In [1]:
import json
from pathlib import Path
from typing import Dict, List, Set
from collections import defaultdict

def load_json_file(file_path: str) -> Dict:
    """Load and validate JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if not isinstance(data, dict) or 'entities' not in data or 'relationships' not in data:
                raise ValueError(f"Invalid file structure in {file_path}")
            return data
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

def print_file_statistics(file_path: str, data: Dict) -> None:
    """Print statistics for a single file."""
    print(f"\nStatistics for {file_path}:")
    print("Entities:")
    for category, items in data['entities'].items():
        print(f"  - {category}: {len(items)} items")
    print(f"Total relationships: {len(data['relationships'])}")

def merge_json_files(file_paths: List[str], output_path: str) -> None:
    """Merge multiple JSON files containing entities and relationships."""
    
    # Initialize merged data structure
    merged_data = {
        "entities": defaultdict(set),
        "relationships": []
    }
    
    # Track unique relationships to avoid duplicates
    relationship_tracker = set()
    
    # Load and process each file
    for file_path in file_paths:
        print(f"\nProcessing: {file_path}")
        data = load_json_file(file_path)
        if not data:
            continue
            
        # Print statistics before merging
        print_file_statistics(file_path, data)
        
        # Merge entities
        for category, items in data['entities'].items():
            merged_data["entities"][category].update(items)
        
        # Merge relationships (avoiding duplicates)
        for rel in data['relationships']:
            # Create a unique identifier for the relationship
            rel_id = (rel['subject'], rel['predicate'], rel['object'])
            if rel_id not in relationship_tracker:
                relationship_tracker.add(rel_id)
                merged_data["relationships"].append(rel)
    
    # Convert sets to lists for JSON serialization
    final_merged_data = {
        "entities": {k: list(v) for k, v in merged_data["entities"].items()},
        "relationships": merged_data["relationships"]
    }
    
    # Print final statistics
    print("\nFinal merged statistics:")
    print("Entities:")
    for category, items in final_merged_data['entities'].items():
        print(f"  - {category}: {len(items)} items")
    print(f"Total relationships: {len(final_merged_data['relationships'])}")
    
    # Save merged data
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_merged_data, f, indent=2)
    print(f"\nMerged data saved to: {output_path}")

def verify_merge(input_files: List[str], output_file: str) -> None:
    """Verify that no information was lost during the merge."""
    print("\nVerifying merge...")
    
    # Load merged file
    merged_data = load_json_file(output_file)
    if not merged_data:
        return
        
    # Track total counts from input files
    total_entities_by_category = defaultdict(set)
    total_relationships = set()
    
    # Process input files
    for file_path in input_files:
        data = load_json_file(file_path)
        if not data:
            continue
            
        # Collect entities
        for category, items in data['entities'].items():
            total_entities_by_category[category].update(items)
            
        # Collect relationships
        for rel in data['relationships']:
            total_relationships.add((rel['subject'], rel['predicate'], rel['object']))
    
    # Verify entities
    all_entities_preserved = True
    for category, items in total_entities_by_category.items():
        merged_items = set(merged_data['entities'].get(category, []))
        if items != merged_items:
            print(f"Warning: Mismatch in category {category}")
            print(f"  Missing items: {items - merged_items}")
            print(f"  Extra items: {merged_items - items}")
            all_entities_preserved = False
    
    # Verify relationships
    merged_relationships = {(rel['subject'], rel['predicate'], rel['object']) 
                          for rel in merged_data['relationships']}
    
    relationships_preserved = total_relationships == merged_relationships
    if not relationships_preserved:
        print("Warning: Relationship mismatch")
        print(f"  Missing relationships: {total_relationships - merged_relationships}")
        print(f"  Extra relationships: {merged_relationships - total_relationships}")
    
    if all_entities_preserved and relationships_preserved:
        print("Verification successful! All information preserved in merge.")
    else:
        print("Verification failed! Some information may have been lost.")

# Main execution
if __name__ == "__main__":
    # Define base directory and files
    base_dir = Path(r"D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output")
    
    # Define input and output files with full paths
    input_files = [
        base_dir / "scd_entities_relationships_1.json",
        base_dir / "scd_entities_relationships_2.json",
        base_dir / "scd_entities_relationships_3.json"
    ]
    
    # Convert Path objects to strings and verify files exist
    input_files_str = []
    for file_path in input_files:
        if file_path.exists():
            input_files_str.append(str(file_path))
        else:
            print(f"Warning: File not found: {file_path}")
    
    if not input_files_str:
        print("No input files found. Please check the file paths.")
        exit(1)
    
    # Define output file path
    output_file = str(base_dir / "scd_entities_relationships_total.json")
    
    # Perform merge
    merge_json_files(input_files_str, output_file)
    
    # Verify the merge
    verify_merge(input_files_str, output_file)


Processing: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_1.json

Statistics for D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_1.json:
Entities:
  - Conditions: 471 items
  - Symptoms: 525 items
  - Care_Providers: 113 items
  - Diagnostic_Tests: 87 items
  - Risk_Factors: 285 items
  - Treatments: 385 items
  - Equipment: 14 items
  - Complications: 193 items
  - Medications: 174 items
  - Supplies: 12 items
Total relationships: 713

Processing: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_2.json

Statistics for D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_2.json:
Entities:
  - Conditions: 1633 items
  - Symptoms: 2663 items
  - Treatments: 2043 items
  - Complications: 747 items
  - Diagnostic_Tests: 560 items
  - Care_Providers: 489 items
 