In [5]:
import json
from collections import defaultdict
from pathlib import Path

def analyze_deduplication(input_file):
    """
    Analyze the potential impact of deduplication before processing.
    """
    print(f"\nAnalyzing file: {input_file}")
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # Check data structure and extract relationships
        if isinstance(data, dict) and 'relationships' in data:
            relationships = data['relationships']
            print(f"\nEntities Analysis:")
            print(f"Total entities categories: {len(data['entities'])}")
            for category, items in data['entities'].items():
                print(f"  - {category}: {len(items)} items")
        else:
            print("Error: Unexpected data structure. Expected dictionary with 'relationships' key.")
            return None
        
        # Count unique sources with more detail
        sources = set()
        source_details = defaultdict(int)  # To count relationships per source
        for rel in relationships:
            if 'source' in rel:
                source_key = (
                    rel['source']['title'],
                    rel['source']['page_number']
                )
                sources.add(source_key)
                source_details[source_key] += 1
        
        # Count unique triples
        triples = set()
        # Count elements
        subjects = set()
        predicates = set()
        objects = set()
        
        for rel in relationships:
            triples.add((
                rel['subject'],
                rel['predicate'],
                rel['object']
            ))
            subjects.add(rel['subject'])
            predicates.add(rel['predicate'])
            objects.add(rel['object'])
        
        analysis = {
            'total_relationships': len(relationships),
            'unique_sources': len(sources),
            'source_details': dict(source_details),  # Convert to regular dict for storage
            'unique_triples': len(triples),
            'unique_subjects': len(subjects),
            'unique_predicates': len(predicates),
            'unique_objects': len(objects)
        }
        
        print("\nRelationships Analysis:")
        print(f"Total relationships: {analysis['total_relationships']}")
        print(f"Unique sources: {analysis['unique_sources']}")
        print("Source distribution:")
        for source, count in sorted(source_details.items(), key=lambda x: x[1], reverse=True)[:5]:
            print(f"  - {source[0]} (page {source[1]}): {count} relationships")
        print(f"Unique triples: {analysis['unique_triples']}")
        print(f"Unique elements:")
        print(f"  - Subjects: {analysis['unique_subjects']}")
        print(f"  - Predicates: {analysis['unique_predicates']}")
        print(f"  - Objects: {analysis['unique_objects']}")
        print(f"Potential reduction: {analysis['total_relationships'] - analysis['unique_sources']} relationships")
        
        return analysis
        
    except Exception as e:
        print(f"Error analyzing file: {str(e)}")
        return None

def deduplicate_relationships(input_file, output_file):
    """
    Deduplicate relationships by keeping one relationship per unique source.
    """
    print(f"\nDeduplicating file: {input_file}")
    
    try:
        # Read the input JSON file
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if not isinstance(data, dict) or 'relationships' not in data:
            raise ValueError("Invalid input file structure")
        
        relationships = data['relationships']
        original_count = len(relationships)
        
        # Pre-deduplication source count
        pre_sources = set()
        for rel in relationships:
            if 'source' in rel:
                pre_sources.add((
                    rel['source']['title'],
                    rel['source']['page_number']
                ))
        
        print("\nBefore deduplication:")
        print(f"Total relationships: {original_count}")
        print(f"Unique sources: {len(pre_sources)}")
        
        # Create a dictionary to store one relationship per unique source
        source_relationships = {}
        
        # Process relationships to keep one per source
        for relationship in relationships:
            if 'source' in relationship:
                source_key = (
                    relationship['source']['title'],
                    relationship['source']['page_number']
                )
                # Only keep the first relationship for each source
                if source_key not in source_relationships:
                    source_relationships[source_key] = relationship
        
        # Convert back to list
        merged_relationships = list(source_relationships.values())
        
        # Post-deduplication source count
        post_sources = set()
        for rel in merged_relationships:
            if 'source' in rel:
                post_sources.add((
                    rel['source']['title'],
                    rel['source']['page_number']
                ))
        
        print("\nAfter deduplication:")
        print(f"Total relationships: {len(merged_relationships)}")
        print(f"Unique sources: {len(post_sources)}")
        
        # Verify that number of relationships equals number of unique sources
        assert len(merged_relationships) == len(post_sources), "Number of relationships does not match number of unique sources"
        
        # Create output data structure
        output_data = {
            'entities': data['entities'],  # Preserve original entities
            'relationships': merged_relationships
        }
        
        # Write deduplicated data
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)
        
        final_count = len(merged_relationships)
        print(f"\nDeduplication Summary:")
        print(f"Original relationships: {original_count}")
        print(f"Original unique sources: {len(pre_sources)}")
        print(f"After deduplication: {final_count}")
        print(f"Final unique sources: {len(post_sources)}")
        print(f"Removed relationships: {original_count - final_count}")
        print(f"Reduction percentage: {((original_count - final_count) / original_count * 100):.2f}%")
        print(f"\nDeduplicated file saved to: {output_file}")
        
        return original_count, final_count
        
    except Exception as e:
        print(f"Error during deduplication: {str(e)}")
        return None, None

if __name__ == "__main__":
    # Define file paths
    base_dir = Path(r"D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output")
    input_file = base_dir / "scd_entities_relationships_total.json"
    output_file = base_dir / "scd_entities_relationships_total_deduplicated.json"
    
    # Check if input file exists
    if not input_file.exists():
        print(f"Error: Input file not found: {input_file}")
        exit(1)
    
    # First analyze the data
    analysis = analyze_deduplication(str(input_file))
    
    if analysis:
        # Proceed with deduplication without asking for user input
        original_count, final_count = deduplicate_relationships(str(input_file), str(output_file))
        
        if original_count and final_count:
            # Verify the deduplicated file
            print("\nVerifying deduplicated file...")
            verify_analysis = analyze_deduplication(str(output_file))


Analyzing file: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_total.json

Entities Analysis:
Total entities categories: 51
  - Conditions: 3228 items
  - Symptoms: 4490 items
  - Care_Providers: 1178 items
  - Diagnostic_Tests: 1331 items
  - Risk_Factors: 4699 items
  - Treatments: 4412 items
  - Equipment: 17 items
  - Complications: 1628 items
  - Medications: 528 items
  - Supplies: 16 items
  - Care_Settings: 8 items
  - Preventive_Measures: 94 items
  - Care_Facilities: 2 items
  - Anatomical_Features: 8 items
  - Developmental_Milestones: 145 items
  - Activities: 28 items
  - Developmental_Skills: 5 items
  - Nutrition: 39 items
  - Safe_Foods: 8 items
  - Behaviors: 5 items
  - Vaccines: 5 items
  - Safety_Measures: 14 items
  - Developmental_Changes: 8 items
  - Movement_Milestones: 10 items
  - Emotional_Milestones: 3 items
  - Social_Milestones: 10 items
  - Recommended_Foods: 30 items
  - Safety_Equipment: 44 it