In [None]:
# Cell 1: Imports
import pandas as pd
import sys
import os
import json

# Add utils to path
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from create_training_dataset_utils import (
    load_medcat_export,
    get_validated_entities,
    get_validated_dates,
    get_validated_relative_dates,
    get_validated_relations,
    _is_valid_ann,
    doc_to_entities_json,
    doc_to_dates_json,
    doc_to_relative_dates_json,
    doc_to_relations_json,
    doc_to_relations_value_json,
    id2value_from_items,
    make_row
)

In [None]:
# Cell 2: Constants
DATE_CUI = "410671006"
RELATIVE_DATE_CUI = "118578006"

In [None]:
# Cell 3: Load Data
def test_load_medcat_export():
    data = load_medcat_export("../data/MedCAT_Export_NPH.json")
    
    print("Data Structure:")
    print(f"Number of projects: {len(data.get('projects', []))}")
    
    # Look at first project
    first_project = data['projects'][0]
    print(f"\nFirst Project:")
    print(f"Number of documents: {len(first_project.get('documents', []))}")
    
    # Look at first document
    first_doc = first_project['documents'][0]
    print(f"\nFirst Document:")
    print(f"Document ID: {first_doc.get('id')}")
    print(f"Number of annotations: {len(first_doc.get('annotations', []))}")
    
    return data

# Run test
data = test_load_medcat_export()

In [None]:
# Cell 4: Test Entities
def test_entities(data):
    first_doc = data['projects'][0]['documents'][0]
    entities = get_validated_entities(first_doc, DATE_CUI)
    
    print("Validated Entities in First Document:")
    print(f"Number of entities: {len(entities)}")
    print("\nFirst 3 entities:")
    for e in entities[:3]:
        print(f"- {e['value']} (CUI: {e['cui']}, Position: {e['start']}-{e['end']})")
    
    return entities

# Run test
entities = test_entities(data)

In [None]:
# Cell 5: Test Dates
def test_dates(data):
    first_doc = data['projects'][0]['documents'][0]
    dates = get_validated_dates(first_doc, DATE_CUI)
    
    print("Validated Dates in First Document:")
    print(f"Number of dates: {len(dates)}")
    print("\nFirst 3 dates:")
    for d in dates[:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return dates

# Run test
dates = test_dates(data)

In [None]:
# Cell 6: Test Relative Dates
def test_relative_dates(data):
    first_doc = data['projects'][0]['documents'][0]
    relative_dates = get_validated_relative_dates(first_doc, RELATIVE_DATE_CUI)
    
    print("Validated Relative Dates in First Document:")
    print(f"Number of relative dates: {len(relative_dates)}")
    print("\nFirst 3 relative dates:")
    for rd in relative_dates[:3]:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    return relative_dates

# Run test
relative_dates = test_relative_dates(data)

In [None]:
# Cell 7: Test Relations
def test_relations(data):
    first_doc = data['projects'][0]['documents'][0]
    relations = get_validated_relations(first_doc, DATE_CUI, RELATIVE_DATE_CUI)
    
    print("Relations in First Document:")
    print(f"Number of relations: {len(relations)}")
    
    # Get all annotations to check their CUIs
    all_annotations = first_doc.get("annotations", [])
    ann_map = {a["id"]: a for a in all_annotations if _is_valid_ann(a)}
    
    # Look at the first few relations in detail
    print("\nFirst 10 relations with details:")
    for i, rel in enumerate(relations[:10]):
        date_ann = ann_map.get(rel['date_id'], {})
        entity_ann = ann_map.get(rel['entity_id'], {})
        
        date_type = (
            "Absolute Date" if date_ann.get("cui") == DATE_CUI
            else "Relative Date" if date_ann.get("cui") == RELATIVE_DATE_CUI
            else "Unknown"
        )
        
        print(f"\nRelation {i+1}:")
        print(f"Entity: {entity_ann.get('value')} (CUI: {entity_ann.get('cui')})")
        print(f"Date: {date_ann.get('value')} ({date_type}, CUI: {date_ann.get('cui')})")
        print(f"Positions: Entity {entity_ann.get('start')}-{entity_ann.get('end')}, "
              f"Date {date_ann.get('start')}-{date_ann.get('end')}")

    return relations

# Run test
relations = test_relations(data)

In [None]:
# Cell 8: Test JSON Serialization
def test_json_serialization(data):
    """Test that all JSON serialization functions work correctly"""
    from create_training_dataset_utils import (
        doc_to_entities_json,
        doc_to_dates_json,
        doc_to_relative_dates_json,
        doc_to_relations_json,
        doc_to_relations_value_json
    )
    
    # Get sample data from first document
    first_doc = data['projects'][0]['documents'][0]
    entities = get_validated_entities(first_doc, DATE_CUI)
    dates = get_validated_dates(first_doc, DATE_CUI)
    relative_dates = get_validated_relative_dates(first_doc, RELATIVE_DATE_CUI)
    relations = get_validated_relations(first_doc, DATE_CUI, RELATIVE_DATE_CUI)
    
    # Test each serialization function
    print("Testing JSON serialization:")
    
    # Test entities
    entities_json = doc_to_entities_json(entities)
    print("\nEntities JSON:")
    print(f"- Type: {type(entities_json)}")
    print(f"- Can parse back: {bool(json.loads(entities_json))}")
    print(f"- Number of entities: {len(json.loads(entities_json))}")
    
    # Test dates
    dates_json = doc_to_dates_json(dates)
    print("\nDates JSON:")
    print(f"- Type: {type(dates_json)}")
    print(f"- Can parse back: {bool(json.loads(dates_json))}")
    print(f"- Number of dates: {len(json.loads(dates_json))}")
    
    # Test relative dates
    relative_dates_json = doc_to_relative_dates_json(relative_dates)
    print("\nRelative Dates JSON:")
    print(f"- Type: {type(relative_dates_json)}")
    print(f"- Can parse back: {bool(json.loads(relative_dates_json))}")
    print(f"- Number of relative dates: {len(json.loads(relative_dates_json))}")
    
    # Test relations
    relations_json = doc_to_relations_json(relations)
    print("\nRelations JSON:")
    print(f"- Type: {type(relations_json)}")
    print(f"- Can parse back: {bool(json.loads(relations_json))}")
    print(f"- Number of relations: {len(json.loads(relations_json))}")
    
    return entities_json, dates_json, relative_dates_json, relations_json

# Run test
json_results = test_json_serialization(data)

In [None]:
# Cell 9: Test ID to Value Mapping
def test_id2value_mapping(data):
    """Test that ID to value mapping works for all types"""
    from create_training_dataset_utils import id2value_from_items
    
    # Get sample data
    first_doc = data['projects'][0]['documents'][0]
    entities = get_validated_entities(first_doc, DATE_CUI)
    dates = get_validated_dates(first_doc, DATE_CUI)
    relative_dates = get_validated_relative_dates(first_doc, RELATIVE_DATE_CUI)
    
    # Create mapping
    id2value = id2value_from_items(entities, dates, relative_dates)
    
    print("ID to Value Mapping Results:")
    print(f"Total mapped items: {len(id2value)}")
    
    # Check entity mappings
    print("\nEntity Mappings (first 3):")
    for e in entities[:3]:
        mapped_value = id2value.get(e['id'])
        print(f"- ID {e['id']}: {e['value']} -> {mapped_value}")
        assert mapped_value == e['value'], f"Mapping mismatch for entity {e['id']}"
    
    # Check date mappings
    print("\nDate Mappings (first 3):")
    for d in dates[:3]:
        mapped_value = id2value.get(d['id'])
        print(f"- ID {d['id']}: {d['value']} -> {mapped_value}")
        assert mapped_value == d['value'], f"Mapping mismatch for date {d['id']}"
    
    # Check relative date mappings
    print("\nRelative Date Mappings (first 3):")
    for rd in relative_dates[:3]:
        mapped_value = id2value.get(rd['id'])
        print(f"- ID {rd['id']}: {rd['value']} -> {mapped_value}")
        assert mapped_value == rd['value'], f"Mapping mismatch for relative date {rd['id']}"
    
    return id2value

# Run test
id2value = test_id2value_mapping(data)

In [None]:
# Cell 10: Test Row Creation
def test_row_creation(data):
    """Test that rows are created with correct structure"""
    from create_training_dataset_utils import make_row
    
    # Get sample data
    first_doc = data['projects'][0]['documents'][0]
    entities = get_validated_entities(first_doc, DATE_CUI)
    dates = get_validated_dates(first_doc, DATE_CUI)
    relative_dates = get_validated_relative_dates(first_doc, RELATIVE_DATE_CUI)
    relations = get_validated_relations(first_doc, DATE_CUI, RELATIVE_DATE_CUI)
    
    # Create JSON strings
    entities_json = doc_to_entities_json(entities)
    dates_json = doc_to_dates_json(dates)
    relative_dates_json = doc_to_relative_dates_json(relative_dates)
    id2value = id2value_from_items(entities, dates, relative_dates)
    relations_json = doc_to_relations_value_json(relations, id2value)
    
    # Create row
    row = make_row(
        doc_id=first_doc.get("id"),
        note_text=first_doc.get("text", ""),
        entities_json=entities_json,
        dates_json=dates_json,
        relative_dates_json=relative_dates_json,
        relations_json=relations_json
    )
    
    print("Row Structure Verification:")
    print("\nRequired Fields:")
    for field in ["doc_id", "note_text", "entities_json", "dates_json", "relative_dates_json", "relations_json"]:
        print(f"- {field}: {'Present' if field in row else 'Missing'}")
    
    print("\nField Types:")
    for field, value in row.items():
        print(f"- {field}: {type(value)}")
    
    print("\nJSON Fields Can Parse:")
    for field in ["entities_json", "dates_json", "relative_dates_json", "relations_json"]:
        try:
            parsed = json.loads(row[field])
            print(f"- {field}: Yes (contains {len(parsed)} items)")
        except json.JSONDecodeError:
            print(f"- {field}: No (invalid JSON)")
    
    return row

# Run test
sample_row = test_row_creation(data)

In [None]:
# Cell 11: Test Error Handling
def test_error_handling():
    """Test how functions handle invalid inputs"""
    from create_training_dataset_utils import (
        get_validated_entities,
        get_validated_dates,
        get_validated_relative_dates,
        get_validated_relations,
        _is_valid_ann
    )
    
    print("Testing Error Handling:")
    
    # Test with empty document
    empty_doc = {}
    print("\nEmpty Document:")
    print(f"- Entities: {len(get_validated_entities(empty_doc, DATE_CUI))}")
    print(f"- Dates: {len(get_validated_dates(empty_doc, DATE_CUI))}")
    print(f"- Relative Dates: {len(get_validated_relative_dates(empty_doc, RELATIVE_DATE_CUI))}")
    print(f"- Relations: {len(get_validated_relations(empty_doc, DATE_CUI, RELATIVE_DATE_CUI))}")
    
    # Test with invalid annotations
    invalid_doc = {
        "annotations": [
            {"id": 1},  # Missing required fields
            {"id": 2, "value": "test", "correct": None},  # None for correct
            {"id": 3, "value": "test", "deleted": None},  # None for deleted
            {"id": 4, "value": "test", "correct": True, "deleted": False}  # Valid
        ]
    }
    
    print("\nInvalid Annotations:")
    valid_count = sum(1 for a in invalid_doc["annotations"] if _is_valid_ann(a))
    print(f"- Valid annotations found: {valid_count} out of {len(invalid_doc['annotations'])}")
    
    # Test with invalid relations
    invalid_relations_doc = {
        "relations": [
            {},  # Empty relation
            {"start_entity": 1},  # Missing end_entity
            {"end_entity": 2},  # Missing start_entity
            {"start_entity": 3, "end_entity": 4}  # Valid structure
        ]
    }
    
    print("\nInvalid Relations:")
    relations = get_validated_relations(invalid_relations_doc, DATE_CUI, RELATIVE_DATE_CUI)
    print(f"- Valid relations found: {len(relations)} out of {len(invalid_relations_doc['relations'])}")

# Run test
test_error_handling()