In [1]:
# Cell 1: Imports (unchanged)
import pandas as pd
import sys
import os
import json

# Add utils to path
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from general_utils import (
    parse_jsonish,
    load_data,
    prepare_sample,
    prepare_all_samples,
    get_entity_date_pairs,
    calculate_metrics
)

In [2]:
# Cell 2: Test load_data
def test_load_data():
    """Test loading and parsing of training dataset"""
    df = load_data("../data/training_dataset.csv")
    
    print("Dataset Overview:")
    print(f"Number of documents: {len(df)}")
    print("\nColumns present:")
    for col in df.columns:
        print(f"- {col}")
    
    # Check first row
    first_row = df.iloc[0]
    print("\nFirst row contents:")
    print(f"Document ID: {first_row.get('doc_id')}")
    print(f"Text length: {len(first_row['note_text'])} characters")
    print(f"Number of entities: {len(first_row['entities_json'])}")
    print(f"Number of dates: {len(first_row['dates_json'])}")
    
    # Sample of parsed content
    print("\nSample entities (first 3):")
    for e in first_row['entities_json'][:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nSample dates (first 3):")
    for d in first_row['dates_json'][:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return df

# Run test
df = test_load_data()

Dataset Overview:
Number of documents: 10

Columns present:
- doc_id
- note_text
- entities_json
- dates_json
- relative_dates_json
- relations_json

First row contents:
Document ID: 26342
Text length: 5643 characters
Number of entities: 117
Number of dates: 27

Sample entities (first 3):
- LYMPHOCYTES (Position: 4748-4759)
- createnine (Position: 1611-1621)
- gliclazide (Position: 1862-1872)

Sample dates (first 3):
-  11/9/2019 (Position: 1773-1783)
- Dec 2018 (Position: 1629-1637)
- 10/09/2019 (Position: 4482-4492)


In [3]:
# Cell 3: Test prepare_sample
def test_prepare_sample():
    """Test preparation of a single sample"""
    # Get first row
    row = df.iloc[0]
    
    # Prepare sample
    note_text, entities_list, dates = prepare_sample(row)
    
    print("Sample Preparation Results:")
    print(f"\nText length: {len(note_text)} characters")
    print(f"Number of entities: {len(entities_list)}")
    print(f"Number of dates: {len(dates)}")
    
    print("\nFirst 3 entities:")
    for e in entities_list[:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nFirst 3 dates:")
    for d in dates[:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    return note_text, entities_list, dates

# Run test
note_text, entities_list, dates = test_prepare_sample()

Sample Preparation Results:

Text length: 5643 characters
Number of entities: 117
Number of dates: 27

First 3 entities:
- LYMPHOCYTES (Position: 4748-4759)
- createnine (Position: 1611-1621)
- gliclazide (Position: 1862-1872)

First 3 dates:
-  11/9/2019 (Position: 1773-1783)
- Dec 2018 (Position: 1629-1637)
- 10/09/2019 (Position: 4482-4492)


In [4]:
# Cell 4: Test prepare_all_samples
def test_prepare_all_samples():
    """Test preparation of all samples"""
    samples = prepare_all_samples(df)
    
    print("All Samples Preparation Results:")
    print(f"Number of samples prepared: {len(samples)}")
    
    # Check first sample
    first_sample = samples[0]
    print("\nFirst sample contents:")
    print(f"- doc_id: {first_sample['doc_id']}")
    print(f"- Text length: {len(first_sample['note_text'])} characters")
    print(f"- Number of entities: {len(first_sample['entities_list'])}")
    print(f"- Number of dates: {len(first_sample['dates'])}")
    print(f"- Number of relative dates: {len(first_sample['relative_dates'])}")
    
    # Print first few entities and dates
    print("\nFirst 3 entities:")
    for e in first_sample['entities_list'][:3]:
        print(f"- {e['value']} (Position: {e['start']}-{e['end']})")
    
    print("\nFirst 3 dates:")
    for d in first_sample['dates'][:3]:
        print(f"- {d['value']} (Position: {d['start']}-{d['end']})")
    
    print("\nFirst 3 relative dates:")
    for rd in first_sample['relative_dates'][:3]:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    return samples

# Run test
samples = test_prepare_all_samples()

All Samples Preparation Results:
Number of samples prepared: 10

First sample contents:
- doc_id: 26342
- Text length: 5643 characters
- Number of entities: 117
- Number of dates: 27
- Number of relative dates: 5

First 3 entities:
- LYMPHOCYTES (Position: 4748-4759)
- createnine (Position: 1611-1621)
- gliclazide (Position: 1862-1872)

First 3 dates:
-  11/9/2019 (Position: 1773-1783)
- Dec 2018 (Position: 1629-1637)
- 10/09/2019 (Position: 4482-4492)

First 3 relative dates:
- last few month (Position: 196-210)
-  last 3 months (Position: 317-331)
-  start of 2018 (Position: 292-306)


In [5]:
# Cell 5: Test get_entity_date_pairs
def test_get_entity_date_pairs():
    """Test creation of entity-date pairs"""
    # Get first sample
    sample = samples[0]
    
    # Get pairs
    pairs = get_entity_date_pairs(
        sample['entities_list'],
        sample['dates'],
        sample['relative_dates']
    )
    
    print("Entity-Date Pairs Results:")
    print(f"Total pairs generated: {len(pairs)}")
    
    print("\nFirst 5 pairs:")
    for i, pair in enumerate(pairs[:5]):
        print(f"\nPair {i+1}:")
        print(f"Entity: {pair['entity_label']} ({pair['entity']['start']}-{pair['entity']['end']})")
        print(f"Date: {pair['date']} ({pair['date_info']['start']}-{pair['date_info']['end']})")
        print(f"Distance: {pair['distance']} chars")
        print(f"Date type: {pair['date_type']}")
    
    return pairs

# Run test
pairs = test_get_entity_date_pairs()

Entity-Date Pairs Results:
Total pairs generated: 3744

First 5 pairs:

Pair 1:
Entity: LYMPHOCYTES (4748-4759)
Date:  11/9/2019 (1773-1783)
Distance: 2975 chars
Date type: absolute

Pair 2:
Entity: LYMPHOCYTES (4748-4759)
Date: Dec 2018 (1629-1637)
Distance: 3119 chars
Date type: absolute

Pair 3:
Entity: LYMPHOCYTES (4748-4759)
Date: 10/09/2019 (4482-4492)
Distance: 266 chars
Date type: absolute

Pair 4:
Entity: LYMPHOCYTES (4748-4759)
Date: 10/09/2019 (4508-4518)
Distance: 240 chars
Date type: absolute

Pair 5:
Entity: LYMPHOCYTES (4748-4759)
Date: 10/09/2019 (4533-4543)
Distance: 215 chars
Date type: absolute


In [6]:
# Cell 6: Test Relative Date Handling
def test_relative_dates():
    """Test specific handling of relative dates in pairs"""
    
    # Get first sample
    sample = samples[0]
    
    print("Relative Date Analysis:")
    print(f"Total relative dates: {len(sample['relative_dates'])}")
    
    # Show all relative dates
    print("\nAll relative dates:")
    for rd in sample['relative_dates']:
        print(f"- {rd['value']} (Position: {rd['start']}-{rd['end']})")
    
    # Find pairs with relative dates
    pairs = get_entity_date_pairs(
        sample['entities_list'],
        sample['dates'],
        sample['relative_dates']
    )
    
    relative_pairs = [p for p in pairs if p['date_type'] == 'relative']
    print(f"\nPairs using relative dates: {len(relative_pairs)}")
    print("\nFirst 3 relative date pairs:")
    for p in relative_pairs[:3]:
        print(f"\nEntity: {p['entity_label']} ({p['entity']['start']}-{p['entity']['end']})")
        print(f"Date: {p['date']} ({p['date_info']['start']}-{p['date_info']['end']})")
        print(f"Distance: {p['distance']} chars")

# Run test
test_relative_dates()

Relative Date Analysis:
Total relative dates: 5

All relative dates:
- last few month (Position: 196-210)
-  last 3 months (Position: 317-331)
-  start of 2018 (Position: 292-306)
- 2/7 (Position: 421-424)
- today (Position: 664-669)

Pairs using relative dates: 585

First 3 relative date pairs:

Entity: LYMPHOCYTES (4748-4759)
Date: last few month (196-210)
Distance: 4552 chars

Entity: LYMPHOCYTES (4748-4759)
Date:  last 3 months (317-331)
Distance: 4431 chars

Entity: LYMPHOCYTES (4748-4759)
Date:  start of 2018 (292-306)
Distance: 4456 chars
