In [None]:
#Imports
import pandas as pd
import sys
import os
import json

# Add utils to path
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from date_extractor_utils import clean_value, extract_absolute_dates, normalise_relative, extract_relative_dates, add_relative_dates
from general_utils import load_data
from naive_extractor_utils import naive_extraction

Test Absolute Dates

In [None]:
# Test on simple example
test_text = "Patient was seen on 15/06/2025 for follow-up. Next appointment scheduled for January 4th, 2026."
absolute_dates = extract_absolute_dates(test_text)
print("\nSimple test:")
print(f"Text: {test_text}")
print("Results:")
for date in absolute_dates:
    print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")

In [None]:
# Test on comprehensive examples
test_text = """
Various date formats:
1. Standard formats:
   - 15/06/2025
   - 2025-06-15
   - 15-06-2025
   
2. Month name formats:
   - June 15, 2025
   - 15 June 2025
   - Jun 15, 2025
   
3. Mixed in text:
   The patient was seen on 15/06/2025 and had a follow-up on June 15, 2025.
   Next appointment scheduled for January 4th, 2026.
"""

print("\nComprehensive test:")
print(f"Text: {test_text}")
absolute_dates = extract_absolute_dates(test_text)
print("\nResults:")
for date in absolute_dates:
    print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")

In [None]:
# Test edge cases
edge_cases = [
    "",  # Empty string
    "No dates here",  # No dates
    "Invalid dates: 35/13/2025, 00/00/0000",  # Invalid dates
    "Partial dates: June 2025, 2025",  # Partial dates
]

print("\nTesting edge cases:")
for text in edge_cases:
    dates = extract_absolute_dates(text)
    print(f"\nText: '{text}'")
    print(f"Found {len(dates)} dates:")
    for date in dates:
        print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")

In [None]:
# Test on actual dataset sample
print("\nTesting on dataset sample:")
df = pd.read_csv("../data/data.csv")
sample_text = df.iloc[0]['note_text']
print(f"Text: {sample_text[:200]}...")
dates = extract_absolute_dates(sample_text)
print(f"\nFound {len(dates)} dates:")
for date in dates:
    print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")

Test Relative Dates

In [None]:
# Test on one example
test_text = "Patient was seen last week for follow-up. Next appointment scheduled for tomorrow. Symptoms started 3 days ago."
relative_dates = extract_relative_dates(test_text)
relative_dates

In [None]:
# Test on comprehensive examples to verify all pattern types
test_text = """
Patient was seen last week for follow-up. 
Next appointment scheduled for tomorrow. 
Symptoms started 3 days ago.
Last visit was on Monday.
Previous checkup was 2 weeks earlier.
Past few days have been difficult.
Several months ago the condition worsened.
Earlier this week the patient improved.
Last visit was productive.
Next few days will be critical.
"""

print("Testing comprehensive relative date extraction:")
print(f"Text: {test_text.strip()}")
print("\nResults:")

results = extract_relative_dates(test_text)
for result in results:
    print(f"  '{result['value']}' -> (pattern: {result['pattern_type']})")

print(f"\nTotal patterns found: {len(results)}")

In [None]:
# Load main dataset using existing load_data function
df = load_data("../data/training_dataset_synthetic2.csv")

print(f"Main dataset: {df.shape}")

In [None]:
# Add relative dates using the simplified function
df = add_relative_dates(df)

print(f"Added relative_dates_json column")
print(f"Final dataset shape: {df.shape}")

In [None]:
# Count how many rows have relative dates
has_relative_dates = df['relative_dates_json'].apply(lambda x: x != '[]')
print(f"Rows with relative dates: {has_relative_dates.sum()}")
print(f"Total relative dates found: {sum(len(json.loads(rd)) for rd in df['relative_dates_json'])}")

In [None]:
# Get rows that have relative dates
rows_with_relative_dates = df[has_relative_dates]

print(f"\nExamining {len(rows_with_relative_dates)} rows with relative dates:")

# Show detailed results for each row with relative dates
for i, (idx, row) in enumerate(rows_with_relative_dates.iterrows()):
    print(f"\n--- Row {i+1} (Index {idx}) ---")
    print(f"Text: {row['note_text'][:200]}...")
    
    # Parse and display relative dates
    relative_dates = json.loads(row['relative_dates_json'])
    print(f"Found {len(relative_dates)} relative dates:")
    for rd in relative_dates:
        print(f"  '{rd['value']}' -> (pattern: {rd['pattern_type']})")
    
    # Only show first 5 rows to avoid too much output
    if i >= 4:
        remaining = len(rows_with_relative_dates) - 5
        if remaining > 0:
            print(f"\n... and {remaining} more rows with relative dates")
        break

In [None]:
# Get the row with relative dates
row_with_relative = df[has_relative_dates].iloc[0]
print("Testing naive extractor with relative dates:")
print(f"Text: {row_with_relative['note_text'][:200]}...")

# Get entities and dates
entities = row_with_relative['entities_json']
absolute_dates = row_with_relative['dates_json']
relative_dates = json.loads(row_with_relative['relative_dates_json'])

print(f"\nEntities: {len(entities)}")
for i, entity in enumerate(entities[:3]):  # Show first 3
    print(f"  {i+1}. {entity['value']} (pos: {entity['start']})")

print(f"\nAbsolute dates: {len(absolute_dates)}")
for i, date in enumerate(absolute_dates[:3]):  # Show first 3
    print(f"  {i+1}. {date['value']} (pos: {date['start']})")

print(f"\nRelative dates: {len(relative_dates)}")
for i, date in enumerate(relative_dates):
    print(f"  {i+1}. {date['value']} (pos: {date['start']})")

In [None]:
# Test naive extraction with absolute dates only
print("=== NAIVE EXTRACTION WITH ABSOLUTE DATES ONLY ===")
relationships_absolute = naive_extraction(entities, absolute_dates, max_distance=400)
print(f"Found {len(relationships_absolute)} relationships:")
for rel in relationships_absolute:
    print(f"  {rel['entity_label']} -> {rel['date']} (distance: {rel['distance']})")

In [None]:
# Test naive extraction with relative dates only
print("=== NAIVE EXTRACTION WITH RELATIVE DATES ONLY ===")
relationships_relative = naive_extraction(entities, relative_dates, max_distance=400)
print(f"Found {len(relationships_relative)} relationships:")
for rel in relationships_relative:
    print(f"  {rel['entity_label']} -> {rel['date']} (distance: {rel['distance']})")

In [None]:
# Test naive extraction with combined dates
print("=== NAIVE EXTRACTION WITH COMBINED DATES ===")
all_dates = absolute_dates + relative_dates
print(f"Total dates: {len(all_dates)} (absolute: {len(absolute_dates)}, relative: {len(relative_dates)})")

relationships_combined = naive_extraction(entities, all_dates, max_distance=400)
print(f"Found {len(relationships_combined)} relationships:")
for rel in relationships_combined:
    print(f"  {rel['entity_label']} -> {rel['date']} (distance: {rel['distance']})")

In [None]:
# Debug - show all entity-date distances
print("=== ENTITY-DATE DISTANCE ANALYSIS ===")
for entity in entities:
    print(f"\nEntity: {entity['value']} (pos: {entity['start']})")
    print("Distances to dates:")
    
    for date in all_dates:
        distance = abs(entity['start'] - date['start'])
        date_type = "absolute" if date in absolute_dates else "relative"
        print(f"  {date['value']} ({date_type}): distance = {distance}")

Test Against Labelled Training Data

In [None]:
# Load training dataset using existing load_data function
df = load_data("../data/training_dataset.csv")

print(f"Main dataset: {df.shape}")

In [None]:
# --- Initialize counters ---
abs_results = []
rel_results = []

# --- Loop through each note ---
for _, row in df.iterrows():
    text = row['note_text']

    validated_abs = row.get('dates_json', []) or []
    validated_rel = row.get('relative_dates_json', []) or []

    if isinstance(validated_rel, str):
        try:
            validated_rel = json.loads(validated_rel)
        except json.JSONDecodeError:
            validated_rel = []

    # --- Normalise gold values ---
    gold_abs_values = {clean_value(d['value']) for d in validated_abs if isinstance(d, dict) and d.get('value')}
    gold_rel_values = {normalise_relative(d['value']) for d in validated_rel if isinstance(d, dict) and d.get('value')}

    # --- Generate predictions ---
    pred_abs = extract_absolute_dates(text)
    pred_rel = extract_relative_dates(text)

    # --- Normalise predicted values ---
    pred_abs_values = {clean_value(p['value']) for p in pred_abs if isinstance(p, dict) and p.get('value')}
    pred_rel_values = {normalise_relative(p['value']) for p in pred_rel if isinstance(p, dict) and p.get('value')}

    # --- Compare sets using same normalisation ---
    tp_abs = len(pred_abs_values & gold_abs_values)
    fp_abs = len(pred_abs_values - gold_abs_values)
    fn_abs = len(gold_abs_values - pred_abs_values)

    tp_rel = len(pred_rel_values & gold_rel_values)
    fp_rel = len(pred_rel_values - gold_rel_values)
    fn_rel = len(gold_rel_values - pred_rel_values)

    abs_results.append((tp_abs, fp_abs, fn_abs))
    rel_results.append((tp_rel, fp_rel, fn_rel))

In [None]:
# --- Compute metrics ---
def compute_metrics(results):
    tp = sum(r[0] for r in results)
    fp = sum(r[1] for r in results)
    fn = sum(r[2] for r in results)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1, tp, fp, fn

In [None]:
# --- Print results ---
abs_precision, abs_recall, abs_f1, tp_abs, fp_abs, fn_abs = compute_metrics(abs_results)
rel_precision, rel_recall, rel_f1, tp_rel, fp_rel, fn_rel = compute_metrics(rel_results)

print("=== Absolute Dates ===")
print(f"TP={tp_abs}, FP={fp_abs}, FN={fn_abs}")
print(f"Precision={abs_precision:.3f}, Recall={abs_recall:.3f}, F1={abs_f1:.3f}")

print("\n=== Relative Dates ===")
print(f"TP={tp_rel}, FP={fp_rel}, FN={fn_rel}")
print(f"Precision={rel_precision:.3f}, Recall={rel_recall:.3f}, F1={rel_f1:.3f}")

In [None]:
# --- Debug absolute dates (first 10 rows) ---
for _, row in df.head(10).iterrows():
    text = row['note_text']

    abs_data = row.get('dates_json')
    if isinstance(abs_data, str):
        try:
            abs_data = json.loads(abs_data)
        except json.JSONDecodeError:
            abs_data = []

    # Extract validated and predicted values
    gold_abs = {d['value'].strip() for d in abs_data if isinstance(d, dict) and 'value' in d}
    pred_abs = {p['value'].strip() for p in extract_absolute_dates(text) if isinstance(p, dict) and 'value' in p}

    # Only show rows where something exists
    if gold_abs or pred_abs:
        print(f"\nDoc {row.get('doc_id', 'N/A')}")
        print("Validated absolute:", gold_abs)
        print("Predicted absolute:", pred_abs)
        print("Overlap:", gold_abs & pred_abs)
        print("-" * 80)

In [None]:
# --- Debug relative date extraction ---
for _, row in df.head(10).iterrows():
    text = row['note_text']

    # Load gold relative dates safely
    rel_data = row.get('relative_dates_json')
    if isinstance(rel_data, str):
        try:
            rel_data = json.loads(rel_data)
        except json.JSONDecodeError:
            rel_data = []
    elif not isinstance(rel_data, list):
        rel_data = []

    # --- Raw and normalised sets ---
    gold_raw = {d['value'] for d in rel_data if isinstance(d, dict) and 'value' in d}
    pred_raw = {p['value'] for p in extract_relative_dates(text)}

    gold_norm = {normalise_relative(v) for v in gold_raw}
    pred_norm = {normalise_relative(v) for v in pred_raw}

    # --- Only print docs that have something to inspect ---
    if gold_raw or pred_raw:
        print(f"\nDoc {row.get('doc_id', 'N/A')}")
        print("Validated relative (raw):", gold_raw)
        print("Predicted relative (raw):", pred_raw)
        print("Overlap (raw):", gold_raw & pred_raw)
        print("Overlap (normalised):", gold_norm & pred_norm)