In [3]:
#Imports
import pandas as pd
import sys
import os
import json

# Add utils to path
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from date_extractor import extract_relative_dates, add_relative_dates, extract_absolute_dates
from utils import load_data

Test Absolute Dates

In [4]:
# Test on simple example
test_text = "Patient was seen on 15/06/2025 for follow-up. Next appointment scheduled for January 4th, 2026."
absolute_dates = extract_absolute_dates(test_text)
print("\nSimple test:")
print(f"Text: {test_text}")
print("Results:")
for date in absolute_dates:
    print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")


Simple test:
Text: Patient was seen on 15/06/2025 for follow-up. Next appointment scheduled for January 4th, 2026.
Results:
  'on 15/06/2025' -> (start: 16, end: 31)
  'January 4th, 2026' -> (start: 76, end: 95)


In [5]:
# Test on comprehensive examples
test_text = """
Various date formats:
1. Standard formats:
   - 15/06/2025
   - 2025-06-15
   - 15-06-2025
   
2. Month name formats:
   - June 15, 2025
   - 15 June 2025
   - Jun 15, 2025
   
3. Mixed in text:
   The patient was seen on 15/06/2025 and had a follow-up on June 15, 2025.
   Next appointment scheduled for January 4th, 2026.
"""

print("\nComprehensive test:")
print(f"Text: {test_text}")
absolute_dates = extract_absolute_dates(test_text)
print("\nResults:")
for date in absolute_dates:
    print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")


Comprehensive test:
Text: 
Various date formats:
1. Standard formats:
   - 15/06/2025
   - 2025-06-15
   - 15-06-2025
   
2. Month name formats:
   - June 15, 2025
   - 15 June 2025
   - Jun 15, 2025
   
3. Mixed in text:
   The patient was seen on 15/06/2025 and had a follow-up on June 15, 2025.
   Next appointment scheduled for January 4th, 2026.


Results:
  'on 15/06/2025' -> (start: 219, end: 234)
  'January 4th, 2026' -> (start: 305, end: 325)


In [6]:
# Test edge cases
edge_cases = [
    "",  # Empty string
    "No dates here",  # No dates
    "Invalid dates: 35/13/2025, 00/00/0000",  # Invalid dates
    "Partial dates: June 2025, 2025",  # Partial dates
]

print("\nTesting edge cases:")
for text in edge_cases:
    dates = extract_absolute_dates(text)
    print(f"\nText: '{text}'")
    print(f"Found {len(dates)} dates:")
    for date in dates:
        print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")


Testing edge cases:

Text: ''
Found 0 dates:

Text: 'No dates here'
Found 0 dates:

Text: 'Invalid dates: 35/13/2025, 00/00/0000'
Found 0 dates:

Text: 'Partial dates: June 2025, 2025'
Found 0 dates:


In [8]:
# Test on actual dataset sample
print("\nTesting on dataset sample:")
df = pd.read_csv("../data/original.csv")
sample_text = df.iloc[0]['text']
print(f"Text: {sample_text[:200]}...")
dates = extract_absolute_dates(sample_text)
print(f"\nFound {len(dates)} dates:")
for date in dates:
    print(f"  '{date['value']}' -> (start: {date['start']}, end: {date['end']})")


Testing on dataset sample:
Text: Ultrasound (30nd Jun 2024): no significant findings.imp: asthma

She denies any nausea, vomiting, or diarrhea.
C Patient reports compliance with current medication regimen. Basic metabolic panel withi...

Found 10 dates:
  '30nd Jun 2024' -> (start: 12, end: 25)
  '140' -> (start: 227, end: 233)
  '4.2' -> (start: 242, end: 248)
  '02nd Aug 2024' -> (start: 312, end: 325)
  '12nd Sep 2024' -> (start: 363, end: 376)
  '3.1' -> (start: 384, end: 388)
  '16 Sep' -> (start: 443, end: 449)
  '23rd Oct 2024' -> (start: 588, end: 601)
  '16st Nov 2024' -> (start: 1205, end: 1218)
  '17.12.24' -> (start: 1286, end: 1294)


Test Relative Dates

In [4]:
# Test on one example
test_text = "Patient was seen last week for follow-up. Next appointment scheduled for tomorrow. Symptoms started 3 days ago."
relative_dates = extract_relative_dates(test_text)
relative_dates

[{'id': 'rel_1',
  'value': 'last week',
  'start': 17,
  'end': 26,
  'pattern_type': 'time_unit'},
 {'id': 'rel_2',
  'value': '3 days ago',
  'start': 100,
  'end': 110,
  'pattern_type': 'ago'},
 {'id': 'rel_3',
  'value': 'tomorrow',
  'start': 73,
  'end': 81,
  'pattern_type': 'common'}]

In [6]:
# Test on comprehensive examples to verify all pattern types
test_text = """
Patient was seen last week for follow-up. 
Next appointment scheduled for tomorrow. 
Symptoms started 3 days ago.
Last visit was on Monday.
Previous checkup was 2 weeks earlier.
Past few days have been difficult.
Several months ago the condition worsened.
Earlier this week the patient improved.
Last visit was productive.
Next few days will be critical.
"""

print("Testing comprehensive relative date extraction:")
print(f"Text: {test_text.strip()}")
print("\nResults:")

results = extract_relative_dates(test_text)
for result in results:
    print(f"  '{result['value']}' -> (pattern: {result['pattern_type']})")

print(f"\nTotal patterns found: {len(results)}")

Testing comprehensive relative date extraction:
Text: Patient was seen last week for follow-up. 
Next appointment scheduled for tomorrow. 
Symptoms started 3 days ago.
Last visit was on Monday.
Previous checkup was 2 weeks earlier.
Past few days have been difficult.
Several months ago the condition worsened.
Earlier this week the patient improved.
Last visit was productive.
Next few days will be critical.

Results:
  'last week' -> (pattern: time_unit)
  'this week' -> (pattern: time_unit)
  '3 days ago' -> (pattern: ago)
  '2 weeks earlier' -> (pattern: earlier)
  'tomorrow' -> (pattern: common)
  'Earlier this week' -> (pattern: earlier_period)

Total patterns found: 6


In [7]:
# Load main dataset using existing load_data function
df = load_data("../data/synthetic_new.csv")

print(f"Main dataset: {df.shape}")

Main dataset: (101, 4)


In [8]:
# Add relative dates using the simplified function
df = add_relative_dates(df)

print(f"Added relative_dates_json column")
print(f"Final dataset shape: {df.shape}")

Added relative_dates_json column
Final dataset shape: (101, 5)


In [9]:
# Count how many rows have relative dates
has_relative_dates = df['relative_dates_json'].apply(lambda x: x != '[]')
print(f"Rows with relative dates: {has_relative_dates.sum()}")
print(f"Total relative dates found: {sum(len(json.loads(rd)) for rd in df['relative_dates_json'])}")

Rows with relative dates: 1
Total relative dates found: 3


In [10]:
# Get rows that have relative dates
rows_with_relative_dates = df[has_relative_dates]

print(f"\nExamining {len(rows_with_relative_dates)} rows with relative dates:")

# Show detailed results for each row with relative dates
for i, (idx, row) in enumerate(rows_with_relative_dates.iterrows()):
    print(f"\n--- Row {i+1} (Index {idx}) ---")
    print(f"Text: {row['note_text'][:200]}...")
    
    # Parse and display relative dates
    relative_dates = json.loads(row['relative_dates_json'])
    print(f"Found {len(relative_dates)} relative dates:")
    for rd in relative_dates:
        print(f"  '{rd['value']}' -> (pattern: {rd['pattern_type']})")
    
    # Only show first 5 rows to avoid too much output
    if i >= 4:
        remaining = len(rows_with_relative_dates) - 5
        if remaining > 0:
            print(f"\n... and {remaining} more rows with relative dates")
        break


Examining 1 rows with relative dates:

--- Row 1 (Index 100) ---
Text: CLINIC VISIT (15/06/2025): Patient was Current medications include: lisinopril 10mg daily, metformin 500mg twice daily, and atorvastatin 20mg at bedtime. Cardiovascular: Regular rate and rhythm. Skin:...
Found 3 relative dates:
  'last year' -> (pattern: time_unit)
  'last week' -> (pattern: time_unit)
  '2 years ago' -> (pattern: ago)


In [11]:
from naive_extractor import naive_extraction

# Get the row with relative dates
row_with_relative = df[has_relative_dates].iloc[0]
print("Testing naive extractor with relative dates:")
print(f"Text: {row_with_relative['note_text'][:200]}...")

# Get entities and dates
entities = row_with_relative['entities_json']
absolute_dates = row_with_relative['dates_json']
relative_dates = json.loads(row_with_relative['relative_dates_json'])

print(f"\nEntities: {len(entities)}")
for i, entity in enumerate(entities[:3]):  # Show first 3
    print(f"  {i+1}. {entity['value']} (pos: {entity['start']})")

print(f"\nAbsolute dates: {len(absolute_dates)}")
for i, date in enumerate(absolute_dates[:3]):  # Show first 3
    print(f"  {i+1}. {date['value']} (pos: {date['start']})")

print(f"\nRelative dates: {len(relative_dates)}")
for i, date in enumerate(relative_dates):
    print(f"  {i+1}. {date['value']} (pos: {date['start']})")

Testing naive extractor with relative dates:
Text: CLINIC VISIT (15/06/2025): Patient was Current medications include: lisinopril 10mg daily, metformin 500mg twice daily, and atorvastatin 20mg at bedtime. Cardiovascular: Regular rate and rhythm. Skin:...

Entities: 25
  1. Skin problem (pos: 195)
  2. Eruption of skin (pos: 204)
  3. Congenital malformation (pos: 274)

Absolute dates: 1
  1. 15/06/2025 (pos: 14)

Relative dates: 3
  1. last year (pos: 796)
  2. last week (pos: 1368)
  3. 2 years ago (pos: 1176)


In [12]:
# Test naive extraction with absolute dates only
print("=== NAIVE EXTRACTION WITH ABSOLUTE DATES ONLY ===")
relationships_absolute = naive_extraction(entities, absolute_dates)
print(f"Found {len(relationships_absolute)} relationships:")
for rel in relationships_absolute:
    print(f"  {rel['entity_label']} -> {rel['date']} (distance: {rel['distance']})")

=== NAIVE EXTRACTION WITH ABSOLUTE DATES ONLY ===
Found 2 relationships:
  Skin problem -> 15/06/2025 (distance: 181)
  Eruption of skin -> 15/06/2025 (distance: 190)


In [None]:
# Test naive extraction with relative dates only
print("=== NAIVE EXTRACTION WITH RELATIVE DATES ONLY ===")
relationships_relative = naive_extraction(entities, relative_dates)
print(f"Found {len(relationships_relative)} relationships:")
for rel in relationships_relative:
    print(f"  {rel['entity_label']} -> {rel['date']} (distance: {rel['distance']})")

=== NAIVE EXTRACTION WITH RELATIVE DATES ONLY ===
Found 16 relationships:
  Follow-up (wait and see) -> last year (distance: 142)
  Computed tomography of head -> last year (distance: 97)
  Congenital malformation -> last year (distance: 49)
  Pituitary adenoma -> last year (distance: 18)
  Physical examination procedure -> last year (distance: 11)
  Soft tissue lesion -> last year (distance: 31)
  Inguinal lymphadenopathy -> last year (distance: 100)
  Gastrointestinal hemorrhage -> last year (distance: 168)
  Normal bowel sounds -> last year (distance: 172)
  Hepatosplenomegaly -> 2 years ago (distance: 161)
  Pituitary adenoma -> 2 years ago (distance: 135)
  Acromegaly -> 2 years ago (distance: 11)
  Dietary finding -> last week (distance: 95)
  Visual symptoms -> last week (distance: 60)
  Headache -> last week (distance: 18)
  Normal vital signs -> last week (distance: 66)


In [14]:
# Test naive extraction with combined dates
print("=== NAIVE EXTRACTION WITH COMBINED DATES ===")
all_dates = absolute_dates + relative_dates
print(f"Total dates: {len(all_dates)} (absolute: {len(absolute_dates)}, relative: {len(relative_dates)})")

relationships_combined = naive_extraction(entities, all_dates)
print(f"Found {len(relationships_combined)} relationships:")
for rel in relationships_combined:
    print(f"  {rel['entity_label']} -> {rel['date']} (distance: {rel['distance']})")

=== NAIVE EXTRACTION WITH COMBINED DATES ===
Total dates: 4 (absolute: 1, relative: 3)
Found 18 relationships:
  Skin problem -> 15/06/2025 (distance: 181)
  Eruption of skin -> 15/06/2025 (distance: 190)
  Follow-up (wait and see) -> last year (distance: 142)
  Computed tomography of head -> last year (distance: 97)
  Congenital malformation -> last year (distance: 49)
  Pituitary adenoma -> last year (distance: 18)
  Physical examination procedure -> last year (distance: 11)
  Soft tissue lesion -> last year (distance: 31)
  Inguinal lymphadenopathy -> last year (distance: 100)
  Gastrointestinal hemorrhage -> last year (distance: 168)
  Normal bowel sounds -> last year (distance: 172)
  Hepatosplenomegaly -> 2 years ago (distance: 161)
  Pituitary adenoma -> 2 years ago (distance: 135)
  Acromegaly -> 2 years ago (distance: 11)
  Dietary finding -> last week (distance: 95)
  Visual symptoms -> last week (distance: 60)
  Headache -> last week (distance: 18)
  Normal vital signs -> la

In [15]:
# Debug - show all entity-date distances
print("=== ENTITY-DATE DISTANCE ANALYSIS ===")
for entity in entities:
    print(f"\nEntity: {entity['value']} (pos: {entity['start']})")
    print("Distances to dates:")
    
    for date in all_dates:
        distance = abs(entity['start'] - date['start'])
        date_type = "absolute" if date in absolute_dates else "relative"
        print(f"  {date['value']} ({date_type}): distance = {distance}")

=== ENTITY-DATE DISTANCE ANALYSIS ===

Entity: Skin problem (pos: 195)
Distances to dates:
  15/06/2025 (absolute): distance = 181
  last year (relative): distance = 601
  last week (relative): distance = 1173
  2 years ago (relative): distance = 981

Entity: Eruption of skin (pos: 204)
Distances to dates:
  15/06/2025 (absolute): distance = 190
  last year (relative): distance = 592
  last week (relative): distance = 1164
  2 years ago (relative): distance = 972

Entity: Congenital malformation (pos: 274)
Distances to dates:
  15/06/2025 (absolute): distance = 260
  last year (relative): distance = 522
  last week (relative): distance = 1094
  2 years ago (relative): distance = 902

Entity: Murmur (pos: 316)
Distances to dates:
  15/06/2025 (absolute): distance = 302
  last year (relative): distance = 480
  last week (relative): distance = 1052
  2 years ago (relative): distance = 860

Entity: Pericardial friction rub (pos: 325)
Distances to dates:
  15/06/2025 (absolute): distance = 