Imports

In [None]:
#imports
import sys
import os
import json
import pandas as pd

utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from general_utils import load_data, prepare_all_samples
from naive_extractor_utils import naive_extraction

Data Loading

In [None]:
# Load data
df = load_data("../data/inference_dataset.csv")
print(f"Loaded {len(df)} records")

In [None]:
#Inspect df
df.head()

In [None]:
# Prepare all samples
samples = prepare_all_samples(df)
print(f"Prepared {len(samples)} samples")
#samples[2]

Naive Inference

In [None]:
#Define max distance (number of characters)
MAX_DISTANCE = 400

In [None]:
#Make predictions
predictions = []

for sample in samples:
    # Combine absolute and relative dates
    all_dates = sample['dates'] + sample['relative_dates']
    
    # Create lookup maps
    entity_map = {entity['value']: (entity['cui'], entity.get('preferred_name', entity['value'])) for entity in sample['entities_list']}
    date_map = {date['value']: (date['id'], 'absolute') for date in sample['dates']}
    date_map.update({date['value']: (date['id'], 'relative') for date in sample['relative_dates']})
    
    # Run naive extraction on all dates
    relationships = naive_extraction(sample['entities_list'], all_dates, max_distance=MAX_DISTANCE)
    
    # Add metadata to each relationship
    for rel in relationships:
        date_id, date_type = date_map[rel['date']]
        entity_cui, preferred_name = entity_map[rel['entity_label']]
        
        predictions.append({
            'doc_id': sample['doc_id'],
            'date_id': date_id,
            'date': rel['date'],
            'date_type': date_type,
            'entity_cui': entity_cui,
            'entity_label': rel['entity_label'],
            'entity_preferred_name': preferred_name
        })

print(f"Total predictions: {len(predictions)}")

In [None]:
# Look at predictions
#predictions

In [None]:
# Save predictions
with open('../outputs/naive_predictions.json', 'w') as f:
    json.dump(predictions, f, indent=2)

print("Saved predictions to outputs/naive_predictions.json")