Imports

In [None]:
#Imports
import pandas as pd
import sys
import os
import json

# Add utils path to be able to import custom functions
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from post_processing_utils import standardize_date, create_interactive_patient_timeline, plot_all_interactive_patient_timelines, get_patient_timeline_summary

Data Loading

In [None]:
# Load original data
df = pd.read_csv("../data/dataset_synthetic1.csv")
print(f"Loaded {len(df)} records")

In [None]:
#Load predictions
predictions = pd.read_json("../outputs/predictions/llm_predictions.json")
print(f"Loaded {len(predictions)} predictions")

In [None]:
# Merge predictions with original data
timeline_df = pd.merge(
    predictions, 
    df[['patient_id', 'doc_id', 'document_timestamp']], 
    on='doc_id',
    how='left'
)

In [None]:
#Inspect df
print(f"there are {timeline_df['patient_id'].nunique()} unique patients")
timeline_df.head()

Data Preparation

In [None]:
# Apply the date standardization function
timeline_df['standardized_date'] = timeline_df.apply(standardize_date, axis=1)

In [None]:
#Re-order columns
new_order = ['patient_id', 'doc_id', 'document_timestamp', 'date_id', 'date', 'date_type', 'standardized_date', 'entity_id', 'entity_label', 'entity_preferred_name']
timeline_df = timeline_df[new_order]
timeline_df.head()

In [None]:
# Drop rows where patient_id is missing or date could not be standardized
print(f'df length before dropping {len(timeline_df)}')
timeline_df.dropna(subset=['patient_id', 'standardized_date'], inplace=True)
print(f'df length after dropping {len(timeline_df)}')

In [None]:
# Convert patient_id to integer for cleaner grouping
#timeline_df['patient_id'] = timeline_df['patient_id'].astype(int)

In [None]:
# Deduplicate predictions with same patient, date, and entity
print(f"Before deduplication: {len(timeline_df)} events")
timeline_df = timeline_df.drop_duplicates(subset=['patient_id', 'standardized_date', 'entity_preferred_name'], keep='first')
print(f"After deduplication: {len(timeline_df)} events")

In [None]:
print(f"Processed {len(timeline_df)} events for {timeline_df['patient_id'].nunique()} patients")
timeline_df.head()

Generate Patient Timelines (Single Patient)

In [None]:
#Set patient id
patient_id = 1

In [None]:
# For a specific patient:
fig = create_interactive_patient_timeline(timeline_df, patient_id=patient_id)
fig.show()

In [None]:
#Save as HTML
fig.write_html(f'../outputs/timelines/patient_{patient_id}_timeline.html', include_plotlyjs=True)

In [None]:
# Get a summary for a specific patient
summary = get_patient_timeline_summary(timeline_df, patient_id=patient_id)
print(json.dumps(summary, indent=2))

In [None]:
# Save timeline to a file
with open(f'../outputs/timelines/synthetic/patient_{patient_id}_timeline.json', 'w') as f:
    json.dump(summary, f, indent=2)

Generate Patient Timelines (All Patients)

In [None]:
# Plots for all patients
#figs = plot_all_interactive_patient_timelines(timeline_df)

#for fig in figs:
    #fig.show()

In [None]:
# Create JSON timelines for all patients
timelines_dir = "../outputs/timelines/synthetic/"

# Generate and save JSON timelines for all patients
patient_ids = timeline_df['patient_id'].unique()
print(f"Creating JSON timelines for {len(patient_ids)} patients...")

for patient_id in patient_ids:
    summary = get_patient_timeline_summary(timeline_df, patient_id=patient_id)
    if summary:

        # Sanitize patient_id for filename by removing slashes
        safe_patient_id = str(patient_id).replace('/', '')

        # Save timeline to JSON file
        with open(f'{timelines_dir}/patient_{safe_patient_id}_timeline.json', 'w') as f:
            json.dump(summary, f, indent=2)
        print(f"Saved timeline for patient {safe_patient_id}")

print("All patient timelines saved to JSON files!")