Imports

In [1]:
#Imports
import pandas as pd
import sys
import os
import json

# Add utils path to be able to import custom functions
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from post_processing_utils import standardize_date, create_interactive_patient_timeline, plot_all_interactive_patient_timelines, get_patient_timeline_summary

Data Loading

In [2]:
# Load original data
df = pd.read_csv("../data/data.csv")
print(f"Loaded {len(df)} records")

Loaded 101 records


In [3]:
#Load predictions
predictions = pd.read_json("../outputs/llm_predictions.json")
print(f"Loaded {len(predictions)} predictions")

Loaded 629 predictions


In [4]:
# Merge predictions with original data
timeline_df = pd.merge(
    predictions, 
    df[['patient_id', 'doc_id', 'document_timestamp']], 
    on='doc_id',
    how='left'
)

In [5]:
#Inspect df
timeline_df.head()

Unnamed: 0,doc_id,date_id,date,date_type,entity_id,entity_label,entity_preferred_name,patient_id,document_timestamp
0,0,abs_1,30nd Jun 2024,absolute,ent_1,Ultrasound,Ultrasound,1,01/11/2024
1,0,abs_2,02nd Aug 2024,absolute,ent_8,asthma,Asthma,1,01/11/2024
2,0,abs_3,12nd Sep 2024,absolute,ent_10,mass,Mass,1,01/11/2024
3,0,abs_3,12nd Sep 2024,absolute,ent_11,brain,Disorder of brain,1,01/11/2024
4,0,abs_3,12nd Sep 2024,absolute,ent_26,pituitary_adenoma,Pituitary adenoma,1,01/11/2024


Data Preparation

In [6]:
# Apply the date standardization function
timeline_df['standardized_date'] = timeline_df.apply(standardize_date, axis=1)

In [7]:
#Re-order columns
new_order = ['patient_id', 'doc_id', 'document_timestamp', 'date_id', 'date', 'date_type', 'standardized_date', 'entity_id', 'entity_label', 'entity_preferred_name']
timeline_df = timeline_df[new_order]

In [8]:
# Drop rows where patient_id is missing or date could not be standardized
timeline_df.dropna(subset=['patient_id', 'standardized_date'], inplace=True)

In [9]:
# Convert patient_id to integer for cleaner grouping
timeline_df['patient_id'] = timeline_df['patient_id'].astype(int)

In [10]:
print(f"Processed {len(timeline_df)} events.")
timeline_df.head()

Processed 614 events.


Unnamed: 0,patient_id,doc_id,document_timestamp,date_id,date,date_type,standardized_date,entity_id,entity_label,entity_preferred_name
0,1,0,01/11/2024,abs_1,30nd Jun 2024,absolute,2024-06-30,ent_1,Ultrasound,Ultrasound
1,1,0,01/11/2024,abs_2,02nd Aug 2024,absolute,2024-08-02,ent_8,asthma,Asthma
2,1,0,01/11/2024,abs_3,12nd Sep 2024,absolute,2024-09-12,ent_10,mass,Mass
3,1,0,01/11/2024,abs_3,12nd Sep 2024,absolute,2024-09-12,ent_11,brain,Disorder of brain
4,1,0,01/11/2024,abs_3,12nd Sep 2024,absolute,2024-09-12,ent_26,pituitary_adenoma,Pituitary adenoma


Generate Patient Timelines (Single Patient)

In [11]:
#Set patient id
patient_id = 1

In [12]:
# For a specific patient:
fig = create_interactive_patient_timeline(timeline_df, patient_id=patient_id)
fig.show()

In [13]:
# Get a summary for a specific patient
summary = get_patient_timeline_summary(timeline_df, patient_id=patient_id)
print(json.dumps(summary, indent=2))

{
  "patient_id": 1,
  "total_events": 75,
  "date_range": {
    "start": "2022-01-11",
    "end": "2025-11-28"
  },
  "events": [
    {
      "date": "2022-01-11",
      "event": "Acromegaly",
      "date_type": "relative",
      "original_date": "2 years ago"
    },
    {
      "date": "2023-01-11",
      "event": "Pituitary adenoma",
      "date_type": "relative",
      "original_date": "last year"
    },
    {
      "date": "2024-01-04",
      "event": "Headache",
      "date_type": "relative",
      "original_date": "last week"
    },
    {
      "date": "2024-03-11",
      "event": "COPD",
      "date_type": "absolute",
      "original_date": "03-11-2024"
    },
    {
      "date": "2024-04-11",
      "event": "Ibuprofen",
      "date_type": "absolute",
      "original_date": "04/11/2024"
    },
    {
      "date": "2024-05-10",
      "event": "Parkinson's disease",
      "date_type": "absolute",
      "original_date": "05/10/2024"
    },
    {
      "date": "2024-06-11",
      "

In [14]:
# Save timeline to a file
with open(f'../outputs/patient_{patient_id}_timeline.json', 'w') as f:
    json.dump(summary, f, indent=2)

Generate Patient Timelines (All Patients)

In [15]:
# For all patients:
figs = plot_all_interactive_patient_timelines(timeline_df)
for fig in figs:
    fig.show()

Created 10 timeline plots
