# Pipeline on Synthetic Data, Mimicking the structure of the real data

In [None]:
import pandas as pd
import numpy as np
import os
import uuid
from datetime import datetime, timedelta

def generate_synthetic_data(phenotype_name, cases_csv='yl_pheno_cases.csv'):
    
    print("--- Generating synthetic data for demonstration ---")
    data_path = 'data'
    phenotype_folder_sanitized = phenotype_name.replace(' Cases', '').replace(' ', '_')
    phenotype_path = os.path.join(data_path, phenotype_folder_sanitized)
    os.makedirs(phenotype_path, exist_ok=True)

    cases_df = pd.read_csv(cases_csv)
    phenotype_cases = cases_df[cases_df['cohort_name'] == phenotype_name]
    case_ids = phenotype_cases['subject_id'].unique()

    subject_ids = np.concatenate([
        np.random.choice(case_ids, size=int(len(case_ids) * 0.7), replace=True),
        [np.random.randint(1000000, 20000000) for _ in range(50)]
    ])

    splits = ['train', 'tuning', 'held_out']

    for split in splits:
        split_ids = np.random.choice(subject_ids, size=200, replace=False)
        
        prediction_times = [
            datetime(2022, 1, 1) - timedelta(days=np.random.randint(1, 365*5))
            for _ in range(200)
        ]
        
        df = pd.DataFrame({
            'subject_id': split_ids,
            'prediction_time': prediction_times,
            'boolean_value': np.random.choice([True, False], size=200),
            'split': split
        })
        
        file_path = os.path.join(phenotype_path, f'{split}.parquet')
        df.to_parquet(file_path)
        print(f"Generated synthetic file: {file_path}")

def generate_time_to_event_labels(phenotype_folder, prediction_df, cohort_info_df):
    
    phenotype_cases_df = cohort_info_df[cohort_info_df['phenotype_folder'] == phenotype_folder].copy()

    merged_df = pd.merge(prediction_df, phenotype_cases_df, on='subject_id', how='left')
    
    merged_df['prediction_time'] = pd.to_datetime(merged_df['prediction_time'])

    merged_df['event_observed'] = merged_df['cohort_start_date'].notna()

    merged_df['time_to_event_hours'] = np.where(
        merged_df['event_observed'],
        (merged_df['cohort_start_date'] - merged_df['prediction_time']).dt.total_seconds() / 3600,
        (merged_df['cohort_end_date'] - merged_df['prediction_time']).dt.total_seconds() / 3600
    )
    
    return merged_df.drop(columns=['cohort_name', 'cohort_start_date', 'cohort_end_date', 'phenotype_folder'], errors='ignore')

if __name__ == "__main__":
    cases_csv_path = 'yl_pheno_cases.csv'
    if not os.path.exists(cases_csv_path):
        print(f"Error: The file '{cases_csv_path}' was not found.")
        print("Please ensure 'yl_pheno_cases.csv' is in the same directory as the script.")
    else:
        cases_df = pd.read_csv(cases_csv_path)
        original_phenotypes = cases_df['cohort_name'].unique()
        
        # First, generate synthetic data for all phenotypes
        for phenotype_name in original_phenotypes:
            generate_synthetic_data(phenotype_name)

        # Now, process the generated data
        cases_df['phenotype_folder'] = cases_df['cohort_name'].str.replace(' Cases', '', regex=False).str.replace(' ', '_', regex=False)
        cases_df['cohort_start_date'] = pd.to_datetime(cases_df['cohort_start_date'])
        cases_df['cohort_end_date'] = pd.to_datetime(cases_df['cohort_end_date'])

        unique_phenotype_folders = cases_df['phenotype_folder'].unique()

        data_path = 'data'
        splits = ['train', 'tuning', 'held_out']

        for phenotype_folder in unique_phenotype_folders:
            print(f"\n--- Processing phenotype: {phenotype_folder} ---")
            
            for split in splits:
                file_path = os.path.join(data_path, phenotype_folder, f"{split}.parquet")

                if not os.path.exists(file_path):
                    print(f"Skipping {file_path} - file not found after generation.")
                    continue
                    
                print(f"  - Processing split: {split}")
                
                prediction_df = pd.read_parquet(file_path)

                labeled_df = generate_time_to_event_labels(phenotype_folder, prediction_df, cases_df)
                
                print(f"    - Sample result for {split} split:")
                print(labeled_df[['subject_id', 'prediction_time', 'time_to_event_hours', 'event_observed']].head())

                output_path = os.path.join(data_path, phenotype_folder, f"{split}_with_labels.parquet")
                labeled_df.to_parquet(output_path)
                print(f"    - Saved results to {output_path}")



# Pipeline on Real Data

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import sys


def generate_time_to_event_labels(phenotype_folder, prediction_df, cohort_info_df,last_event_df):
    '''
    phenotype_folder: task name
    cohort_info_df: 
        - phenotype_folder: str
        - cohort_name: str
        - cohort_start_date: datetime
        - cohort_end_date: datetime
        
    prediction_df: 
        - subject_id: str
        - prediction_time: datetime
        - boolean_value: 0/1
        - split: train/tuning/held_out
    last_event_df:
        - subject_id: str
        - last_event_time: datetime
    '''
    
    phenotype_cases_df = cohort_info_df[cohort_info_df['phenotype_folder'] == phenotype_folder].copy()

    merged_df = pd.merge(prediction_df, phenotype_cases_df, on='subject_id', how='left')
    # print(merged_df)

    
    merged_df['prediction_time'] = pd.to_datetime(merged_df['prediction_time'])

    merged_df['event_observed'] = merged_df['cohort_start_date'].notna()
    # print(merged_df['event_observed'].value_counts())

    merged_df['time_to_event_hours'] = np.where(
        merged_df['event_observed'],
        (merged_df['cohort_start_date'] - merged_df['prediction_time']).dt.total_seconds() / 3600,
        (merged_df['cohort_end_date'] - merged_df['prediction_time']).dt.total_seconds() / 3600
    )
    negative_tte = merged_df[(merged_df['time_to_event_hours']<0) | (merged_df['time_to_event_hours'].isna())]
    if len(negative_tte) > 0:
        print("False Labeling")
        print(merged_df)
        print(f"Warning: {len(negative_tte)} negative time to event hours found")
        print(negative_tte)
        sys.exit()

    # assert merged_df['time_to_event_hours'].min() >= 0, "Time to event hours is negative"
    # sys.exit()
    return merged_df.drop(columns=['cohort_name', 'cohort_start_date', 'cohort_end_date', 'phenotype_folder'], errors='ignore')

if __name__ == "__main__":
    cases_csv_path = '/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/task_labels/tte_task_labels/yl_pheno_cases.csv'
    data_path = '/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/task_labels/phenotype_sample'
    last_event_path = '/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/task_labels/tte_task_labels/last_event_times.csv'
    output_dir = '/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/task_labels/tte_task_labels'

    if not os.path.exists(cases_csv_path):
        print(f"Error: The file '{cases_csv_path}' was not found.")
        print("Please ensure 'yl_pheno_cases.csv' is in the same directory as the script.")
    elif not os.path.exists(data_path):
        print(f"Error: The folder '{data_path}' was not found.")
        print("Please ensure the root 'data' directory with phenotype subfolders exists.")
    else:
        cases_df = pd.read_csv(cases_csv_path)
        cases_df['phenotype_folder'] = cases_df['cohort_name'].str.replace(' Cases', '', regex=False).str.replace(' ', '_', regex=False)
        cases_df['cohort_start_date'] = pd.to_datetime(cases_df['cohort_start_date'])
        cases_df['cohort_end_date'] = pd.to_datetime(cases_df['cohort_end_date'])

        # Get a list of phenotype folders that already exist in the 'data' directory
        # unique_phenotype_folders = [d for d in os.listdir(data_path) if (os.path.isdir(os.path.join(data_path, d)) and d not in ["AMI","Ischemic_Stroke"] ) ]
        unique_phenotype_folders = ["CLL"]
        splits = ['train', 'tuning', 'held_out']
        last_event_df = pd.read_csv(last_event_path)

        for phenotype_folder in unique_phenotype_folders:
            print(f"\n--- Processing phenotype: {phenotype_folder} ---")
            
            for split in splits:
                file_path = os.path.join(data_path, phenotype_folder, f"{split}.parquet")

                if not os.path.exists(file_path):
                    print(f"Skipping {file_path} - file not found.")
                    continue
                    
                print(f"  - Processing split: {split}")
                
                prediction_df = pd.read_parquet(file_path)

                labeled_df = generate_time_to_event_labels(phenotype_folder, prediction_df, cases_df,last_event_df)
                
                # print(f"    - Sample result for {split} split:")
                # print(labeled_df[['subject_id', 'prediction_time', 'time_to_event_hours', 'event_observed']].head())

                os.makedirs(os.path.join(output_dir, phenotype_folder), exist_ok=True)
                output_path = os.path.join(output_dir, phenotype_folder, f"tte_{split}.parquet")
                labeled_df.to_parquet(output_path)
                print(f"    - Saved results to {output_path}")



In [None]:
import meds_reader
import csv

meds_reader_path = "/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/post_transform_meds_reader"
database = meds_reader.SubjectDatabase(meds_reader_path)

def extract_last(subjects):
    return [(subject.subject_id, subject.events[-1].time if subject.events else "") for subject in subjects]

# Process in parallel across threads
results = database.map(extract_last)

with open("last_event_times.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["subject_id", "last_event_time"])
    for batch in results:
        for subj_id, last_time in batch:
            writer.writerow([subj_id, last_time])


In [5]:
import meds_reader
from collections import Counter

# Load database
meds_reader_path = "/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/post_transform_meds_reader"
database = meds_reader.SubjectDatabase(meds_reader_path)

last_event_codes = []

# Iterate through all subjects
for subject_id in database:
    subject = database[subject_id]
    if subject.events:  # If this subject has events
        last_event = subject.events[-1]  # Last event in chronological order
        if last_event.code == "CMS Place of Service/12":
            print(subject_id)
            for event in subject.events:
                print(event)
            print()
            continue
            # print(f"Last event is {last_event}")
        last_event_codes.append(last_event.code)

# Count top 10 codes
code_counts = Counter(last_event_codes)
top_10 = code_counts.most_common(10)

print("Top 10 last event codes:")
for code, count in top_10:
    print(f"{code}: {count}")


12
Event(time=1963-05-10 00:00:00, code=Ethnicity/Not Hispanic, ...)
Event(time=1963-05-10 00:00:00, code=Gender/F, ...)
Event(time=1963-05-10 00:00:00, code=MEDS_BIRTH, ...)
Event(time=1963-05-10 00:00:00, code=Race/3, ...)
Event(time=2012-08-12 12:44:00.100000, code=Visit/OP, ...)
Event(time=2012-08-12 12:44:00.100010, code=PCORNet/Generic-NI, ...)
Event(time=2020-06-25 00:00:00, code=CPT4/73140, ...)
Event(time=2020-06-25 00:00:00, code=ICD10CM/M19.042, ...)
Event(time=2020-06-25 00:00:00, code=ICD10CM/S69.92XA, ...)
Event(time=2020-06-25 10:06:00.100020, code=ICD10CM/S69.92XA, ...)
Event(time=2020-06-25 10:06:00.100020, code=Visit/ER, ...)
Event(time=2020-06-25 10:37:00, code=LOINC/8310-5//UCUM/[degF], ...)
Event(time=2020-06-25 10:37:00, code=LOINC/8462-4//UCUM/mm[Hg], ...)
Event(time=2020-06-25 10:37:00, code=LOINC/8480-6//UCUM/mm[Hg], ...)
Event(time=2020-06-25 10:37:00, code=LOINC/8867-4//UCUM//min, ...)
Event(time=2020-06-25 10:37:00, code=LOINC/9279-1//UCUM//min, ...)
Event(t

KeyboardInterrupt: 

In [None]:
'''
Top 10 last event codes:
Race/Unknown: 1078408
PCORNet/Generic-NI: 673133
LOINC/63503-7: 569830
Visit/OP: 439523
CMS Place of Service/12: 232716
MEDS_DEATH: 211509
SNOMED/365981007: 198533
Race/UNK: 177163
ICD10CM/L82.00: 80669
ICD10CM/D48.50: 76641
'''


Event(time=2015-12-27 00:00:00, code=Ethnicity/Unknown, ...)
Event(time=2015-12-27 00:00:00, code=Gender/M, ...)
Event(time=2015-12-27 00:00:00, code=MEDS_BIRTH, ...)
Event(time=2015-12-27 00:00:00, code=Race/Unknown, ...)
Event(time=2016-01-12 00:00:00, code=CPT4/93000, ...)
Event(time=2016-01-12 00:00:00, code=ICD10CM/R01.10, ...)
Event(time=2016-01-12 09:26:50.453000, code=ICD10CM/R01.1, ...)
Event(time=2016-01-12 12:00:00, code=Visit/OP, ...)
Event(time=2016-01-12 12:36:30.063000, code=SNOMED/365932005//UCUM/%, ...)
Event(time=2016-01-12 12:36:30.070000, code=LOINC/29463-7//UCUM/[oz_av], ...)
Event(time=2016-01-12 12:50:56.300000, code=LOINC/18686-6, ...)
Event(time=2016-01-12 12:50:56.307000, code=LOINC/8867-4, ...)
Event(time=2016-01-12 12:50:56.310000, code=LOINC/8480-6//UCUM/mm[Hg], ...)
Event(time=2016-01-12 12:50:56.317000, code=LOINC/8462-4//UCUM/mm[Hg], ...)
Event(time=2016-01-12 12:51:20.290000, code=LOINC/42588-4, ...)
Event(time=2016-01-12 12:51:20.290000, code=SNOMED/40