# benchmark load times using pandas vs. PyHealth

## pandas

In [17]:
import pandas as pd
import time

path_to_massive_lab_events_table = "/srv/local/data/MIMIC-IV/2.0/hosp/labevents.csv"
path_to_massive_admissions_table = "/srv/local/data/MIMIC-IV/2.0/hosp/admissions.csv"
path_to_massive_diagnoses_table = "/srv/local/data/MIMIC-IV/2.0/hosp/diagnoses_icd.csv"
path_to_massive_procedures_table = "/srv/local/data/MIMIC-IV/2.0/hosp/procedures_icd.csv"
path_to_massive_prescriptions_table = "/srv/local/data/MIMIC-IV/2.0/hosp/prescriptions.csv"
path_to_massive_patients_table = "/srv/local/data/MIMIC-IV/2.0/hosp/patients.csv"

# Time the file reading
start_time = time.perf_counter()
lab_df = pd.read_csv(path_to_massive_lab_events_table)
adm_df = pd.read_csv(path_to_massive_admissions_table)
diag_df = pd.read_csv(path_to_massive_diagnoses_table)
proc_df = pd.read_csv(path_to_massive_procedures_table)
presc_df = pd.read_csv(path_to_massive_prescriptions_table)
pat_df = pd.read_csv(path_to_massive_patients_table)
end_time = time.perf_counter()
read_time = end_time - start_time

print(f"Read time: {read_time:.2f} seconds")
print(f"Dataset shape: {lab_df.shape}")

  presc_df = pd.read_csv(path_to_massive_prescriptions_table)


Read time: 151.06 seconds
Dataset shape: (124342638, 15)


## pyhealth loading labevents

In [18]:
from pyhealth.datasets import MIMIC4Dataset
start_time = time.perf_counter()
root = "/srv/local/data/MIMIC-IV/2.0"
dataset = MIMIC4Dataset(ehr_root=root, ehr_tables=["diagnoses_icd", "procedures_icd", "prescriptions", "labevents"])
end_time = time.perf_counter()
read_time = end_time - start_time
print(f"Read time: {read_time:.2f} seconds")

Memory usage Starting MIMIC4Dataset init: 76030.8 MB
Initializing MIMIC4EHRDataset with tables: ['diagnoses_icd', 'procedures_icd', 'prescriptions', 'labevents'] (dev mode: False)
Using default EHR config: /home/johnwu3/projects/PyHealth_Branch_Testing/PyHealth/pyhealth/datasets/configs/mimic4_ehr.yaml
Memory usage Before initializing mimic4_ehr: 76030.8 MB
Initializing mimic4_ehr dataset from /srv/local/data/MIMIC-IV/2.0 (dev mode: False)
Scanning table: diagnoses_icd from /srv/local/data/MIMIC-IV/2.0/hosp/diagnoses_icd.csv.gz
Original path does not exist. Using alternative: /srv/local/data/MIMIC-IV/2.0/hosp/diagnoses_icd.csv
Joining with table: /srv/local/data/MIMIC-IV/2.0/hosp/admissions.csv.gz
Original path does not exist. Using alternative: /srv/local/data/MIMIC-IV/2.0/hosp/admissions.csv
Scanning table: procedures_icd from /srv/local/data/MIMIC-IV/2.0/hosp/procedures_icd.csv.gz
Original path does not exist. Using alternative: /srv/local/data/MIMIC-IV/2.0/hosp/procedures_icd.csv
J

### When you actually do any processing: ~11.5 seconds

In [6]:
print(f"Dataset shape: {dataset.stats()}")

Collecting global event dataframe...
Collected dataframe with shape: (28021083, 36)
Dataset: mimic3
Dev mode: False
Number of patients: 46520
Number of events: 28021083
Dataset shape: None


# benchmarking task processing time
## pandas dataframe processing 

In [22]:
import pandas as pd
import time
from datetime import datetime

def benchmark_mimic_processing(adm_df, diag_df, proc_df, presc_df, patients_df, n=1000):
    """
    Benchmark MIMIC-IV data processing for readmission prediction task.
    
    Args:
        adm_df: Admissions dataframe
        diag_df: Diagnoses dataframe  
        proc_df: Procedures dataframe
        presc_df: Prescriptions dataframe
        patients_df: Patients dataframe
        n: Number of patients to process (None = all patients)
    
    Returns:
        DataFrame with processed samples
    """
    print("Starting MIMIC-IV processing benchmark...")
    start_total = time.perf_counter()
    
    # Get patients to process
    if n is None:
        patients_to_process = patients_df['subject_id'].tolist()
        print(f"Processing all {len(patients_to_process)} patients...")
    else:
        patients_to_process = patients_df['subject_id'].head(n).tolist()
        print(f"Processing first {len(patients_to_process)} patients...")
    
    # Filter all dataframes to selected patients
    start_filter = time.perf_counter()
    
    adm_filtered = adm_df[adm_df['subject_id'].isin(patients_to_process)].copy()
    diag_filtered = diag_df[diag_df['subject_id'].isin(patients_to_process)].copy()
    proc_filtered = proc_df[proc_df['subject_id'].isin(patients_to_process)].copy()
    presc_filtered = presc_df[presc_df['subject_id'].isin(patients_to_process)].copy()
    patients_filtered = patients_df[patients_df['subject_id'].isin(patients_to_process)].copy()
    
    end_filter = time.perf_counter()
    print(f"Data filtering time: {end_filter - start_filter:.2f} seconds")
    
    # Convert datetime columns
    start_datetime = time.perf_counter()
    
    adm_filtered['admittime'] = pd.to_datetime(adm_filtered['admittime'])
    adm_filtered['dischtime'] = pd.to_datetime(adm_filtered['dischtime'])
    
    end_datetime = time.perf_counter()
    print(f"Datetime conversion time: {end_datetime - start_datetime:.2f} seconds")
    
    # Process each patient (mimicking the pyhealth task logic)
    start_processing = time.perf_counter()
    
    samples = []
    processed_patients = 0
    
    for subject_id in patients_to_process:
        # Get patient demographics
        patient_demo = patients_filtered[patients_filtered['subject_id'] == subject_id]
        if len(patient_demo) == 0:
            continue
            
        # Skip if under 18 (mimicking the age filter from pyhealth code)
        anchor_age = patient_demo.iloc[0]['anchor_age']
        if anchor_age < 18:
            continue
            
        # Get admissions for this patient, sorted by admit time
        patient_admissions = adm_filtered[
            adm_filtered['subject_id'] == subject_id
        ].sort_values('admittime')
        
        for i, (_, admission) in enumerate(patient_admissions.iterrows()):
            hadm_id = admission['hadm_id']
            admit_time = admission['admittime']
            discharge_time = admission['dischtime']
            
            if pd.isna(discharge_time):
                continue
                
            # Calculate admission duration (skip if <= 12 hours)
            duration_hours = (discharge_time - admit_time).total_seconds() / 3600
            if duration_hours <= 12:
                continue
            
            # Check for readmission within 30 days
            readmission = 0
            if i < len(patient_admissions) - 1:
                next_admission = patient_admissions.iloc[i + 1]
                next_admit_time = next_admission['admittime']
                time_diff_hours = (next_admit_time - discharge_time).total_seconds() / 3600
                
                if time_diff_hours <= 3:  # Skip if too close
                    continue
                    
                readmission = 1 if time_diff_hours < 30 * 24 else 0
            
            # Get events for this admission
            admission_diagnoses = diag_filtered[
                diag_filtered['hadm_id'] == hadm_id
            ]
            admission_procedures = proc_filtered[
                proc_filtered['hadm_id'] == hadm_id
            ]
            admission_prescriptions = presc_filtered[
                presc_filtered['hadm_id'] == hadm_id
            ]
            
            # Create condition codes (mimicking the pyhealth format)
            conditions = []
            if len(admission_diagnoses) > 0:
                conditions = [
                    f"{row['icd_version']}_{row['icd_code']}" 
                    for _, row in admission_diagnoses.iterrows()
                    if pd.notna(row['icd_code'])
                ]
            
            # Create procedure codes
            procedures = []
            if len(admission_procedures) > 0:
                procedures = [
                    f"{row['icd_version']}_{row['icd_code']}" 
                    for _, row in admission_procedures.iterrows()
                    if pd.notna(row['icd_code'])
                ]
            
            # Create drug codes
            drugs = []
            if len(admission_prescriptions) > 0:
                drugs = [
                    f"{row['drug']}" 
                    for _, row in admission_prescriptions.iterrows()
                    if pd.notna(row['drug'])
                ]
            
            # Skip if any category is empty (mimicking pyhealth logic)
            if len(conditions) == 0 or len(procedures) == 0 or len(drugs) == 0:
                continue
            
            samples.append({
                'patient_id': subject_id,
                'admission_id': hadm_id,
                'conditions': conditions,
                'procedures': procedures,
                'drugs': drugs,
                'readmission': readmission,
                'num_conditions': len(conditions),
                'num_procedures': len(procedures),
                'num_drugs': len(drugs)
            })
        
        processed_patients += 1
        if processed_patients % 100 == 0:
            print(f"Processed {processed_patients} patients...")
    
    end_processing = time.perf_counter()
    
    # Create results dataframe
    results_df = pd.DataFrame(samples)
    
    print("="*60)
    print("BENCHMARK RESULTS")
    print("="*60)
    print(f"Data filtering time: {end_filter - start_filter:.2f} seconds")
    print(f"Datetime conversion time: {end_datetime - start_datetime:.2f} seconds")
    print(f"Processing time: {end_processing - start_processing:.2f} seconds")
    print(f"Total time: {end_processing - start_total:.2f} seconds")
    print(f"Processed patients: {processed_patients}")
    print(f"Valid admissions: {len(results_df)}")
    
    if len(results_df) > 0:
        print(f"Readmission rate: {results_df['readmission'].mean():.3f}")
        print(f"Avg conditions per admission: {results_df['num_conditions'].mean():.1f}")
        print(f"Avg procedures per admission: {results_df['num_procedures'].mean():.1f}")
        print(f"Avg drugs per admission: {results_df['num_drugs'].mean():.1f}")
    
    return results_df


results = benchmark_mimic_processing(adm_df, diag_df, proc_df, presc_df, pat_df, n=None)

Starting MIMIC-IV processing benchmark...
Processing all 315460 patients...
Data filtering time: 4.05 seconds
Datetime conversion time: 0.15 seconds
Processed 100 patients...
Processed 200 patients...
Processed 300 patients...
Processed 400 patients...
Processed 500 patients...
Processed 600 patients...
Processed 700 patients...
Processed 800 patients...
Processed 900 patients...
Processed 1000 patients...
Processed 1100 patients...
Processed 1200 patients...
Processed 1300 patients...
Processed 1400 patients...
Processed 1500 patients...
Processed 1600 patients...
Processed 1700 patients...
Processed 1800 patients...
Processed 1900 patients...
Processed 2000 patients...
Processed 2100 patients...
Processed 2200 patients...
Processed 2300 patients...
Processed 2400 patients...
Processed 2500 patients...
Processed 2600 patients...
Processed 2700 patients...
Processed 2800 patients...
Processed 2900 patients...
Processed 3000 patients...
Processed 3100 patients...
Processed 3200 patients

## set_task() operations after reading multiple labevents

In [23]:
from pyhealth.tasks import InHospitalMortalityMIMIC4
start_time = time.perf_counter()
task = InHospitalMortalityMIMIC4()
sample_dataset = dataset.set_task(task, num_workers=4)
end_time = time.perf_counter()
set_task_time = end_time - start_time
print(f"Set task time: {set_task_time:.2f} seconds")


Setting task InHospitalMortalityMIMIC4 for mimic4 base dataset...
Collecting global event dataframe...
Collected dataframe with shape: (147119785, 44)
Generating samples with 4 worker(s)...
Generating samples for InHospitalMortalityMIMIC4 with 4 workers


Collecting samples for InHospitalMortalityMIMIC4 from 4 workers: 100%|██████████| 315460/315460 [17:49<00:00, 294.87it/s]


Label mortality vocab: {0: 0, 1: 1}


Processing samples: 100%|██████████| 236532/236532 [02:07<00:00, 1861.04it/s]

Generated 236532 samples for task InHospitalMortalityMIMIC4





Set task time: 2236.55 seconds


### Cleaned versions for line counting comparisons:

In [None]:
import pandas as pd
import time
from datetime import datetime
import pandas as pd
import time

path_to_massive_lab_events_table = "/srv/local/data/MIMIC-IV/2.0/hosp/labevents.csv"
path_to_massive_admissions_table = "/srv/local/data/MIMIC-IV/2.0/hosp/admissions.csv"
path_to_massive_diagnoses_table = "/srv/local/data/MIMIC-IV/2.0/hosp/diagnoses_icd.csv"
path_to_massive_procedures_table = "/srv/local/data/MIMIC-IV/2.0/hosp/procedures_icd.csv"
path_to_massive_prescriptions_table = "/srv/local/data/MIMIC-IV/2.0/hosp/prescriptions.csv"
path_to_massive_patients_table = "/srv/local/data/MIMIC-IV/2.0/hosp/patients.csv"

lab_df = pd.read_csv(path_to_massive_lab_events_table)
adm_df = pd.read_csv(path_to_massive_admissions_table)
diag_df = pd.read_csv(path_to_massive_diagnoses_table)
proc_df = pd.read_csv(path_to_massive_procedures_table)
presc_df = pd.read_csv(path_to_massive_prescriptions_table)
pat_df = pd.read_csv(path_to_massive_patients_table)

def benchmark_mimic_processing(adm_df, diag_df, proc_df, presc_df, patients_df, n=1000):
    """
    Benchmark MIMIC-IV data processing for readmission prediction task.
    
    Args:
        adm_df: Admissions dataframe
        diag_df: Diagnoses dataframe  
        proc_df: Procedures dataframe
        presc_df: Prescriptions dataframe
        patients_df: Patients dataframe
        n: Number of patients to process (None = all patients)
    
    Returns:
        DataFrame with processed samples
    """
    print("Starting MIMIC-IV processing benchmark...")
    start_total = time.perf_counter()
    
    # Get patients to process
    if n is None:
        patients_to_process = patients_df['subject_id'].tolist()
        print(f"Processing all {len(patients_to_process)} patients...")
    else:
        patients_to_process = patients_df['subject_id'].head(n).tolist()
        print(f"Processing first {len(patients_to_process)} patients...")
    
    adm_filtered = adm_df[adm_df['subject_id'].isin(patients_to_process)].copy()
    diag_filtered = diag_df[diag_df['subject_id'].isin(patients_to_process)].copy()
    proc_filtered = proc_df[proc_df['subject_id'].isin(patients_to_process)].copy()
    presc_filtered = presc_df[presc_df['subject_id'].isin(patients_to_process)].copy()
    patients_filtered = patients_df[patients_df['subject_id'].isin(patients_to_process)].copy()
    
    adm_filtered['admittime'] = pd.to_datetime(adm_filtered['admittime'])
    adm_filtered['dischtime'] = pd.to_datetime(adm_filtered['dischtime'])
    
    # Process each patient (mimicking the pyhealth task logic)
    start_processing = time.perf_counter()
    
    samples = []
    processed_patients = 0
    
    for subject_id in patients_to_process:
        # Get patient demographics
        patient_demo = patients_filtered[patients_filtered['subject_id'] == subject_id]
        if len(patient_demo) == 0:
            continue
            
        # Skip if under 18 (mimicking the age filter from pyhealth code)
        anchor_age = patient_demo.iloc[0]['anchor_age']
        if anchor_age < 18:
            continue
            
        # Get admissions for this patient, sorted by admit time
        patient_admissions = adm_filtered[
            adm_filtered['subject_id'] == subject_id
        ].sort_values('admittime')
        
        for i, (_, admission) in enumerate(patient_admissions.iterrows()):
            hadm_id = admission['hadm_id']
            admit_time = admission['admittime']
            discharge_time = admission['dischtime']
            
            if pd.isna(discharge_time):
                continue
                
            # Calculate admission duration (skip if <= 12 hours)
            duration_hours = (discharge_time - admit_time).total_seconds() / 3600
            if duration_hours <= 12:
                continue
            
            # Check for readmission within 30 days
            readmission = 0
            if i < len(patient_admissions) - 1:
                next_admission = patient_admissions.iloc[i + 1]
                next_admit_time = next_admission['admittime']
                time_diff_hours = (next_admit_time - discharge_time).total_seconds() / 3600
                
                if time_diff_hours <= 3:  # Skip if too close
                    continue
                    
                readmission = 1 if time_diff_hours < 30 * 24 else 0
            
            # Get events for this admission
            admission_diagnoses = diag_filtered[
                diag_filtered['hadm_id'] == hadm_id
            ]
            admission_procedures = proc_filtered[
                proc_filtered['hadm_id'] == hadm_id
            ]
            admission_prescriptions = presc_filtered[
                presc_filtered['hadm_id'] == hadm_id
            ]
            
            # Create condition codes (mimicking the pyhealth format)
            conditions = []
            if len(admission_diagnoses) > 0:
                conditions = [
                    f"{row['icd_version']}_{row['icd_code']}" 
                    for _, row in admission_diagnoses.iterrows()
                    if pd.notna(row['icd_code'])
                ]
            
            # Create procedure codes
            procedures = []
            if len(admission_procedures) > 0:
                procedures = [
                    f"{row['icd_version']}_{row['icd_code']}" 
                    for _, row in admission_procedures.iterrows()
                    if pd.notna(row['icd_code'])
                ]
            
            # Create drug codes
            drugs = []
            if len(admission_prescriptions) > 0:
                drugs = [
                    f"{row['drug']}" 
                    for _, row in admission_prescriptions.iterrows()
                    if pd.notna(row['drug'])
                ]
            
            # Skip if any category is empty (mimicking pyhealth logic)
            if len(conditions) == 0 or len(procedures) == 0 or len(drugs) == 0:
                continue
            
            samples.append({
                'patient_id': subject_id,
                'admission_id': hadm_id,
                'conditions': conditions,
                'procedures': procedures,
                'drugs': drugs,
                'readmission': readmission,
                'num_conditions': len(conditions),
                'num_procedures': len(procedures),
                'num_drugs': len(drugs)
            })
        
        processed_patients += 1
        if processed_patients % 100 == 0:
            print(f"Processed {processed_patients} patients...")
    
    # Create results dataframe
    results_df = pd.DataFrame(samples)

    return results_df


results = benchmark_mimic_processing(adm_df, diag_df, proc_df, presc_df, pat_df, n=None)

In [None]:
from pyhealth.tasks import InHospitalMortalityMIMIC4
from pyhealth.datasets import MIMIC4Dataset
root = "/srv/local/data/MIMIC-IV/2.0"
dataset = MIMIC4Dataset(ehr_root=root, ehr_tables=["diagnoses_icd", "procedures_icd", "prescriptions", "labevents"])
task = InHospitalMortalityMIMIC4()
sample_dataset = dataset.set_task(task, num_workers=4)

