In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import pyCLIF
from datetime import timedelta
from tableone import TableOne
import pyarrow
import waterfall
import warnings
warnings.filterwarnings('ignore')
import glob

✅ Loaded configuration from config.json


In [2]:
# Load the cohort DataFrame
cohort = pd.read_parquet('../output/final/c2d2_cohort.parquet')

# Define the join key — replace with actual key used in your data
join_key = 'hospitalization_id'  # adjust as needed

# Get list of other parquet files (excluding the cohort itself)
parquet_files = glob.glob('../output/final/*.parquet')
parquet_files = [f for f in parquet_files if 'c2d2_cohort' not in os.path.basename(f)]

# Iteratively merge
for file in parquet_files:
    print('joining:',file)
    df = pd.read_parquet(file)
    cohort = cohort.merge(df, on=join_key, how='left')


joining: ../output/final\c2d2_diagnostic_tests.parquet
joining: ../output/final\c2d2_interventions_crrt.parquet
joining: ../output/final\c2d2_interventions_rst.parquet
joining: ../output/final\c2d2_medications.parquet
joining: ../output/final\c2d2_objective_assessments.parquet


In [3]:
print('c2d2 elements:',cohort.columns)

c2d2 elements: Index(['hospitalization_id', '24hr_icu_in_dttm', 'icu_out_dttm',
       'location_category', 'hospital_id', 'icu_duration_minutes',
       '24hr_icu_out_dttm', 'prism_2hr_before_icu_admission',
       'prism_4hr_after_icu_admission', 'admission_dttm', 'discharge_dttm',
       'all_icu_stays', 'icu_los', 'hosp_los', 'icu_admit_age',
       'hosp_admit_source', 'hosp_disch_disp', 'sex', 'race', 'ethnicity',
       'icu_24hr_creatinine_min', 'icu_24hr_hemoglobin_min',
       'icu_24hr_wbc_min', 'icu_24hr_creatinine_max',
       'icu_24hr_hemoglobin_max', 'icu_24hr_wbc_max', 'icu_any_crrt',
       'icu_any_imv', 'icu_24hr_infusion_norepinephrine_max',
       'icu_24hr_heart_rate_min', 'icu_24hr_map_min', 'icu_24hr_temp_c_min',
       'icu_24hr_heart_rate_max', 'icu_24hr_map_max', 'icu_24hr_temp_c_max'],
      dtype='object')


In [None]:
# Define the full list of columns in your DataFrame
columns = [
    # Demographics
    'icu_admit_age', 'sex', 'race', 'ethnicity',
    
    # Vitals
    'icu_24hr_temp_c_max', 'icu_24hr_heart_rate_max', 'icu_24hr_map_min',
    'icu_24hr_infusion_norepinephrine_max',
    
    # Labs
    'icu_24hr_hemoglobin_min', 'icu_24hr_creatinine_max', 'icu_24hr_wbc_max',
    
    # Support Devices
    'icu_any_crrt', 'icu_any_imv',
    
    # Outcomes
    'icu_los', 'hosp_los', 'hosp_disch_disp',
]

# Define which columns are categorical
categorical = [
    'sex', 'race', 'ethnicity', 
    'hosp_disch_disp', 'icu_any_crrt', 'icu_any_imv'
]
nonnorm = [ 'icu_los', 'hosp_los','icu_24hr_hemoglobin_min', 'icu_24hr_creatinine_max',
    'icu_24hr_wbc_max', 'icu_any_crrt', 'icu_any_imv',
    'icu_24hr_infusion_norepinephrine_max', 'icu_24hr_map_min',
    'icu_24hr_heart_rate_max', 'icu_24hr_temp_c_max']
# Create Table 1
table1 = TableOne(
    cohort,
    columns=columns,
    categorical=categorical,
    nonnormal=nonnorm,   
    missing=False    
)

# Display Table 1
table1

In [5]:
table1.to_csv(
    f"""../output/t1_c2d2_{cohort.admission_dttm.dt.year.min()}_{cohort.admission_dttm.dt.year.max()}_{pyCLIF.helper['site_name']}.csv"""
)