In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import pyCLIF
from datetime import timedelta
import pyarrow
import waterfall
import warnings
warnings.filterwarnings('ignore')

✅ Loaded configuration from config.json


In [2]:
clif_c2d2_mapping = pd.read_excel('../mapping/ccm-53-e1045-s002.xlsx')
cohort = pd.read_parquet('../output/final/c2d2_cohort.parquet')

## 🗺️ CLIF labs to C2D2 Mapping

In [3]:
mapper = clif_c2d2_mapping[(clif_c2d2_mapping['Domain']=='Diagnostic tests') & (clif_c2d2_mapping['CLIF Table']=='Labs')]
mapper.head()

Unnamed: 0,Domain,Sub-domain,Concept,Common Data Element,CLIF Table,CLIF mCIDE Crosswalk column,CLIF version,mCIDE,Definition,Coding,...,SOFA,PSOFA,nSOFA,PRISM III,PIM3,Charlson CI,MRC-ICU,SOI count,No SOI score,Potential EHR Datasource
88,Diagnostic tests,Hematology,Hematocrit - high,icu_24hr_hct_max,Labs,lab_category,CLIF 2.2 (Upcoming release),TBD,Patient's hematocrit (highest during first 24 ...,[continuous] \nUnits: percent %,...,,,,,,,,1,,Lab
89,Diagnostic tests,Hematology,Hemoglobin - high,icu_24hr_hgb_max,Labs,lab_category,CLIF-2.1.0 (Live),hemoglobin,Patient's hemoglobin level (highest) captured ...,[continuous] \nUnits: g/dL,...,,,,,,,,0,1.0,Lab
90,Diagnostic tests,Hematology,WBC - high,icu_24_hr_wbc_max,Labs,lab_category,CLIF-2.1.0 (Live),wbc,Patient’s WBC (highest) captured during first ...,[continuous] \nUnits: x109/L,...,,,,,,,,1,,Lab
91,Diagnostic tests,Coagulation,PRISM Prothrombin time - high,prism_pt_max,Labs,lab_category,CLIF 2.2 (Upcoming release),pt,Patient's prothrombin time (highest) captured ...,[continuous] \nUnits: seconds,...,,,,1.0,,,,1,,Lab
92,Diagnostic tests,Coagulation,International Normalized Ratio (INR) - high,icu_24hr_inr_max,Labs,lab_category,CLIF 2.2 (Upcoming release),inr,Patient's INR (highest) captured during first ...,[continuous],...,,,,,,,,0,1.0,Lab


In [4]:
labs_required_columns = [
    'hospitalization_id',
    'lab_result_dttm',
    'lab_category',
    'lab_value_numeric'
]
labs_of_interest = ['wbc','hemoglobin','creatinine']

labs_filters = {
    'hospitalization_id': cohort['hospitalization_id'].unique().tolist(),
    'lab_category': labs_of_interest
}
labs_cohort = pyCLIF.load_data('clif_labs', columns=labs_required_columns, filters=labs_filters)
labs_cohort = pyCLIF.convert_datetime_columns_to_site_tz(labs_cohort, pyCLIF.helper['timezone'])
labs_cohort['lab_value_numeric'] = pd.to_numeric(labs_cohort['lab_value_numeric'], errors='coerce')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Data loaded successfully from C:/Users/vchaudha/Downloads/rush_parquet_2\clif_labs.parquet




In [5]:
# Select relevant columns and drop duplicates
cohort_trimmed = cohort[['hospitalization_id', '24hr_icu_in_dttm', '24hr_icu_out_dttm']].drop_duplicates()

# Perform inner join on 'hospitalization_id'
labs_joined_df = pd.merge(cohort_trimmed, labs_cohort, on='hospitalization_id', how='inner')

In [6]:
# Filter rows where recorded_dttm is within the ICU 24hr window
labs_filtered_df = labs_joined_df[
    (labs_joined_df['lab_result_dttm'] >= labs_joined_df['24hr_icu_in_dttm']) &
    (labs_joined_df['lab_result_dttm'] <= labs_joined_df['24hr_icu_out_dttm'])
]

# Group by hospitalization_id and labs_category, and compute min/max of labs_value
labs_agg_df = (
    labs_filtered_df
    .groupby(['hospitalization_id', 'lab_category'])['lab_value_numeric']
    .agg(['min', 'max'])
    .unstack(level='lab_category')  # Pivot labs_category into columns
)
# Rename columns to format: labs_category_min, labs_category_max
labs_agg_df.columns = [f'icu_24hr_{labs}_{stat}' for stat, labs in labs_agg_df.columns]

In [7]:
labs_agg_df.reset_index().to_parquet('../output/final/c2d2_diagnostic_tests.parquet',index=False)