## Install Packages

In [1]:
# !pip install awswrangler
# !pip install tableone
# !pip install tqdm

## Import Packages

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr
from tqdm import tqdm
from tableone import TableOne

In [3]:
# Load cohort created in the cohort notebook.
cohort = pd.read_csv('initial_cohort_no_covariates.csv')
cohort_pats = cohort['subject_id'].unique()

In [4]:
cohort.shape

(70278, 38)

In [5]:
cohort.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,creat_low_past_7day,creat_low_past_48hr,creat,aki_stage_creat,uo_rt_6hr,uo_rt_12hr,...,anchor_year,anchor_year_group,dod,first_careunit,last_careunit,intime,outtime,los,creat_24,urine_24
0,10004401,29988601,32773003,2144-01-27 22:51:00,1.3,1.4,2.3,1,,,...,2141,2008 - 2010,2144-06-18,Medical Intensive Care Unit (MICU),Trauma SICU (TSICU),2144-01-26 22:28:04,2144-02-06 13:44:15,10.636238,1,1
1,10004401,29988601,32773003,2144-01-27 01:39:00,1.3,1.3,2.1,1,,,...,2141,2008 - 2010,2144-06-18,Medical Intensive Care Unit (MICU),Trauma SICU (TSICU),2144-01-26 22:28:04,2144-02-06 13:44:15,10.636238,1,1
2,10004401,29988601,32773003,2144-01-27 10:36:00,1.3,1.3,2.3,1,,,...,2141,2008 - 2010,2144-06-18,Medical Intensive Care Unit (MICU),Trauma SICU (TSICU),2144-01-26 22:28:04,2144-02-06 13:44:15,10.636238,1,1
3,10004401,29988601,32773003,2144-02-05 02:18:00,1.0,1.5,1.6,1,,,...,2141,2008 - 2010,2144-06-18,Medical Intensive Care Unit (MICU),Trauma SICU (TSICU),2144-01-26 22:28:04,2144-02-06 13:44:15,10.636238,1,1
4,10004401,29988601,32773003,2144-01-28 02:33:00,1.3,1.4,2.3,1,,,...,2141,2008 - 2010,2144-06-18,Medical Intensive Care Unit (MICU),Trauma SICU (TSICU),2144-01-26 22:28:04,2144-02-06 13:44:15,10.636238,1,1


In [6]:
list(cohort)

['subject_id',
 'hadm_id',
 'stay_id',
 'charttime',
 'creat_low_past_7day',
 'creat_low_past_48hr',
 'creat',
 'aki_stage_creat',
 'uo_rt_6hr',
 'uo_rt_12hr',
 'uo_rt_24hr',
 'aki_stage_uo',
 'aki_stage',
 'admittime',
 'dischtime',
 'deathtime',
 'admission_type',
 'admission_location',
 'discharge_location',
 'insurance',
 'language',
 'marital_status',
 'race',
 'edregtime',
 'edouttime',
 'hospital_expire_flag',
 'gender',
 'anchor_age',
 'anchor_year',
 'anchor_year_group',
 'dod',
 'first_careunit',
 'last_careunit',
 'intime',
 'outtime',
 'los',
 'creat_24',
 'urine_24']

In [7]:
cohort['charttime'] = pd.to_datetime(cohort['charttime'])
cohort['intime'] = pd.to_datetime(cohort['intime'])
cohort['outtime'] = pd.to_datetime(cohort['outtime'])

In [8]:
cohort_icu_48 = cohort[(cohort['charttime'] >= cohort['intime']) & (cohort['charttime'] <= (cohort['intime'] + np.timedelta64(2, 'D')))]

In [9]:
cohort_icu_48.shape

(25516, 38)

In [10]:
min_stay = cohort_icu_48.groupby('stay_id')['intime'].min()
cohort_icu_48_first_aki = cohort_icu_48.merge(min_stay)

In [11]:
labs = wr.athena.read_sql_query(f"SELECT * from crrt_cov_labs where subject_id IN {str(tuple(cohort_pats))}", database="mimiciv")

In [12]:
labs.shape

(3821379, 16)

In [19]:
# Drop outpatient labs with null hadm ids. 
# Iterate to avoid memory error
cohort_icu_stay = cohort_icu_48_first_aki[['hadm_id', 'intime', 'outtime']]

In [20]:
labs = labs.dropna(subset=['hadm_id'])

In [24]:
labs_merged = labs.merge(cohort_icu_48_first_aki, on=['hadm_id', 'subject_id'])

In [26]:
labs_merged['charttime'] = pd.to_datetime(labs_merged['charttime_x'])
labs_merged['intime'] = pd.to_datetime(labs_merged['intime'])
labs_merged['outtime'] = pd.to_datetime(labs_merged['outtime'])

In [27]:
labs_filt = labs_merged[(labs_merged['charttime'] >= labs_merged['intime']) & (labs_merged['charttime'] <= labs_merged['outtime'])]

In [28]:
labs_filt.shape

(2361999, 53)

In [29]:
list(labs_merged)

['labevent_id',
 'subject_id',
 'hadm_id',
 'specimen_id',
 'itemid',
 'charttime_x',
 'storetime',
 'value',
 'valuenum',
 'valueuom',
 'ref_range_lower',
 'ref_range_upper',
 'flag',
 'priority',
 'comments',
 'label',
 'stay_id',
 'charttime_y',
 'creat_low_past_7day',
 'creat_low_past_48hr',
 'creat',
 'aki_stage_creat',
 'uo_rt_6hr',
 'uo_rt_12hr',
 'uo_rt_24hr',
 'aki_stage_uo',
 'aki_stage',
 'admittime',
 'dischtime',
 'deathtime',
 'admission_type',
 'admission_location',
 'discharge_location',
 'insurance',
 'language',
 'marital_status',
 'race',
 'edregtime',
 'edouttime',
 'hospital_expire_flag',
 'gender',
 'anchor_age',
 'anchor_year',
 'anchor_year_group',
 'dod',
 'first_careunit',
 'last_careunit',
 'intime',
 'outtime',
 'los',
 'creat_24',
 'urine_24',
 'charttime']

In [30]:
lab_description = pd.pivot_table(labs, values=['valuenum'], index=['hadm_id'], columns=['label'], aggfunc=[np.nanmin, np.nanmax])

In [31]:
lab_description = lab_description.reset_index()

In [32]:
cols = ["_".join(x) for x in list(lab_description)]
cols[0] = 'hadm_id'
lab_description.columns = cols

In [47]:
cohort_icu_labs = cohort_icu_48_first_aki.merge(lab_description, on='hadm_id')

In [48]:
cohort_icu_labs.head()

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,creat_low_past_7day,creat_low_past_48hr,creat,aki_stage_creat,uo_rt_6hr,uo_rt_12hr,...,nanmax_valuenum_Albumin,nanmax_valuenum_Bicarbonate,"nanmax_valuenum_Calculated Bicarbonate, Whole Blood",nanmax_valuenum_Creatinine,nanmax_valuenum_Hematocrit,"nanmax_valuenum_Hematocrit, Calculated",nanmax_valuenum_Potassium,"nanmax_valuenum_Potassium, Whole Blood",nanmax_valuenum_Urea Nitrogen,nanmax_valuenum_pH
0,10004401,29988601,32773003,2144-01-27 22:51:00,1.3,1.4,2.3,1,,,...,3.0,27.0,23.0,2.3,28.700001,,5.5,5.1,107.0,7.45
1,10004401,29988601,32773003,2144-01-27 01:39:00,1.3,1.3,2.1,1,,,...,3.0,27.0,23.0,2.3,28.700001,,5.5,5.1,107.0,7.45
2,10004401,29988601,32773003,2144-01-27 10:36:00,1.3,1.3,2.3,1,,,...,3.0,27.0,23.0,2.3,28.700001,,5.5,5.1,107.0,7.45
3,10004401,29988601,32773003,2144-01-28 02:33:00,1.3,1.4,2.3,1,,,...,3.0,27.0,23.0,2.3,28.700001,,5.5,5.1,107.0,7.45
4,10004401,29988601,32773003,2144-01-27 03:26:00,1.3,1.3,2.1,1,,,...,3.0,27.0,23.0,2.3,28.700001,,5.5,5.1,107.0,7.45


In [49]:
cohort_icu_labs.shape

(25500, 58)

In [50]:
sofa = pd.read_csv('bq-results-20221113-103506-1668335764380.csv')
antib = pd.read_csv('Antibiotics.csv')
sepsis = pd.read_csv('sepsisIII.csv')

In [51]:
cohort_sepsis = cohort_icu_labs.merge(sepsis, on='stay_id', how='left')

In [52]:
cohort_sepsis['sepsis3'].value_counts(dropna=False)

True    18248
NaN      7252
Name: sepsis3, dtype: int64

In [53]:
cohort_sepsis['sepsis3'].fillna(False, inplace=True)

In [54]:
cohort_sepsis.shape

(25500, 60)

In [55]:
sofa_48 = sofa[sofa['hr'] <= 48]

In [56]:
sofa_48_min = dict(sofa_48.groupby(by='stay_id')['hr'].min())

In [57]:
sofa['min_hour'] = sofa['stay_id'].map(sofa_48_min)

In [58]:
sofa = sofa[sofa['min_hour'] == sofa['hr']]

In [59]:
sofa.shape

(76519, 4)

In [60]:
cohort_sofa = cohort_sepsis.merge(sofa, on='stay_id', how='left')

In [61]:
cohort_sofa.shape

(25500, 63)

In [62]:
cohort_sofa.nunique()

subject_id                  10184
hadm_id                     10184
stay_id                     10184
charttime                   25486
creat_low_past_7day           143
                            ...  
suspected_infection_time     6634
sepsis3                         2
hr                              1
sofa_24hours                   17
min_hour                        1
Length: 63, dtype: int64

In [63]:
antib.head()

Unnamed: 0,antibiotic,stay_id,starttime
0,Vancomycin,31326208,2135-06-21T08:00:00
1,Vancomycin,31195904,2152-08-28T20:00:00
2,Vancomycin,38407168,2175-08-25T18:00:00
3,Vancomycin,38407168,2175-08-25T10:00:00
4,Vancomycin,36179968,2111-05-27T20:00:00


In [64]:
antib['antibiotic'] = 1

In [65]:
cohort_stay_intime = cohort_sofa[['stay_id', 'intime']]

In [66]:
antib['starttime'] = pd.to_datetime(antib['starttime'])

In [67]:
antib_intime_stayid = cohort_stay_intime.merge(antib, on='stay_id')

In [68]:
antib_intime_stayid['delta_days'] = (antib_intime_stayid['starttime'] - antib_intime_stayid['intime']) / np.timedelta64(1, 'D')

In [69]:
antib_intime_stayid_48 = antib_intime_stayid[antib_intime_stayid['delta_days'] <= 2]

In [70]:
antib_intime_stayid_48 = antib_intime_stayid_48.drop_duplicates(subset=['stay_id'])

In [71]:
cohort_antib = cohort_sofa.merge(antib_intime_stayid_48, on='stay_id', how='left')

In [72]:
cohort_antib['antibiotic'].fillna(0, inplace=True)

In [73]:
cohort_antib.shape

(25500, 67)

In [74]:
vitals = pd.read_csv('bq-results-20221113-100919-1668334483927.csv')

In [75]:
vitals.shape

(10249430, 8)

In [76]:
vitals.head()

Unnamed: 0,stay_id,charttime,heart_rate,mbp,mbp_ni,temperature,resp_rate,spo2
0,38576832,2162-04-11 15:00:00,75.0,86.5,81.0,,7.0,100.0
1,35962636,2159-09-27 12:00:00,94.0,127.0,127.0,36.67,9.0,100.0
2,31984002,2128-11-25 08:00:00,106.0,104.0,104.0,36.33,17.5,100.0
3,32421099,2156-08-04 20:45:00,122.0,110.0,110.0,,33.0,94.0
4,30682249,2170-03-17 20:00:00,118.0,55.0,,36.56,37.0,95.0


In [77]:
agg_cols = ['heart_rate', 'mbp', 'mbp_ni', 'resp_rate', 'spo2']
vitals_min = vitals.groupby('stay_id')[agg_cols].min().reset_index()
vitals_min.rename(dict(zip(agg_cols, ['min_' + x for x in agg_cols])), axis=1, inplace=True)
vitals_max = vitals.groupby('stay_id')[agg_cols].max().reset_index()
vitals_max.rename(dict(zip(agg_cols, ['max_' + x for x in agg_cols])), axis=1, inplace=True)

In [78]:
cohort_vitals = cohort_antib.merge(vitals_min, on='stay_id', how='left')
cohort_vitals = cohort_vitals.merge(vitals_max, on='stay_id', how='left')

In [79]:
cohort_vitals.nunique()

subject_id             10184
hadm_id                10184
stay_id                10184
charttime              25486
creat_low_past_7day      143
                       ...  
max_heart_rate           172
max_mbp                  327
max_mbp_ni               180
max_resp_rate             85
max_spo2                  18
Length: 77, dtype: int64

In [80]:
cohort_vitals = cohort_vitals.drop_duplicates()

In [81]:
cohort_vitals.shape

(25500, 77)

In [82]:
diagnosis = pd.read_csv('covariates.csv')
diagnosis = diagnosis.drop([x for x in list(diagnosis) if 'Unnamed' in x], axis=1)

In [83]:
cohort_diag = cohort_vitals.merge(diagnosis, on='stay_id', how='left')

In [84]:
cohort_diag.shape

(25500, 81)

In [85]:
cohort_diag.nunique()

subject_id             10184
hadm_id                10184
stay_id                10184
charttime              25486
creat_low_past_7day      143
                       ...  
max_spo2                  18
CKD                        2
DM                         2
Hypertension               2
Heart Failure              2
Length: 81, dtype: int64

In [86]:
cohort_diag.shape

(25500, 81)

In [87]:
list(cohort_diag)

['subject_id',
 'hadm_id',
 'stay_id',
 'charttime',
 'creat_low_past_7day',
 'creat_low_past_48hr',
 'creat',
 'aki_stage_creat',
 'uo_rt_6hr',
 'uo_rt_12hr',
 'uo_rt_24hr',
 'aki_stage_uo',
 'aki_stage',
 'admittime',
 'dischtime',
 'deathtime',
 'admission_type',
 'admission_location',
 'discharge_location',
 'insurance',
 'language',
 'marital_status',
 'race',
 'edregtime',
 'edouttime',
 'hospital_expire_flag',
 'gender',
 'anchor_age',
 'anchor_year',
 'anchor_year_group',
 'dod',
 'first_careunit',
 'last_careunit',
 'intime_x',
 'outtime',
 'los',
 'creat_24',
 'urine_24',
 'nanmin_valuenum_Albumin',
 'nanmin_valuenum_Bicarbonate',
 'nanmin_valuenum_Calculated Bicarbonate, Whole Blood',
 'nanmin_valuenum_Creatinine',
 'nanmin_valuenum_Hematocrit',
 'nanmin_valuenum_Hematocrit, Calculated',
 'nanmin_valuenum_Potassium',
 'nanmin_valuenum_Potassium, Whole Blood',
 'nanmin_valuenum_Urea Nitrogen',
 'nanmin_valuenum_pH',
 'nanmax_valuenum_Albumin',
 'nanmax_valuenum_Bicarbonate',


In [88]:
cohort_diag.to_csv('cohort_with_covariates.csv', index=False)

In [89]:
crrt = pd.read_csv('crrt_positive_cohort.csv')

In [90]:
list(crrt)

['stay_id',
 'charttime',
 'crrt_mode',
 'access_pressure',
 'blood_flow',
 'citrate',
 'current_goal',
 'dialysate_fluid',
 'dialysate_rate',
 'effluent_pressure',
 'filter_pressure',
 'heparin_concentration',
 'heparin_dose',
 'hourly_patient_fluid_removal',
 'prefilter_replacement_rate',
 'postfilter_replacement_rate',
 'replacement_fluid',
 'replacement_rate',
 'return_pressure',
 'ultrafiltrate_output',
 'system_active',
 'clots',
 'clots_increasing',
 'clotted',
 'subject_id',
 'hadm_id',
 'first_careunit',
 'last_careunit',
 'intime',
 'outtime',
 'los',
 'time_delta']

In [91]:
crrt['crrt'] = 1

In [92]:
crrt = crrt[['stay_id', 'crrt']]

In [93]:
cohort_diag_crrt = cohort_diag.merge(crrt, on='stay_id', how='left')

In [94]:
cohort_diag_crrt['crrt'].fillna(0, inplace=True)

In [110]:
columns = ['first_careunit', 'creat_low_past_48hr', 'aki_stage', 'race','min_heart_rate',
 'min_mbp',
 'min_mbp_ni',
 'min_resp_rate',
 'min_spo2',
 'max_heart_rate',
 'max_mbp',
 'max_mbp_ni',
 'max_resp_rate',
 'max_spo2',
 'CKD',
 'DM',
 'Hypertension',
 'Heart Failure',
'nanmin_valuenum_Albumin',
 'nanmin_valuenum_Bicarbonate',
 'nanmin_valuenum_Calculated Bicarbonate, Whole Blood',
 'nanmin_valuenum_Creatinine',
 'nanmin_valuenum_Hematocrit',
 'nanmin_valuenum_Hematocrit, Calculated',
 'nanmin_valuenum_Potassium',
 'nanmin_valuenum_Potassium, Whole Blood',
 'nanmin_valuenum_Urea Nitrogen',
 'nanmin_valuenum_pH',
 'nanmax_valuenum_Albumin',
 'nanmax_valuenum_Bicarbonate',
 'nanmax_valuenum_Calculated Bicarbonate, Whole Blood',
 'nanmax_valuenum_Creatinine',
 'nanmax_valuenum_Hematocrit',
 'nanmax_valuenum_Hematocrit, Calculated',
 'nanmax_valuenum_Potassium',
 'nanmax_valuenum_Potassium, Whole Blood',
 'nanmax_valuenum_Urea Nitrogen',
 'nanmax_valuenum_pH',
 'sepsis3',
 'sofa_24hours',
 'antibiotic',
 'gender']

numerical=[
    'creat_low_past_48hr',
    'min_heart_rate',
    'min_mbp',
 'min_mbp_ni',
 'min_resp_rate',
 'min_spo2',
 'max_heart_rate',
 'max_mbp',
 'max_mbp_ni',
 'max_resp_rate',
 'max_spo2','nanmin_valuenum_Albumin',
 'nanmin_valuenum_Bicarbonate',
 'nanmin_valuenum_Calculated Bicarbonate, Whole Blood',
 'nanmin_valuenum_Creatinine',
 'nanmin_valuenum_Hematocrit',
 'nanmin_valuenum_Hematocrit, Calculated',
 'nanmin_valuenum_Potassium',
 'nanmin_valuenum_Potassium, Whole Blood',
 'nanmin_valuenum_Urea Nitrogen',
 'nanmin_valuenum_pH',
 'nanmax_valuenum_Albumin',
 'nanmax_valuenum_Bicarbonate',
 'nanmax_valuenum_Calculated Bicarbonate, Whole Blood',
 'nanmax_valuenum_Creatinine',
 'nanmax_valuenum_Hematocrit',
 'nanmax_valuenum_Hematocrit, Calculated',
 'nanmax_valuenum_Potassium',
 'nanmax_valuenum_Potassium, Whole Blood',
 'nanmax_valuenum_Urea Nitrogen',
 'nanmax_valuenum_pH',
        'sofa_24hours',]

for col in numerical:
    cohort_diag_crrt[col] = cohort_diag_crrt[col].astype(np.float64)

categorical = [x for x in columns if x not in numerical]

tab = TableOne(cohort_diag_crrt, columns=columns,categorical=categorical, groupby='crrt')

In [114]:
tab.to_csv('tableone.csv')

In [96]:
cohort_diag_crrt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25500 entries, 0 to 25499
Data columns (total 82 columns):
 #   Column                                               Non-Null Count  Dtype         
---  ------                                               --------------  -----         
 0   subject_id                                           25500 non-null  int64         
 1   hadm_id                                              25500 non-null  int64         
 2   stay_id                                              25500 non-null  int64         
 3   charttime                                            25500 non-null  datetime64[ns]
 4   creat_low_past_7day                                  25500 non-null  float64       
 5   creat_low_past_48hr                                  25447 non-null  float64       
 6   creat                                                25500 non-null  float64       
 7   aki_stage_creat                                      25500 non-null  int64         
 

In [97]:
tab.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by crrt,Grouped by crrt,Grouped by crrt,Grouped by crrt
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,0.0,1.0
n,,,25500,22449,3051
"first_careunit, n (%)",Cardiac Vascular Intensive Care Unit (CVICU),0,4966 (19.5),4492 (20.0),474 (15.5)
"first_careunit, n (%)",Coronary Care Unit (CCU),,3514 (13.8),3082 (13.7),432 (14.2)
"first_careunit, n (%)",Medical Intensive Care Unit (MICU),,5676 (22.3),4826 (21.5),850 (27.9)
"first_careunit, n (%)",Medical/Surgical Intensive Care Unit (MICU/SICU),,4498 (17.6),4016 (17.9),482 (15.8)
...,...,...,...,...,...
"creat_low_past_48hr, n (%)",3.6,,167 (0.7),125 (0.6),42 (1.4)
"creat_low_past_48hr, n (%)",3.7,,96 (0.4),76 (0.3),20 (0.7)
"creat_low_past_48hr, n (%)",3.8,,90 (0.4),64 (0.3),26 (0.9)
"creat_low_past_48hr, n (%)",3.9,,93 (0.4),70 (0.3),23 (0.8)


In [98]:
cohort_diag_crrt.nunique()

subject_id             10184
hadm_id                10184
stay_id                10184
charttime              25486
creat_low_past_7day      143
                       ...  
CKD                        2
DM                         2
Hypertension               2
Heart Failure              2
crrt                       2
Length: 82, dtype: int64

In [99]:
first = cohort_diag_crrt.groupby('stay_id')['charttime'].min()
cohort_diag_crrt = cohort_diag_crrt.merge(first)

In [101]:
cohort_diag_crrt.shape

(10192, 82)

In [102]:
cohort_diag_crrt.to_csv('final_cohort.csv', index=False)