In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
import scipy.stats as stats

In [2]:
# Load the data 
df = pd.read_csv('sepsis_cleaned.csv')

In [3]:
df.shape

(2145, 91)

In [4]:
# Convert EDTime and HospTime to hours
df['EDTime'] = df['EDTime'] * 24
df['HospTime'] = df['HospTime'] * 24


In [5]:
# Divide data into groups based on EDTime
df['EDTimeGroup'] = pd.cut(df['EDTime'], [0, 6, 12, 24, np.inf], labels=['<6', '6-12', '12-24', '>24'])

In [6]:
# Count number of patients in each EDTimeGroup
edtime_counts = df['EDTimeGroup'].value_counts()

In [7]:
# Calculate median and IQR of Hospital LOS for each group
los_stats = df.groupby('EDTimeGroup')['HospTime'].agg(['median', lambda x: np.percentile(x, 25), lambda x: np.percentile(x, 75)])
los_stats.columns = ['Median', 'IQR_25', 'IQR_75']

In [8]:
# Count number and percentage of deaths in each group
death_counts = df.groupby('EDTimeGroup')['death'].value_counts().unstack().fillna(0)
death_counts['% Mortality'] = death_counts[1] / (death_counts[0] + death_counts[1]) * 100

In [9]:
# Test for significant differences in HospTime and mortality across groups using Kruskal-Wallis test
hosp_time_p = stats.kruskal(*[group['HospTime'] for name, group in df.groupby('EDTimeGroup')]).pvalue
mortality_p = stats.kruskal(*[group['death'] for name, group in df.groupby('EDTimeGroup')]).pvalue

In [39]:
df.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'EDTime', 'HospTime', 'death',
       'gender', 'WHITE', 'WHITE - RUSSIAN', 'WHITE - EASTERN EUROPEAN',
       'WHITE - OTHER EUROPEAN', 'WHITE - BRAZILIAN', 'SOUTH AMERICAN',
       'ASIAN', 'ASIAN - KOREAN', 'ASIAN - CHINESE',
       'ASIAN - SOUTH EAST ASIAN', 'ASIAN - ASIAN INDIAN',
       'AMERICAN INDIAN/ALASKA NATIVE',
       'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'HISPANIC OR LATINO',
       'HISPANIC/LATINO - GUATEMALAN', 'HISPANIC/LATINO - DOMINICAN',
       'HISPANIC/LATINO - SALVADORAN', 'HISPANIC/LATINO - PUERTO RICAN',
       'HISPANIC/LATINO - COLUMBIAN', 'HISPANIC/LATINO - HONDURAN',
       'HISPANIC/LATINO - CENTRAL AMERICAN', 'HISPANIC/LATINO - CUBAN',
       'HISPANIC/LATINO - MEXICAN', 'PORTUGUESE', 'BLACK/AFRICAN AMERICAN',
       'BLACK/CAPE VERDEAN', 'BLACK/AFRICAN', 'BLACK/CARIBBEAN ISLAND',
       'OTHER', 'UNKNOWN', 'UNABLE TO OBTAIN', 'PATIENT DECLINED TO ANSWER',
       'hospital_expire_flag', 'BMI', 'weight',

In [11]:
# Print results
print('Table 2 Outcomes of patients according to length of stay in ED')
print(f'{edtime_counts["<6"]} patients had EDTime < 6 hours')
print(f'{edtime_counts["6-12"]} patients had 6-12 hours of EDTime')
print(f'{edtime_counts["12-24"]} patients had 12-24 hours of EDTime')
print(f'{edtime_counts[">24"]} patients had EDTime > 24 hours\n')
print('Hospital LOS (median [IQR])')
print(los_stats.to_string())
print('\nMortality, n (%)')
print(death_counts.to_string())

Table 2 Outcomes of patients according to length of stay in ED
1287 patients had EDTime < 6 hours
725 patients had 6-12 hours of EDTime
118 patients had 12-24 hours of EDTime
15 patients had EDTime > 24 hours

Hospital LOS (median [IQR])
                 Median     IQR_25      IQR_75
EDTimeGroup                                   
<6           163.783333  90.933333  266.383333
6-12         139.116667  85.333333  243.433333
12-24        153.150000  80.670833  285.058333
>24           70.366667  58.016667  179.266667

Mortality, n (%)
death           0    1  % Mortality
EDTimeGroup                        
<6           1063  224    17.404817
6-12          639   86    11.862069
12-24         110    8     6.779661
>24            12    3    20.000000


In [19]:
hosp_time_p

0.018449134894641627

In [20]:
mortality_p

0.0004380153345445407

In [21]:
# Load the data 
df_org = pd.read_csv('sepsis.csv')

In [22]:
df_org.columns

Index(['Outtime>Admittime', 'Dischtime  =/= Deathtime', 'First ED Time',
       'First ED', 'Age Group', 'No', 'subject_id', 'hadm_id', 'stay_id',
       'icu_stay_id',
       ...
       'creatinine', 'bilirubin', 'po2', 'fio2', 'fio2 or fio2_ce', 'po2/fio2',
       'epinephrine.1', 'norepinephrine.1', 'dopamine.1', 'dobutamine.1'],
      dtype='object', length=118)

In [23]:
from datetime import datetime, timedelta
df_org['28_day_mortality'] = (( pd.to_datetime(df_org['deathtime']) - pd.to_datetime(df_org['intime'])) <= timedelta(days=28)).astype(int)

In [26]:
# Merge df and df_org on subject_id, stay_id, and hadm_id
merged_df = pd.merge(df, df_org[['subject_id', 'stay_id', 'hadm_id', '28_day_mortality']], 
                      on=['subject_id', 'stay_id', 'hadm_id'], 
                      how='left')
# Rename the 28_day_mortality column to a more descriptive name
merged_df = merged_df.rename(columns={'28_day_mortality': 'mortality_28d'})

In [27]:
merged_df.shape

(2145, 93)

In [28]:
merged_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,EDTime,HospTime,death,gender,WHITE,WHITE - RUSSIAN,WHITE - EASTERN EUROPEAN,...,supplemental_oxygen,temperature,heartrate,resprate,o2sat,sbp,dbp,qsofa,EDTimeGroup,mortality_28d
0,10004322,20356134,36579193,6.116667,140.133333,0,0,1,0,0,...,0,37.277778,111.5,23.5,99.0,107.0,58.0,1,6-12,0
1,10014729,23300884,37887480,4.396389,212.65,0,1,0,0,0,...,0,37.444444,119.5,22.0,100.0,115.0,65.0,1,<6,0
2,10019003,29279905,31254712,4.883333,256.916667,0,1,1,0,0,...,1,36.944444,96.0,21.5,97.5,102.0,52.0,0,<6,0
3,10020944,29974575,39738665,3.4,337.45,0,0,0,0,0,...,1,37.0,80.0,14.0,97.5,105.0,70.0,0,<6,0
4,10030753,21257920,39742415,4.933333,252.75,0,1,1,0,0,...,1,38.5,121.0,26.0,96.0,160.0,83.0,2,<6,0


In [29]:
death_counts_28d = merged_df.groupby('EDTimeGroup')['mortality_28d'].value_counts().unstack().fillna(0)
death_counts_28d['% Mortality'] = death_counts_28d[1] / (death_counts_28d[0] + death_counts_28d[1]) * 100

In [30]:
death_counts_28d_p = stats.kruskal(*[group['mortality_28d'] for name, group in merged_df.groupby('EDTimeGroup')]).pvalue

In [31]:
print('\n28_day_mortality, n (%)')
print(death_counts_28d.to_string())


28_day_mortality, n (%)
mortality_28d     0    1  % Mortality
EDTimeGroup                          
<6             1070  217    16.860917
6-12            639   86    11.862069
12-24           112    6     5.084746
>24              12    3    20.000000


In [32]:
death_counts_28d_p

0.00033094507604277665

In [43]:
counts_iv = df.groupby('EDTimeGroup')['IV'].value_counts().unstack().fillna(0)
counts_iv['% IV'] = counts_iv[1] / (counts_iv[0] + counts_iv[1]) * 100

In [44]:
IV_p = stats.kruskal(*[group['IV'] for name, group in df.groupby('EDTimeGroup')]).pvalue

In [46]:
print('\nIV n (%)')
print(counts_iv.to_string())
print("-----------------------")
print(IV_p)


IV n (%)
IV             0    1       % IV
EDTimeGroup                     
<6           854  433  33.644134
6-12         597  128  17.655172
12-24        100   18  15.254237
>24           14    1   6.666667
-----------------------
1.4071133934279985e-15
