In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
import matplotlib.pyplot as plt
import statsmodels.api as sm


In [None]:
# Load the data 
df = pd.read_csv('has_icu_stay.csv')

In [None]:
df.shape

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
df.tail()

In [None]:
survivors = df[df['death'] == 0]
non_survivors = df[df['death'] == 1]

In [None]:
categorical_variables = [
    'gender',
    'myocardial_infarction', 'congestive_heart_failure', 'peripheral_vascular_disease',
    'cerebrovascular_disease', 'dementia', 'chronic_pulmonary_disease',
    'rheumatic_disease', 'peptic_ulcer_disease', 'mild_liver_disease',
    'diabetes_without_chronic_complication', 'diabetes_with_chronic_complication',
    'hemiplegia_or_paraplegia', 'renal_disease', 'malignancy',
    'moderate_or_severe_liver_disease', 'metastatic_solid_tumor', 'AIDSHIV', 'hypertension',
    'diabetes_mellitus', 'epinephrine', 'norepinephrine', 'dopamine', 'dobutamine', 'tracheostomy',
    'IV', 'NIV', 'HFNC', 'supplemental_oxygen', 'gcs_geq', 'po2_fio2_geq', 'use_of_vasopressors',
    'invasive_ventilation', 'non_invasive_ventilation', 'qsofa'
]


In [None]:
numeric_variables = [
    'BMI', 'temperature', 'heartrate', 'resprate', 'sbp', 'dbp',
    'o2sat',  'age',
    'WBC', 'platelet', 'log2_CRP', 'glucose', 'glucose_bg', 'lactate', 'creatinine', 'bilirubin'
]

In [None]:
# Activate the automatic conversion from numpy to R arrays
numpy2ri.activate()

# Import the R "stats" package
stats = importr('stats')

def fisher_exact_test(table, workspace=1e6, simulate_p_value=True):
    result = stats.fisher_test(table, workspace=float(workspace), simulate_p_value=bool(simulate_p_value))
    p_value = result[0][0]
    return p_value

In [None]:
p_values = {} #store the p-values in a dictionary

In [None]:
for variable in categorical_variables:
    contingency_table = pd.crosstab(df['death'], df[variable])
    stat, p, _, _ = chi2_contingency(contingency_table)
    
    if np.min(contingency_table.values) < 5:
        p = fisher_exact_test(contingency_table.values, workspace=1e6, simulate_p_value=True)
        print(f"Fisher's exact test p-value for {variable}: {p:.5f}")
    else:
        print(f"Chi-squared test p-value for {variable}: {p:.5f}")
    
    p_values[variable] = p

In [None]:
results_cat_df = pd.DataFrame()

# Loop through each categorical variable in your dataframe
for col in categorical_variables:
    
    # Calculate the total number and proportion of 1's for this variable
    total_count = df[col].value_counts().get(1, 0)
    total_prop = total_count / len(df)
    
    # Calculate the number and proportion of 1's for survivors
    survivor_count = df[df['death'] == 0][col].value_counts().get(1, 0)
    survivor_prop = survivor_count / len(df[df['death'] == 0])
    
    # Calculate the number and proportion of 1's for non-survivors
    nonsurvivor_count = df[df['death'] == 1][col].value_counts().get(1, 0)
    nonsurvivor_prop = nonsurvivor_count / len(df[df['death'] == 1])
    # Add the results to the new dataframe
    results_cat_df.loc[col, 'Total Count'] = total_count
    results_cat_df.loc[col, 'Total Proportion'] = total_prop*100
    results_cat_df.loc[col, 'Survivor Count'] = survivor_count
    results_cat_df.loc[col, 'Survivor Proportion'] = survivor_prop*100
    results_cat_df.loc[col, 'Non-Survivor Count'] = nonsurvivor_count
    results_cat_df.loc[col, 'Non-Survivor Proportion'] = nonsurvivor_prop*100
    results_cat_df.loc[col, 'P value'] = p_values[col]
    
# Convert integer columns to int type
int_cols = ['Total Count', 'Survivor Count', 'Non-Survivor Count']
results_cat_df[int_cols] = results_cat_df[int_cols].astype(int)

In [None]:
for col in numeric_variables:
    fig, ax = plt.subplots(figsize=(8, 6))
    # plot the histogram for overall
    df[col].hist(ax=ax, label='Overall')
    # plot the histogram for survivors
    df[df['death'] == 0][col].hist(ax=ax, alpha=0.7, label='Survivors')
    # plot the histogram for non-survivors
    df[df['death'] == 1][col].hist(ax=ax, alpha=0.7, label='Non-Survivors')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax.legend()
    fig.savefig(f'no_icu_{col}_histogram.png')
    plt.show()

In [None]:
results_num_df = pd.DataFrame()


def perform_statistical_tests(column):
    # Check for normality using the Shapiro-Wilk test
    stat, p_shapiro_survivors = shapiro(survivors[column])
    stat, p_shapiro_non_survivors = shapiro(non_survivors[column])
    
    if p_shapiro_survivors > 0.05 and p_shapiro_non_survivors > 0.05:
        # Normally distributed data: use Student's t-test
        stat, p = ttest_ind(survivors[column], non_survivors[column])
        print(f"Student's t-test p-value for {column}: {p:.5f}")
    else:
        # Non-normally distributed data: use Mann-Whitney test
        stat, p = mannwhitneyu(survivors[column], non_survivors[column])
        print(f"Mann-Whitney test p-value for {column}: {p:.5f}")
    p_values[column] = p

In [None]:
for variable in numeric_variables:
    perform_statistical_tests(variable)

In [None]:
# Filter predictor variables with p-value ≤ 0.05
significant_predictors = [var for var, p_value in p_values.items() if p_value <= 0.05]
print("\nSignificant predictor variables (p-value ≤ 0.05):")
print(significant_predictors)

In [None]:

for col in numeric_variables:
    
    # Calculate the median and IQR for this variable among all patients
    overall_median = df[col].median()
    # Calculate the 25th and 75th percentiles of the 'age' column
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)

    # Calculate the IQR of the 'age' column
    overall_iqr = q3 - q1
    
    # Calculate the median and IQR for this variable among survivors (death=0)
    survivor_median = df[df['death'] == 0][col].median()
    survivor_q1 = df[df['death'] == 0][col].quantile(0.25)
    survivor_q3 = df[df['death'] == 0][col].quantile(0.75)
    survivor_iqr = survivor_q3 - survivor_q1
    
    
    # Calculate the number and proportion of 1's for non-survivors
    nonsurvivor_median = df[df['death'] == 1][col].median()
    nonsurvivor_q1 = df[df['death'] == 1][col].quantile(0.25)
    nonsurvivor_q3 = df[df['death'] == 1][col].quantile(0.75)
    nonsurvivor_iqr = survivor_q3 - survivor_q1
    
    
    # Add the results to the new dataframe
    results_num_df.loc[col, 'Overall Median'] = overall_median
    results_num_df.loc[col, 'Overall IQR'] = overall_iqr
    results_num_df.loc[col, 'Survivor Median'] = survivor_median
    results_num_df.loc[col, 'Survivor IQR'] = survivor_iqr
    results_num_df.loc[col, 'Non-Survivor Median'] = nonsurvivor_median
    results_num_df.loc[col, 'Non-Survivor IQR'] = nonsurvivor_iqr
    results_num_df.loc[col, 'P value'] = p_values[col]

In [None]:
time_list = ['EDTime']
or_dfs = []

In [None]:
from sklearn.preprocessing import StandardScaler


var_use = significant_predictors + time_list

for time_range in time_list:
    
    X = df[var_use]
    scaler = StandardScaler()
    X_STD = scaler.fit_transform(X)
    # fit a logistic regression model with all variables
    X_STD = sm.add_constant(X_STD)
    y = df['death']
    logit_model = sm.Logit(y, X_STD)
    result = logit_model.fit(maxiter=100)
    
    # calculate adjusted odds ratios for each variable
    ORs = np.exp(result.params)
    CI = np.exp(result.conf_int(alpha=0.01))
    CI.columns = ['OR_lower', 'OR_upper']
    ORs = pd.concat([ORs, CI], axis=1)
    ORs.columns = ['OR', 'OR_lower', 'OR_upper']
    ORs.drop('const', inplace=True)
    #ORs = ORs.loc[var_use]
    or_dfs.append(ORs)
    

    print(ORs)

In [None]:
var_use_df = pd.DataFrame(var_use, columns=['Variable'])

In [None]:
filename = 'odds_ratios_no_icu.xlsx'
with pd.ExcelWriter(filename) as writer:
    or_dfs[0].to_excel(writer, sheet_name='OR', index=False)
    var_use_df.to_excel(writer, sheet_name='Sheet2', index=False)
    results_cat_df.to_excel(writer, sheet_name='CAT', index=True)
    results_num_df.to_excel(writer, sheet_name='NUM', index=True)

## FOR TABLE 2

In [None]:
# Convert EDTime and HospTime to hours
df['EDTime'] = df['EDTime'] * 24
df['HospTime'] = df['HospTime'] * 24


In [None]:
# Divide data into groups based on EDTime
df['EDTimeGroup'] = pd.cut(df['EDTime'], [0, 6, 12, 24, np.inf], labels=['<6', '6-12', '12-24', '>24'])

In [None]:
# Count number of patients in each EDTimeGroup
edtime_counts = df['EDTimeGroup'].value_counts()

In [None]:
# Calculate median and IQR of Hospital LOS for each group
los_stats = df.groupby('EDTimeGroup')['HospTime'].agg(['median', lambda x: np.percentile(x, 25), lambda x: np.percentile(x, 75)])
los_stats.columns = ['Median', 'IQR_25', 'IQR_75']

In [None]:
# Count number and percentage of deaths in each group
death_counts = df.groupby('EDTimeGroup')['death'].value_counts().unstack().fillna(0)
death_counts['% Mortality'] = death_counts[1] / (death_counts[0] + death_counts[1]) * 100

In [None]:
from scipy.stats import shapiro, ttest_ind, mannwhitneyu, chi2_contingency, fisher_exact
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
import scipy.stats as stats
# Test for significant differences in HospTime and mortality across groups using Kruskal-Wallis test
hosp_time_p = stats.kruskal(*[group['HospTime'] for name, group in df.groupby('EDTimeGroup')]).pvalue
mortality_p = stats.kruskal(*[group['death'] for name, group in df.groupby('EDTimeGroup')]).pvalue

In [None]:
df_org = pd.read_csv('sepsis.csv')

In [None]:
from datetime import datetime, timedelta
df_org['28_day_mortality'] = (( pd.to_datetime(df_org['deathtime']) - pd.to_datetime(df_org['intime'])) <= timedelta(days=28)).astype(int)

In [None]:
# Merge df and df_org on subject_id, stay_id, and hadm_id
merged_df = pd.merge(df, df_org[['subject_id', 'stay_id', 'hadm_id', '28_day_mortality']], 
                      on=['subject_id', 'stay_id', 'hadm_id'], 
                      how='left')
# Rename the 28_day_mortality column to a more descriptive name
merged_df = merged_df.rename(columns={'28_day_mortality': 'mortality_28d'})

In [None]:
death_counts_28d = merged_df.groupby('EDTimeGroup')['mortality_28d'].value_counts().unstack().fillna(0)
death_counts_28d['% Mortality'] = death_counts_28d[1] / (death_counts_28d[0] + death_counts_28d[1]) * 100
death_counts_28d_p = stats.kruskal(*[group['mortality_28d'] for name, group in merged_df.groupby('EDTimeGroup')]).pvalue

In [None]:
# Print results
print('Table 2 Outcomes of patients according to length of stay in ED')
print(f'{edtime_counts["<6"]} patients had EDTime < 6 hours')
print(f'{edtime_counts["6-12"]} patients had 6-12 hours of EDTime')
print(f'{edtime_counts["12-24"]} patients had 12-24 hours of EDTime')
print(f'{edtime_counts[">24"]} patients had EDTime > 24 hours\n')
print('----------------------')
print('Hospital LOS (median [IQR])')
print(los_stats.to_string())
print(hosp_time_p)
print('----------------------')
print('\nMortality, n (%)')
print(death_counts.to_string())
print(mortality_p)
print('----------------------')
print('\n28_day_mortality, n (%)')
print(death_counts_28d.to_string())
print(death_counts_28d_p)