In [68]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from IPython.display import display
from scipy.stats import chi2_contingency, kruskal

pd.options.display.precision = 3

## Load useful Dataframes

In [69]:
# Chloride load
df = pd.read_csv('chloride.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['icu_day', 'chloride_input_meq'])
df = df.dropna(subset = ['chloride_input_meq'])
df = df.query('icu_day <= 3')

# Average Chloride load of first 3 days
df_chloride = df.filter(['chloride_input_meq'])
df_chloride = df_chloride.groupby(['subject_id', 'hadm_id', 'icustay_id']).mean()
df_chloride.head()

# SOFA score on admission (day 1)
df = pd.read_csv('sofa_pan.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['day', 'sofa'])
df = df.query('day == 1')
df_sofa = df.filter(['sofa'])

# Demographic values
df = pd.read_csv('patient_info.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df_dem = df.filter(['ethnicity', 'gender','age'])

# ICU mortality
df = pd.read_csv('icu_mort.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['intime', 'outtime', 'in_icu_mort'])
df = df.dropna(subset = ['intime', 'outtime'])
df['LOS'] = (pd.to_datetime(df['outtime']) - pd.to_datetime(df['intime'])).dt.days
df_mort = df.filter(['LOS', 'in_icu_mort'])  # need LOS for MODS calculation

# MODS on day 7
sofa_subscores = ['cardiovascular', 'cns', 'coagulation', 'liver', 'renal', 'respiration']
df = pd.read_csv('sofa_pan.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['day'] + sofa_subscores)
df = df.query('day == 7')
df['od_sum'] = sum((df[subscore] > 1).astype(int) for subscore in sofa_subscores)
df = df.filter(['od_sum'])
df = df.join(df_mort, how='inner')
df['mods'] = ((df['od_sum'] > 1) | ((df['LOS'] < 7) & (df['in_icu_mort'] == 1))).astype(int)
df_mods = df.filter(['mods'])

df_mort = df_mort.filter(['in_icu_mort'])

# New AKI
df = pd.read_csv('new_aki.csv')
df = pd.read_csv('new_aki.csv')
df['new'] = df.apply(lambda row: 1 if row['label'] < 0 else 0, axis=1)
df_aki = df.filter(['icustay_id', 'new'])

## Define useful functions

In [70]:
def run_kruskal(df_neg, df_pos):
    """Display results of a Kruskal-Wallis H test.
    
    @param df_neg: DataFrame containing the negative samples
    @param df_pos: DataFrame containing the positive samples
    """
    summary = pd.concat([df_neg.describe(), df_pos.describe()], axis=1)
    summary.columns = ['(-)', '(+)']
    display(summary)
    print (kruskal(df_neg, df_pos))

def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print ("Chi-square test of independence with Yates' continuity correction")
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print ("X-squared = %s" % chi2)
    print ("dof       = %s" % dof)
    print ("p-value   = %s" % p)

def run_glm(df):
    """Display results (odds ratios) of a GLM regression.

    @param df: DataFrame with columns [exog1, exog2, ..., endog]
    """
    endog = df[df.columns[-1]]
    exog = sm.add_constant(df[df.columns[:-1]])
    glm = sm.GLM(endog, exog, family=sm.families.Binomial())  # == sm.Logit(endog, exog)
    result = glm.fit()
    print (result.summary())

    odds = pd.concat([result.params, result.conf_int()], axis=1)[1:]  # toss const (first row)
    odds.columns = ['Odds Ratio', '2.5%', '97.5%']
    display(np.exp(odds))
    

# 2a(1): Chloride input amout and Mortality

In [71]:
df_combined = df_chloride.join(df_sofa, how='inner').join(df_mort, how='inner')

## -Univariate association

In [72]:
run_kruskal(df_combined.query('in_icu_mort == 0').filter(['chloride_input_meq']),
            df_combined.query('in_icu_mort == 1').filter(['chloride_input_meq']))

Unnamed: 0,(-),(+)
count,41688.0,3572.0
mean,205.583,286.807
std,166.307,277.53
min,0.036,0.051
25%,89.041,100.829
50%,172.608,214.369
75%,279.972,382.966
max,6742.801,3893.103


KruskalResult(statistic=247.5003629164961, pvalue=9.107388395717572e-56)


## - Independent association adjusted for day 1 SOFA

In [73]:
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:            in_icu_mort   No. Observations:                45260
Model:                            GLM   Df Residuals:                    45257
Model Family:                Binomial   Df Model:                            2
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -10965.
Date:                Wed, 13 Mar 2019   Deviance:                       21930.
Time:                        14:37:50   Pearson chi2:                 4.66e+04
No. Iterations:                     6                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -3.9643      0.039   -102.167      0.000      -4.040      -3.888
chloride_input_meq     0.001

Unnamed: 0,Odds Ratio,2.5%,97.5%
chloride_input_meq,1.001,1.001,1.001
sofa,1.293,1.28,1.306


# 2a(2): Chloride input amount and MODS on Day 7

In [74]:
df_combined = df_chloride.join(df_sofa, how='inner').join(df_mods, how='left')
df_combined = df_combined.fillna({'mods':0})

## - Univariate association

In [75]:
run_kruskal(df_combined.query('mods == 0').filter(['chloride_input_meq']),
            df_combined.query('mods == 1').filter(['chloride_input_meq']))

Unnamed: 0,(-),(+)
count,46526.0,1843.0
mean,207.16,278.512
std,176.135,230.175
min,0.031,0.885
25%,86.24,106.553
50%,171.135,217.413
75%,280.944,389.504
max,6742.801,1997.461


KruskalResult(statistic=163.34662172803073, pvalue=2.1013458664139814e-37)


## - Independent association adjusted for day 1 SOFA

In [76]:
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                   mods   No. Observations:                48369
Model:                            GLM   Df Residuals:                    48366
Model Family:                Binomial   Df Model:                            2
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -6775.2
Date:                Wed, 13 Mar 2019   Deviance:                       13550.
Time:                        14:37:52   Pearson chi2:                 4.35e+04
No. Iterations:                     7                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -4.8340      0.052    -93.304      0.000      -4.936      -4.732
chloride_input_meq     0.000

Unnamed: 0,Odds Ratio,2.5%,97.5%
chloride_input_meq,1.0,1.0,1.001
sofa,1.321,1.305,1.338


# 2a(3): Chloride input amount and NEW AKI

In [77]:
df_combined = df_chloride.join(df_sofa, how='inner').reset_index().merge(df_aki, on = 'icustay_id')
df_combined = df_combined.set_index(['subject_id', 'hadm_id', 'icustay_id'])

## - Univariate association

In [78]:
run_kruskal(df_combined.query('new == 0').filter(['chloride_input_meq']),
            df_combined.query('new == 1').filter(['chloride_input_meq']))

Unnamed: 0,(-),(+)
count,16819.0,31550.0
mean,199.663,215.325
std,175.412,180.67
min,0.031,0.282
25%,81.985,89.795
50%,162.69,178.13
75%,267.193,292.685
max,3150.2,6742.801


KruskalResult(statistic=119.95090365136603, pvalue=6.484588899544113e-28)


## - Independent association adjusted for day 1 SOFA

In [79]:
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                48369
Model:                            GLM   Df Residuals:                    48366
Model Family:                Binomial   Df Model:                            2
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -29872.
Date:                Wed, 13 Mar 2019   Deviance:                       59743.
Time:                        14:37:55   Pearson chi2:                 4.92e+04
No. Iterations:                     4                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -0.0617      0.019     -3.203      0.001      -0.100      -0.024
chloride_input_meq  6.208e-0

Unnamed: 0,Odds Ratio,2.5%,97.5%
chloride_input_meq,1.0,1.0,1.0
sofa,1.205,1.196,1.214
