In [54]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from IPython.display import display
from tableone import TableOne
from scipy.stats import chi2_contingency, kruskal

pd.options.display.precision = 3

## Load useful DataFrames

In [55]:
# Get the adult patients first icustay id.
df = pd.read_csv('icu_first_18.csv')
icu = []
for icuid in df['icustay_id']:
    icu.append(icuid)

In [56]:
# Admission demographics
df = pd.read_csv('adm_demographics.csv')
df = df.set_index(['subject_id', 'hadm_id'])
df_demographics = df.filter(['age', 'gender', 'ethnicity'])

# Admission demographics for adjustments
df_demo_adj = pd.concat([df['age'],
                         pd.get_dummies(df['gender']),
                         pd.get_dummies(df['ethnicity'])], axis=1)
df_demo_adj = df_demo_adj.drop(['M', 'OTHER'], axis=1)

In [82]:
# Average daily chloride load of first 7 days
df = pd.read_csv('chloride.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['icu_day', 'chloride_input_meq'])
df = df.dropna(subset = ['chloride_input_meq'])
df = df.query('icu_day <= 7')
df = df.groupby(['subject_id', 'hadm_id', 'icustay_id']) \
       .agg({'icu_day': 'max', 'chloride_input_meq': 'sum'})
df['daily_chl_load'] = df['chloride_input_meq'] / df['icu_day']
df_chl_load = df.filter(['daily_chl_load'])

In [89]:
# SOFA score on admission (day 1)
df = pd.read_csv('sofa_pan.csv')
df = df.drop(df.columns[[0]], axis=1) 
df = df.query('day == 1')

df_sofa = df.filter(['subject_id', 'hadm_id', 'icustay_id', 'sofa'])

df_sofa_sub = df.filter(['subject_id', 'hadm_id', 'icustay_id','respiration', 'coagulation','liver','cns','renal','cardiovascular'])
df_sofa_sub = df_sofa_sub.fillna(0)

In [90]:
# New AKI
df = pd.read_csv('new_aki.csv')
df['new'] = df.apply(lambda row: 1 if row['difference'] < 0 else 0, axis=1)
df_aki = df.filter(['icustay_id', 'new'])
df_aki = df_aki[df_aki['icustay_id'].isin(icu)]

## Define useful functions

In [91]:
def run_kruskal(df_neg, df_pos):
    """Display results of a Kruskal-Wallis H test.
    
    @param df_neg: DataFrame containing the negative samples
    @param df_pos: DataFrame containing the positive samples
    """
    summary = pd.concat([df_neg.describe(), df_pos.describe()], axis=1)
    summary.columns = ['(-)', '(+)']
    display(summary)
    print (kruskal(df_neg, df_pos))

def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print ("Chi-square test of independence with Yates' continuity correction")
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print ("X-squared = %s" % chi2)
    print ("dof       = %s" % dof)
    print ("p-value   = %s" % p)

def run_glm(df):
    """Display results (odds ratios) of a GLM regression.

    @param df: DataFrame with columns [exog1, exog2, ..., endog]
    """
    endog = df[df.columns[-1]]
    exog = sm.add_constant(df[df.columns[:-1]])
    glm = sm.GLM(endog, exog, family=sm.families.Binomial())  # == sm.Logit(endog, exog)
    result = glm.fit()
    print (result.summary())

    odds = pd.concat([result.params, result.conf_int()], axis=1)[1:]  # toss const (first row)
    odds.columns = ['Odds Ratio', '2.5%', '97.5%']
    display(np.exp(odds))
    

# Daily Chloride Load and New AKI¶

In [92]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki,on=['icustay_id'])

In [93]:
run_kruskal(df_combined.query('new == 0').filter(['daily_chl_load']),
            df_combined.query('new == 1').filter(['daily_chl_load']))

Unnamed: 0,(-),(+)
count,25573.0,18652.0
mean,192.793,189.362
std,172.423,144.791
min,0.036,0.257
25%,77.0,92.633
50%,154.767,164.247
75%,259.105,251.496
max,3150.2,5066.143


KruskalResult(statistic=45.16459025253761, pvalue=1.8115105240429043e-11)


## Adjust with age, race, gender, _SOFA_

In [94]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                44225
Model:                            GLM   Df Residuals:                    44216
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -29970.
Date:                Mon, 08 Apr 2019   Deviance:                       59941.
Time:                        15:07:18   Pearson chi2:                 4.42e+04
No. Iterations:                     4                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.2236      0.049     -4.596      0.000      -0.319      -0.128
daily_chl_load -8.473e-05   6.13e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
sofa,0.989,0.983,0.995
age,1.004,1.003,1.005
F,0.934,0.899,0.971
ASIAN,0.79,0.691,0.903
BLACK,0.564,0.519,0.613
HISPANIC,0.684,0.609,0.769
WHITE,0.765,0.724,0.809


## Adjust with age, race, gender, _SOFA subscore_

In [95]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki,on=['icustay_id'])
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                44225
Model:                            GLM   Df Residuals:                    44211
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -27770.
Date:                Mon, 08 Apr 2019   Deviance:                       55540.
Time:                        15:08:06   Pearson chi2:                 4.33e+04
No. Iterations:                     4                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.4714      0.051     -9.156      0.000      -0.572      -0.371
daily_chl_load    -0.0007    6.9e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,0.999,0.999,0.999
respiration,1.256,1.235,1.277
coagulation,1.148,1.117,1.179
liver,1.015,0.984,1.048
cns,0.976,0.955,0.996
renal,0.546,0.534,0.559
cardiovascular,1.165,1.141,1.189
age,1.007,1.006,1.008
F,0.891,0.855,0.928
ASIAN,0.918,0.798,1.056
