In [30]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from IPython.display import display
from tableone import TableOne
from scipy.stats import chi2_contingency, kruskal

pd.options.display.precision = 3

## Load useful DataFrames

In [31]:
# Get the adult patients first icustay id.
df = pd.read_csv('icu_first_18.csv')
icu = []
for icuid in df['icustay_id']:
    icu.append(icuid)

In [32]:
# Admission demographics
df = pd.read_csv('adm_demographics.csv')
df = df.set_index(['subject_id', 'hadm_id'])
df_demographics = df.filter(['age', 'gender', 'ethnicity'])

# Admission demographics for adjustments
df_demo_adj = pd.concat([df['age'],
                         pd.get_dummies(df['gender']),
                         pd.get_dummies(df['ethnicity'])], axis=1)
df_demo_adj = df_demo_adj.drop(['M', 'OTHER'], axis=1)

In [33]:
# Average daily chloride load of first 7 days
df = pd.read_csv('chloride.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['icu_day', 'chloride_input_meq'])
df = df.dropna(subset = ['chloride_input_meq'])
df = df.query('icu_day <= 7')
df = df.groupby(['subject_id', 'hadm_id', 'icustay_id']) \
       .agg({'icu_day': 'max', 'chloride_input_meq': 'sum'})
df['daily_chl_load'] = df['chloride_input_meq'] / df['icu_day']
df_chl_load = df.filter(['daily_chl_load'])

In [34]:
# SOFA score on admission (day 1)
df = pd.read_csv('sofa_pan.csv')
df = df.drop(df.columns[[0]], axis=1) 
df = df.query('day == 1')

df_sofa = df.filter(['subject_id', 'hadm_id', 'icustay_id', 'sofa'])

df_sofa_sub = df.filter(['subject_id', 'hadm_id', 'icustay_id','respiration', 'coagulation','liver','cns','renal','cardiovascular'])
df_sofa_sub = df_sofa_sub.fillna(0)

df_sofa_sub_renal = df.filter(['subject_id', 'hadm_id', 'icustay_id','respiration', 'coagulation','liver','cns','cardiovascular'])
df_sofa_sub_renal = df_sofa_sub_renal.fillna(0)

In [35]:
# New AKI
df = pd.read_csv('new_aki.csv')
df['new'] = df.apply(lambda row: 1 if row['dif'] < 0 else 0, axis=1)
df_aki = df.filter(['icustay_id','first', 'new'])
df_aki = df_aki[df_aki['icustay_id'].isin(icu)]

In [36]:
# AKI with pure first stage = 0 patient
df_aki_0 = df_aki.query('first == 0')
df_aki_0 = df_aki_0.filter(['icustay_id','new'])

In [37]:
# AKI with first stage = 0, 1, 2 (exclude the patient already comes with 3)
df_aki_012 = df_aki.query('first < 3')
df_aki_012 = df_aki_012.filter(['icustay_id','new'])

## Define useful functions

In [38]:
def run_kruskal(df_neg, df_pos):
    """Display results of a Kruskal-Wallis H test.
    
    @param df_neg: DataFrame containing the negative samples
    @param df_pos: DataFrame containing the positive samples
    """
    summary = pd.concat([df_neg.describe(), df_pos.describe()], axis=1)
    summary.columns = ['(-)', '(+)']
    display(summary)
    print (kruskal(df_neg, df_pos))

def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print ("Chi-square test of independence with Yates' continuity correction")
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print ("X-squared = %s" % chi2)
    print ("dof       = %s" % dof)
    print ("p-value   = %s" % p)

def run_glm(df):
    """Display results (odds ratios) of a GLM regression.

    @param df: DataFrame with columns [exog1, exog2, ..., endog]
    """
    endog = df[df.columns[-1]]
    exog = sm.add_constant(df[df.columns[:-1]])
    glm = sm.GLM(endog, exog, family=sm.families.Binomial())  # == sm.Logit(endog, exog)
    result = glm.fit()
    print (result.summary())

    odds = pd.concat([result.params, result.conf_int()], axis=1)[1:]  # toss const (first row)
    odds.columns = ['Odds Ratio', '2.5%', '97.5%']
    display(np.exp(odds))
    

# Daily Chloride Load and New AKI in pure 0 paitent¶

In [39]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_0,on=['icustay_id'])

In [40]:
run_kruskal(df_combined.query('new == 0').filter(['daily_chl_load']),
            df_combined.query('new == 1').filter(['daily_chl_load']))

Unnamed: 0,(-),(+)
count,11032.0,5939.0
mean,198.546,206.544
std,158.862,162.206
min,0.036,1.485
25%,94.016,109.642
50%,168.061,181.429
75%,261.8,268.092
max,3150.2,5066.143


KruskalResult(statistic=38.002374032398414, pvalue=7.065860191745454e-10)


## Adjust with age, race, gender, _SOFA_

In [41]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                16971
Model:                            GLM   Df Residuals:                    16962
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -10784.
Date:                Thu, 11 Apr 2019   Deviance:                       21568.
Time:                        12:19:30   Pearson chi2:                 1.70e+04
No. Iterations:                     5                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.8327      0.077    -10.797      0.000      -0.984      -0.682
daily_chl_load  4.055e-05      0.000    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
sofa,1.145,1.128,1.161
age,1.001,0.999,1.003
F,0.984,0.922,1.05
ASIAN,1.003,0.821,1.224
BLACK,0.745,0.646,0.859
HISPANIC,0.848,0.708,1.016
WHITE,0.776,0.706,0.853


## Adjust with age, race, gender, _SOFA subscore_

In [42]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_0,on=['icustay_id'])

In [43]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                16971
Model:                            GLM   Df Residuals:                    16957
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -10636.
Date:                Thu, 11 Apr 2019   Deviance:                       21272.
Time:                        12:19:30   Pearson chi2:                 1.70e+04
No. Iterations:                     5                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.8648      0.078    -11.058      0.000      -1.018      -0.711
daily_chl_load -1.982e-05      0.000    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.28,1.242,1.32
coagulation,1.214,1.162,1.268
liver,1.009,0.953,1.069
cns,0.974,0.94,1.009
renal,0.81,0.754,0.869
cardiovascular,1.229,1.185,1.275
age,1.001,0.999,1.003
F,0.965,0.904,1.031
ASIAN,1.062,0.868,1.299


## Adjust with age, race, gender, _SOFA subscore without renal_

In [44]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub_renal,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_0,on=['icustay_id'])
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                16971
Model:                            GLM   Df Residuals:                    16958
Model Family:                Binomial   Df Model:                           12
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -10654.
Date:                Thu, 11 Apr 2019   Deviance:                       21309.
Time:                        12:19:30   Pearson chi2:                 1.70e+04
No. Iterations:                     5                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.8706      0.078    -11.148      0.000      -1.024      -0.718
daily_chl_load -2.058e-05      0.000    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.281,1.243,1.321
coagulation,1.212,1.16,1.266
liver,1.002,0.947,1.061
cns,0.975,0.941,1.01
cardiovascular,1.223,1.179,1.269
age,1.001,0.999,1.002
F,0.984,0.921,1.051
ASIAN,1.061,0.867,1.298
BLACK,0.802,0.694,0.927


# Daily Chloride Load and New AKI without stage 3 paitent¶

In [45]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_012,on=['icustay_id'])

In [46]:
run_kruskal(df_combined.query('new == 0').filter(['daily_chl_load']),
            df_combined.query('new == 1').filter(['daily_chl_load']))

Unnamed: 0,(-),(+)
count,16374.0,16671.0
mean,194.77,188.681
std,162.946,145.931
min,0.036,0.257
25%,86.625,91.236
50%,162.669,163.5
75%,258.104,251.805
max,3150.2,5066.143


KruskalResult(statistic=0.2516365332061708, pvalue=0.6159250942448803)


## Adjust with age, race, gender, _SOFA_

In [47]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                33045
Model:                            GLM   Df Residuals:                    33036
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -22570.
Date:                Thu, 11 Apr 2019   Deviance:                       45139.
Time:                        12:19:31   Pearson chi2:                 3.31e+04
No. Iterations:                     4                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.3363      0.056     -6.020      0.000      -0.446      -0.227
daily_chl_load    -0.0004   7.46e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,0.999,1.0
sofa,1.087,1.078,1.097
age,1.006,1.005,1.008
F,0.947,0.906,0.99
ASIAN,0.774,0.665,0.902
BLACK,0.683,0.619,0.753
HISPANIC,0.726,0.635,0.829
WHITE,0.754,0.706,0.804


## Adjust with age, race, gender, _SOFA subscore_

In [48]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_012,on=['icustay_id'])

In [49]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                33045
Model:                            GLM   Df Residuals:                    33031
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -22330.
Date:                Thu, 11 Apr 2019   Deviance:                       44660.
Time:                        12:19:31   Pearson chi2:                 3.31e+04
No. Iterations:                     4                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.4369      0.057     -7.714      0.000      -0.548      -0.326
daily_chl_load    -0.0004   7.59e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,0.999,1.0
respiration,1.206,1.183,1.229
coagulation,1.163,1.129,1.199
liver,1.006,0.97,1.044
cns,1.004,0.981,1.027
renal,0.844,0.818,0.87
cardiovascular,1.122,1.096,1.149
age,1.008,1.007,1.009
F,0.921,0.88,0.963
ASIAN,0.823,0.706,0.96


## Adjust with age, race, gender, _SOFA subscore without renal_

In [50]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub_renal,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_012,on=['icustay_id'])
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                33045
Model:                            GLM   Df Residuals:                    33032
Model Family:                Binomial   Df Model:                           12
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -22387.
Date:                Thu, 11 Apr 2019   Deviance:                       44775.
Time:                        12:19:32   Pearson chi2:                 3.31e+04
No. Iterations:                     4                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.4041      0.056     -7.157      0.000      -0.515      -0.293
daily_chl_load    -0.0004   7.57e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,0.999,1.0
respiration,1.206,1.184,1.23
coagulation,1.159,1.125,1.195
liver,0.989,0.953,1.025
cns,1.004,0.981,1.027
cardiovascular,1.104,1.079,1.129
age,1.007,1.005,1.008
F,0.947,0.906,0.991
ASIAN,0.82,0.703,0.956
BLACK,0.739,0.67,0.815
