In [49]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from IPython.display import display
from tableone import TableOne
from scipy.stats import chi2_contingency, kruskal

pd.options.display.precision = 3

## Load useful DataFrames

In [50]:
# Get the adult patients first icustay id.
df = pd.read_csv('icu_first_18.csv')
icu = []
for icuid in df['icustay_id']:
    icu.append(icuid)

In [51]:
# Admission demographics
df = pd.read_csv('adm_demographics.csv')
df = df.set_index(['subject_id', 'hadm_id'])
df_demographics = df.filter(['age', 'gender', 'ethnicity'])

# Admission demographics for adjustments
df_demo_adj = pd.concat([df['age'],
                         pd.get_dummies(df['gender']),
                         pd.get_dummies(df['ethnicity'])], axis=1)
df_demo_adj = df_demo_adj.drop(['M', 'OTHER'], axis=1)

In [52]:
# Average daily chloride load of first 7 days
df = pd.read_csv('chloride.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['icu_day', 'chloride_input_meq'])
df = df.dropna(subset = ['chloride_input_meq'])
df = df.query('icu_day <= 7')
df = df.groupby(['subject_id', 'hadm_id', 'icustay_id']) \
       .agg({'icu_day': 'max', 'chloride_input_meq': 'sum'})
df['daily_chl_load'] = df['chloride_input_meq'] / df['icu_day']
df_chl_load = df.filter(['daily_chl_load'])

In [53]:
# SOFA score on admission (day 1)
df = pd.read_csv('sofa_pan.csv')
df = df.drop(df.columns[[0]], axis=1) 
df = df.query('day == 1')

df_sofa = df.filter(['subject_id', 'hadm_id', 'icustay_id', 'sofa'])

df_sofa_sub = df.filter(['subject_id', 'hadm_id', 'icustay_id','respiration', 'coagulation','liver','cns','renal','cardiovascular'])
df_sofa_sub = df_sofa_sub.fillna(0)

df_sofa_sub_renal = df.filter(['subject_id', 'hadm_id', 'icustay_id','respiration', 'coagulation','liver','cns','cardiovascular'])
df_sofa_sub_renal = df_sofa_sub_renal.fillna(0)

In [54]:
# New AKI
df = pd.read_csv('new_aki.csv')
df['new'] = df.apply(lambda row: 1 if row['dif'] < 0 else 0, axis=1)
df_aki = df.filter(['icustay_id','first', 'new'])
df_aki = df_aki[df_aki['icustay_id'].isin(icu)]

In [55]:
# AKI with pure first stage = 0 patient
df_aki_0 = df_aki.query('first == 0')
df_aki_0 = df_aki_0.filter(['icustay_id','new'])

In [56]:
# AKI with first stage = 0, 1, 2 (exclude the patient already comes with 3)
df_aki_012 = df_aki.query('first < 3')
df_aki_012 = df_aki_012.filter(['icustay_id','new'])

In [57]:
x = pd.merge(df_aki, df_sofa_sub, on = 'icustay_id')
x[x['first'] == 0][['icustay_id','first','renal']]

Unnamed: 0,icustay_id,first,renal
1,200003,0.0,0.0
5,200010,0.0,0.0
7,200012,0.0,0.0
15,200030,0.0,0.0
16,200033,0.0,0.0
17,200034,0.0,0.0
18,200035,0.0,0.0
23,200041,0.0,0.0
25,200045,0.0,0.0
26,200047,0.0,1.0


## Define useful functions

In [58]:
def run_kruskal(df_neg, df_pos):
    """Display results of a Kruskal-Wallis H test.
    
    @param df_neg: DataFrame containing the negative samples
    @param df_pos: DataFrame containing the positive samples
    """
    summary = pd.concat([df_neg.describe(), df_pos.describe()], axis=1)
    summary.columns = ['(-)', '(+)']
    display(summary)
    print (kruskal(df_neg, df_pos))

def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print ("Chi-square test of independence with Yates' continuity correction")
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print ("X-squared = %s" % chi2)
    print ("dof       = %s" % dof)
    print ("p-value   = %s" % p)

def run_glm(df):
    """Display results (odds ratios) of a GLM regression.

    @param df: DataFrame with columns [exog1, exog2, ..., endog]
    """
    endog = df[df.columns[-1]]
    exog = sm.add_constant(df[df.columns[:-1]])
    glm = sm.GLM(endog, exog, family=sm.families.Binomial())  # == sm.Logit(endog, exog)
    result = glm.fit()
    print (result.summary())

    odds = pd.concat([result.params, result.conf_int()], axis=1)[1:]  # toss const (first row)
    odds.columns = ['Odds Ratio', '2.5%', '97.5%']
    display(np.exp(odds))
    

# Daily Chloride Load and New AKI in pure 0 paitent¶

In [59]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_0,on=['icustay_id'])

In [60]:
run_kruskal(df_combined.query('new == 0').filter(['daily_chl_load']),
            df_combined.query('new == 1').filter(['daily_chl_load']))

Unnamed: 0,(-),(+)
count,4978.0,7737.0
mean,197.305,209.303
std,171.169,154.24
min,0.036,0.654
25%,90.561,110.623
50%,163.774,184.6
75%,259.032,275.187
max,3893.103,5066.143


KruskalResult(statistic=64.08819516737633, pvalue=1.1897227235024969e-15)


## Adjust with age, race, gender, _SOFA_

In [61]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                12715
Model:                            GLM   Df Residuals:                    12706
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -8250.6
Date:                Tue, 16 Apr 2019   Deviance:                       16501.
Time:                        13:42:54   Pearson chi2:                 1.28e+04
No. Iterations:                     4   Covariance Type:             nonrobust
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.3916      0.086     -4.551      0.000      -0.560      -0.223
daily_chl_load     0.0003      0.000    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.001
sofa,1.201,1.179,1.224
age,1.006,1.004,1.008
F,0.896,0.833,0.965
ASIAN,1.262,1.003,1.588
BLACK,1.18,1.004,1.387
HISPANIC,1.184,0.968,1.447
WHITE,0.986,0.886,1.097


## Adjust with age, race, gender, _SOFA subscore_

In [62]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_0,on=['icustay_id'])

In [63]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                12715
Model:                            GLM   Df Residuals:                    12701
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -8161.6
Date:                Tue, 16 Apr 2019   Deviance:                       16323.
Time:                        13:42:54   Pearson chi2:                 1.27e+04
No. Iterations:                     4   Covariance Type:             nonrobust
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.3851      0.087     -4.439      0.000      -0.555      -0.215
daily_chl_load     0.0002      0.000    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.313,1.262,1.366
coagulation,1.19,1.129,1.254
liver,1.048,0.979,1.122
cns,1.124,1.077,1.173
renal,0.764,0.699,0.835
cardiovascular,1.337,1.276,1.401
age,1.005,1.003,1.007
F,0.853,0.791,0.919
ASIAN,1.294,1.027,1.631


## Adjust with age, race, gender, _SOFA subscore without renal_

In [64]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub_renal,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_0,on=['icustay_id'])
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                12715
Model:                            GLM   Df Residuals:                    12702
Model Family:                Binomial   Df Model:                           12
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -8179.7
Date:                Tue, 16 Apr 2019   Deviance:                       16359.
Time:                        13:42:54   Pearson chi2:                 1.28e+04
No. Iterations:                     4   Covariance Type:             nonrobust
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.4087      0.087     -4.719      0.000      -0.579      -0.239
daily_chl_load     0.0002      0.000    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.313,1.262,1.366
coagulation,1.187,1.126,1.251
liver,1.04,0.972,1.113
cns,1.125,1.078,1.175
cardiovascular,1.332,1.271,1.395
age,1.005,1.003,1.007
F,0.876,0.813,0.943
ASIAN,1.297,1.029,1.634
BLACK,1.243,1.057,1.463


# Daily Chloride Load and New AKI without stage 3 paitent¶

In [65]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_012,on=['icustay_id'])

In [66]:
run_kruskal(df_combined.query('new == 0').filter(['daily_chl_load']),
            df_combined.query('new == 1').filter(['daily_chl_load']))

Unnamed: 0,(-),(+)
count,8379.0,21290.0
mean,189.843,189.849
std,158.65,143.713
min,0.036,0.168
25%,86.252,90.906
50%,159.907,164.564
75%,251.47,254.578
max,3893.103,5066.143


KruskalResult(statistic=6.749168882626833, pvalue=0.009379136435744196)


## Adjust with age, race, gender, _SOFA_

In [67]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                29669
Model:                            GLM   Df Residuals:                    29660
Model Family:                Binomial   Df Model:                            8
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -17393.
Date:                Tue, 16 Apr 2019   Deviance:                       34785.
Time:                        13:42:55   Pearson chi2:                 2.98e+04
No. Iterations:                     5   Covariance Type:             nonrobust
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.0096      0.064     -0.151      0.880      -0.134       0.115
daily_chl_load   9.61e-05   9.06e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
sofa,1.077,1.065,1.088
age,1.012,1.01,1.013
F,0.857,0.814,0.903
ASIAN,1.118,0.934,1.339
BLACK,1.138,1.015,1.275
HISPANIC,1.178,1.009,1.374
WHITE,1.005,0.933,1.083


## Adjust with age, race, gender, _SOFA subscore_

In [68]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_012,on=['icustay_id'])

In [69]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                29669
Model:                            GLM   Df Residuals:                    29655
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -17331.
Date:                Tue, 16 Apr 2019   Deviance:                       34662.
Time:                        13:42:55   Pearson chi2:                 2.98e+04
No. Iterations:                     5   Covariance Type:             nonrobust
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.0431      0.064     -0.675      0.500      -0.168       0.082
daily_chl_load -4.275e-06   9.02e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.091,1.067,1.116
coagulation,1.112,1.072,1.153
liver,1.043,0.998,1.09
cns,1.124,1.092,1.157
renal,0.878,0.845,0.913
cardiovascular,1.117,1.087,1.148
age,1.012,1.011,1.014
F,0.828,0.786,0.873
ASIAN,1.123,0.938,1.346


## Adjust with age, race, gender, _SOFA subscore without renal_

In [70]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa_sub_renal,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki_012,on=['icustay_id'])
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                29669
Model:                            GLM   Df Residuals:                    29656
Model Family:                Binomial   Df Model:                           12
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -17352.
Date:                Tue, 16 Apr 2019   Deviance:                       34704.
Time:                        13:42:56   Pearson chi2:                 2.98e+04
No. Iterations:                     5   Covariance Type:             nonrobust
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.0332      0.064     -0.520      0.603      -0.158       0.092
daily_chl_load   3.22e-05   9.05e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.091,1.067,1.116
coagulation,1.109,1.07,1.149
liver,1.031,0.987,1.077
cns,1.124,1.092,1.157
cardiovascular,1.107,1.077,1.138
age,1.012,1.01,1.013
F,0.847,0.804,0.892
ASIAN,1.121,0.936,1.343
BLACK,1.17,1.043,1.312
