In [54]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from IPython.display import display
from tableone import TableOne
from scipy.stats import chi2_contingency, kruskal

pd.options.display.precision = 3

## Load useful DataFrames

In [55]:
# Get the adult patients first icustay id.
df = pd.read_csv('icu_first_18.csv')
icu = []
for icuid in df['icustay_id']:
    icu.append(icuid)

In [56]:
# Admission demographics
df = pd.read_csv('adm_demographics.csv')
df = df.set_index(['subject_id', 'hadm_id'])
df_demographics = df.filter(['age', 'gender', 'ethnicity'])

# Admission demographics for adjustments
df_demo_adj = pd.concat([df['age'],
                         pd.get_dummies(df['gender']),
                         pd.get_dummies(df['ethnicity'])], axis=1)
df_demo_adj = df_demo_adj.drop(['M', 'OTHER'], axis=1)

In [57]:
# Average daily chloride load of first 3 days
df = pd.read_csv('chloride.csv')
df = df.set_index(['subject_id', 'hadm_id', 'icustay_id'])
df = df.filter(['icu_day', 'chloride_input_meq'])
df = df.dropna(subset = ['chloride_input_meq'])
df = df.query('icu_day <= 3')
df = df.groupby(['subject_id', 'hadm_id', 'icustay_id']) \
       .agg({'icu_day': 'max', 'chloride_input_meq': 'sum'})
df['daily_chl_load'] = df['chloride_input_meq'] / df['icu_day']
df_chl_load = df.filter(['daily_chl_load'])

In [74]:
# SOFA score on admission (day 1)
df = pd.read_csv('sofa_pan.csv')
df = df.drop(df.columns[[0]], axis=1) 
df = df.query('day == 1')
df_sofa = df.filter(['subject_id', 'hadm_id', 'icustay_id','respiration', 'coagulation','liver','cns','renal','cardiovascular'])
df_sofa = df_sofa.fillna(0)
df_sofa

Unnamed: 0,subject_id,hadm_id,icustay_id,respiration,coagulation,liver,cns,renal,cardiovascular
0,3,145834,211552,0.0,0.0,0.0,0.0,3.0,4.0
6,4,185777,294638,0.0,0.0,1.0,0.0,0.0,1.0
8,6,107064,228232,0.0,0.0,0.0,0.0,4.0,0.0
12,9,150750,220597,3.0,0.0,0.0,0.0,1.0,1.0
18,11,194540,229441,0.0,0.0,0.0,1.0,0.0,1.0
20,12,112213,232669,0.0,2.0,0.0,0.0,1.0,0.0
28,13,143045,263738,0.0,1.0,0.0,0.0,0.0,0.0
32,17,161087,257980,2.0,0.0,0.0,0.0,0.0,0.0
33,17,194023,277042,1.0,1.0,0.0,4.0,0.0,1.0
36,18,188822,298129,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
# New AKI
df = pd.read_csv('new_aki.csv')
df['new'] = df.apply(lambda row: 1 if row['difference'] < 0 else 0, axis=1)
df_aki = df.filter(['icustay_id', 'new'])
df_aki = df_aki[df_aki['icustay_id'].isin(icu)]

## Define useful functions

In [60]:
def run_kruskal(df_neg, df_pos):
    """Display results of a Kruskal-Wallis H test.
    
    @param df_neg: DataFrame containing the negative samples
    @param df_pos: DataFrame containing the positive samples
    """
    summary = pd.concat([df_neg.describe(), df_pos.describe()], axis=1)
    summary.columns = ['(-)', '(+)']
    display(summary)
    print (kruskal(df_neg, df_pos))

def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print ("Chi-square test of independence with Yates' continuity correction")
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print ("X-squared = %s" % chi2)
    print ("dof       = %s" % dof)
    print ("p-value   = %s" % p)

def run_glm(df):
    """Display results (odds ratios) of a GLM regression.

    @param df: DataFrame with columns [exog1, exog2, ..., endog]
    """
    endog = df[df.columns[-1]]
    exog = sm.add_constant(df[df.columns[:-1]])
    glm = sm.GLM(endog, exog, family=sm.families.Binomial())  # == sm.Logit(endog, exog)
    result = glm.fit()
    print (result.summary())

    odds = pd.concat([result.params, result.conf_int()], axis=1)[1:]  # toss const (first row)
    odds.columns = ['Odds Ratio', '2.5%', '97.5%']
    display(np.exp(odds))
    

# Daily Chloride Load and New AKI¶

In [79]:
df_combined = pd.merge(df_chl_load.reset_index(),df_sofa,on=['subject_id','hadm_id','icustay_id'])
df_combined = pd.merge(df_combined.reset_index(),df_demo_adj.reset_index(),on=['subject_id','hadm_id'])
df_combined = df_combined.drop(df_combined.columns[[0]], axis=1)
df_combined = pd.merge(df_combined,df_aki,on=['icustay_id'])
df_combined

Unnamed: 0,subject_id,hadm_id,icustay_id,daily_chl_load,respiration,coagulation,liver,cns,renal,cardiovascular,age,F,ASIAN,BLACK,HISPANIC,WHITE,new
0,3,145834,211552,742.665,0.0,0.0,0.0,0.0,3.0,4.0,76.575,0,0,0,0,1,0
1,4,185777,294638,150.150,0.0,0.0,1.0,0.0,0.0,1.0,47.877,1,0,0,0,1,0
2,6,107064,228232,274.120,0.0,0.0,0.0,0.0,4.0,0.0,65.984,1,0,0,0,1,0
3,9,150750,220597,41.580,3.0,0.0,0.0,0.0,1.0,1.0,41.816,0,0,0,0,0,1
4,11,194540,229441,42.675,0.0,0.0,0.0,1.0,0.0,1.0,50.181,1,0,0,0,1,0
5,12,112213,232669,139.921,0.0,2.0,0.0,0.0,1.0,0.0,72.419,0,0,0,0,1,1
6,13,143045,263738,61.643,0.0,1.0,0.0,0.0,0.0,0.0,39.890,1,0,0,0,1,0
7,17,161087,257980,218.000,2.0,0.0,0.0,0.0,0.0,0.0,47.849,1,0,0,0,1,0
8,17,194023,277042,151.500,1.0,1.0,0.0,4.0,0.0,1.0,47.485,1,0,0,0,1,0
9,18,188822,298129,80.850,0.0,0.0,0.0,0.0,0.0,0.0,50.874,0,0,0,0,1,0


In [80]:
run_kruskal(df_combined.query('new == 0').filter(['daily_chl_load']),
            df_combined.query('new == 1').filter(['daily_chl_load']))

Unnamed: 0,(-),(+)
count,25530.0,18560.0
mean,205.792,218.432
std,184.597,168.704
min,0.036,0.257
25%,78.502,103.95
50%,163.625,189.384
75%,278.0,294.616
max,3150.2,6742.801


KruskalResult(statistic=258.9141388335734, pvalue=2.9594418703360235e-58)


In [81]:
df_combined = df_combined.set_index(['subject_id','hadm_id','icustay_id'])
run_glm(df_combined)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    new   No. Observations:                44090
Model:                            GLM   Df Residuals:                    44076
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -27725.
Date:                Mon, 08 Apr 2019   Deviance:                       55450.
Time:                        10:55:37   Pearson chi2:                 4.31e+04
No. Iterations:                     4                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.6303      0.051    -12.343      0.000      -0.730      -0.530
daily_chl_load -4.847e-05   6.01e-05    

Unnamed: 0,Odds Ratio,2.5%,97.5%
daily_chl_load,1.0,1.0,1.0
respiration,1.25,1.229,1.271
coagulation,1.139,1.109,1.17
liver,1.012,0.981,1.045
cns,0.971,0.951,0.991
renal,0.551,0.538,0.563
cardiovascular,1.153,1.13,1.177
age,1.008,1.007,1.009
F,0.895,0.859,0.932
ASIAN,0.917,0.796,1.055
