# Aim 3: find out the characteristics of more susceptible group

1. divide patients into two groups: who is hyperchloremic between 24 and 48h
2. look at the variables from the first 24h 
3. univariate analysis

In [81]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from IPython.display import display
from scipy.stats import chi2_contingency, kruskal
from tableone import TableOne

pd.options.display.precision = 3

## Load useful DataFrames

In [102]:
# Target Group: adult patients' first icu visit
df = pd.read_csv('icu_first_18.csv')
df = df.drop(df.columns[0], axis=1)

# Create ICU Id list
icu = []
for icuid in df['icustay_id']:
    icu.append(icuid)

# Max chloride
df = pd.read_csv('chloride.csv')
df = df[df['icustay_id'].isin(icu)]
df = df.set_index(['subject_id','hadm_id'])
df = df.filter(['icustay_id','icu_day','chloride_max'])
df = df.dropna(subset = ['chloride_max'])

# Chloride >= 110 in the second day
df_day2max = df.query('icu_day == 2')
df_day2max['chl_110'] = (df_day2max['chloride_max'] >= 110).astype(int)

# Demographic
df = pd.read_csv('adm_demographics.csv')
df = df.set_index(['subject_id', 'hadm_id'])
df_demo = df.filter(['insurance','ethnicity','age','gender'])

# Weight
df = pd.read_csv('weight.csv')
df = df[df['icustay_id'].isin(icu)]
df_wt = df.filter(['icustay_id','day','weight'])

# GCS(Glasgow coma scale)
df = pd.read_csv('gcs_pan.csv')
df = df[df['icustay_id'].isin(icu)]
df_gcs = df.filter(['icustay_id','day','mingcs'])

# first-day Vitals(Merge with GCS and Weight)
df = pd.read_csv('vitals_pan.csv')
df = df[df['icustay_id'].isin(icu)]
df = df.set_index(['subject_id','hadm_id','icustay_id'])
df_vitals = df.filter(['day', 'heartrate_max','sysbp_min','diasbp_min','resprate_max','spo2_min','tempc_max'])
df_vitals = df_vitals.reset_index()
df_vitals = df_vitals.merge(df_wt, on = ['icustay_id','day']).merge(df_gcs, on = ['icustay_id','day'])
df_vitals = df_vitals.query('day == 1')

# medication from prescription table
df = pd.read_csv('medication.csv')
df_med = df.drop(df.columns[[0,3]], axis=1)
df = pd.read_csv('icu_first_18.csv')
df = df.drop(df.columns[0], axis=1)
df = df_med.merge(df, on = ['subject_id','hadm_id'])
df['taken'] = ((pd.to_datetime(df['intime']) < pd.to_datetime(df['startdate'])) & (pd.to_datetime(df['startdate']) < pd.to_datetime(df['outtime'])))
df = df.filter(['subject_id','hadm_id','icustay_id','drug','taken'])
df = df.set_index(['subject_id','hadm_id','icustay_id'])
df_med = df.loc[df['taken'] == True]

#df_med = df_med.groupby(['subject_id','hadm_id','icustay_id','drug']).any()
#df_med = df_med['taken'].unstack()
#df_med = (df_med * 1).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Table 1. Demographics of Study Population

In [97]:
df_day2max = df_day2max.reset_index()
df_day2max = df_day2max.set_index(['subject_id','hadm_id'])
df_combined = df_demo.join(df_day2max, how='inner')
columns = ['age', 'gender', 'ethnicity']
categorical = ['gender', 'ethnicity']
groupby = 'chl_110'
display(TableOne(df_combined, columns, categorical, groupby,
                 labels={'age': 'Age', 'gender': 'Gender', 'ethnicity': 'Ethnicity',
                         'chl_110': 'Hyperchloremia (>=110)'},
                 pval=True, isnull=False, label_suffix=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110)
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,pval,ptest
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
n,,24452,7293,,
"Age, mean (SD)",,64.4 (16.7),66.0 (17.7),<0.001,Two Sample T-test
"Gender, n (%)",F,10415 (42.6),3646 (50.0),<0.001,Chi-squared
"Gender, n (%)",M,14037 (57.4),3647 (50.0),,
"Ethnicity, n (%)",ASIAN,496 (2.0),226 (3.1),<0.001,Chi-squared
"Ethnicity, n (%)",BLACK,2254 (9.2),733 (10.1),,
"Ethnicity, n (%)",HISPANIC,767 (3.1),240 (3.3),,
"Ethnicity, n (%)",OTHER,3314 (13.6),1020 (14.0),,
"Ethnicity, n (%)",WHITE,17621 (72.1),5074 (69.6),,


In [84]:
df_day2max = df_day2max.reset_index()
df_day2max = df_day2max.set_index(['subject_id','hadm_id','icustay_id'])
df_vitals = df_vitals.reset_index()
df_vitals = df_vitals.set_index(['subject_id','hadm_id','icustay_id'])
df_combined = df_vitals.join(df_day2max, how='inner')

columns = ['heartrate_max','sysbp_min','diasbp_min','resprate_max','spo2_min','tempc_max','weight','mingcs']
groupby = 'chl_110'
display(TableOne(df_combined, columns,categorical = [], groupby = 'chl_110',
                 labels={
                         'chl_110': 'Hyperchloremia (>=110)'},
                 pval=True, isnull=False, label_suffix=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110)
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,pval,ptest
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
n,,8146,2126,,
"heartrate_max, mean (SD)",,103.2 (19.7),107.2 (22.1),<0.001,Two Sample T-test
"sysbp_min, mean (SD)",,92.0 (17.4),88.6 (17.9),<0.001,Two Sample T-test
"diasbp_min, mean (SD)",,43.8 (11.7),41.7 (11.6),<0.001,Two Sample T-test
"resprate_max, mean (SD)",,27.4 (6.4),27.8 (6.5),0.007,Two Sample T-test
"spo2_min, mean (SD)",,91.5 (7.7),91.7 (8.6),0.288,Two Sample T-test
"tempc_max, mean (SD)",,37.5 (0.8),37.6 (0.8),0.044,Two Sample T-test
"weight, mean (SD)",,82.4 (24.4),75.3 (20.3),<0.001,Two Sample T-test
"mingcs, mean (SD)",,12.8 (3.4),11.7 (3.8),<0.001,Two Sample T-test


In [105]:
df_day2max = df_day2max.reset_index()
df_day2max = df_day2max.set_index(['subject_id','hadm_id','icustay_id'])
df_combined = df_med.join(df_day2max, how='inner')
columns = ['drug']
categorical = ['drug']
groupby = 'chl_110'
display(TableOne(df_combined, columns,categorical, groupby = 'chl_110',
                                  labels={
                         'chl_110': 'Hyperchloremia (>=110)'},
                 pval=True, isnull=False, label_suffix=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110),Grouped by Hyperchloremia (>=110)
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,pval,ptest
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
n,,8329,3596,,
"drug, n (%)",dopamine,1117 (13.4),415 (11.5),<0.001,Chi-squared
"drug, n (%)",epinephrine,860 (10.3),260 (7.2),,
"drug, n (%)",norepinephrine,4441 (53.3),2158 (60.0),,
"drug, n (%)",vasopressin,1911 (22.9),763 (21.2),,


## Define useful functions

In [86]:
def run_kruskal(df_neg, df_pos):
    """Display results of a Kruskal-Wallis H test.
    
    @param df_neg: DataFrame containing the negative samples
    @param df_pos: DataFrame containing the positive samples
    """
    summary = pd.concat([df_neg.describe(), df_pos.describe()], axis=1)
    summary.columns = ['(-)', '(+)']
    display(summary)
    print (kruskal(df_neg, df_pos))

# Univariate association

## 1. age

In [90]:
df_day2max = df_day2max.reset_index()
df_day2max = df_day2max.set_index(['subject_id','hadm_id'])
df_combined = df_demo.join(df_day2max, how='inner')
df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,insurance,ethnicity,age,gender,icustay_id,icu_day,chloride_max,chl_110
subject_id,hadm_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,145834,Medicare,WHITE,76.575,M,211552,2,112.0,1
4,185777,Private,WHITE,47.877,F,294638,2,106.0,0
6,107064,Medicare,WHITE,65.984,F,228232,2,102.0,0
9,150750,Medicaid,OTHER,41.816,M,220597,2,103.0,0
12,112213,Medicare,WHITE,72.419,M,232669,2,112.0,1
13,143045,Medicaid,WHITE,39.890,F,263738,2,110.0,1
17,194023,Private,WHITE,47.485,F,277042,2,108.0,0
21,109451,Medicare,WHITE,87.496,M,217847,2,103.0,0
21,111970,Medicare,WHITE,87.882,M,216859,2,105.0,0
26,197661,Medicare,OTHER,72.052,M,244882,2,104.0,0


In [91]:
run_kruskal(df_combined.query('chl_110 == 0').filter(['age']),
            df_combined.query('chl_110 == 1').filter(['age']))

Unnamed: 0,(-),(+)
count,24452.0,7293.0
mean,64.383,66.036
std,16.744,17.652
min,18.008,18.033
25%,53.528,54.337
50%,65.97,68.447
75%,77.647,80.195
max,91.4,91.4


KruskalResult(statistic=74.90376458296488, pvalue=4.942267490848014e-18)


## 2. gender

In [None]:
def run_chi2(index, columns):
    """Display results of a chi-square test of independence.
    
    @param index: Values to group by in the rows
    @param columns: Values to group by in the columns
    """
    print ("Chi-square test of independence with Yates' continuity correction")
    c_table = pd.crosstab(index, columns)
    display(c_table)
    chi2, p, dof, expected = chi2_contingency(c_table)  # correction=True (Yates' correction)
    print ("X-squared = %s" % chi2)
    print ("dof       = %s" % dof)
    print ("p-value   = %s" % p)

In [None]:
run_chi2(df_combined['gender'], df_combined['chl_110'])

## 3. ethnicity

In [None]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
def run_table(table):
    stat, p, dof, expected = chi2_contingency(table)
    print('dof=%d' % dof)
    print(expected)
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
        # interpret p-value
        alpha = 1.0 - prob
        print('significance=%.3f, p=%.3f' % (alpha, p))
        if p <= alpha:
            print('Dependent (reject H0)')
        else:
            print('Independent (fail to reject H0)')