In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import copy

import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook', font_scale=1.2)

import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline

from IPython.display import display

In [2]:
# create a database connection
sqluser = 'mimic'
dbname = 'mimic'
schema_name = 'mimiciii'

cur = None

In [3]:
if cur: 
    cur.close()
    con.close()

con = psycopg2.connect(dbname = dbname, user = sqluser, host='/var/run/postgresql')
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

In [4]:
query = """
SELECT * FROM icu_features
"""
ef = pd.read_sql_query(query, con)
ef.head()

Unnamed: 0,icustay_id,hadm_id,subject_id,age,gender,height,weight,ethnicity,insurance,filter_vaso,...,pc_bronch,pc_cath,pc_echo,pc_pressor,pc_rhc,pc_thora,pc_vent,passed_filters,use_record,bmi
0,200001,152234,55973,22290 days 19:06:12,F,167.851667,27.669135,ASIAN - ASIAN INDIAN,Medicare,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,9.820741
1,200003,163557,27513,17625 days 19:50:04,M,177.8,78.224998,WHITE,Private,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.744692
2,200006,189514,10950,19736 days 11:28:14,M,165.1,82.400002,OTHER,Medicaid,False,...,,,,,,,,,,30.229647
3,200007,129310,20707,15818 days 10:03:37,M,177.8,126.0,WHITE,Private,False,...,,,,,,,,,,39.857223
4,200009,129607,29904,17353 days 10:34:32,F,160.02,85.833331,WHITE,Private,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,33.520264


In [10]:
len(ef)

61532

In [11]:
F = (ef['use_record'] == 1)

In [12]:
ef = ef.loc[F]
len(ef)

9320

In [13]:
ef.ea_lv_systolic.value_counts()

 0.0    2297
-1.0     308
 3.0     220
 1.0     183
 2.0     147
-2.0      71
-3.0      30
Name: ea_lv_systolic, dtype: int64

In [14]:
ef['has_echo'] = ~np.isnan(ef.ea_key)

In [16]:
ef.ea_lv_systolic.fillna(value=-99, inplace=True)

In [19]:
ef.groupby(['has_echo', 'ea_lv_systolic']).apply(len)

has_echo  ea_lv_systolic
False     -99.0             5634
True      -99.0              430
          -3.0                30
          -2.0                71
          -1.0               308
           0.0              2297
           1.0               183
           2.0               147
           3.0               220
dtype: int64

In [32]:
ef.groupby('has_echo').apply(lambda x: x.gender.value_counts())

gender,M,F
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2935,2699
True,1920,1766


In [33]:
ef.groupby('has_echo').apply(lambda x: x.gender.value_counts(normalize=True))

gender,M,F
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.520944,0.479056
True,0.52089,0.47911


In [24]:
ef.groupby(['has_echo', 'ea_lv_systolic']).apply(lambda x: x.gender.value_counts()).unstack(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,F,M
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,2699,2935
True,-99.0,175,255
True,-3.0,9,21
True,-2.0,30,41
True,-1.0,169,139
True,0.0,1152,1145
True,1.0,76,107
True,2.0,67,80
True,3.0,88,132


In [25]:
ef.groupby(['has_echo', 'ea_lv_systolic']).apply(lambda x: x.gender.value_counts(normalize=True)).unstack(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,F,M
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,0.479056,0.520944
True,-99.0,0.406977,0.593023
True,-3.0,0.3,0.7
True,-2.0,0.422535,0.577465
True,-1.0,0.548701,0.451299
True,0.0,0.501524,0.498476
True,1.0,0.415301,0.584699
True,2.0,0.455782,0.544218
True,3.0,0.4,0.6


In [38]:
def subgroup(g): 
    display(ef.groupby('has_echo').apply(lambda x: x[g].value_counts()))
    display(ef.groupby('has_echo').apply(lambda x: x[g].value_counts(normalize=True)))
    display(ef.groupby(['has_echo', 'ea_lv_systolic']).apply(lambda x: x[g].value_counts()).unstack(2))
    display(ef.groupby(['has_echo', 'ea_lv_systolic']).apply(lambda x: x[g].value_counts(normalize=True)).unstack(2))
    
    
    

In [35]:
subgroup('gender')

gender,M,F
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2935,2699
True,1920,1766


gender,M,F
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.520944,0.479056
True,0.52089,0.47911


Unnamed: 0_level_0,Unnamed: 1_level_0,F,M
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,2699,2935
True,-99.0,175,255
True,-3.0,9,21
True,-2.0,30,41
True,-1.0,169,139
True,0.0,1152,1145
True,1.0,76,107
True,2.0,67,80
True,3.0,88,132


Unnamed: 0_level_0,Unnamed: 1_level_0,F,M
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,0.479056,0.520944
True,-99.0,0.406977,0.593023
True,-3.0,0.3,0.7
True,-2.0,0.422535,0.577465
True,-1.0,0.548701,0.451299
True,0.0,0.501524,0.498476
True,1.0,0.415301,0.584699
True,2.0,0.455782,0.544218
True,3.0,0.4,0.6


In [37]:
subgroup('ex_congestive_heart_failure')

ex_congestive_heart_failure,0,1
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4369,1265
True,2369,1317


ex_congestive_heart_failure,0,1
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.77547,0.22453
True,0.642702,0.357298


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,4369,1265
True,-99.0,230,200
True,-3.0,17,13
True,-2.0,31,40
True,-1.0,229,79
True,0.0,1639,658
True,1.0,90,93
True,2.0,58,89
True,3.0,75,145


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,0.77547,0.22453
True,-99.0,0.534884,0.465116
True,-3.0,0.566667,0.433333
True,-2.0,0.43662,0.56338
True,-1.0,0.743506,0.256494
True,0.0,0.713539,0.286461
True,1.0,0.491803,0.508197
True,2.0,0.394558,0.605442
True,3.0,0.340909,0.659091


In [169]:
def subgroup(g):
    from scipy.stats import fisher_exact, chi2_contingency
    
   
        
    
    counts = ef.groupby(g).apply(lambda x: x.mortality.value_counts())
    if type(counts) is pd.Series:
        counts = counts.unstack(-1).fillna(value=0)
    marginal = counts.sum(axis=0)
    #if type(g) is list:
    #    counts = counts.unstack(-1)
        
    display(counts)
    
    marginal = float(marginal.loc[True])/marginal.sum()
    proportions = counts.div(counts.sum(axis=1), axis=0)
    proportions['fold_change'] = proportions[True]/marginal
    
    if len(proportions) > 2:
        pval_array = np.zeros(len(proportions))
        for i in range(len(proportions)):
            this = counts.iloc[i]
            rest = counts.sum() - this
            table = pd.DataFrame([this, rest])
            _, pval = fisher_exact(table)
            pval_array[i] = pval
        signif = (pval_array < 0.05)
        proportions['p-value'] = pval_array
        proportions['significant @ p=0.05'] = signif
        proportions['support'] = counts.sum(axis=1)
        
    
    display(proportions)
    #display(counts/counts.sum(axis=1))
    
    if len(counts) == 2:
        _, pval = fisher_exact(counts)
        print('Fisher\'s exact p-value:', pval)
    elif len(counts) > 2:
        if np.any(counts < 5):
            print('Warning! Chi2 not advised if any count < 5')
        _, pval, _, _ = chi2_contingency(counts)
        print('Chi2 p-value:', pval)
    
    ## echo vs no echo
    #counts = ef.groupby('has_echo').apply(lambda x: x[g].value_counts())
    
def subgroup_fluids(g):
    from scipy.stats import ttest_ind
    
    def fluid_analysis(x):
        F = ~(x.fb_day1_balance_truncated == 1)
        fl_live = x.fb_day1_balance_ml.loc[~x.mortality & F]
        fl_live = fl_live.loc[~np.isnan(fl_live)]
        fl_dead = x.fb_day1_balance_ml.loc[x.mortality & F]
        fl_dead = fl_dead.loc[~np.isnan(fl_dead)]
        delta = fl_live.mean() - fl_dead.mean()
        _, pval = ttest_ind(fl_live, fl_dead, equal_var=False)
        signif = (pval < 0.05)
        return pd.Series({'fluid_day1_diff': delta, 'p-value': pval
                          , 'significant @ p=0.05': signif
                          , 'support': len(fl_live)+len(fl_dead)})
    
    if len(g) == 0:
        table = pd.DataFrame([fluid_analysis(ef)])
    else:
        table = ef.groupby(g).apply(fluid_analysis)
    display(table)
    
    

What we want to understand:

+ _p(mortality | $lv_{mild}$, fluids) = p($lv_{mild}$, fluids | mortality)p(mortality)/p($lv_{mild}$, fluids)_
+ Is mortality conditionaly independent of fluids and $lv_{mild}$ given some other variable?
  + find _c_ s.t. _p(mortality, $lv_{mild}$, fluids | c) = p(mortality | c)p($lv_{mild}$, fluids | c)_

In [170]:
tod = ef.dod - ef.intime
ef['mortality'] = tod < np.timedelta64(30, 'D')

In [171]:
subgroup_fluids([])

Unnamed: 0,fluid_day1_diff,p-value,significant @ p=0.05,support
0,-686.52083,1.1038850000000001e-28,True,8329


In [172]:
subgroup('has_echo')

mortality,False,True
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4103,1531
True,2597,1089


mortality,False,True,fold_change
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.728257,0.271743,0.966658
True,0.704558,0.295442,1.050962


Fisher's exact p-value: 0.0133556149149


In [173]:
subgroup_fluids('has_echo')

Unnamed: 0_level_0,fluid_day1_diff,p-value,significant @ p=0.05,support
has_echo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,-803.860637,8.311518000000001e-25,True,5022
True,-504.635392,5.009536e-07,True,3307


In [174]:
subgroup(['has_echo', 'ea_lv_systolic'])

Unnamed: 0_level_0,mortality,False,True
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,4103,1531
True,-99.0,295,135
True,-3.0,19,11
True,-2.0,38,33
True,-1.0,184,124
True,0.0,1689,608
True,1.0,137,46
True,2.0,98,49
True,3.0,137,83


Unnamed: 0_level_0,mortality,False,True,fold_change,p-value,significant @ p=0.05,support
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,-99.0,0.728257,0.271743,0.966658,0.013356,True,5634
True,-99.0,0.686047,0.313953,1.116812,0.124196,False,430
True,-3.0,0.633333,0.366667,1.304326,0.310949,False,30
True,-2.0,0.535211,0.464789,1.653371,0.001252,True,71
True,-1.0,0.597403,0.402597,1.43214,3e-06,True,308
True,0.0,0.735307,0.264693,0.94158,0.044952,True,2297
True,1.0,0.748634,0.251366,0.894173,0.406485,False,183
True,2.0,0.666667,0.333333,1.185751,0.165373,False,147
True,3.0,0.622727,0.377273,1.342054,0.001804,True,220


Chi2 p-value: 2.70890008027e-09


In [175]:
subgroup_fluids(['has_echo', 'ea_lv_systolic'])

Unnamed: 0_level_0,Unnamed: 1_level_0,fluid_day1_diff,p-value,significant @ p=0.05,support
has_echo,ea_lv_systolic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,-99.0,-803.860637,8.311518000000001e-25,True,5022
True,-99.0,-124.767457,0.628046,False,383
True,-3.0,-517.441453,0.6336798,False,25
True,-2.0,-371.720469,0.5338452,False,59
True,-1.0,-992.137347,0.002947838,True,268
True,0.0,-472.823157,0.0003236973,True,2088
True,1.0,268.645489,0.5314346,False,162
True,2.0,-751.05642,0.1987013,False,132
True,3.0,-1191.935608,0.003506012,True,190


In [176]:
ef.ea_rv_cavity.fillna(value=-99, inplace=True)

In [177]:
subgroup(['has_echo', 'ea_rv_cavity'])

Unnamed: 0_level_0,Unnamed: 1_level_0,False,True
has_echo,ea_rv_cavity,Unnamed: 2_level_1,Unnamed: 3_level_1
False,-99.0,4103.0,1531.0
True,-99.0,199.0,106.0
True,-50.0,0.0,1.0
True,-3.0,115.0,48.0
True,-2.0,135.0,92.0
True,-1.0,5.0,4.0
True,0.0,1811.0,679.0
True,1.0,221.0,90.0
True,2.0,111.0,69.0


Unnamed: 0_level_0,Unnamed: 1_level_0,False,True,fold_change,p-value,significant @ p=0.05,support
has_echo,ea_rv_cavity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,-99.0,0.728257,0.271743,0.966658,0.013356,True,5634.0
True,-99.0,0.652459,0.347541,1.236291,0.009595,True,305.0
True,-50.0,0.0,1.0,3.557252,0.281116,False,1.0
True,-3.0,0.705521,0.294479,1.047534,0.725222,False,163.0
True,-2.0,0.594714,0.405286,1.441706,4.9e-05,True,227.0
True,-1.0,0.555556,0.444444,1.581001,0.279519,False,9.0
True,0.0,0.727309,0.272691,0.97003,0.285731,False,2490.0
True,1.0,0.710611,0.289389,1.02943,0.74851,False,311.0
True,2.0,0.616667,0.383333,1.363613,0.003247,True,180.0


Chi2 p-value: 2.47318228119e-06
