In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd

import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook', font_scale=1.2)

import matplotlib.style
matplotlib.style.use('ggplot')
%matplotlib inline

from IPython.display import display

In [2]:
import pymc3 as pm
import theano
import theano.tensor as T
from scipy.stats.stats import pearsonr

import sklearn.model_selection

In [3]:
# create a database connection
sqluser = 'mimic'
dbname = 'mimic'
schema_name = 'mimiciii'

cur = None

In [4]:
if cur: 
    cur.close()
    con.close()

con = psycopg2.connect(dbname = dbname, user = sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

To recap, variables of interest include: 

+ Echo parameters (LV systolic, RV size, pulmonary hypertension)
+ Fluids (day 1: missing term = 1 only if missing on day 1, day1-2: missing term = 1 if missing on day 1 or 2, day1-3: missing term = 1 if missing on day 1, day 2 or day 3)
+ Interactions: fluids x echo variables, fluids x CHF, echo x CHF
+ Covariates: Age, gender, BMI, BMI missing, mech vent, CHF, MDRD, Cr missing, apache score, lactate, lactate missing, ICU type, pressors indicator variable

Because of collinearity: 

+ Use only 1 fluid term per model
+ Use only 1 echo type variable per model
+ Use forward selection for interaction terms

In [5]:
query = """
SELECT * FROM icu_features
"""
ef = pd.read_sql_query(query, con)
ef.head()

Unnamed: 0,icustay_id,hadm_id,subject_id,age,gender,height,weight,ethnicity,insurance,filter_vaso,...,pc_bronch,pc_cath,pc_echo,pc_pressor,pc_rhc,pc_thora,pc_vent,passed_filters,use_record,bmi
0,200001,152234,55973,22290 days 19:06:12,F,167.851667,27.669135,ASIAN - ASIAN INDIAN,Medicare,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,9.820741
1,200003,163557,27513,17625 days 19:50:04,M,177.8,78.224998,WHITE,Private,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.744692
2,200006,189514,10950,19736 days 11:28:14,M,165.1,82.400002,OTHER,Medicaid,False,...,,,,,,,,,,30.229647
3,200007,129310,20707,15818 days 10:03:37,M,177.8,126.0,WHITE,Private,False,...,,,,,,,,,,39.857223
4,200009,129607,29904,17353 days 10:34:32,F,160.02,85.833331,WHITE,Private,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,33.520264


In [6]:
len(ef)

61532

In [7]:
F = (ef['use_record'] == 1)

In [8]:
ef = ef.loc[F]
len(ef)

9320

# Preprocessing

## Add MDRD to table

In [10]:
def mdrd(data):
    creat = data.lab_creatinine
    age = data.age/np.timedelta64('1', 'Y')
    age.loc[age<0] = age.max()
    gender = data.gender.apply(lambda x: 0.742 if x == 'F' else 1)
    ethnicity = data.ethnicity.apply(lambda x: 1.212 if 'AFRICAN AMERICAN' in x else 1)
    return 175*creat**(-1.154) * age**(-0.203) * gender * ethnicity

In [11]:
value = mdrd(ef)
ef['mdrd'] = value
discrete = pd.cut(value, [0, 15, 30, 44, 60, 90, 1000000], labels=[5, 4, 3, 2, 1, 0], include_lowest=True)
ef['mdrd_discrete'] = discrete
discrete.value_counts()

0    2524
1    2068
2    1439
4    1280
3    1258
5     503
dtype: int64

In [12]:
discrete = pd.cut(value, [0, 30, 60, 1000000], labels=[2, 1, 0], include_lowest=True)
ef['mdrd_3'] = discrete
discrete.value_counts()

0    4592
1    2697
2    1783
dtype: int64

## Build dataframe

In [14]:
def normalize(x):
    x = x.copy()
    I = ~np.isnan(x)
    x[I] = (x[I] - x[I].mean())/x[I].std()
    return x

In [15]:
X = pd.DataFrame()

# build the features table
age = ef.age/np.timedelta64('1', 'Y') # time in years
age_masked = age < 0 # if age is less than 0, then the person is above 89 (90?)
age = normalize(age)
age[age_masked] = 0
X['age'] = age
X['age_over_90'] = age_masked.astype(float)

gender = ef.gender
gender = (gender == 'F').astype(float)
X['gender'] = gender

bmi = ef.bmi
bmi_missing = np.isnan(bmi)
bmi = normalize(bmi)
bmi[bmi_missing] = 0
X['bmi'] = bmi
X['bmi_missing'] = bmi_missing.astype(float)

mech_vent = ef.vf_first_day_vent.astype(float)
X['mech_vent'] = mech_vent

chf = ef.ex_congestive_heart_failure.astype(float)
X['CHF'] = chf

mdrd = ef.mdrd
mdrd_missing = np.isnan(mdrd)
mdrd = normalize(mdrd)
mdrd[mdrd_missing] = 0
X['mdrd'] = mdrd

# MDRD missing is already largely covered by BMI missing
# the only other missing values come from missing creatinine lab
# so add that
creatinine_missing = np.isnan(ef.lab_creatinine)
X['creatinine_missing'] = creatinine_missing.astype(float)

apsiii = ef.apsiii
#apsiii_missing = np.isnan(apsiii)
apsiii = normalize(apsiii)
#apsiii[apsiii_missing] = 0
X['apsiii'] = apsiii
#X['apsiii_missing'] = apsiii_missing.astype(float)

no_echo = np.isnan(ef.ea_key)
#X['no_echo'] = no_echo.astype(float)

lvsys = ef.ea_lv_systolic
lv_missing = ((np.isnan(lvsys) | (lvsys < -1)) & ~no_echo).astype(float)
lv_normal = (lvsys == 0).astype(float)
lv_hyperdynamic = (lvsys == -1).astype(float)
lv_mild_depressed = (lvsys == 1).astype(float)
lv_mod_depressed = (lvsys == 2).astype(float)
lv_sev_depressed = (lvsys == 3).astype(float)
X['lv_missing'] = lv_missing
X['lv_normal'] = lv_normal
X['lv_hyper'] = lv_hyperdynamic
X['lv_mild'] = lv_mild_depressed
X['lv_moderate'] = lv_mod_depressed
X['lv_severe'] = lv_sev_depressed

tvpulm = ef.ea_tv_pulm_htn
tv_missing = ((np.isnan(tvpulm) | (tvpulm < -2)) & ~no_echo).astype(float)
tv_normal = (tvpulm == 0).astype(float)
tv_abnormal = ((tvpulm == -2) | (tvpulm > 0)).astype(float)
X['plm_htn_missing'] = tv_missing
X['plm_htn_normal'] = tv_normal
X['plm_htn_abnormal'] = tv_abnormal

rvcavity = ef.ea_rv_cavity
rvcav_missing = ((np.isnan(rvcavity) | (rvcavity < -2)) & ~no_echo).astype(float)
rvcav_normal = (rvcavity == 0).astype(float)
rvcav_small = (rvcavity == -1).astype(float)
rvcav_dilated = ((rvcavity == -2) | (rvcavity > 0)).astype(float)
X['rvcav_missing'] = rvcav_missing
X['rvcav_normal'] = rvcav_normal
X['rvcav_small'] = rvcav_small
X['rvcav_dilated'] = rvcav_dilated

lactate = ef.lab_lactate
lactate_missing = np.isnan(lactate)
lactate = normalize(lactate)
lactate[lactate_missing] = 0
X['lactate'] = lactate
X['lactate_missing'] = lactate_missing.astype(float)

## which ICU was treating the patient
sicu = ef.st_sicu.astype(float)
sicu[np.isnan(sicu)] = 0
nsicu = ef.st_nsicu.astype(float)
nsicu[np.isnan(nsicu)] = 0
micu = ef.st_micu.astype(float)
micu[np.isnan(micu)] = 0
X['sicu'] = sicu
X['nsicu'] = nsicu
X['micu'] = micu

## on vasopressors
on_vaso = ef.filter_vaso.astype(float)
X['on_vaso'] = on_vaso

## fluids
fluid_day1 = ef.fb_day1_balance_ml.values
f1_missing = np.isnan(fluid_day1)
fluid_day1[~f1_missing] = normalize(fluid_day1[~f1_missing])
fluid_day1[f1_missing] = 0
X['fluid_day1'] = fluid_day1
X['fluid_day1_missing'] = f1_missing.astype(float)

fluid_day2 = ef.fb_day2_balance_ml.values
f2_missing = np.isnan(fluid_day1) | np.isnan(fluid_day2)
fluid_day2[~f2_missing] = normalize(fluid_day2[~f2_missing])
fluid_day2[f2_missing] = 0
X['fluid_day2'] = fluid_day2
X['fluid_day2_missing'] = f2_missing.astype(float)

fluid_day3 = ef.fb_day3_balance_ml.values
f3_missing = np.isnan(fluid_day1) | np.isnan(fluid_day2) | np.isnan(fluid_day3)
fluid_day3[~f3_missing] = normalize(fluid_day3[~f3_missing])
fluid_day3[f3_missing] = 0
X['fluid_day3'] = fluid_day3
X['fluid_day3_missing'] = f3_missing.astype(float)

y = (ef.dod - ef.intime) < np.timedelta64(30, 'D')

X.shape, y.shape

((9320, 35), (9320,))

In [16]:
X.head()

Unnamed: 0,age,age_over_90,gender,bmi,bmi_missing,mech_vent,CHF,mdrd,creatinine_missing,apsiii,...,sicu,nsicu,micu,on_vaso,fluid_day1,fluid_day1_missing,fluid_day2,fluid_day2_missing,fluid_day3,fluid_day3_missing
1,0.100055,0.0,0.0,-0.002727,0.0,1.0,0.0,0.436365,0.0,-0.204515,...,0.0,1.0,0.0,0.0,0.175646,0.0,-0.202604,0.0,0.0,1.0
16,0.450966,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.364843,...,0.0,0.0,1.0,0.0,0.188791,0.0,0.0,1.0,0.0,1.0
20,0.164522,0.0,0.0,0.044261,0.0,0.0,0.0,0.404471,0.0,0.145859,...,0.0,0.0,1.0,0.0,-0.727927,0.0,-0.921635,0.0,-1.18932,0.0
22,0.305922,0.0,0.0,-0.088287,0.0,0.0,0.0,0.729504,0.0,-1.036653,...,1.0,0.0,0.0,0.0,-0.331343,0.0,-0.240897,0.0,-0.055252,0.0
25,0.39089,0.0,0.0,0.0,1.0,0.0,0.0,-0.400352,0.0,2.204305,...,1.0,0.0,1.0,0.0,-0.327334,0.0,-0.135571,0.0,-0.077264,0.0


In [17]:
echo_features = {
    'lv': ['lv_missing', 'lv_normal', 'lv_hyper', 'lv_mild', 'lv_moderate', 'lv_severe'], 
    'plm_htn': ['plm_htn_missing', 'plm_htn_normal', 'plm_htn_abnormal'], 
    'rvcav': ['rvcav_missing', 'rvcav_normal', 'rvcav_small', 'rvcav_dilated'], 
}
fluid_features = {
    'fluid_day1': ['fluid_day1', 'fluid_day1_missing'], 
    'fluid_day2': ['fluid_day2', 'fluid_day2_missing'], 
    'fluid_day3': ['fluid_day3', 'fluid_day3_missing'],
}
# Age, gender, BMI, BMI missing, mech vent, CHF, MDRD, Cr missing, apache score, lactate, 
#lactate missing, ICU type, pressors indicator variable
covariates = ['age', 'age_over_90', # age
    "gender", # gender
    "bmi", "bmi_missing", # bmi
    "mech_vent", # mech_vent
    "CHF", # CHF
    "mdrd", "creatinine_missing", # MDRD
    "apsiii", # apache score
    "lactate", "lactate_missing", # lactate
    "sicu", "nsicu", "micu", # icu type
    "on_vaso", # pressors indicator variable
]

In [92]:
def make_data(fluid_fs, echo_fs): 
    
    Xf = X[fluid_features[fluid_fs]]
    Xe = X[echo_features[echo_fs]]
    Xc = X[covariates]

    return pd.concat([Xf, Xe, Xc], axis = 1)

def make_interactions(fluid_fs, echo_fs): 
    
    Xf = X[fluid_features[fluid_fs]]
    Xe = X[echo_features[echo_fs]]
    Xc = X[covariates]

    # Interactions: fluids x echo variables, fluids x CHF, echo x CHF
    # Fluids x echo variables
    Xfe = {}
    for cf in Xf.columns: 
        for ce in Xe.columns: 
            Xfe["{}*{}".format(cf, ce)] = Xf[cf]*Xe[ce]
    Xfe = pd.DataFrame(Xfe)

    # Fluids x CHF
    Xfc = {}
    for cf in Xf.columns:
        Xfc["{}*CHF".format(cf)] = Xf[cf] * Xc['CHF']
    Xfc = pd.DataFrame(Xfc)

    # Echo x CHF
    Xec = {}
    for ce in Xe.columns: 
        Xec["{}*CHF".format(ce)] = Xe[ce] * Xc['CHF']
    Xec = pd.DataFrame(Xec)

    return pd.concat([Xfe, Xfc, Xec], axis = 1)

ytr, yte = sklearn.model_selection.train_test_split(y, test_size = 0.25, stratify = y)

# Covariates only

In [93]:
import statsmodels.api as sm

In [94]:
Xc = X[covariates]

logit = sm.Logit(ytr, Xc.loc[ytr.index])
res = logit.fit()

print(res.summary2())

ypr = res.predict(Xc.loc[yte.index])
print('test auroc:', sklearn.metrics.roc_auc_score(yte, ypr))

Optimization terminated successfully.
         Current function value: 0.505710
         Iterations 6
                          Results: Logit
Model:               Logit            Pseudo R-squared: 0.149      
Dependent Variable:  y                AIC:              7101.8205  
Date:                2017-05-18 17:21 BIC:              7211.4562  
No. Observations:    6990             Log-Likelihood:   -3534.9    
Df Model:            15               LL-Null:          -4152.1    
Df Residuals:        6974             LLR p-value:      6.8050e-254
Converged:           1.0000           Scale:            1.0000     
No. Iterations:      6.0000                                        
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
age                 1.0220   0.1774   5.7621 0.0000  0.6744  1.3697
age_over_90         0.7216   0.1222   5.9

# Fluid + echo, linear

In [109]:
linear = {}

for fluid_fs in ['fluid_day1', 'fluid_day2', 'fluid_day3']: 
    for echo_fs in echo_features.keys(): 

        X_ = make_data(fluid_fs, echo_fs)

        logit = sm.Logit(ytr, X_.loc[ytr.index])
        res = logit.fit()

        print(res.summary2(title = '{}:{}'.format(fluid_fs, echo_fs)))

        ypr = res.predict(X_.loc[yte.index])
        auroc = sklearn.metrics.roc_auc_score(yte, ypr)
        print('test auroc:', auroc)
        
        linear[(fluid_fs, echo_fs)] = (auroc, res.aic, res.bic)
        
linear = pd.DataFrame(linear).transpose()
linear.columns = ['auroc', 'aic', 'bic']
linear        

Optimization terminated successfully.
         Current function value: 0.502318
         Iterations 6
                         fluid_day1:rvcav
Model:               Logit            Pseudo R-squared: 0.154      
Dependent Variable:  y                AIC:              7066.4073  
Date:                2017-05-18 17:27 BIC:              7217.1565  
No. Observations:    6990             Log-Likelihood:   -3511.2    
Df Model:            21               LL-Null:          -4152.1    
Df Residuals:        6968             LLR p-value:      1.9214e-258
Converged:           1.0000           Scale:            1.0000     
No. Iterations:      6.0000                                        
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
fluid_day1         -0.0891   0.0339  -2.6304 0.0085 -0.1556 -0.0227
fluid_day1_missing  0.4647   0.1196   3.

Unnamed: 0,Unnamed: 1,auroc,aic,bic
fluid_day1,lv,0.734457,7064.505305,7228.958965
fluid_day1,plm_htn,0.737266,7041.794594,7185.691547
fluid_day1,rvcav,0.734741,7066.407332,7217.156521
fluid_day2,lv,0.734287,7083.430052,7247.883712
fluid_day2,plm_htn,0.73704,7061.155964,7205.052917
fluid_day2,rvcav,0.734389,7085.483432,7236.23262
fluid_day3,lv,0.739802,7067.572983,7232.026643
fluid_day3,plm_htn,0.74238,7045.851476,7189.748429
fluid_day3,rvcav,0.739762,7069.855431,7220.60462


# Fluid + echo, interactions

In [113]:
ixn = {}

for fluid_fs in ['fluid_day1', 'fluid_day2', 'fluid_day3']: 
    for echo_fs in echo_features.keys(): 

        Xd = make_data(fluid_fs, echo_fs)
        Xi = make_interactions(fluid_fs, echo_fs)
        X_ = pd.concat([Xd, Xi], axis = 1)

        try:
        
            logit = sm.Logit(ytr, X_.loc[ytr.index])
            res = logit.fit(maxiter = 100)

            print(res.summary2(title = '{}:{}'.format(fluid_fs, echo_fs)))
            
            ypr = res.predict(X_.loc[yte.index])
            auroc = sklearn.metrics.roc_auc_score(yte, ypr)
            print('test auroc:', auroc)

            ixn[(fluid_fs, echo_fs)] = (auroc, res.aic, res.bic)
            
        except np.linalg.LinAlgError:
            print('{}:{} model failed to fit'.format(fluid_fs, echo_fs))
            continue
        
        ixn[(fluid_fs, echo_fs)] = 
        
ixn = pd.DataFrame(ixn).transpose()
ixn.columns = ['auroc', 'aic', 'bic']
ixn

         Current function value: inf
         Iterations: 100
fluid_day1:rvcav model failed to fit
Optimization terminated successfully.
         Current function value: 0.497688
         Iterations 6
                                 fluid_day1:plm_htn
Model:                    Logit                  Pseudo R-squared:       0.162      
Dependent Variable:       y                      AIC:                    7021.6843  
Date:                     2017-05-18 17:30       BIC:                    7240.9558  
No. Observations:         6990                   Log-Likelihood:         -3478.8    
Df Model:                 31                     LL-Null:                -4152.1    
Df Residuals:             6958                   LLR p-value:            1.2758e-263
Converged:                1.0000                 Scale:                  1.0000     
No. Iterations:           6.0000                                                    
-------------------------------------------------------------------

Unnamed: 0,Unnamed: 1,auroc,aic,bic
fluid_day1,lv,0.729703,7053.542627,7355.041003
fluid_day1,plm_htn,0.737627,7021.684291,7240.955838
fluid_day2,lv,0.730709,7077.358336,7378.856713
fluid_day2,plm_htn,0.734473,7052.326037,7271.597584
fluid_day3,lv,0.737872,7069.774561,7371.272938
fluid_day3,plm_htn,0.740979,7031.318835,7250.590381


Test adding interaction term of interest

In [90]:
for fluid_fs in ['fluid_day1', 'fluid_day2', 'fluid_day3']: 
    for echo_fs in echo_features.keys(): 

        X_ = make_data_(fluid_fs, echo_fs)

        logit = sm.Logit(ytr, X_.loc[ytr.index])
        res = logit.fit()

        print(res.summary2(title = '{}:{}'.format(fluid_fs, echo_fs)))

        ypr = res.predict(X_.loc[yte.index])
        print('test auroc:', sklearn.metrics.roc_auc_score(yte, ypr))
        
        break
    break

Optimization terminated successfully.
         Current function value: 0.504642
         Iterations 7
                         fluid_day1:rvcav
Model:               Logit            Pseudo R-squared: 0.150      
Dependent Variable:  y                AIC:              7098.8953  
Date:                2017-05-18 17:19 BIC:              7249.6445  
No. Observations:    6990             Log-Likelihood:   -3527.4    
Df Model:            21               LL-Null:          -4152.1    
Df Residuals:        6968             LLR p-value:      1.7082e-251
Converged:           1.0000           Scale:            1.0000     
No. Iterations:      7.0000                                        
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
fluid_day1         -0.0882   0.0336  -2.6247 0.0087 -0.1540 -0.0223
fluid_day1_missing  0.2628   0.1254   2.

# Add interaction terms, one at a time

In [116]:
iixn = {}

for fluid_fs in ['fluid_day1', 'fluid_day2', 'fluid_day3']: 
    for echo_fs in ['lv', 'plm_htn']: 

        Xd = make_data(fluid_fs, echo_fs)
        Xi = make_interactions(fluid_fs, echo_fs)
        
        for it in Xi.columns:
            X_ = Xd.copy()
            X_[it] = Xi[it]
            
            try:

                logit = sm.Logit(ytr, X_.loc[ytr.index])
                res = logit.fit(maxiter = 100)

                print(res.summary2(title = '{}:{}:{}'.format(fluid_fs, echo_fs, it)))

                ypr = res.predict(X_.loc[yte.index])
                auroc = sklearn.metrics.roc_auc_score(yte, ypr)
                print('test auroc:', auroc)

                iixn[(fluid_fs, echo_fs, it)] = (auroc, res.aic, res.bic)
            
            except np.linalg.LinAlgError:
                print('{}:{}:{} model failed to fit'.format(fluid_fs, echo_fs, it))
                continue
                
iixn = pd.DataFrame(iixn).transpose()
iixn.columns = ['auroc', 'aic', 'bic']
iixn                

Optimization terminated successfully.
         Current function value: 0.501788
         Iterations 6
                 fluid_day1:lv:fluid_day1*lv_hyper
Model:                Logit            Pseudo R-squared: 0.155      
Dependent Variable:   y                AIC:              7064.9927  
Date:                 2017-05-18 17:32 BIC:              7236.2986  
No. Observations:     6990             Log-Likelihood:   -3507.5    
Df Model:             24               LL-Null:          -4152.1    
Df Residuals:         6965             LLR p-value:      2.3197e-257
Converged:            1.0000           Scale:            1.0000     
No. Iterations:       6.0000                                        
--------------------------------------------------------------------
                     Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
--------------------------------------------------------------------
fluid_day1          -0.0805   0.0345  -2.3335 0.0196 -0.1480 -0.0129
fluid_day1_missing 

Unnamed: 0,Unnamed: 1,Unnamed: 2,auroc,aic,bic
fluid_day1,lv,fluid_day1*CHF,0.734628,7064.832360,7236.138256
fluid_day1,lv,fluid_day1*lv_hyper,0.734224,7064.992658,7236.298554
fluid_day1,lv,fluid_day1*lv_mild,0.734631,7065.402993,7236.708889
fluid_day1,lv,fluid_day1*lv_missing,0.732968,7058.082483,7229.388379
fluid_day1,lv,fluid_day1*lv_moderate,0.734319,7065.446510,7236.752406
fluid_day1,lv,fluid_day1*lv_normal,0.733859,7055.908874,7227.214769
fluid_day1,lv,fluid_day1*lv_severe,0.733645,7064.562038,7235.867933
fluid_day1,lv,fluid_day1_missing*CHF,0.734365,7064.797514,7236.103410
fluid_day1,lv,fluid_day1_missing*lv_hyper,0.734325,7065.992474,7237.298370
fluid_day1,lv,fluid_day1_missing*lv_mild,0.734296,7066.367822,7237.673717


In [117]:
iixn.to_csv("../data/iixn.csv")

# Specific models

Build more specific models

In [133]:
fluid_fs = 'fluid_day1'
echo_fs = 'lv'

Xd = make_data(fluid_fs, echo_fs)
Xi = make_interactions(fluid_fs, echo_fs)
Xi = Xi[['fluid_day1*lv_hyper', 'fluid_day1*lv_missing', 'fluid_day1*lv_normal']]
X_ = pd.concat([Xd, Xi], axis = 1)

logit = sm.Logit(ytr, X_.loc[ytr.index])
res = logit.fit(maxiter = 100)

print(res.summary2(title = '{}:{}'.format(fluid_fs, echo_fs)))

ypr = res.predict(X_.loc[yte.index])
auroc = sklearn.metrics.roc_auc_score(yte, ypr)
print('test auroc:', auroc)

Optimization terminated successfully.
         Current function value: 0.499999
         Iterations 6
                            fluid_day1:lv
Model:                Logit             Pseudo R-squared:  0.158      
Dependent Variable:   y                 AIC:               7043.9830  
Date:                 2017-05-18 18:02  BIC:               7228.9934  
No. Observations:     6990              Log-Likelihood:    -3495.0    
Df Model:             26                LL-Null:           -4152.1    
Df Residuals:         6963              LLR p-value:       5.8267e-261
Converged:            1.0000            Scale:             1.0000     
No. Iterations:       6.0000                                          
----------------------------------------------------------------------
                       Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------------
fluid_day1             0.0317   0.0418   0.7582 0.4483 -0.0502  0.1135
flui

In [134]:
fluid_fs = 'fluid_day2'
echo_fs = 'lv'

Xd = make_data(fluid_fs, echo_fs)
Xi = make_interactions(fluid_fs, echo_fs)
Xi = Xi[['fluid_day2*lv_hyper', 'fluid_day2*lv_missing', 'fluid_day2*lv_normal']]
X_ = pd.concat([Xd, Xi], axis = 1)

logit = sm.Logit(ytr, X_.loc[ytr.index])
res = logit.fit(maxiter = 100)

print(res.summary2(title = '{}:{}'.format(fluid_fs, echo_fs)))

ypr = res.predict(X_.loc[yte.index])
auroc = sklearn.metrics.roc_auc_score(yte, ypr)
print('test auroc:', auroc)

Optimization terminated successfully.
         Current function value: 0.502380
         Iterations 6
                            fluid_day2:lv
Model:                Logit             Pseudo R-squared:  0.154      
Dependent Variable:   y                 AIC:               7077.2723  
Date:                 2017-05-18 18:03  BIC:               7262.2827  
No. Observations:     6990              Log-Likelihood:    -3511.6    
Df Model:             26                LL-Null:           -4152.1    
Df Residuals:         6963              LLR p-value:       7.2544e-254
Converged:            1.0000            Scale:             1.0000     
No. Iterations:       6.0000                                          
----------------------------------------------------------------------
                       Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------------
fluid_day2             0.0683   0.0455   1.5025 0.1330 -0.0208  0.1574
flui

In [136]:
fluid_fs = 'fluid_day3'
echo_fs = 'lv'

Xd = make_data(fluid_fs, echo_fs)
Xi = make_interactions(fluid_fs, echo_fs)
Xi = Xi[['fluid_day3*lv_mild']]
X_ = pd.concat([Xd, Xi], axis = 1)

logit = sm.Logit(ytr, X_.loc[ytr.index])
res = logit.fit(maxiter = 100)

print(res.summary2(title = '{}:{}'.format(fluid_fs, echo_fs)))

ypr = res.predict(X_.loc[yte.index])
auroc = sklearn.metrics.roc_auc_score(yte, ypr)
print('test auroc:', auroc)

Optimization terminated successfully.
         Current function value: 0.501837
         Iterations 6
                           fluid_day3:lv
Model:               Logit            Pseudo R-squared: 0.155      
Dependent Variable:  y                AIC:              7065.6845  
Date:                2017-05-18 18:05 BIC:              7236.9904  
No. Observations:    6990             Log-Likelihood:   -3507.8    
Df Model:            24               LL-Null:          -4152.1    
Df Residuals:        6965             LLR p-value:      3.2592e-257
Converged:           1.0000           Scale:            1.0000     
No. Iterations:      6.0000                                        
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
fluid_day3          0.0580   0.0399   1.4539 0.1460 -0.0202  0.1362
fluid_day3_missing -0.2580   0.0659  -3.9