# Model development using EMR data only (Strategy 4)
1. Summary statistics
2. Feature selection (to add)
3. Model development
4. Hyperparameter tuning (to add)
5. Evaluation of the final model and error analysis (to add)

In [1]:
import numpy as np
import pandas as pd
import utils

import copy, math, os, pickle, time 
import scipy.stats as ss

from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline

# To show all columns in a dataframe
pd.options.display.max_info_columns=250
pd.options.display.max_columns=500

# To make pretty plots
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-ticks')
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

%matplotlib inline

### Load and prepare the data

In [2]:
df_train = pd.read_csv("../data/emr-filled-train-S4.csv")
df_train.drop(columns=["starttime", "endtime"], inplace=True)
print(df_train.shape)
df_train.head()

(2963, 66)


Unnamed: 0,stay_id,admission_location,insurance,language,ethnicity,marital_status,gender,age,hours_in_hosp_before_intubation,weight,height,co2_total_max,co2_total_min,ph_max,ph_min,lactate_max,lactate_min,pao2fio2ratio,heart_rate_max,heart_rate_min,mbp_ni_max,mbp_ni_min,mbp_arterial_max,mbp_arterial_min,resp_rate_max,resp_rate_min,spo2_max,spo2_min,temp_max,temp_min,glucose_max,glucose_min,epinephrine,vasopressin,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_min,peep_max,peep_min,plateau_pressure_max,plateau_pressure_min,rrt,sinus_rhythm,neuroblocker,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,SOFA,apsiii,over72h,alive96h
0,38450498,EMERGENCY ROOM,Medicare,ENGLISH,WHITE,SINGLE,M,50,2,95.4,169.099032,37.0,29.0,7.51,7.42,1.3,1.3,176.0,112.0,50.0,95.0,55.0,116.0,61.0,31.0,4.0,100.0,90.0,38.17,36.56,169.0,78.0,1,1,1,1,1,1,6,100.0,40.0,12.0,5.0,23.0,14.0,0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,60,1,1
1,35319936,TRANSFER FROM HOSPITAL,Medicare,ENGLISH,OTHER,SINGLE,M,40,49,99.8,169.099032,29.0,22.0,7.53,7.15,1.4,1.4,250.0,130.0,52.0,75.0,56.0,113.0,58.0,40.0,15.0,100.0,92.0,39.28,36.0,330.0,63.0,1,1,1,1,1,1,6,100.0,40.0,14.4,0.0,28.0,25.0,0,1.0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,5,77,1,1
2,39141494,PHYSICIAN REFERRAL,Medicaid,ENGLISH,WHITE,SINGLE,M,59,37,52.1,169.099032,27.0,24.0,7.37,7.36,2.2,1.1,224.0,94.0,68.0,106.0,62.0,113.0,55.0,33.0,13.0,100.0,90.0,38.72,36.83,860.0,88.0,1,1,1,1,1,1,6,70.0,35.0,5.0,0.0,22.0,15.0,0,1.0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,5,27,0,1
3,34129457,EMERGENCY ROOM,Medicare,ENGLISH,WHITE,MARRIED,M,73,344,67.3,169.099032,21.0,20.0,7.35,7.33,2.2,1.1,80.0,136.0,60.0,96.0,47.0,113.0,55.0,35.0,12.0,100.0,92.0,38.56,36.44,121.0,93.0,1,1,1,1,1,1,6,45.0,35.0,5.0,5.0,22.0,15.0,0,1.0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,7,52,1,1
4,38836976,EMERGENCY ROOM,Medicare,ENGLISH,WHITE,SINGLE,F,30,87,102.1,173.0,26.0,23.0,7.37,7.24,2.2,1.1,142.0,102.0,71.0,74.0,51.0,113.0,55.0,26.0,12.0,100.0,95.0,37.67,36.89,115.0,110.0,1,1,1,1,1,1,6,100.0,40.0,24.0,5.0,30.0,20.0,0,1.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,5,27,0,1


**Summary statistics**

In [3]:
df_train.describe()

Unnamed: 0,stay_id,age,hours_in_hosp_before_intubation,weight,height,co2_total_max,co2_total_min,ph_max,ph_min,lactate_max,lactate_min,pao2fio2ratio,heart_rate_max,heart_rate_min,mbp_ni_max,mbp_ni_min,mbp_arterial_max,mbp_arterial_min,resp_rate_max,resp_rate_min,spo2_max,spo2_min,temp_max,temp_min,glucose_max,glucose_min,epinephrine,vasopressin,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_min,peep_max,peep_min,plateau_pressure_max,plateau_pressure_min,rrt,sinus_rhythm,neuroblocker,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,SOFA,apsiii,over72h,alive96h
count,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0,2963.0
mean,35008870.0,63.79244,80.024975,83.793971,169.085239,29.391158,21.721228,7.450175,7.289015,3.198046,1.306149,253.223856,118.368883,63.905839,101.666892,54.009787,121.292946,52.483834,32.941951,9.299696,99.926088,88.521093,38.175366,36.13839,2249.355383,93.02565,1.0,1.0,1.0,1.0,1.0,1.0,6.0,75.20891,38.36787,9.616372,2.662774,23.837023,15.534931,0.147823,0.603105,0.096524,0.314884,0.193723,0.031387,0.306109,0.039825,0.187648,0.255484,0.078637,0.083361,0.242322,0.119811,0.089436,0.053662,0.013162,8.383058,67.061762,0.516031,0.915626
std,2873116.0,16.516083,264.242538,24.628087,8.649546,6.577221,6.234982,0.069163,0.113495,3.019015,1.014579,114.659821,22.905199,14.471056,21.726806,11.889468,39.958958,15.140875,8.161354,3.818461,0.525775,11.923335,0.84994,1.069951,44951.706952,30.308096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.688564,7.322074,4.615771,2.949842,53.591823,4.257685,0.354985,0.489336,0.295358,0.464548,0.395281,0.174391,0.460953,0.19558,0.390496,0.436207,0.269216,0.276474,0.42856,0.324795,0.285421,0.225387,0.113989,4.135869,27.889272,0.499827,0.277995
min,30004890.0,18.0,0.0,1.3,122.0,11.0,0.0,7.1,6.38,0.4,0.2,35.0,50.0,1.0,33.0,11.0,1.0,1.0,15.0,1.0,79.0,1.0,32.8,15.0,68.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,25.0,21.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0
25%,32535390.0,54.0,2.0,67.05,165.0,25.0,18.0,7.41,7.22,1.5,0.9,172.5,103.0,54.0,87.0,48.0,105.0,52.0,27.0,7.0,100.0,87.0,37.5,36.11,154.0,74.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,50.0,35.0,6.0,0.0,19.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,46.0,0.0,1.0
50%,35029440.0,66.0,6.0,80.8,169.099032,29.0,22.0,7.45,7.31,2.2,1.1,245.0,117.0,63.0,101.0,54.0,113.0,55.0,32.0,9.0,100.0,92.0,38.1,36.44,196.0,91.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,70.0,40.0,9.0,0.0,22.0,15.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,64.0,1.0,1.0
75%,37488830.0,76.0,62.0,96.0,173.0,32.0,25.0,7.49,7.37,3.6,1.4,320.0,133.0,73.0,113.0,60.0,120.0,58.0,37.0,12.0,100.0,95.0,38.8,36.67,269.0,109.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,40.0,12.0,5.0,26.0,18.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,85.0,1.0,1.0
max,39997980.0,97.0,8942.0,264.0,198.0,65.0,52.0,7.93,7.61,29.1,17.5,1353.333333,247.0,125.0,220.0,107.0,299.0,241.0,69.0,31.0,100.0,100.0,43.06,37.83,999999.0,421.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,100.0,52.0,26.0,2910.0,39.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23.0,189.0,1.0,1.0


**Drop constant variables**

In [4]:
df_train = df_train.loc[:, df_train.apply(pd.Series.nunique) != 1]

### Feature selection

In [5]:
df_train["group"] = df_train.over72h.astype(str) + df_train.alive96h.astype(str)

In [6]:
# features = ryo_features

X_train, y_train = utils.get_X_and_y(df_train, label="group")
print(X_train.shape, y_train.shape)

preprocessor = utils.define_preprocessor(X_train.columns)

(2963, 56) (2963,)


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train = le.fit_transform(y_train)

le.classes_

array(['00', '01', '10', '11'], dtype=object)

In [12]:
clfs = (
#     LogisticRegression(max_iter=1000),
#     KNeighborsClassifier(),
#     SVC(),
#     DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
)

for clf in clfs:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', clf)])
    utils.benchmark_cv_score(pipe, X_train, y_train)

________________________________________________________________________________

Model training: 
RandomForestClassifier()


ValueError: multiclass format is not supported

In [13]:
clf = RandomForestClassifier()

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])

In [14]:
from sklearn.model_selection import cross_val_predict

y_pred = cross_val_predict(pipe, X_train, y_train, n_jobs=-1, cv=10)

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

print("precision:", precision_score(y_train, y_pred, average=None))
print("recall:", recall_score(y_train, y_pred, average=None))
print("f1 score:", f1_score(y_train, y_pred, average=None))

precision: [0.81818182 0.76070901 0.         0.80678851]
recall: [0.31034483 0.83671812 0.         0.8340081 ]
f1 score: [0.45       0.79690522 0.         0.82017253]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import cross_val_predict

y_proba = cross_val_predict(pipe, X_train, y_train, n_jobs=-1, cv=10, method="predict_proba")

In [None]:
y_proba[1].shape

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print("Over 72h")
print("precision:", precision_score(y_train[:,0], y_pred[:,0]))
print("recall:", recall_score(y_train[:,0], y_pred[:,0]))
print("f1 score:", f1_score(y_train[:,0], y_pred[:,0]))
print()
print("Alive @ 96h")
print("precision:", precision_score(y_train[:,1], y_pred[:,1], pos_label=0))
print("recall:", recall_score(y_train[:,1], y_pred[:,1], pos_label=0))
print("f1 score:", f1_score(y_train[:,1], y_pred[:,1], pos_label=0))

In [None]:
print("precision:", precision_score(y_train[:,1], np.ones_like(y_train[:,1])))
print("recall:", recall_score(y_train[:,1], np.ones_like(y_train[:,1])))
print("f1 score:", f1_score(y_train[:,1], np.ones_like(y_train[:,1])))

In [None]:
class_names = ("MV <= 72 hours", "MV > 72 hours")
# class_names = ("Bad outcome", "Good outcome")

clfs = (
    LogisticRegression(max_iter=1000),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
)

for clf in clfs:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', clf)])
    utils.benchmark_cv_score(pipe, X_train, y_train, class_names)

### Hyperparameter tuning

In [None]:
# class DictDist():
#     def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
#     def rvs(self, n):
#         a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
#         out = []
#         for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
#         return out
    
# class Choice():
#     def __init__(self, options): self.options = options
#     def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [None]:
# N = 15
# SEED = 1443
# RF_dist = DictDist({
#     'n_estimators': ss.randint(50, 500),
#     'max_depth': ss.randint(2, 10),
#     'min_samples_split': ss.randint(2, 75),
#     'min_samples_leaf': ss.randint(1, 50),
# })
# np.random.seed(SEED)
# RF_hyperparams_list = RF_dist.rvs(N)