In [1]:
import numpy as np
import pandas as pd
import utils

import copy, math, os, pickle, time 

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, GammaRegressor
from sklearn.svm import LinearSVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from sklearn.pipeline import Pipeline

from scipy.stats import pearsonr, spearmanr, kendalltau

# To show all columns in a dataframe
pd.options.display.max_info_columns=250
pd.options.display.max_columns=500

# To make pretty plots
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-ticks')
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

%matplotlib inline

In [2]:
df_train = pd.read_csv("../data/mimic-ft98-clustered-S0-train.csv")
df_train.drop(columns=["starttime", "endtime"], inplace=True)

label = "log_duration"

print(df_train.shape)
df_train.head()

(10121, 107)


Unnamed: 0,stay_id,admission_location,insurance,language,ethnicity,marital_status,gender,age,hours_in_hosp_before_intubation,weight,height,co2_total_max,co2_total_avg,co2_total_min,ph_max,ph_avg,ph_min,lactate_max,lactate_avg,lactate_min,pao2fio2ratio,heart_rate_max,heart_rate_avg,heart_rate_min,mbp_max,mbp_avg,mbp_min,mbp_ni_max,mbp_ni_avg,mbp_ni_min,resp_rate_max,resp_rate_avg,resp_rate_min,temp_max,temp_avg,temp_min,spo2_max,spo2_avg,spo2_min,glucose_max,glucose_avg,glucose_min,vasopressin,epinephrine,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_avg,fio2_min,peep_max,peep_avg,peep_min,plateau_pressure_max,plateau_pressure_avg,plateau_pressure_min,rrt,sinus_rhythm,neuroblocker,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,SOFA,respiration,coagulation,liver,cardiovascular,cns,renal,apsiii,hr_score,mbp_score,temp_score,resp_rate_score,pao2_aado2_score,hematocrit_score,wbc_score,creatinine_score,uo_score,bun_score,sodium_score,albumin_score,bilirubin_score,glucose_score,acidbase_score,gcs_score,duration,log_duration,over72h,alive96h,pc1,pc2,pc3,cluster
0,38910812,EMERGENCY ROOM,Other,ENGLISH,UNKNOWN,SINGLE,M,56,17,77.0,,19.0,18.0,17.0,7.33,7.305,7.28,7.4,6.95,6.5,108.0,82.0,72.44,65.0,100.0,73.72,57.0,63.0,63.0,63.0,26.0,22.8,20.0,37.06,36.551667,36.0,98.0,94.84,92.0,136.0,102.4,62.0,0,0,0,0,1,0,1,50.0,50.0,50.0,6.0,5.6,5.0,16.0,16.0,16.0,0,1.0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,14,3.0,2.0,3.0,1.0,4,1.0,118,5.0,15.0,0.0,6.0,2.0,3.0,0.0,4.0,8.0,11.0,2.0,0.0,8.0,0.0,6.0,48.0,75.033333,4.317932,1,0,60.593486,-5.996556,-2.941236,4
1,38388229,EMERGENCY ROOM,Other,ENGLISH,BLACK/AFRICAN AMERICAN,MARRIED,M,81,45,95.5,180.0,23.0,22.5,22.0,7.44,7.435,7.43,,,,210.0,110.0,89.333333,54.0,103.0,83.269231,71.0,91.0,80.555556,71.0,33.0,23.94,16.0,38.61,37.426667,36.67,100.0,98.666667,96.0,205.0,162.666667,109.0,0,0,0,0,1,0,1,50.0,42.5,40.0,5.0,5.0,5.0,16.0,15.333333,15.0,0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,5,,1.0,0.0,0.0,3,1.0,60,5.0,7.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,33.0,213.633333,5.364261,1,1,-0.342315,-19.28656,3.687725,3
2,31753166,TRANSFER FROM HOSPITAL,Medicare,ENGLISH,WHITE,MARRIED,M,91,73,79.5,175.0,27.0,26.333333,26.0,7.49,7.46,7.43,,,,300.0,74.0,61.65625,60.0,128.0,72.3625,45.0,95.0,73.681818,45.0,38.0,19.234375,14.0,38.0,37.13,36.22,100.0,98.15625,94.0,72.0,71.5,71.0,0,0,0,0,0,0,0,100.0,48.75,30.0,10.0,5.7,5.0,24.0,20.916667,20.0,0,0.0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,7,2.0,0.0,,0.0,3,2.0,72,0.0,10.0,0.0,6.0,5.0,3.0,0.0,7.0,4.0,11.0,0.0,,,0.0,2.0,24.0,90.416667,4.504429,1,1,7.868908,-4.134309,-5.318351,3
3,30003299,EMERGENCY ROOM,Other,ENGLISH,WHITE,SINGLE,M,26,1,120.0,178.0,29.0,24.888889,21.0,7.4,7.335556,7.27,4.0,2.777778,1.5,280.0,133.0,119.5,101.0,122.0,93.071429,70.0,,,,18.0,17.105263,12.0,37.44,36.971667,36.39,100.0,98.555556,96.0,185.0,152.166667,130.0,0,0,0,0,0,0,0,50.0,48.333333,40.0,5.0,5.0,5.0,25.0,23.6,22.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.0,0.0,,0.0,3,0.0,48,7.0,7.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,,,0.0,12.0,15.0,154.183333,5.038142,1,1,-18.294562,-7.29107,3.911806,2
4,31166711,EMERGENCY ROOM,Other,ENGLISH,WHITE,SINGLE,M,42,77,97.6,183.0,32.0,20.75,15.0,7.22,7.1565,7.0,6.4,4.485,2.2,72.0,150.0,128.5,113.0,88.0,67.607143,47.0,,,,35.0,16.017857,10.0,39.8,38.15,37.3,100.0,90.62069,78.0,173.0,120.421053,77.0,1,1,0,1,1,0,4,100.0,100.0,100.0,16.0,12.769231,10.0,32.0,26.5,21.0,1,0.0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,14,4.0,1.0,2.0,4.0,1,2.0,92,13.0,15.0,0.0,9.0,0.0,3.0,0.0,7.0,5.0,7.0,2.0,11.0,5.0,0.0,12.0,3.0,420.283333,6.040929,1,1,19.069108,25.24142,6.178101,2


In [3]:
# features = ["acidbase_score",
#             "cns",
#             "co2_total_avg",
#             "gcs_score",
#             "height",
# #             "malignant_cancer",
# #             "paraplegia",
#             "peep_avg",
#             "peep_min",
#             "resp_rate_min",
#             "temp_avg",
#             "uo_score", 
#             "apsiii",
#             "temp_max",
#             "SOFA",
#             "plateau_pressure_max",
#             "fio2_min",
#             "cardiovascular",
#             "neuroblocker"
#            ]

# # All eICU features
# features = ['ph_max', 'spo2_min',
#        'heart_rate_min', 'heart_rate_max', 'resp_rate_min', 'resp_rate_max',
#        'temp_min', 'temp_max', 'glucose_max', 'glucose_min', 'co2_total_max',
#        'co2_total_min', 'mbp_max', 'mbp_ni_min', 'apsiii', 'peep_max',
#        'peep_min', 'co2_total_avg', 'fio2_min', 'plateau_pressure_max',
#        'height', 'peep_avg', 'temp_avg', 'hr_score', 'mbp_score', 'temp_score',
#        'resp_rate_score', 'pao2_aado2_score', 'hematocrit_score', 'wbc_score',
#        'creatinine_score', 'uo_score', 'bun_score', 'sodium_score',
#        'albumin_score', 'bilirubin_score', 'glucose_score', 'acidbase_score',
#        'gcs_score', 'SOFA', 'respiration', 'coagulation', 'liver',
#        'cardiovascular', 'cns', 'renal', 
#            ]

features = None

X_train, y_train = utils.get_X_and_y(df_train, features=features, label=label)
preprocessor = utils.define_preprocessor(X_train.columns)

features = X_train.columns

In [4]:
for ft in features:
    X_train, y_train = utils.get_X_and_y(df_train, features=features, label=label)
    preprocessor = utils.define_preprocessor(X_train[[ft]])
    reg = GradientBoostingRegressor()

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', reg)])
    y_pred = utils.benchmark_cv(pipe, X_train, y_train, head="reg")
    print("%s : %.3f" % (ft, pearsonr(df_train.log_duration, y_pred)[0]))

ValueError: The truth value of a Index is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

### Preprocessing

In [5]:
cat_features = df_train.columns[1:7]
num_features = df_train.columns[7:-8]

len(num_features), len(cat_features)

(92, 6)

In [6]:
df_train[cat_features].isna().sum()

admission_location       0
insurance                0
language                 0
ethnicity                0
marital_status        1213
gender                   0
dtype: int64

In [7]:
from sklearn.preprocessing import OneHotEncoder

df_train.marital_status.fillna("TINDER", inplace=True)
onehot = OneHotEncoder()
onehot.fit(df_train[cat_features])
onehot.categories_

[array(['AMBULATORY SURGERY TRANSFER', 'CLINIC REFERRAL', 'EMERGENCY ROOM',
        'INFORMATION NOT AVAILABLE', 'PACU', 'PHYSICIAN REFERRAL',
        'PROCEDURE SITE', 'TRANSFER FROM HOSPITAL',
        'TRANSFER FROM SKILLED NURSING FACILITY', 'WALK-IN/SELF REFERRAL'],
       dtype=object),
 array(['Medicaid', 'Medicare', 'Other'], dtype=object),
 array(['?', 'ENGLISH'], dtype=object),
 array(['AMERICAN INDIAN/ALASKA NATIVE', 'ASIAN', 'BLACK/AFRICAN AMERICAN',
        'HISPANIC/LATINO', 'OTHER', 'UNABLE TO OBTAIN', 'UNKNOWN', 'WHITE'],
       dtype=object),
 array(['DIVORCED', 'MARRIED', 'SINGLE', 'TINDER', 'WIDOWED'], dtype=object),
 array(['F', 'M'], dtype=object)]

In [8]:
cat_features_ = [cat.lower() for arr in onehot.categories_ for cat in list(arr)]
feature_names = pd.Index(num_features.tolist() + cat_features_)
len(num_features), len(cat_features_), len(feature_names)

(92, 30, 122)

### Using `SelectFromModel`

In [9]:
from sklearn.feature_selection import SelectFromModel

X_train = preprocessor.fit_transform(X_train)
reg = GradientBoostingRegressor()
reg = reg.fit(X_train, y_train)
reg.feature_importances_ 

array([9.80684268e-03, 1.37590310e-02, 3.48263830e-03, 3.34366504e-02,
       4.65218455e-03, 5.42129376e-03, 1.04462179e-02, 2.03384996e-03,
       7.73778452e-03, 1.17256365e-03, 1.48732319e-03, 5.54681106e-03,
       1.22067179e-02, 7.85376811e-03, 6.53125510e-03, 3.40015556e-03,
       5.20464614e-04, 3.74719819e-03, 4.00442077e-03, 1.92240700e-03,
       1.49154534e-03, 1.73515533e-03, 3.10133168e-04, 2.16176538e-03,
       2.21438567e-02, 9.27428286e-03, 1.16463429e-03, 2.09314849e-02,
       1.56443963e-03, 9.21518481e-04, 3.70613241e-03, 3.00008213e-03,
       7.33689355e-03, 1.12161658e-03, 2.29481099e-03, 0.00000000e+00,
       2.73967858e-03, 0.00000000e+00, 0.00000000e+00, 3.40337058e-04,
       2.02753563e-04, 0.00000000e+00, 3.96945389e-04, 3.12553638e-03,
       2.97098006e-03, 1.14209631e-02, 5.92979969e-02, 3.12720324e-02,
       2.25596161e-02, 1.68299763e-02, 1.11224799e-03, 2.83309998e-05,
       1.11638109e-06, 4.90744494e-03, 1.26236462e-03, 2.16722575e-03,
      

In [10]:
len(reg.feature_importances_)

122

In [11]:
df_imp = pd.DataFrame(zip(feature_names, reg.feature_importances_), columns=["feature", "importance"])

In [12]:
df_imp.sort_values(by="importance", ascending=False).head(20)

Unnamed: 0,feature,importance
91,gcs_score,0.26554
90,acidbase_score,0.140638
73,cns,0.119035
46,peep_avg,0.059298
75,apsiii,0.056191
3,height,0.033437
47,peep_min,0.031272
48,plateau_pressure_max,0.02256
24,resp_rate_avg,0.022144
27,temp_avg,0.020931


In [14]:
model = SelectFromModel(reg, prefit=True, max_features=122)
X_new = model.transform(X_train)
X_new.shape 

(10121, 18)

In [15]:
feature_names[model.get_support()]
# X_train.columns[model.get_support()]

Index(['age', 'hours_in_hosp_before_intubation', 'height', 'co2_total_min',
       'lactate_min', 'resp_rate_avg', 'resp_rate_min', 'temp_avg', 'peep_max',
       'peep_avg', 'peep_min', 'plateau_pressure_max', 'plateau_pressure_avg',
       'paraplegia', 'cns', 'apsiii', 'acidbase_score', 'gcs_score'],
      dtype='object')

In [18]:
for max_features in range(1,20):
    model = SelectFromModel(reg, prefit=True, max_features=max_features)
    X_new = model.transform(X_train)
#     df_new = pd.DataFrame(X_new, columns = X_train.columns[model.get_support()])
    df_new = pd.DataFrame(X_new, columns = feature_names[model.get_support()])
    print(df_new.shape)
    print(df_new.columns)
    print()
    
    preprocessor = utils.define_preprocessor(df_new.columns)
    reg_new = GradientBoostingRegressor()

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', reg_new)])
    y_pred = utils.benchmark_cv(pipe, df_new, y_train, head="reg")
    print("%d : %.3f" % (max_features, pearsonr(y_train, y_pred)[0]))

(10121, 1)
Index(['gcs_score'], dtype='object')

________________________________________________________________________________

Model training: 
train time: 0.725s
1 : 0.389
(10121, 2)
Index(['acidbase_score', 'gcs_score'], dtype='object')

________________________________________________________________________________

Model training: 
train time: 1.174s
2 : 0.460
(10121, 3)
Index(['cns', 'acidbase_score', 'gcs_score'], dtype='object')

________________________________________________________________________________

Model training: 
train time: 1.415s
3 : 0.465
(10121, 4)
Index(['peep_avg', 'cns', 'acidbase_score', 'gcs_score'], dtype='object')

________________________________________________________________________________

Model training: 
train time: 2.579s
4 : 0.494
(10121, 5)
Index(['peep_avg', 'cns', 'apsiii', 'acidbase_score', 'gcs_score'], dtype='object')

________________________________________________________________________________

Model training: 
train time: 3.542

### Using `SequentialFeatureSelector`

In [36]:
from sklearn.feature_selection import SequentialFeatureSelector
from time import time

tic = time()
reg = GradientBoostingRegressor()
sfs_forward = SequentialFeatureSelector(reg, n_features_to_select=12,
                                        direction='forward').fit(X_train, y_train)
toc = time()

print("Features selected by forward sequential selection: \n"
      f"{X_train.columns[sfs_forward.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

Features selected by forward sequential selection: 
Index(['height', 'co2_total_min', 'resp_rate_min', 'temp_avg', 'epinephrine',
       'peep_avg', 'plateau_pressure_avg', 'paraplegia', 'apsiii', 'mbp_score',
       'acidbase_score', 'gcs_score'],
      dtype='object')
Done in 4899.456s


Features selected by forward sequential selection: 
Index(['height', 'co2_total_min', 'peep_avg', 'acidbase_score', 'gcs_score'], dtype='object')
Done in 523.710s
