# Model6_final

The purpose of this notebook / model experiment is to explore additional approaches to improve model performance

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

with "remove_multicollinearity" set as true, some linear kernal-based model performance is expected to be improved

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      categorical_features = ["GCS_min", "GCS_max"],
                      #bin_numeric_features = ["GCS_min", "GCS_max", "GCS_med"],
                      normalize = True, 
                      transformation = True, 
                      #ignore_low_variance = True,
                      remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,108
6,Categorical Features,7
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [4]:
compare_models(turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.8703,0.8299,0.3043,0.5624,0.3927,0.3276,0.348,1.8842
1,CatBoost Classifier,0.8703,0.8365,0.2912,0.5687,0.3817,0.3181,0.3419,16.6224
2,Extreme Gradient Boosting,0.8678,0.8313,0.3067,0.5444,0.3904,0.323,0.3406,3.5728
3,Extra Trees Classifier,0.8632,0.8208,0.1908,0.5267,0.2777,0.22,0.256,0.2769
4,Gradient Boosting Classifier,0.8603,0.8225,0.3507,0.499,0.4099,0.3337,0.3413,5.9079
5,Random Forest Classifier,0.8392,0.77,0.2476,0.3809,0.2969,0.2118,0.2193,0.1154
6,MLP Classifier,0.8389,0.7592,0.3453,0.4097,0.3718,0.2807,0.2835,3.892
7,Ada Boost Classifier,0.8346,0.7823,0.4389,0.4084,0.4218,0.3257,0.3267,1.2983
8,SVM - Radial Kernel,0.8149,0.7767,0.4249,0.3609,0.3895,0.2816,0.2833,8.5899
9,Decision Tree Classifier,0.7964,0.6051,0.3404,0.2959,0.3146,0.1964,0.198,0.2323


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## create, tune, evaluate, predict adaboost model

In [5]:
ada = create_model("ada")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8643,0.8318,0.6053,0.5,0.5476,0.4686,0.4716
1,0.825,0.7943,0.4615,0.3913,0.4235,0.3212,0.3227
2,0.8286,0.7701,0.4103,0.3902,0.4,0.3001,0.3002
3,0.8429,0.799,0.4615,0.439,0.45,0.3584,0.3586
4,0.8143,0.7766,0.4359,0.3617,0.3953,0.2868,0.2885
5,0.8286,0.7754,0.3333,0.3714,0.3514,0.2529,0.2534
6,0.8357,0.7199,0.4103,0.4103,0.4103,0.3148,0.3148
7,0.8393,0.7727,0.359,0.4118,0.3836,0.2917,0.2926
8,0.8143,0.7371,0.359,0.3415,0.35,0.2417,0.2418
9,0.853,0.8463,0.5526,0.4667,0.506,0.4204,0.4225


In [18]:
tuned_ada = tune_model(ada, optimize = "Accuracy", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8571,0.8184,0.6053,0.4792,0.5349,0.4518,0.4561
1,0.8429,0.8026,0.5641,0.449,0.5,0.4082,0.4119
2,0.8036,0.7871,0.4615,0.3462,0.3956,0.2812,0.2853
3,0.8607,0.847,0.5385,0.5,0.5185,0.4372,0.4376
4,0.8143,0.79,0.4359,0.3617,0.3953,0.2868,0.2885
5,0.7964,0.804,0.359,0.3043,0.3294,0.2104,0.2114
6,0.8214,0.7786,0.3846,0.3659,0.375,0.2709,0.271
7,0.8429,0.793,0.4359,0.4359,0.4359,0.3446,0.3446
8,0.825,0.7697,0.359,0.3684,0.3636,0.2622,0.2622
9,0.8459,0.8322,0.4737,0.439,0.4557,0.3661,0.3664


In [19]:
evaluate_model(tuned_ada)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [20]:
predict_model(tuned_ada)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.8493,0.7915,0.4036,0.4497,0.4254,0.339,0.3396


Unnamed: 0,Age,Height,Weight_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,NIDiasABP_med,NIMAP_min,...,GCS_max_7.0,GCS_max_8.0,GCS_max_9.0,GCS_max_nan,MechVent_min_0,MechVent_max_1,MechVent_med_0.0,ihd,Label,Score
0,0.477499,-1.215611,0.925603,-0.511354,0.753785,-0.510431,0.160269,-0.915253,-1.733592,-0.360001,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0.4824
1,-0.556735,-0.031642,-0.694846,0.963869,1.427515,1.920192,1.464024,-0.820052,-0.179077,0.425147,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.4845
2,1.769352,0.035305,-0.633653,0.434389,-1.360105,0.149794,-0.498416,-0.004823,0.006606,1.090112,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,0.5001
3,-1.189793,-0.031642,0.178626,0.963869,0.616506,0.103027,0.517847,0.721668,1.594774,0.185996,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.4732
4,-0.057864,-0.031642,-2.314832,-0.862434,-1.360105,-1.641198,-1.738028,0.721668,-0.568123,1.019298,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0.4810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.725754,-0.364897,0.055875,-0.295979,-0.320230,-0.447804,-0.668665,0.603954,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0,0.4968
1197,-0.505915,-0.031642,-0.478512,0.963869,-0.827196,0.704247,-0.606748,-0.004823,0.483916,-0.240703,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.4693
1198,0.406102,-0.031642,-0.819713,0.963869,2.408744,1.190597,2.268197,-0.632149,0.272510,-0.795156,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0,0.4990
1199,-2.067770,-0.031642,0.439906,-1.206565,1.025611,0.501095,1.106328,1.170347,0.846665,2.004292,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0.4945


## explore blend models, both hard and soft voting

In [24]:
blend_hard = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8679,0.0,0.6053,0.5111,0.5542,0.4773,0.4796
1,0.8536,0.0,0.5385,0.4773,0.506,0.4204,0.4215
2,0.85,0.0,0.4359,0.4595,0.4474,0.3607,0.3608
3,0.8714,0.0,0.4872,0.5429,0.5135,0.4397,0.4405
4,0.8536,0.0,0.5385,0.4773,0.506,0.4204,0.4215
5,0.8357,0.0,0.2821,0.3793,0.3235,0.2323,0.2356
6,0.8571,0.0,0.5641,0.4889,0.5238,0.4403,0.4418
7,0.8321,0.0,0.4615,0.4091,0.4337,0.3356,0.3365
8,0.875,0.0,0.4615,0.5625,0.507,0.4363,0.4391
9,0.871,0.0,0.4737,0.5294,0.5,0.4262,0.4271


In [25]:
evaluate_model(blend_hard)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [26]:
predict_model(blend_hard)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8468,0,0.4096,0.4416,0.425,0.3368,0.3371


Unnamed: 0,Age,Height,Weight_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,NIDiasABP_med,NIMAP_min,...,GCS_max_6.0,GCS_max_7.0,GCS_max_8.0,GCS_max_9.0,GCS_max_nan,MechVent_min_0,MechVent_max_1,MechVent_med_0.0,ihd,Label
0,0.477499,-1.215611,0.925603,-0.511354,0.753785,-0.510431,0.160269,-0.915253,-1.733592,-0.360001,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0
1,-0.556735,-0.031642,-0.694846,0.963869,1.427515,1.920192,1.464024,-0.820052,-0.179077,0.425147,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0
2,1.769352,0.035305,-0.633653,0.434389,-1.360105,0.149794,-0.498416,-0.004823,0.006606,1.090112,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0
3,-1.189793,-0.031642,0.178626,0.963869,0.616506,0.103027,0.517847,0.721668,1.594774,0.185996,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0
4,-0.057864,-0.031642,-2.314832,-0.862434,-1.360105,-1.641198,-1.738028,0.721668,-0.568123,1.019298,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.725754,-0.364897,0.055875,-0.295979,-0.320230,-0.447804,-0.668665,0.603954,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0
1197,-0.505915,-0.031642,-0.478512,0.963869,-0.827196,0.704247,-0.606748,-0.004823,0.483916,-0.240703,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0
1198,0.406102,-0.031642,-0.819713,0.963869,2.408744,1.190597,2.268197,-0.632149,0.272510,-0.795156,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0
1199,-2.067770,-0.031642,0.439906,-1.206565,1.025611,0.501095,1.106328,1.170347,0.846665,2.004292,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0


In [21]:
blend_soft = blend_models(method = "soft")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8536,0.8706,0.6316,0.4706,0.5393,0.4545,0.4614
1,0.8393,0.8054,0.4872,0.4318,0.4578,0.3639,0.3648
2,0.8393,0.8175,0.3846,0.4167,0.4,0.3074,0.3077
3,0.8607,0.8441,0.5385,0.5,0.5185,0.4372,0.4376
4,0.8321,0.8037,0.4615,0.4091,0.4337,0.3356,0.3365
5,0.8393,0.8403,0.2821,0.3929,0.3284,0.2399,0.2441
6,0.8464,0.8232,0.5128,0.4545,0.4819,0.3922,0.3931
7,0.8571,0.8136,0.4359,0.4857,0.4595,0.3774,0.3782
8,0.8714,0.8087,0.4359,0.5484,0.4857,0.4133,0.4169
9,0.8566,0.8456,0.5,0.475,0.4872,0.4039,0.4041


In [22]:
evaluate_model(blend_soft)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [23]:
predict_model(blend_soft)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8385,0.8312,0.4518,0.4213,0.436,0.3419,0.3422


Unnamed: 0,Age,Height,Weight_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,NIDiasABP_med,NIMAP_min,...,GCS_max_7.0,GCS_max_8.0,GCS_max_9.0,GCS_max_nan,MechVent_min_0,MechVent_max_1,MechVent_med_0.0,ihd,Label,Score
0,0.477499,-1.215611,0.925603,-0.511354,0.753785,-0.510431,0.160269,-0.915253,-1.733592,-0.360001,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0.1759
1,-0.556735,-0.031642,-0.694846,0.963869,1.427515,1.920192,1.464024,-0.820052,-0.179077,0.425147,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.3760
2,1.769352,0.035305,-0.633653,0.434389,-1.360105,0.149794,-0.498416,-0.004823,0.006606,1.090112,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0,0.3177
3,-1.189793,-0.031642,0.178626,0.963869,0.616506,0.103027,0.517847,0.721668,1.594774,0.185996,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.0442
4,-0.057864,-0.031642,-2.314832,-0.862434,-1.360105,-1.641198,-1.738028,0.721668,-0.568123,1.019298,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0.2035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.725754,-0.364897,0.055875,-0.295979,-0.320230,-0.447804,-0.668665,0.603954,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0,0.4883
1197,-0.505915,-0.031642,-0.478512,0.963869,-0.827196,0.704247,-0.606748,-0.004823,0.483916,-0.240703,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.1064
1198,0.406102,-0.031642,-0.819713,0.963869,2.408744,1.190597,2.268197,-0.632149,0.272510,-0.795156,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0,0.4527
1199,-2.067770,-0.031642,0.439906,-1.206565,1.025611,0.501095,1.106328,1.170347,0.846665,2.004292,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0.2599


## conclusion

with additional binning / categorizing certain numeric values, either a single model, e.g. adaboost, or blend models has increased train or test min(recall, precision) to 0.48 or 0.42

Key potential next steps:
* further exploring binning / categorizing
* handling missing data, especially for certain variables with large missingness, e.g. assign distinct numeric value for missingness as missingness itself would be useful information for prediction