# Model4_standardize

The purpose of this notebook / model experiment is to improve upon the previous fix_imbalance model achieved from model2_fix_imbalance.ipynb, by standardizing variables (normalizing + transforming)

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

with "normalize" and "transformation" set as true, some linear kernal-based model performance is expected to be improved

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      numeric_features = ["GCS_min", "GCS_max"],
                      normalize = True, 
                      transformation = True, 
                      #ignore_low_variance = True,
                      #remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,110
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [9]:
compare_models(turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Extra Trees Classifier,0.8678,0.8231,0.2372,0.5764,0.3312,0.2715,0.3067,0.2762
1,Light Gradient Boosting Machine,0.8667,0.8365,0.3071,0.5396,0.3885,0.3204,0.3378,2.4954
2,Extreme Gradient Boosting,0.865,0.8325,0.3198,0.5277,0.3958,0.3251,0.3392,4.2819
3,CatBoost Classifier,0.8649,0.8321,0.3068,0.5237,0.3849,0.3152,0.3304,18.4005
4,Gradient Boosting Classifier,0.8546,0.8258,0.3867,0.4711,0.4219,0.3403,0.3438,7.7028
5,Random Forest Classifier,0.8485,0.7698,0.3016,0.4416,0.3567,0.2743,0.2817,0.1156
6,MLP Classifier,0.8314,0.7508,0.369,0.3888,0.3748,0.2784,0.2803,3.9559
7,SVM - Radial Kernel,0.8285,0.7811,0.3865,0.3848,0.3846,0.2853,0.2858,8.1245
8,Ada Boost Classifier,0.8192,0.7826,0.4459,0.3752,0.4058,0.3008,0.3031,1.649
9,Decision Tree Classifier,0.7746,0.6131,0.3897,0.2773,0.3228,0.1925,0.1972,0.3979


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)

## create, tune, evaluate, predict gradient boosting classifier

In [4]:
gbc = create_model("gbc")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8929,0.8942,0.5526,0.6176,0.5833,0.5221,0.5231
1,0.8286,0.7919,0.3077,0.3636,0.3333,0.2358,0.2368
2,0.8643,0.8104,0.4103,0.5161,0.4571,0.3807,0.384
3,0.8571,0.8507,0.3846,0.4839,0.4286,0.3482,0.3512
4,0.8393,0.8135,0.4872,0.4318,0.4578,0.3639,0.3648
5,0.8464,0.8493,0.3333,0.4333,0.3768,0.2909,0.2942
6,0.8429,0.792,0.3846,0.4286,0.4054,0.3152,0.3158
7,0.8786,0.8363,0.4615,0.5806,0.5143,0.4459,0.4498
8,0.8357,0.7836,0.2821,0.3793,0.3235,0.2323,0.2356
9,0.8602,0.8365,0.2632,0.4762,0.339,0.268,0.2828


In [9]:
tuned_gbc = tune_model(gbc, optimize = "F1", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8464,0.8593,0.4474,0.4359,0.4416,0.3525,0.3526
1,0.8214,0.7852,0.4103,0.3721,0.3902,0.2859,0.2864
2,0.8107,0.8208,0.359,0.3333,0.3457,0.2352,0.2354
3,0.8679,0.8572,0.4615,0.5294,0.4932,0.4176,0.4189
4,0.825,0.8289,0.4872,0.3958,0.4368,0.3345,0.337
5,0.825,0.8292,0.3846,0.375,0.3797,0.2779,0.2779
6,0.8321,0.7971,0.4103,0.4,0.4051,0.3074,0.3074
7,0.8321,0.7778,0.3846,0.3947,0.3896,0.2923,0.2924
8,0.8571,0.7919,0.3846,0.4839,0.4286,0.3482,0.3512
9,0.8566,0.8679,0.4211,0.4706,0.4444,0.3624,0.3632


In [10]:
evaluate_model(tuned_gbc)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [11]:
predict_model(tuned_gbc)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8485,0.8154,0.3675,0.442,0.4013,0.3154,0.3172


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label,Score
0,0.477499,-1.215611,0.547075,0.925603,0.944387,-1.152209,0.672376,-0.511354,0.753785,-0.510431,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.0605
1,-0.556735,-0.031642,-0.550466,-0.694846,-0.656055,1.181913,0.672376,0.963869,1.427515,1.920192,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.3409
2,1.769352,0.035305,-0.619652,-0.633653,-0.724077,0.669598,0.672376,0.434389,-1.360105,0.149794,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0,0.3330
3,-1.189793,-0.031642,0.179759,0.178626,0.175316,1.341102,0.672376,0.963869,0.616506,0.103027,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0,0.0219
4,-0.057864,-0.031642,-2.213802,-2.314832,-2.271104,0.288248,-1.528718,-0.862434,-1.360105,-1.641198,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.1116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.625997,-0.725754,-0.686850,0.483924,-0.551305,-0.364897,0.055875,-0.295979,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0,0.4648
1197,-0.505915,-0.031642,-0.361671,-0.478512,-0.440575,1.181913,0.672376,0.963869,-0.827196,0.704247,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.0260
1198,0.406102,-0.031642,-0.677094,-0.819713,-0.780473,1.181913,0.672376,0.963869,2.408744,1.190597,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,0.2266
1199,-2.067770,-0.031642,0.575774,0.439906,0.472466,-0.142452,-0.551305,-1.206565,1.025611,0.501095,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0,0.1668


## explore blend models, both hard and soft voting

In [12]:
blend_hard = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8679,0.0,0.6842,0.5098,0.5843,0.5077,0.5155
1,0.8179,0.0,0.5128,0.3846,0.4396,0.3335,0.3384
2,0.8107,0.0,0.4103,0.3478,0.3765,0.2658,0.267
3,0.85,0.0,0.5641,0.4681,0.5116,0.4239,0.4265
4,0.8071,0.0,0.5641,0.3729,0.449,0.3379,0.3486
5,0.8429,0.0,0.4359,0.4359,0.4359,0.3446,0.3446
6,0.8429,0.0,0.5128,0.4444,0.4762,0.3843,0.3857
7,0.8571,0.0,0.5385,0.4884,0.5122,0.4287,0.4294
8,0.8393,0.0,0.4872,0.4318,0.4578,0.3639,0.3648
9,0.8566,0.0,0.5,0.475,0.4872,0.4039,0.4041


In [13]:
evaluate_model(blend_hard)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [14]:
predict_model(blend_hard)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8393,0,0.4337,0.4211,0.4273,0.3339,0.3339


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label
0,0.477499,-1.215611,0.547075,0.925603,0.944387,-1.152209,0.672376,-0.511354,0.753785,-0.510431,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0
1,-0.556735,-0.031642,-0.550466,-0.694846,-0.656055,1.181913,0.672376,0.963869,1.427515,1.920192,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0
2,1.769352,0.035305,-0.619652,-0.633653,-0.724077,0.669598,0.672376,0.434389,-1.360105,0.149794,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0
3,-1.189793,-0.031642,0.179759,0.178626,0.175316,1.341102,0.672376,0.963869,0.616506,0.103027,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0
4,-0.057864,-0.031642,-2.213802,-2.314832,-2.271104,0.288248,-1.528718,-0.862434,-1.360105,-1.641198,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.625997,-0.725754,-0.686850,0.483924,-0.551305,-0.364897,0.055875,-0.295979,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,1
1197,-0.505915,-0.031642,-0.361671,-0.478512,-0.440575,1.181913,0.672376,0.963869,-0.827196,0.704247,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0
1198,0.406102,-0.031642,-0.677094,-0.819713,-0.780473,1.181913,0.672376,0.963869,2.408744,1.190597,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0
1199,-2.067770,-0.031642,0.575774,0.439906,0.472466,-0.142452,-0.551305,-1.206565,1.025611,0.501095,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0


In [15]:
blend_soft = blend_models(method = "soft")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8536,0.8781,0.7105,0.4737,0.5684,0.4845,0.4989
1,0.8071,0.7884,0.5128,0.3636,0.4255,0.3137,0.3204
2,0.8143,0.7917,0.359,0.3415,0.35,0.2417,0.2418
3,0.85,0.8429,0.6154,0.4706,0.5333,0.4459,0.4516
4,0.825,0.8021,0.5641,0.4074,0.4731,0.3714,0.3785
5,0.825,0.8369,0.3333,0.3611,0.3467,0.2458,0.2461
6,0.85,0.8197,0.5128,0.4651,0.4878,0.4002,0.4008
7,0.8429,0.8194,0.5385,0.4468,0.4884,0.3965,0.3989
8,0.8393,0.8065,0.4615,0.4286,0.4444,0.3506,0.351
9,0.8423,0.8407,0.4211,0.4211,0.4211,0.3298,0.3298


In [16]:
evaluate_model(blend_soft)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [17]:
predict_model(blend_soft)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8285,0.818,0.4157,0.3876,0.4012,0.3012,0.3015


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label,Score
0,0.477499,-1.215611,0.547075,0.925603,0.944387,-1.152209,0.672376,-0.511354,0.753785,-0.510431,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.1476
1,-0.556735,-0.031642,-0.550466,-0.694846,-0.656055,1.181913,0.672376,0.963869,1.427515,1.920192,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.4248
2,1.769352,0.035305,-0.619652,-0.633653,-0.724077,0.669598,0.672376,0.434389,-1.360105,0.149794,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0,0.4440
3,-1.189793,-0.031642,0.179759,0.178626,0.175316,1.341102,0.672376,0.963869,0.616506,0.103027,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0,0.0550
4,-0.057864,-0.031642,-2.213802,-2.314832,-2.271104,0.288248,-1.528718,-0.862434,-1.360105,-1.641198,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.4130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.625997,-0.725754,-0.686850,0.483924,-0.551305,-0.364897,0.055875,-0.295979,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,1,0.5185
1197,-0.505915,-0.031642,-0.361671,-0.478512,-0.440575,1.181913,0.672376,0.963869,-0.827196,0.704247,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.1041
1198,0.406102,-0.031642,-0.677094,-0.819713,-0.780473,1.181913,0.672376,0.963869,2.408744,1.190597,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,0.2854
1199,-2.067770,-0.031642,0.575774,0.439906,0.472466,-0.142452,-0.551305,-1.206565,1.025611,0.501095,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0,0.2974


## conclusion

similar to the last case of multicollinearity removal, building upon the imbalance-fixed model, through additional normalization / transformation, either a single model, e.g. gradient boosting classifier, or blend models has increased train or test min(recall, precision) to 0.44 or 0.42