# Model2_fix_imbalance

The purpose of this notebook / model experiment is to improve upon the baseline model achieved from model1_base.ipynb, by fixing imbalanced label.

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

with "fix_imbalance" set as true, the model performance is expected to be improved

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      numeric_features = ["GCS_min", "GCS_max"],
                      #normalize = True, 
                      #transformation = True, 
                      #ignore_low_variance = True,
                      #remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,110
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [4]:
compare_models(turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.8678,0.8332,0.312,0.5387,0.3938,0.326,0.3418,25.1453
1,Light Gradient Boosting Machine,0.8675,0.8343,0.2914,0.5438,0.377,0.3108,0.3308,2.6607
2,Extra Trees Classifier,0.866,0.8292,0.2219,0.5362,0.3087,0.2503,0.2817,0.2789
3,Extreme Gradient Boosting,0.865,0.8259,0.3016,0.5344,0.3835,0.3139,0.3314,4.5616
4,SVM - Radial Kernel,0.8614,0.5025,0.0,0.0,0.0,0.0,0.0,16.8055
5,Gaussian Process Classifier,0.8614,0.5,0.0,0.0,0.0,0.0,0.0,16.4107
6,Gradient Boosting Classifier,0.8578,0.8273,0.361,0.4809,0.409,0.331,0.3369,8.0791
7,Random Forest Classifier,0.8467,0.7762,0.2834,0.4275,0.3379,0.2557,0.264,0.1603
8,Ada Boost Classifier,0.8324,0.7674,0.3862,0.3953,0.3878,0.2914,0.2929,1.814
9,Naive Bayes,0.7928,0.7375,0.4459,0.324,0.3735,0.2541,0.2594,0.0099


<catboost.core.CatBoostClassifier at 0x2b5e31e23da0>

with fixing imbalanced label, the base adaboost classifier offers min(recall, precision) = 0.39 on the test data

## create ada boost model

In [4]:
ada = create_model("ada")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8321,0.7951,0.2895,0.3548,0.3188,0.2242,0.2258
1,0.8214,0.7223,0.4359,0.3778,0.4048,0.3003,0.3014
2,0.8286,0.7684,0.4615,0.4,0.4286,0.3283,0.3295
3,0.8321,0.7899,0.3846,0.3947,0.3896,0.2923,0.2924
4,0.8179,0.7869,0.5128,0.3846,0.4396,0.3335,0.3384
5,0.8429,0.7747,0.359,0.4242,0.3889,0.2994,0.3008
6,0.8393,0.7414,0.3846,0.4167,0.4,0.3074,0.3077
7,0.8464,0.7693,0.3333,0.4333,0.3768,0.2909,0.2942
8,0.8429,0.7495,0.359,0.4242,0.3889,0.2994,0.3008
9,0.8208,0.7769,0.3421,0.3421,0.3421,0.2384,0.2384


## tune ada boost model

In [19]:
tuned_ada = tune_model(ada, optimize = "Recall", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.85,0.8126,0.3947,0.4412,0.4167,0.3309,0.3316
1,0.8214,0.738,0.4103,0.3721,0.3902,0.2859,0.2864
2,0.8179,0.7585,0.4359,0.3696,0.4,0.2935,0.2949
3,0.8536,0.8037,0.4359,0.4722,0.4533,0.369,0.3693
4,0.8286,0.8014,0.4359,0.3953,0.4146,0.3145,0.315
5,0.8393,0.775,0.3846,0.4167,0.4,0.3074,0.3077
6,0.825,0.7272,0.3846,0.375,0.3797,0.2779,0.2779
7,0.8429,0.7618,0.4359,0.4359,0.4359,0.3446,0.3446
8,0.8393,0.7772,0.3333,0.4062,0.3662,0.2752,0.277
9,0.8208,0.7984,0.4211,0.3636,0.3902,0.2859,0.2869


the tuned adaboost model offers a slightly increased min(recall, precision) = 0.40

## evaluate tuned adaboost model

In [10]:
evaluate_model(tuned_ada)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## predict label on unseen data, using tuned adaboost model

In [11]:
predict_model(tuned_ada)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.8301,0.7933,0.3916,0.3869,0.3892,0.2906,0.2906


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label,Score
0,74.0,160.000000,89.300000,103.00000,102.600000,3.0,15.0,11.5,79.0,100.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.4791
1,57.0,169.677895,68.000000,68.00000,68.000000,14.0,15.0,15.0,89.0,161.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.4962
2,90.0,170.200000,66.900000,69.00000,66.900000,11.0,15.0,14.0,50.0,113.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,1,0.5065
3,43.0,169.677895,81.083572,84.04358,83.243402,15.0,15.0,15.0,77.0,112.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0,0.4734
4,66.0,169.677895,45.000000,45.00000,45.000000,9.0,11.0,10.0,50.0,81.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.4969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,66.800000,67.50000,67.500000,10.0,14.0,12.0,69.0,104.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,1,0.5051
1197,58.0,169.677895,71.100000,71.60000,71.600000,14.0,15.0,15.0,57.0,126.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.4714
1198,73.0,169.677895,66.000000,66.00000,66.000000,14.0,15.0,15.0,104.0,139.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,0.4885
1199,19.0,169.677895,90.000000,90.00000,90.000000,7.0,14.0,8.0,83.0,121.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0,0.4918


min(recall, precision) = 0.39 on unseen data

## explore blend models, both hard and soft voting

In [12]:
blend_hard = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8679,0.0,0.5263,0.5128,0.5195,0.4429,0.4429
1,0.8357,0.0,0.4103,0.4103,0.4103,0.3148,0.3148
2,0.8286,0.0,0.359,0.3784,0.3684,0.2693,0.2695
3,0.8857,0.0,0.4615,0.6207,0.5294,0.466,0.4726
4,0.8643,0.0,0.359,0.5185,0.4242,0.3502,0.3578
5,0.8536,0.0,0.2821,0.4583,0.3492,0.2719,0.2821
6,0.8464,0.0,0.4615,0.45,0.4557,0.3663,0.3664
7,0.8714,0.0,0.4103,0.5517,0.4706,0.3992,0.4049
8,0.8429,0.0,0.3333,0.4194,0.3714,0.283,0.2854
9,0.871,0.0,0.3684,0.5385,0.4375,0.3675,0.376


In [13]:
evaluate_model(blend_hard)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [14]:
predict_model(blend_hard)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8518,0,0.3434,0.4524,0.3904,0.3078,0.3116


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label
0,74.0,160.000000,89.300000,103.00000,102.600000,3.0,15.0,11.5,79.0,100.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0
1,57.0,169.677895,68.000000,68.00000,68.000000,14.0,15.0,15.0,89.0,161.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0
2,90.0,170.200000,66.900000,69.00000,66.900000,11.0,15.0,14.0,50.0,113.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0
3,43.0,169.677895,81.083572,84.04358,83.243402,15.0,15.0,15.0,77.0,112.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0
4,66.0,169.677895,45.000000,45.00000,45.000000,9.0,11.0,10.0,50.0,81.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,66.800000,67.50000,67.500000,10.0,14.0,12.0,69.0,104.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0
1197,58.0,169.677895,71.100000,71.60000,71.600000,14.0,15.0,15.0,57.0,126.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0
1198,73.0,169.677895,66.000000,66.00000,66.000000,14.0,15.0,15.0,104.0,139.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0
1199,19.0,169.677895,90.000000,90.00000,90.000000,7.0,14.0,8.0,83.0,121.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0


In [15]:
blend_soft = blend_models(method = "soft")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8607,0.8598,0.5526,0.4884,0.5185,0.4375,0.4386
1,0.85,0.8063,0.359,0.4516,0.4,0.3156,0.3183
2,0.8357,0.8083,0.3077,0.3871,0.3429,0.2504,0.2525
3,0.8643,0.856,0.4615,0.5143,0.4865,0.4086,0.4094
4,0.8607,0.8007,0.4359,0.5,0.4658,0.3861,0.3873
5,0.8429,0.8399,0.3077,0.4138,0.3529,0.2657,0.2695
6,0.85,0.8043,0.4615,0.4615,0.4615,0.3744,0.3744
7,0.85,0.8081,0.359,0.4516,0.4,0.3156,0.3183
8,0.8536,0.8317,0.3846,0.4688,0.4225,0.3396,0.3418
9,0.871,0.8293,0.3421,0.5417,0.4194,0.3509,0.3627


In [16]:
evaluate_model(blend_soft)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [17]:
predict_model(blend_soft)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8518,0.8153,0.4096,0.4595,0.4331,0.3482,0.349


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label,Score
0,74.0,160.000000,89.300000,103.00000,102.600000,3.0,15.0,11.5,79.0,100.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.0703
1,57.0,169.677895,68.000000,68.00000,68.000000,14.0,15.0,15.0,89.0,161.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.3986
2,90.0,170.200000,66.900000,69.00000,66.900000,11.0,15.0,14.0,50.0,113.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,1,0.5052
3,43.0,169.677895,81.083572,84.04358,83.243402,15.0,15.0,15.0,77.0,112.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0,0.0650
4,66.0,169.677895,45.000000,45.00000,45.000000,9.0,11.0,10.0,50.0,81.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.2535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,66.800000,67.50000,67.500000,10.0,14.0,12.0,69.0,104.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0,0.4747
1197,58.0,169.677895,71.100000,71.60000,71.600000,14.0,15.0,15.0,57.0,126.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.1072
1198,73.0,169.677895,66.000000,66.00000,66.000000,14.0,15.0,15.0,104.0,139.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,0.2291
1199,19.0,169.677895,90.000000,90.00000,90.000000,7.0,14.0,8.0,83.0,121.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0,0.2009


## conclusion

by fixing the imbalanced label, either a single model, e.g. adaboost, or blend models has increased train or test min(recall, precision) to 0.40 or 0.41