### Import libraries

In [50]:
from IPython.core.display import display
from src.utils.preprocessing import preprocessing_pipeline
from src.utils.get_data import import_data, split_experts
from src.utils.train import hyperparameter_tuning_cv
from src.utils.config import *
import pandas as pd
import numpy as np

In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [52]:
DATA_PATH = '../../../data'
X_coarse, y_coarse = import_data(DATA_PATH, segmentation_type='coarse',
                                 drop_user_features=True,
                                 drop_expert=True)

In [53]:
# For each expert separately
X_e, y_e = import_data(DATA_PATH, segmentation_type='coarse',
                       drop_user_features=True,
                       drop_expert=False)

In [54]:
display(X_coarse.head())
display(y_coarse.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,EEPD50_100,EEPD100_150,EEPD150_200,EEPD200_250,EEPD250_300,EEPD300_350,EEPD350_400,EEPD400_450,EEPD450_500,EEPD500_550,...,MFCC_std11,MFCC_std12,Crest_Factor,Cough_Length,PSD_225-425,PSD_450-550,PSD_1325-1600,PSD_1600-2000,PSD_2500-2900,PSD_3100-3700
subject,file_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
008ba489-31ad-44d8-856b-fcf72369dc46,0,8.0,8.0,7.0,6.0,7.0,9.0,7.0,7.0,7.0,8.0,...,14.163379,10.952018,8.36548,1.036563,0.314239,0.027049,0.015518,0.030041,0.04709,0.009558
008ba489-31ad-44d8-856b-fcf72369dc46,1,2.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,...,11.684959,9.106066,9.348207,0.378625,0.09355,0.10778,0.013997,0.008304,0.045378,0.007341
008c1c9e-aeef-40c5-846c-24f1b964f884,0,4.0,4.0,4.0,4.0,6.0,6.0,5.0,5.0,5.0,5.0,...,13.539921,8.786539,5.222938,0.765625,0.160032,0.034572,0.017697,0.029626,0.027728,0.047036
008c1c9e-aeef-40c5-846c-24f1b964f884,1,3.0,5.0,5.0,6.0,4.0,5.0,6.0,5.0,5.0,4.0,...,12.846291,7.0351,4.402501,0.794125,0.050611,0.050448,0.036759,0.036765,0.032501,0.050735
00bf9f83-2e8f-47cf-a4f2-97f2beceebc1,0,6.0,6.0,6.0,7.0,6.0,7.0,6.0,6.0,8.0,6.0,...,14.501091,9.551842,4.234178,0.86025,0.264409,0.05078,0.020884,0.008068,0.002585,0.004144


Unnamed: 0_level_0,Unnamed: 1_level_0,Label
subject,file_id,Unnamed: 2_level_1
008ba489-31ad-44d8-856b-fcf72369dc46,0,1.0
008ba489-31ad-44d8-856b-fcf72369dc46,1,1.0
008c1c9e-aeef-40c5-846c-24f1b964f884,0,1.0
008c1c9e-aeef-40c5-846c-24f1b964f884,1,1.0
00bf9f83-2e8f-47cf-a4f2-97f2beceebc1,0,1.0


### Preprocessing

In [55]:
X_coarse = preprocessing_pipeline(X_tr=X_coarse, stop=None, dummy=False)

In [56]:
# Save the expert feature for split
expert = X_e['Expert'].copy()
X_e = preprocessing_pipeline(X_e, stop=None, dummy=False)
X_e['Expert'] = expert.values

### Split expert models

In [57]:
X_e_1, y_e_1, X_e_2, y_e_2, X_e_3, y_e_3 = split_experts(X_e, y_e)

### Grid search

In [58]:
DECISION_METRIC = 'roc_auc_score'
CV_K = 10

#### 1. Logistic regression

In [59]:
log_results = hyperparameter_tuning_cv(model='logistic', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=LOGISTIC_PARAMS)

display(log_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.448957,0.603175,0.000163,0.606589,-0.843521
10000,False,0.170428,0.53122,5.9e-05,0.728488,-0.154657
100000,True,0.448957,0.603175,0.000163,0.606589,-0.843521
100000,False,0.170428,0.53122,5.9e-05,0.728488,-0.154657


In [60]:
# Best model parameters

best_log = log_results.iloc[[
    log_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_log)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.448957,0.603175,0.000163,0.606589,-0.843521


In [61]:
# for each expert
log_results_1 = hyperparameter_tuning_cv(model='logistic', data=X_e_1, labels=y_e_1, cv_k=CV_K,
                                         params=LOGISTIC_PARAMS)
log_results_2 = hyperparameter_tuning_cv(model='logistic', data=X_e_2, labels=y_e_2, cv_k=CV_K,
                                         params=LOGISTIC_PARAMS)
log_results_3 = hyperparameter_tuning_cv(model='logistic', data=X_e_3, labels=y_e_3, cv_k=CV_K,
                                         params=LOGISTIC_PARAMS)

In [62]:
# Best model parameters

best_log_1 = log_results_1.iloc[[
    log_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_log_2 = log_results_2.iloc[[
    log_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_log_3 = log_results_3.iloc[[
    log_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_log_1, best_log_2, best_log_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.369138,0.656366,0.001356,0.721173,-1.055053


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.301329,0.554153,0.001046,0.601596,-1.404462


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,False,0.608526,0.630547,0.000693,0.631143,-0.473886


#### 2. Linear Discriminant Analysis

In [63]:
lda_results = hyperparameter_tuning_cv(model='lda', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=LDA_PARAMS)

display(lda_results)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.450221,0.603876,0.000158,0.605329,-0.842842
False,0.167864,0.530537,5.8e-05,0.728295,-0.153934


In [64]:
# Best model parameters

best_lda = lda_results.iloc[[
    lda_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_lda)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.450221,0.603876,0.000158,0.605329,-0.842842


In [65]:
# For each expert
lda_results_1 = hyperparameter_tuning_cv(model='lda', data=X_e_1, labels=y_e_1, cv_k=CV_K,
                                         params=LDA_PARAMS)
lda_results_2 = hyperparameter_tuning_cv(model='lda', data=X_e_2, labels=y_e_2, cv_k=CV_K,
                                         params=LDA_PARAMS)
lda_results_3 = hyperparameter_tuning_cv(model='lda', data=X_e_3, labels=y_e_3, cv_k=CV_K,
                                         params=LDA_PARAMS)

In [66]:
best_lda_1 = lda_results_1.iloc[[
    lda_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_lda_2 = lda_results_2.iloc[[
    lda_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_lda_3 = lda_results_3.iloc[[
    lda_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_lda_1, best_lda_2, best_lda_3)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.380967,0.672532,0.001921,0.707166,-1.083197


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.298166,0.549586,0.001523,0.589096,-1.445085


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.622405,0.628547,0.000326,0.628,-0.486653


#### 3. K-nearest Neighbors

In [67]:
knn_results = hyperparameter_tuning_cv(model='knn', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=KNN_PARAMS)

display(knn_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.45014,0.602536,0.000179,0.596899,-0.854378
1,False,0.412175,0.597416,0.000141,0.680329,-0.624469
2,True,0.432639,0.600448,0.000343,0.643895,-0.768486
2,False,0.221385,0.541712,9.3e-05,0.724322,-0.229833
3,True,0.450193,0.589365,0.000127,0.532267,-0.880041
3,False,0.349757,0.573669,0.000126,0.699031,-0.501081
4,True,0.442436,0.592503,0.000206,0.578876,-0.901119
4,False,0.200617,0.536073,0.000159,0.723256,-0.219708
5,True,0.444783,0.576675,0.000128,0.496512,-0.868263
5,False,0.286918,0.551759,0.000271,0.703779,-0.429832


In [68]:
# Best model parameters

best_knn = knn_results.iloc[[
    knn_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_knn)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.45014,0.602536,0.000179,0.596899,-0.854378


In [69]:
knn_results_1 = hyperparameter_tuning_cv(
    model='knn', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=KNN_PARAMS)
knn_results_2 = hyperparameter_tuning_cv(
    model='knn', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=KNN_PARAMS)
knn_results_3 = hyperparameter_tuning_cv(
    model='knn', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=KNN_PARAMS)

In [70]:
best_knn_1 = knn_results_1.iloc[[
    knn_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_knn_2 = knn_results_2.iloc[[
    knn_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_knn_3 = knn_results_3.iloc[[
    knn_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_knn_1, best_knn_2, best_knn_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.363404,0.659157,0.001144,0.680456,-1.20615


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,True,0.310349,0.56934,0.000527,0.658511,-1.197215


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,False,0.617849,0.643908,0.000272,0.644571,-0.416249


#### 4. Support Vector Classifier

In [71]:
svc_results = hyperparameter_tuning_cv(model='svc', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=SVC_PARAMS)

display(svc_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.453609,0.604691,0.000203,0.59593,-0.848945
linear,0.1,False,0.0,0.5,0.0,0.731008,0.0
rbf,0.1,True,0.153029,0.531673,3.1e-05,0.736143,-0.080774
rbf,0.1,False,0.017757,0.503277,1e-05,0.731589,-0.008534
linear,0.01,True,0.453609,0.604691,0.000203,0.59593,-0.848945
linear,0.01,False,0.0,0.5,0.0,0.731008,0.0
rbf,0.01,True,0.441761,0.603406,0.000194,0.631395,-0.797979
rbf,0.01,False,0.002215,0.500356,1e-06,0.731008,-0.001545


In [72]:
# Best model parameters

best_svc = svc_results.iloc[[
    svc_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_svc)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.453609,0.604691,0.000203,0.59593,-0.848945


In [73]:
svc_results_1 = hyperparameter_tuning_cv(
    model='svc', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=SVC_PARAMS)
svc_results_2 = hyperparameter_tuning_cv(
    model='svc', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=SVC_PARAMS)
svc_results_3 = hyperparameter_tuning_cv(
    model='svc', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=SVC_PARAMS)

In [74]:
best_svc_1 = svc_results_1.iloc[[
    svc_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_svc_2 = svc_results_2.iloc[[
    svc_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_svc_3 = svc_results_3.iloc[[
    svc_results_3.reset_index()[DECISION_METRIC].idxmax()]]


display(best_svc_1, best_svc_2, best_svc_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.365848,0.654488,0.001308,0.714332,-1.085648


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.310932,0.561231,0.000906,0.582713,-1.411778


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rbf,0.1,True,0.568022,0.651697,0.000602,0.655429,-0.254891


#### 5. Naive Bayes Classifier

In [75]:
nb_results = hyperparameter_tuning_cv(model='naive_bayes', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=NAIVE_BAYES_PARAMS)

display(nb_results)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.099235,0.513705,0.000656,0.68469,-0.223863
False,0.08854,0.512813,0.000655,0.713372,-0.183421


In [76]:
# Best model parameters

best_nb = nb_results.iloc[[nb_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_nb)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.099235,0.513705,0.000656,0.68469,-0.223863


In [77]:
nb_results_1 = hyperparameter_tuning_cv(
    model='naive_bayes', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=NAIVE_BAYES_PARAMS)
nb_results_2 = hyperparameter_tuning_cv(
    model='naive_bayes', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=NAIVE_BAYES_PARAMS)
nb_results_3 = hyperparameter_tuning_cv(
    model='naive_bayes', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=NAIVE_BAYES_PARAMS)

In [78]:
best_nb_1 = nb_results_1.iloc[[
    nb_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_nb_2 = nb_results_2.iloc[[
    nb_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_nb_3 = nb_results_3.iloc[[
    nb_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_nb_1, best_nb_2, best_nb_3)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.307222,0.598367,0.001093,0.689251,-1.281494


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.294965,0.529018,0.0008,0.471809,-1.550282


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.185035,0.521429,0.000791,0.541143,-0.233189


#### 6. Decision Tree

In [79]:
dt_results = hyperparameter_tuning_cv(model='decision_tree', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=DECISION_TREE_PARAMS)

display(dt_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,True,0.404446,0.549765,0.000187,0.52064,-1.0206
3,False,0.050187,0.504344,0.000123,0.72374,-0.095173
5,True,0.414617,0.556365,0.00035,0.520252,-1.011049
5,False,0.183122,0.522103,0.000378,0.703295,-0.350121
7,True,0.401059,0.554158,0.000214,0.552422,-1.046656
7,False,0.245407,0.53027,0.000374,0.685174,-0.521899


In [80]:
# Best model parameters

best_dt = dt_results.iloc[[dt_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_dt)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,True,0.414617,0.556365,0.00035,0.520252,-1.011049


In [81]:
dt_results_1 = hyperparameter_tuning_cv(
    model='decision_tree', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=DECISION_TREE_PARAMS)
dt_results_2 = hyperparameter_tuning_cv(
    model='decision_tree', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=DECISION_TREE_PARAMS)
dt_results_3 = hyperparameter_tuning_cv(
    model='decision_tree', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=DECISION_TREE_PARAMS)

In [82]:
best_dt_1 = dt_results_1.iloc[[
    dt_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_dt_2 = dt_results_2.iloc[[
    dt_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_dt_3 = dt_results_3.iloc[[
    dt_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_dt_1, best_dt_2, best_dt_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,True,0.346085,0.636091,0.001636,0.704235,-1.164205


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,True,0.266734,0.51595,0.001397,0.553191,-1.591528


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,True,0.55514,0.60377,0.000614,0.609429,-0.524916


#### 7. Random Forest

In [83]:
rf_results = hyperparameter_tuning_cv(model='random_forest', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=RANDOM_FOREST_PARAMS)

display(rf_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,True,0.401601,0.555579,0.000544,0.554748,-1.024678
3,3,False,0.030998,0.504816,1.6e-05,0.73062,-0.026458
3,5,True,0.418825,0.575176,9.6e-05,0.58188,-0.957104
3,5,False,0.01674,0.502386,2.1e-05,0.730523,-0.01645
3,7,True,0.408777,0.570216,0.000457,0.587984,-0.956403
3,7,False,0.007183,0.500631,3e-06,0.730233,-0.011231
5,3,True,0.396608,0.558124,0.000315,0.576357,-1.012512
5,3,False,0.102598,0.5126,7.6e-05,0.721899,-0.149415
5,5,True,0.409355,0.569925,0.000217,0.586919,-0.964463
5,5,False,0.072439,0.509329,5.3e-05,0.725775,-0.095444


In [84]:
# Best model parameters

best_rf = rf_results.iloc[[rf_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_rf)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,5,True,0.418825,0.575176,9.6e-05,0.58188,-0.957104


In [85]:
rf_results_1 = hyperparameter_tuning_cv(
    model='random_forest', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=RANDOM_FOREST_PARAMS)
rf_results_2 = hyperparameter_tuning_cv(
    model='random_forest', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=RANDOM_FOREST_PARAMS)
rf_results_3 = hyperparameter_tuning_cv(
    model='random_forest', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=RANDOM_FOREST_PARAMS)

In [86]:
best_rf_1 = rf_results_1.iloc[[
    rf_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_rf_2 = rf_results_2.iloc[[
    rf_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_rf_3 = rf_results_3.iloc[[
    rf_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_rf_1, best_rf_2, best_rf_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,7,True,0.356501,0.640505,0.000773,0.734528,-1.015553


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,5,True,0.283418,0.533518,0.001045,0.557979,-1.525398


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,7,True,0.599263,0.62764,0.000418,0.629429,-0.473155


#### 8. Gradient Boosting

In [87]:
gb_results = hyperparameter_tuning_cv(model='gradient_boosting', data=X_coarse,
                                      labels=y_coarse.Label, cv_k=CV_K,
                                      params=GRADIENT_BOOSTING_PARAMS)

display(gb_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,True,0.412617,0.565204,0.0002414382,0.560174,-0.999682
3,3,False,0.0,0.5,0.0,0.731008,0.0
3,5,True,0.418072,0.570939,0.0001649323,0.56812,-0.981431
3,5,False,0.0,0.499934,3.968191e-08,0.730911,-0.000756
3,7,True,0.421102,0.577398,0.0001206199,0.584205,-0.944169
3,7,False,0.0,0.499802,1.798146e-07,0.730717,-0.002278
5,3,True,0.410343,0.568616,0.0002744055,0.580233,-0.979452
5,3,False,0.0,0.499936,3.736449e-08,0.730911,-0.000777
5,5,True,0.414335,0.576274,0.0002732199,0.598256,-0.931343
5,5,False,0.007154,0.501021,3.155267e-06,0.730814,-0.006866


In [88]:
# Best model parameters

best_gb = gb_results.iloc[[gb_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_gb)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7,7,True,0.409033,0.579215,0.000345,0.621221,-0.870232


In [89]:
gb_results_1 = hyperparameter_tuning_cv(
    model='gradient_boosting', data=X_e_1, labels=y_e_1, cv_k=CV_K,
    params=GRADIENT_BOOSTING_PARAMS)
gb_results_2 = hyperparameter_tuning_cv(
    model='gradient_boosting', data=X_e_2, labels=y_e_2, cv_k=CV_K,
    params=GRADIENT_BOOSTING_PARAMS)
gb_results_3 = hyperparameter_tuning_cv(
    model='gradient_boosting', data=X_e_3, labels=y_e_3, cv_k=CV_K,
    params=GRADIENT_BOOSTING_PARAMS)

In [90]:
best_gb_1 = gb_results_1.iloc[[
    gb_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_gb_2 = gb_results_2.iloc[[
    gb_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_gb_3 = gb_results_3.iloc[[
    gb_results_3.reset_index()[DECISION_METRIC].idxmax()]]
display(best_gb_1, best_gb_2, best_gb_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,7,True,0.373081,0.652145,0.001677,0.749511,-0.911941


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,7,True,0.290834,0.541158,0.0008,0.577128,-1.487318


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,7,True,0.598296,0.629824,0.000776,0.631143,-0.466773


### Results

In [91]:
# display('logistic', best_log)
# display('lda', best_lda)
# display('knn', best_knn)
# display('svc', best_svc)
# display('naive_bayes', best_nb)
# display('decision_tree', best_dt)
# display('random_forest', best_rf)
# display('gradient_boosting', best_gb)

In [92]:
weighted_average = (np.array([len(X_e_1), len(X_e_2), len(X_e_3)]) / len(X_e))

e_log = np.sum(pd.concat([best_log_1, best_log_2, best_log_3]
                         ).roc_auc_score.values * weighted_average)
e_lda = np.sum(pd.concat([best_lda_1, best_lda_2, best_lda_3]
                         ).roc_auc_score.values * weighted_average)
e_knn = np.sum(pd.concat([best_knn_1, best_knn_2, best_knn_3]
                         ).roc_auc_score.values * weighted_average)
e_svc = np.sum(pd.concat([best_svc_1, best_svc_2, best_svc_3]
                         ).roc_auc_score.values * weighted_average)
e_nb = np.sum(pd.concat([best_nb_1, best_nb_2, best_nb_3]
                        ).roc_auc_score.values * weighted_average)
e_dt = np.sum(pd.concat([best_dt_1, best_dt_2, best_dt_3]
                        ).roc_auc_score.values * weighted_average)
e_rf = np.sum(pd.concat([best_rf_1, best_rf_2, best_rf_3]
                        ).roc_auc_score.values * weighted_average)
e_gb = np.sum(pd.concat([best_gb_1, best_gb_2, best_gb_3]
                        ).roc_auc_score.values * weighted_average)

In [93]:
# Results expert features
# First row is Expert 1, second Expert 2, third Expert 3
# display('logistic', pd.concat([best_log_1,best_log_2,best_log_3]))
# display('lda', pd.concat([best_lda_1,best_lda_2,best_lda_3]))
# display('knn', pd.concat([best_knn_1,best_knn_2,best_knn_3]))
display('svc', pd.concat([best_svc_1, best_svc_2, best_svc_3]))
# display('naive_bayes', pd.concat([best_nb_1,best_nb_2,best_nb_3]))
# display('decision_tree', pd.concat([best_dt_1,best_dt_2,best_dt_3]))
# display('random_forest', pd.concat([best_rf_1,best_rf_2,best_rf_3]))
# display('gradient_boosting', pd.concat([best_gb_1,best_gb_2,best_gb_3]))

'svc'

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.365848,0.654488,0.001308,0.714332,-1.085648
linear,0.1,True,0.310932,0.561231,0.000906,0.582713,-1.411778
rbf,0.1,True,0.568022,0.651697,0.000602,0.655429,-0.254891


In [94]:
results = pd.DataFrame(data={'models': ['logistic', 'lda', 'knn', 'svc', 'naive_bayes', 'decision_tree',
                                        'random_forest', 'gradient_boosting'],
                             'auc_best': [best_log.roc_auc_score.values[0], best_lda.roc_auc_score.values[0],
                                          best_knn.roc_auc_score.values[0], best_svc.roc_auc_score.values[0],
                                          best_nb.roc_auc_score.values[0], best_dt.roc_auc_score.values[0],
                                          best_rf.roc_auc_score.values[0], best_gb.roc_auc_score.values[0]],
                             'auc_expert_weighted': [e_log, e_lda, e_knn, e_svc, e_nb, e_dt, e_rf, e_gb]})

display(results)

Unnamed: 0,models,auc_best,auc_expert_weighted
0,logistic,0.603175,0.610414
1,lda,0.603876,0.612877
2,knn,0.602536,0.621299
3,svc,0.604691,0.619601
4,naive_bayes,0.513705,0.547049
5,decision_tree,0.556365,0.58141
6,random_forest,0.575176,0.597206
7,gradient_boosting,0.579215,0.604185


### Conclusions


In [95]:
# Save dataframe
results.to_pickle("results_coarse_no_metadata.pkl")

In [96]:
results.mean(axis=0)

auc_best               0.579842
auc_expert_weighted    0.599255
dtype: float64

In [97]:
results.loc[results['auc_best'].argmax()]

models                      svc
auc_best               0.604691
auc_expert_weighted    0.619601
Name: 3, dtype: object

In [98]:
results.loc[results['auc_expert_weighted'].argmax()]

models                      knn
auc_best               0.602536
auc_expert_weighted    0.621299
Name: 2, dtype: object