### Import libraries

In [34]:
from IPython.core.display import display
from src.utils.preprocessing import preprocessing_pipeline
from src.utils.get_data import import_data, split_experts
from src.utils.train import hyperparameter_tuning_cv
from src.utils.config import *
import pandas as pd
import numpy as np

In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [36]:
DATA_PATH = '../../../data'
X_coarse, y_coarse = import_data(DATA_PATH, segmentation_type='coarse',
                                 drop_user_features=False,
                                 drop_expert=True)

In [37]:
# For each expert separately
X_e, y_e = import_data(DATA_PATH, segmentation_type='coarse',
                       drop_user_features=False,
                       drop_expert=False)

In [38]:
display(X_coarse.head())
display(y_coarse.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,EEPD50_100,EEPD100_150,EEPD150_200,EEPD200_250,EEPD250_300,EEPD300_350,EEPD350_400,EEPD400_450,EEPD450_500,EEPD500_550,...,PSD_225-425,PSD_450-550,PSD_1325-1600,PSD_1600-2000,PSD_2500-2900,PSD_3100-3700,Age,Gender,Resp_Condition,Symptoms
subject,file_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
008ba489-31ad-44d8-856b-fcf72369dc46,0,8.0,8.0,7.0,6.0,7.0,9.0,7.0,7.0,7.0,8.0,...,0.314239,0.027049,0.015518,0.030041,0.04709,0.009558,28.0,1.0,0.0,0.0
008ba489-31ad-44d8-856b-fcf72369dc46,1,2.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,...,0.09355,0.10778,0.013997,0.008304,0.045378,0.007341,28.0,1.0,0.0,0.0
008c1c9e-aeef-40c5-846c-24f1b964f884,0,4.0,4.0,4.0,4.0,6.0,6.0,5.0,5.0,5.0,5.0,...,0.160032,0.034572,0.017697,0.029626,0.027728,0.047036,44.0,0.0,0.0,0.0
008c1c9e-aeef-40c5-846c-24f1b964f884,1,3.0,5.0,5.0,6.0,4.0,5.0,6.0,5.0,5.0,4.0,...,0.050611,0.050448,0.036759,0.036765,0.032501,0.050735,44.0,0.0,0.0,0.0
00bf9f83-2e8f-47cf-a4f2-97f2beceebc1,0,6.0,6.0,6.0,7.0,6.0,7.0,6.0,6.0,8.0,6.0,...,0.264409,0.05078,0.020884,0.008068,0.002585,0.004144,37.0,0.0,1.0,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Label
subject,file_id,Unnamed: 2_level_1
008ba489-31ad-44d8-856b-fcf72369dc46,0,1.0
008ba489-31ad-44d8-856b-fcf72369dc46,1,1.0
008c1c9e-aeef-40c5-846c-24f1b964f884,0,1.0
008c1c9e-aeef-40c5-846c-24f1b964f884,1,1.0
00bf9f83-2e8f-47cf-a4f2-97f2beceebc1,0,1.0


### Preprocessing

In [39]:
X_coarse = preprocessing_pipeline(X_coarse)

In [40]:
# Save the expert feature for split
expert = X_e['Expert'].copy()
X_e = preprocessing_pipeline(X_e)
X_e['Expert'] = expert.values

### Split expert models

In [41]:
X_e_1, y_e_1, X_e_2, y_e_2, X_e_3, y_e_3 = split_experts(X_e, y_e)

### Grid search

In [42]:
DECISION_METRIC = 'roc_auc_score'
CV_K = 10

#### 1. Logistic regression

In [43]:
log_results = hyperparameter_tuning_cv(model='logistic', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=LOGISTIC_PARAMS)

display(log_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.428893,0.596055,0.000133,0.637791,-0.795674
10000,False,0.190028,0.535026,8.7e-05,0.726938,-0.183461
100000,True,0.428893,0.596055,0.000133,0.637791,-0.795674
100000,False,0.190028,0.535026,8.7e-05,0.726938,-0.183461


In [44]:
# Best model parameters

best_log = log_results.iloc[[
    log_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_log)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.428893,0.596055,0.000133,0.637791,-0.795674


In [45]:
# for each expert
log_results_1 = hyperparameter_tuning_cv(model='logistic', data=X_e_1, labels=y_e_1, cv_k=CV_K,
                                         params=LOGISTIC_PARAMS)
log_results_2 = hyperparameter_tuning_cv(model='logistic', data=X_e_2, labels=y_e_2, cv_k=CV_K,
                                         params=LOGISTIC_PARAMS)
log_results_3 = hyperparameter_tuning_cv(model='logistic', data=X_e_3, labels=y_e_3, cv_k=CV_K,
                                         params=LOGISTIC_PARAMS)

In [46]:
# Best model parameters

best_log_1 = log_results_1.iloc[[
    log_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_log_2 = log_results_2.iloc[[
    log_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_log_3 = log_results_3.iloc[[
    log_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_log_1, best_log_2, best_log_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.364467,0.640867,0.000753,0.759283,-0.872209


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,True,0.266119,0.532979,0.000405,0.642819,-1.320197


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_iter,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,False,0.617514,0.641445,0.000357,0.642857,-0.426392


#### 2. Linear Discriminant Analysis

In [47]:
lda_results = hyperparameter_tuning_cv(model='lda', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=LDA_PARAMS)

display(lda_results)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.428824,0.596361,0.000176,0.639341,-0.790511
False,0.194535,0.536379,4.1e-05,0.727422,-0.183448


In [48]:
# Best model parameters

best_lda = lda_results.iloc[[
    lda_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_lda)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.428824,0.596361,0.000176,0.639341,-0.790511


In [49]:
# For each expert
lda_results_1 = hyperparameter_tuning_cv(model='lda', data=X_e_1, labels=y_e_1, cv_k=CV_K,
                                         params=LDA_PARAMS)
lda_results_2 = hyperparameter_tuning_cv(model='lda', data=X_e_2, labels=y_e_2, cv_k=CV_K,
                                         params=LDA_PARAMS)
lda_results_3 = hyperparameter_tuning_cv(model='lda', data=X_e_3, labels=y_e_3, cv_k=CV_K,
                                         params=LDA_PARAMS)

In [50]:
best_lda_1 = lda_results_1.iloc[[
    lda_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_lda_2 = lda_results_2.iloc[[
    lda_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_lda_3 = lda_results_3.iloc[[
    lda_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_lda_1, best_lda_2, best_lda_3)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.3739,0.653304,0.000844,0.7443,-0.946999


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.262352,0.53026,0.000713,0.642287,-1.326226


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.612285,0.63845,0.00036,0.64,-0.436563


#### 3. K-nearest Neighbors

In [51]:
knn_results = hyperparameter_tuning_cv(model='knn', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=KNN_PARAMS)

display(knn_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.468548,0.619924,0.0002,0.612791,-0.783883
1,False,0.443651,0.618595,7.3e-05,0.694671,-0.551187
2,True,0.446534,0.611984,0.000293,0.654651,-0.717203
2,False,0.237103,0.547534,2.9e-05,0.727616,-0.218966
3,True,0.463438,0.604908,0.000138,0.550388,-0.827344
3,False,0.35525,0.575409,0.000206,0.697674,-0.512557
4,True,0.455686,0.606087,0.000178,0.592636,-0.844881
4,False,0.220397,0.541376,3.6e-05,0.724322,-0.228755
5,True,0.457991,0.593137,8.2e-05,0.514438,-0.816936
5,False,0.298973,0.555685,0.000163,0.703198,-0.442241


In [52]:
# Best model parameters

best_knn = knn_results.iloc[[
    knn_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_knn)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.468548,0.619924,0.0002,0.612791,-0.783883


In [53]:
knn_results_1 = hyperparameter_tuning_cv(
    model='knn', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=KNN_PARAMS)
knn_results_2 = hyperparameter_tuning_cv(
    model='knn', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=KNN_PARAMS)
knn_results_3 = hyperparameter_tuning_cv(
    model='knn', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=KNN_PARAMS)

In [54]:
display(knn_results_1, knn_results_2, knn_results_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.367133,0.659582,0.001366,0.691857,-1.161129
1,False,0.400402,0.652262,0.000771,0.818893,-0.456615
2,True,0.380025,0.661933,0.001472,0.738111,-0.959942
2,False,0.276758,0.579811,0.000205,0.857003,-0.07714
3,True,0.335296,0.639032,0.000628,0.592508,-1.442038
3,False,0.311753,0.597135,0.001123,0.825733,-0.386194
4,True,0.343695,0.644055,0.001222,0.634528,-1.368099
4,False,0.232462,0.563138,0.000409,0.852443,-0.107926
5,True,0.326485,0.633212,0.000284,0.54886,-1.470596
5,False,0.272671,0.577158,0.000489,0.840065,-0.245835


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.328676,0.58135,0.0003960881,0.613564,-1.305274
1,False,0.298203,0.575845,0.0007982115,0.757181,-0.651486
2,True,0.317945,0.577239,0.0006041645,0.667021,-1.149476
2,False,0.063006,0.506914,6.667467e-05,0.809309,-0.135137
3,True,0.312659,0.556285,0.0004495218,0.519415,-1.472783
3,False,0.15782,0.52184,0.000246217,0.78484,-0.395469
4,True,0.30942,0.557752,0.0003271568,0.56516,-1.452731
4,False,0.05077,0.507084,2.937483e-05,0.814628,-0.082014
5,True,0.314975,0.552768,0.0005167644,0.464362,-1.419365
5,False,0.092258,0.510814,0.000246464,0.802128,-0.215378


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.645243,0.655031,0.000229,0.654571,-0.382123
1,False,0.639512,0.656636,0.000233,0.656857,-0.372366
2,True,0.540837,0.632592,0.00038,0.64,-0.319298
2,False,0.498572,0.621683,0.00038,0.630286,-0.278626
3,True,0.63923,0.642514,0.000358,0.641429,-0.431136
3,False,0.620053,0.643324,0.000468,0.643429,-0.420846
4,True,0.573813,0.634285,0.000413,0.639429,-0.395144
4,False,0.519111,0.621569,0.000272,0.628857,-0.34283
5,True,0.632202,0.637702,0.000275,0.637429,-0.450162
5,False,0.606441,0.642106,0.000341,0.644286,-0.410275


In [55]:
best_knn_1 = knn_results_1.iloc[[
    knn_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_knn_2 = knn_results_2.iloc[[
    knn_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_knn_3 = knn_results_3.iloc[[
    knn_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_knn_1, best_knn_2, best_knn_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,True,0.380025,0.661933,0.001472,0.738111,-0.959942


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True,0.328676,0.58135,0.000396,0.613564,-1.305274


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,False,0.639512,0.656636,0.000233,0.656857,-0.372366


#### 4. Support Vector Classifier

In [56]:
svc_results = hyperparameter_tuning_cv(model='svc', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                       params=SVC_PARAMS)

display(svc_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.424076,0.592715,0.000236,0.636628,-0.80458
linear,0.1,False,0.0,0.5,0.0,0.731008,0.0
rbf,0.1,True,0.138128,0.531521,2.5e-05,0.741376,-0.027409
rbf,0.1,False,0.011451,0.502106,4e-06,0.731395,-0.005394
linear,0.01,True,0.424076,0.592715,0.000236,0.636628,-0.80458
linear,0.01,False,0.0,0.5,0.0,0.731008,0.0
rbf,0.01,True,0.434941,0.606863,0.000248,0.665504,-0.68398
rbf,0.01,False,0.006488,0.501369,2e-06,0.731492,-0.000926


In [57]:
# Best model parameters

best_svc = svc_results.iloc[[
    svc_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_svc)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rbf,0.01,True,0.434941,0.606863,0.000248,0.665504,-0.68398


In [58]:
svc_results_1 = hyperparameter_tuning_cv(
    model='svc', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=SVC_PARAMS)
svc_results_2 = hyperparameter_tuning_cv(
    model='svc', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=SVC_PARAMS)
svc_results_3 = hyperparameter_tuning_cv(
    model='svc', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=SVC_PARAMS)

In [59]:
best_svc_1 = svc_results_1.iloc[[
    svc_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_svc_2 = svc_results_2.iloc[[
    svc_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_svc_3 = svc_results_3.iloc[[
    svc_results_3.reset_index()[DECISION_METRIC].idxmax()]]


display(best_svc_1, best_svc_2, best_svc_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.374145,0.649842,0.000417,0.758958,-0.866928


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.1,True,0.263785,0.5304,0.000282,0.637766,-1.342262


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
kernel,gamma,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rbf,0.01,False,0.63429,0.663262,0.000192,0.664857,-0.332676


#### 5. Naive Bayes Classifier

In [60]:
nb_results = hyperparameter_tuning_cv(model='naive_bayes', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=NAIVE_BAYES_PARAMS)

display(nb_results)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.097475,0.514818,0.000866,0.688178,-0.220509
False,0.09187,0.514923,0.000948,0.714244,-0.178692


In [61]:
# Best model parameters

best_nb = nb_results.iloc[[nb_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_nb)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.09187,0.514923,0.000948,0.714244,-0.178692


In [62]:
nb_results_1 = hyperparameter_tuning_cv(
    model='naive_bayes', data=X_e_1, labels=y_e_1, cv_k=CV_K,
    params=NAIVE_BAYES_PARAMS)
nb_results_2 = hyperparameter_tuning_cv(
    model='naive_bayes', data=X_e_2, labels=y_e_2, cv_k=CV_K,
    params=NAIVE_BAYES_PARAMS)
nb_results_3 = hyperparameter_tuning_cv(
    model='naive_bayes', data=X_e_3, labels=y_e_3, cv_k=CV_K,
    params=NAIVE_BAYES_PARAMS)

In [63]:
best_nb_1 = nb_results_1.iloc[[
    nb_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_nb_2 = nb_results_2.iloc[[
    nb_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_nb_3 = nb_results_3.iloc[[
    nb_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_nb_1, best_nb_2, best_nb_3)

Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
True,0.301947,0.594561,0.000978,0.727687,-1.095128


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.294778,0.532091,0.001164,0.50133,-1.537017


Unnamed: 0_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
oversampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.18167,0.520906,0.001138,0.540286,-0.237237


#### 6. Decision Tree

In [64]:
dt_results = hyperparameter_tuning_cv(model='decision_tree', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=DECISION_TREE_PARAMS)

display(dt_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,True,0.351964,0.5294,0.0003,0.550291,-1.00851
3,False,0.064378,0.508511,0.000115,0.725872,-0.090567
5,True,0.374372,0.557682,0.000213,0.612403,-0.895144
5,False,0.197162,0.527772,0.00036,0.707752,-0.330389
7,True,0.385628,0.563532,0.000274,0.617248,-0.905391
7,False,0.263464,0.538929,0.000207,0.690698,-0.495519


In [65]:
# Best model parameters

best_dt = dt_results.iloc[[dt_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_dt)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,True,0.385628,0.563532,0.000274,0.617248,-0.905391


In [66]:
dt_results_1 = hyperparameter_tuning_cv(
    model='decision_tree', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=DECISION_TREE_PARAMS)
dt_results_2 = hyperparameter_tuning_cv(
    model='decision_tree', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=DECISION_TREE_PARAMS)
dt_results_3 = hyperparameter_tuning_cv(
    model='decision_tree', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=DECISION_TREE_PARAMS)

In [67]:
best_dt_1 = dt_results_1.iloc[[
    dt_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_dt_2 = dt_results_2.iloc[[
    dt_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_dt_3 = dt_results_3.iloc[[
    dt_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_dt_1, best_dt_2, best_dt_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,True,0.342055,0.631774,0.001776,0.694463,-1.194066


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,True,0.245803,0.522685,0.001978,0.646277,-1.25654


Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,True,0.537808,0.607115,0.000258,0.613143,-0.479072


#### 7. Random Forest

In [68]:
rf_results = hyperparameter_tuning_cv(model='random_forest', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=RANDOM_FOREST_PARAMS)

display(rf_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,True,0.395191,0.554517,0.000348,0.566667,-1.033337
3,3,False,0.036141,0.505414,3.3e-05,0.730136,-0.033573
3,5,True,0.413601,0.574476,0.000263,0.592733,-0.942366
3,5,False,0.014301,0.502764,6e-06,0.731686,-0.005174
3,7,True,0.396937,0.565995,0.000377,0.600678,-0.946844
3,7,False,0.002974,0.500484,2e-06,0.731008,-0.002058
5,3,True,0.394964,0.563284,0.000183,0.59593,-0.960309
5,3,False,0.110989,0.512183,0.0001,0.717926,-0.187325
5,5,True,0.404304,0.573236,0.000369,0.611337,-0.907718
5,5,False,0.072997,0.509164,8.9e-05,0.725484,-0.098061


In [69]:
# Best model parameters

best_rf = rf_results.iloc[[rf_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_rf)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,7,True,0.412767,0.581815,0.000253,0.621221,-0.863286


In [70]:
rf_results_1 = hyperparameter_tuning_cv(
    model='random_forest', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=RANDOM_FOREST_PARAMS)
rf_results_2 = hyperparameter_tuning_cv(
    model='random_forest', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=RANDOM_FOREST_PARAMS)
rf_results_3 = hyperparameter_tuning_cv(
    model='random_forest', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=RANDOM_FOREST_PARAMS)

In [71]:
best_rf_1 = rf_results_1.iloc[[
    rf_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_rf_2 = rf_results_2.iloc[[
    rf_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_rf_3 = rf_results_3.iloc[[
    rf_results_3.reset_index()[DECISION_METRIC].idxmax()]]

display(best_rf_1, best_rf_2, best_rf_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,7,True,0.382291,0.655406,0.000614,0.767427,-0.813035


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7,5,True,0.281542,0.549249,0.0011,0.669415,-1.172626


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7,7,False,0.608261,0.637357,0.000452,0.637714,-0.437479


#### 8. Gradient Boosting

In [72]:
gb_results = hyperparameter_tuning_cv(model='gradient_boosting', data=X_coarse, labels=y_coarse.Label, cv_k=CV_K,
                                      params=GRADIENT_BOOSTING_PARAMS)

display(gb_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,True,0.378887,0.556529,0.0003579595,0.605233,-0.937375
3,3,False,0.0,0.5,0.0,0.731008,0.0
3,5,True,0.388957,0.564662,0.0004669334,0.613275,-0.907505
3,5,False,0.0,0.499934,3.968191e-08,0.730911,-0.000756
3,7,True,0.39681,0.571216,0.0006020635,0.619089,-0.885249
3,7,False,0.0,0.5,0.0,0.731008,0.0
5,3,True,0.3787,0.569376,0.0003151276,0.641376,-0.793353
5,3,False,0.0,0.5,0.0,0.731008,0.0
5,5,True,0.3796,0.57076,0.0002076803,0.648062,-0.777301
5,5,False,0.005705,0.500778,1.445306e-06,0.730814,-0.005752


In [73]:
# Best model parameters

best_gb = gb_results.iloc[[gb_results.reset_index()[DECISION_METRIC].idxmax()]]

display(best_gb)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7,5,True,0.398834,0.587663,0.000523,0.670446,-0.673514


In [74]:
gb_results_1 = hyperparameter_tuning_cv(
    model='gradient_boosting', data=X_e_1, labels=y_e_1, cv_k=CV_K, params=GRADIENT_BOOSTING_PARAMS)
gb_results_2 = hyperparameter_tuning_cv(
    model='gradient_boosting', data=X_e_2, labels=y_e_2, cv_k=CV_K, params=GRADIENT_BOOSTING_PARAMS)
gb_results_3 = hyperparameter_tuning_cv(
    model='gradient_boosting', data=X_e_3, labels=y_e_3, cv_k=CV_K, params=GRADIENT_BOOSTING_PARAMS)

In [75]:
best_gb_1 = gb_results_1.iloc[[
    gb_results_1.reset_index()[DECISION_METRIC].idxmax()]]
best_gb_2 = gb_results_2.iloc[[
    gb_results_2.reset_index()[DECISION_METRIC].idxmax()]]
best_gb_3 = gb_results_3.iloc[[
    gb_results_3.reset_index()[DECISION_METRIC].idxmax()]]
display(best_gb_1, best_gb_2, best_gb_3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,True,0.380664,0.667278,0.000725,0.715309,-1.057728


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7,3,True,0.258692,0.546872,0.00045,0.718883,-0.893166


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
max_depth,n_estimators,oversampling,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,7,True,0.626931,0.65105,0.000379,0.652,-0.388518


### Results

In [76]:
# display('logistic', best_log)
# display('lda', best_lda)
# display('knn', best_knn)
# display('svc', best_svc)
# display('naive_bayes', best_nb)
# display('decision_tree', best_dt)
# display('random_forest', best_rf)
# display('gradient_boosting', best_gb)

In [77]:
weighted_average = (np.array([len(X_e_1), len(X_e_2), len(X_e_3)]) / len(X_e))

e_log = np.sum(pd.concat([best_log_1, best_log_2, best_log_3]
                         ).roc_auc_score.values * weighted_average)
e_lda = np.sum(pd.concat([best_lda_1, best_lda_2, best_lda_3]
                         ).roc_auc_score.values * weighted_average)
e_knn = np.sum(pd.concat([best_knn_1, best_knn_2, best_knn_3]
                         ).roc_auc_score.values * weighted_average)
e_svc = np.sum(pd.concat([best_svc_1, best_svc_2, best_svc_3]
                         ).roc_auc_score.values * weighted_average)
e_nb = np.sum(pd.concat([best_nb_1, best_nb_2, best_nb_3]
                        ).roc_auc_score.values * weighted_average)
e_dt = np.sum(pd.concat([best_dt_1, best_dt_2, best_dt_3]
                        ).roc_auc_score.values * weighted_average)
e_rf = np.sum(pd.concat([best_rf_1, best_rf_2, best_rf_3]
                        ).roc_auc_score.values * weighted_average)
e_gb = np.sum(pd.concat([best_gb_1, best_gb_2, best_gb_3]
                        ).roc_auc_score.values * weighted_average)

In [78]:
# Results expert features
# First row is Expert 1, second Expert 2, third Expert 3
# display('logistic', pd.concat([best_log_1,best_log_2,best_log_3]))
# display('lda', pd.concat([best_lda_1,best_lda_2,best_lda_3]))
display('knn', pd.concat([best_knn_1, best_knn_2, best_knn_3]))
# display('svc', pd.concat([best_svc_1,best_svc_2,best_svc_3]))
# display('naive_bayes', pd.concat([best_nb_1,best_nb_2,best_nb_3]))
# display('decision_tree', pd.concat([best_dt_1,best_dt_2,best_dt_3]))
# display('random_forest', pd.concat([best_rf_1,best_rf_2,best_rf_3]))
# display('gradient_boosting', pd.concat([best_gb_1,best_gb_2,best_gb_3]))

'knn'

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,roc_auc_score,roc_auc_score_var,accuracy_score,explained_variance_score
n_neighbors,oversampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,True,0.380025,0.661933,0.001472,0.738111,-0.959942
1,True,0.328676,0.58135,0.000396,0.613564,-1.305274
1,False,0.639512,0.656636,0.000233,0.656857,-0.372366


In [79]:
results = pd.DataFrame(data={'models': ['logistic', 'lda', 'knn', 'svc', 'naive_bayes', 'decision_tree',
                                        'random_forest', 'gradient_boosting'],
                             'auc_best': [best_log.roc_auc_score.values[0], best_lda.roc_auc_score.values[0],
                                          best_knn.roc_auc_score.values[0], best_svc.roc_auc_score.values[0],
                                          best_nb.roc_auc_score.values[0], best_dt.roc_auc_score.values[0],
                                          best_rf.roc_auc_score.values[0], best_gb.roc_auc_score.values[0]],
                             'auc_expert_weighted': [e_log, e_lda, e_knn, e_svc, e_nb, e_dt, e_rf, e_gb]})

display(results)

Unnamed: 0,models,auc_best,auc_expert_weighted
0,logistic,0.596055,0.601797
1,lda,0.596361,0.603486
2,knn,0.619924,0.630809
3,svc,0.606863,0.610919
4,naive_bayes,0.514923,0.546859
5,decision_tree,0.563532,0.583713
6,random_forest,0.581815,0.610652
7,gradient_boosting,0.587663,0.617955


### Conclusions


In [84]:
# Save dataframe
results.to_pickle("results_coarse_metadata.pkl")

In [81]:
results.mean(axis=0)

auc_best               0.583392
auc_expert_weighted    0.600774
dtype: float64

In [82]:
results.loc[results['auc_best'].argmax()]

models                      knn
auc_best               0.619924
auc_expert_weighted    0.630809
Name: 2, dtype: object

In [83]:
results.loc[results['auc_expert_weighted'].argmax()]

models                      knn
auc_best               0.619924
auc_expert_weighted    0.630809
Name: 2, dtype: object