In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt

# !pip install scikit-optimize==0.8
import skopt
from skopt import dump, load
print(skopt.__version__)

import sklearn
print(sklearn.__version__)

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

0.8.1
0.23.2


In [2]:
seed = 3832 # set seed number for reproducible results

In [3]:
ex = pd.read_csv("./examination_bert_result_df.csv")
te = pd.read_csv("./treatment_effect_bert_result_df.csv")
ph = pd.read_csv("./past_history_bert_result_df.csv")

ph = ph.loc[:,["id", "past_history_pred_0", "past_history_pred_1", "label"]].set_index('id')
ex = ex.loc[:,["id", "examination_pred_0", "examination_pred_1", "label"]].set_index('id')
te = te.loc[:,["id", "treatment_effect_pred_0", "treatment_effect_pred_1", "label"]].set_index('id')

df = ph.combine_first(ex)
df = df.combine_first(te)
df

Unnamed: 0_level_0,examination_pred_0,examination_pred_1,label,past_history_pred_0,past_history_pred_1,treatment_effect_pred_0,treatment_effect_pred_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59194424,-3.807,4.08200,1,-5.430,4.883,-4.990,4.980
22235170,-3.992,4.20700,1,-5.360,4.793,-5.035,5.004
131182223,-3.979,4.22700,1,-4.875,4.270,-4.914,4.930
9453533,-4.000,4.24200,1,-5.090,4.500,-5.125,5.040
58102385,-4.195,4.45000,1,-5.375,4.820,-5.055,4.938
...,...,...,...,...,...,...,...
58439794,-4.105,4.38300,0,-5.350,4.793,-5.035,5.004
63772231,-4.188,4.46500,1,-4.660,4.074,-5.082,5.043
55819857,-3.870,4.11700,1,-5.190,4.650,-5.094,5.010
111502342,-4.117,4.39000,1,-4.800,4.246,-5.030,5.023


In [4]:
tmp = df.isna().sum()
tmp[tmp!=0]

Series([], dtype: int64)

In [5]:
df = df.loc[:,["past_history_pred_0", "past_history_pred_1", "treatment_effect_pred_0", "treatment_effect_pred_1", "examination_pred_0", "examination_pred_1", "label"]]
X = df.loc[:,["past_history_pred_0", "past_history_pred_1", "treatment_effect_pred_0", "treatment_effect_pred_1", "examination_pred_0", "examination_pred_1"]]
y = df.loc[:,"label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = y, random_state = seed)

X_train

Unnamed: 0_level_0,past_history_pred_0,past_history_pred_1,treatment_effect_pred_0,treatment_effect_pred_1,examination_pred_0,examination_pred_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
36963276,-5.367,4.810,-5.016,4.996,-4.0200,4.2700
114857723,2.701,-2.875,-5.055,5.027,1.0750,-0.8600
130748375,-5.387,4.832,-5.040,5.000,-4.0470,4.2900
48163040,-5.273,4.727,-5.047,4.996,0.1318,0.1439
32328364,-5.098,4.543,-5.040,4.980,-3.8440,4.0740
...,...,...,...,...,...,...
66865562,-5.030,4.438,-5.094,4.926,-4.0900,4.3550
9453533,-5.090,4.500,-5.125,5.040,-4.0000,4.2420
36280227,-5.355,4.816,-5.040,5.004,-1.8680,2.0180
31609767,-5.348,4.810,-5.090,5.040,-4.1900,4.4200


In [6]:
# imputer = KNNImputer(n_neighbors=10)
scaler = MinMaxScaler()
augmentation = ADASYN(random_state=seed)
model = XGBClassifier(tree_method='gpu_hist', 
                      use_label_encoder=False, 
                      eval_metric='logloss',
                      random_state =seed
                     )

pipe = Pipeline(steps=[
#     ('imputer', imputer),
    ('scaler', scaler),
    ('augmentation', augmentation),
    ('model', model)
    ])

search_space = {
    'model__max_depth': Integer(low = 3, high = 20, prior='uniform'),
    'model__n_estimators': Integer(low = 100, high = 1000, prior='uniform'), #
    'model__learning_rate': Real(low = 0.001, high = 0.2, prior='log-uniform'),
    'model__gamma': Real(low = 0.1, high = 1.0, prior='log-uniform'),
    'model__scale_pos_weight': Real(low = 0.1, high = 10, prior='log-uniform'),
    'model__colsample_bytree': Real(low = 0.4, high = 1.0),
    'model__min_child_weight': Integer(low = 50, high = 200, prior='uniform'),
}

opt = BayesSearchCV(
    pipe,
    [(search_space, 40)],
    n_iter=5,
#     cv=5,
    n_jobs=1,
    random_state = seed
)

# callback handler
def on_step(optim_result):
    score = opt.best_score_
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

In [7]:
opt.fit(X_train, y_train, callback=on_step)
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))

best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best score: 0.9102167182662538
best sco

In [8]:
opt.cv_results_

defaultdict(list,
            {'split0_test_score': [0.9076923076923077,
              0.9076923076923077,
              0.09230769230769231,
              0.09230769230769231,
              0.9076923076923077,
              0.9076923076923077,
              0.09230769230769231,
              0.09230769230769231,
              0.9076923076923077,
              0.09230769230769231,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.09230769230769231,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
              0.9076923076923077,
   

In [9]:
opt.best_estimator_

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('augmentation', ADASYN(random_state=3832)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.9769312026861221,
                               eval_metric='logloss', gamma=0.4253134578365801,
                               gpu_id=0, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.13946303889907458,
                               max_delta_step=0, max_depth=13,
                               min_child_weight=102, missing=nan,
                               monotone_constraints='()', n_estimators=491,
                               n_jobs=20, num_parallel_tree=1,
                               random_state=3832, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=4

In [10]:
opt.optimizer_results_

[          fun: -0.9102167182662538
     func_vals: array([-0.91021672, -0.91021672, -0.08978328, -0.08978328, -0.91021672,
        -0.91021672, -0.08978328, -0.08978328, -0.91021672, -0.08978328,
        -0.91021672, -0.91021672, -0.91021672, -0.91021672, -0.89164087,
        -0.91021672, -0.91021672, -0.91021672, -0.91021672, -0.08978328,
        -0.91021672, -0.91021672, -0.91021672, -0.89164087, -0.91021672,
        -0.91021672, -0.91021672, -0.91021672, -0.91021672, -0.91021672,
        -0.91021672, -0.91021672, -0.91021672, -0.91021672, -0.91021672,
        -0.91021672, -0.91021672, -0.91021672, -0.91021672, -0.91021672])
        models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
                          n_restarts_optimizer=2, noise='gaussian',
                          normalize_y=True, random_state=238102835), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1, 1, 1], nu=2.5) 

In [11]:
opt.best_params_

OrderedDict([('model__colsample_bytree', 0.9769312026861221),
             ('model__gamma', 0.4253134578365801),
             ('model__learning_rate', 0.13946303889907458),
             ('model__max_depth', 13),
             ('model__min_child_weight', 102),
             ('model__n_estimators', 491),
             ('model__scale_pos_weight', 4.103218442077472)])

In [12]:
# Check the classification result of each XLM-RoBERTa Model 
from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score,
    precision_score, 
    recall_score, 
    f1_score,
    cohen_kappa_score
)
y_test_pred  = opt.predict(X_test)
y_test_prob = opt.predict_proba(X_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print("-------------------------")
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
auroc = roc_auc_score(y_test, y_test_prob[:, 1])
kappa = cohen_kappa_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)

print(f"Accuracy: ", '%.4f' % accuracy)
print(f"F1 score: ", '%.4f' % f1)
print(f"AUROC: ", '%.4f' % auroc)
print("Kappa : ", '%.4f' % kappa)
print(f"Recall: ", '%.4f' % recall)
print(f"Precision: ", '%.4f' % precision)

Confusion Matrix
[[ 0  7]
 [ 0 74]]
-------------------------
Accuracy:  0.9136
F1 score:  0.9548
AUROC:  0.7346
Kappa :  0.0000
Recall:  1.0000
Precision:  0.9136


In [13]:
# Save HPO results
dump(opt, 'result.pkl')

# Training ratio vs. Accuracy, F1 score

In [14]:
# Load HPO results
res_loaded = load('result.pkl')
res_loaded.best_params_

OrderedDict([('model__colsample_bytree', 0.9769312026861221),
             ('model__gamma', 0.4253134578365801),
             ('model__learning_rate', 0.13946303889907458),
             ('model__max_depth', 13),
             ('model__min_child_weight', 102),
             ('model__n_estimators', 491),
             ('model__scale_pos_weight', 4.103218442077472)])

In [15]:
# Applying Optimized hyperparameters on the XGBoost Classifier model

model = XGBClassifier(tree_method='gpu_hist', 
                      use_label_encoder=False, 
                      eval_metric='logloss',
                      random_state=seed
                     )

for n, v in res_loaded.best_params_.items():
    setattr(model, re.sub("model__", "", n), v)
    
model

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9769312026861221,
              eval_metric='logloss', gamma=0.4253134578365801, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.13946303889907458, max_delta_step=None,
              max_depth=13, min_child_weight=102, missing=nan,
              monotone_constraints=None, n_estimators=491, n_jobs=None,
              num_parallel_tree=None, random_state=3832, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=4.103218442077472,
              subsample=None, tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=None, verbosity=None)

In [16]:
pipe = Pipeline(steps=[
#     ('imputer', imputer),
    ('scaler', scaler),
    ('augmentation', augmentation),
    ('model', model)
    ])

In [17]:
df = pd.DataFrame({
    'train_set_ratio':[0],
    'Accuracy':[0],
    'F1-score':[0]
    })

for i in range(1,11):
    X_train_sample = X_train.sample(frac=1, random_state=seed)[:int(len(X_train)*i/10)]
    y_train_sample = pd.merge(X_train_sample, y_train, on="id", how="left").loc[:,"label"]
    pipe.fit(X_train_sample, y_train_sample)
    pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    tmp_df = pd.DataFrame({
        'train_set_ratio':[i/10],
        'Accuracy':[accuracy],
        'F1-score':[f1],
        })
    df = df.append(tmp_df, ignore_index = True)
    
df

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 6

In [None]:
X = df["train_set_ratio"]
y1 = df["Accuracy"]
y2 = df["F1-score"]

plt.plot(X, y1, '-o', label='ACC')
plt.plot(X, y2, '-o', label='F1')
plt.xlim([0.0, 1.1])      
plt.ylim([0.0, 1])
plt.legend()
plt.show()