In [14]:
import pandas as pd
# !pip install scikit-optimize==0.8
import skopt
print(skopt.__version__)

from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

0.8.0


In [15]:
ex = pd.read_csv("./examination_bert_result_df.csv")
te = pd.read_csv("./treatment_effect_bert_result_df.csv")
ph = pd.read_csv("./past_history_bert_result_df.csv")

ph = ph.loc[:,["id", "past_history_pred_0", "past_history_pred_1", "label"]].set_index('id')
ex = ex.loc[:,["id", "examination_pred_0", "examination_pred_1", "label"]].set_index('id')
te = te.loc[:,["id", "treatment_effect_pred_0", "treatment_effect_pred_1", "label"]].set_index('id')

df = ph.combine_first(ex)
df = df.combine_first(te)
df

Unnamed: 0_level_0,examination_pred_0,examination_pred_1,label,past_history_pred_0,past_history_pred_1,treatment_effect_pred_0,treatment_effect_pred_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
83,,,0.0,4.145,-4.207,3.898,-4.207
86,,,1.0,-3.523,3.867,-2.310,2.115
164,,,0.0,4.047,-4.008,2.584,-2.740
173,,,1.0,-0.557,0.778,-2.440,2.297
182,2.861,-2.828,0.0,,,,
...,...,...,...,...,...,...,...
76037,,,0.0,4.242,-4.332,2.955,-3.172
76139,,,0.0,4.168,-4.190,3.898,-4.220
76247,,,0.0,4.230,-4.355,3.760,-4.113
76381,,,0.0,4.227,-4.402,3.525,-3.934


In [16]:
tmp = df.isna().sum()
tmp[tmp!=0]

examination_pred_0         1685
examination_pred_1         1685
past_history_pred_0         176
past_history_pred_1         176
treatment_effect_pred_0     176
treatment_effect_pred_1     176
dtype: int64

In [17]:
df = df.loc[:,["past_history_pred_0", "past_history_pred_1", "treatment_effect_pred_0", "treatment_effect_pred_1", "examination_pred_0", "examination_pred_1", "label"]]
X = df.loc[:,["past_history_pred_0", "past_history_pred_1", "treatment_effect_pred_0", "treatment_effect_pred_1", "examination_pred_0", "examination_pred_1"]]
y = df.loc[:,"label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = y)

X_train

Unnamed: 0_level_0,past_history_pred_0,past_history_pred_1,treatment_effect_pred_0,treatment_effect_pred_1,examination_pred_0,examination_pred_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
35715,4.1800,-4.32000,3.861,-4.190,,
53349,4.1880,-4.36000,2.541,-2.764,,
11065,4.1700,-4.19500,3.889,-4.230,3.115,-3.006
43146,-2.0020,2.30300,-2.334,2.186,,
17227,4.1760,-4.34000,3.256,-3.535,,
...,...,...,...,...,...,...
4618,-3.0500,3.54700,-2.521,2.418,,
183,,,,,2.861,-2.828
12813,4.0620,-4.14000,3.102,-3.418,,
72727,4.1170,-4.20000,3.193,-3.470,,


In [40]:
seed = 1234

# imputer = IterativeImputer(random_state=seed)
imputer = KNNImputer(n_neighbors=10)
scaler = MinMaxScaler()
augmentation = ADASYN(random_state=seed)
model = XGBClassifier(tree_method='gpu_hist', 
                      use_label_encoder=False, 
                      eval_metric='logloss',
                      random_state =seed
                     )

pipe = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler),
    ('augmentation', augmentation),
    ('model', model)
    ])

search_space = {
    'model__max_depth': Integer(low = 3, high = 20, prior='uniform'),
    'model__n_estimators': Integer(low = 100, high = 1000, prior='uniform'), #
    'model__learning_rate': Real(low = 0.001, high = 0.2, prior='log-uniform'),
    'model__gamma': Real(low = 0.1, high = 1.0, prior='log-uniform'),
    'model__scale_pos_weight': Real(low = 0.1, high = 10, prior='log-uniform'),
    'model__colsample_bytree': Real(low = 0.4, high = 1.0),
    'model__min_child_weight': Integer(low = 50, high = 200, prior='uniform'),
}

opt = BayesSearchCV(
    pipe,
    [(search_space, 40)],
    n_iter=5,
    cv=5,
    n_jobs=1,
    random_state = seed
)

# callback handler
def on_step(optim_result):
    score = opt.best_score_
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

In [41]:
opt.fit(X_train, y_train, callback=on_step)
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))

best score: 0.7206329884357882
best score: 0.7206329884357882
best score: 0.8143639683505782
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9038344491783323
best score: 0.9050517346317711
best score: 0.9050517346317711
best score: 0.9062690200852099
best score: 0.9062690200852099
best score: 0.9099208764455264
best score: 0.9099208764455264
best score: 0.9099208764455264
best score: 0.9099208764455264
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9123554473524041
best score: 0.9129640900791236
best sco

In [42]:
opt.cv_results_

defaultdict(list,
            {'split0_test_score': [0.7203647416413373,
              0.7203647416413373,
              0.8206686930091185,
              0.8996960486322189,
              0.7203647416413373,
              0.8297872340425532,
              0.9057750759878419,
              0.8328267477203647,
              0.7203647416413373,
              0.9057750759878419,
              0.8996960486322189,
              0.9118541033434651,
              0.9118541033434651,
              0.9118541033434651,
              0.9088145896656535,
              0.9027355623100304,
              0.9027355623100304,
              0.8996960486322189,
              0.8206686930091185,
              0.9179331306990881,
              0.9179331306990881,
              0.8996960486322189,
              0.9118541033434651,
              0.9118541033434651,
              0.9118541033434651,
              0.9179331306990881,
              0.8996960486322189,
              0.9148936170212766,
         

In [43]:
opt.best_estimator_

Pipeline(steps=[('imputer', KNNImputer(n_neighbors=10)),
                ('scaler', MinMaxScaler()),
                ('augmentation', ADASYN(random_state=1234)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.4, eval_metric='logloss',
                               gamma=0.1, gpu_id=0, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.0202020720953368,
                               max_delta_step=0, max_depth=15,
                               min_child_weight=89, missing=nan,
                               monotone_constraints='()', n_estimators=904,
                               n_jobs=20, num_parallel_tree=1,
                               random_state=1234, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=0.85813

In [44]:
opt.optimizer_results_

[          fun: -0.9129640900791236
     func_vals: array([-0.72063299, -0.72063299, -0.81436397, -0.90383445, -0.72063299,
        -0.84175289, -0.87766281, -0.82653682, -0.72063299, -0.89957395,
        -0.89470481, -0.90505173, -0.89835666, -0.90626902, -0.90200852,
        -0.90992088, -0.90931223, -0.89835666, -0.76628119, -0.91235545,
        -0.90931223, -0.90079124, -0.90444309, -0.91052952, -0.89896531,
        -0.9117468 , -0.8874011 , -0.90992088, -0.88618381, -0.90018259,
        -0.90200852, -0.91296409, -0.90566038, -0.91113816, -0.90383445,
        -0.70724285, -0.89409617, -0.9117468 , -0.90809495, -0.90322581])
        models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
                          n_restarts_optimizer=2, noise='gaussian',
                          normalize_y=True, random_state=822569775), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1, 1, 1], nu=2.5) 

In [45]:
opt.best_params_

OrderedDict([('model__colsample_bytree', 0.4),
             ('model__gamma', 0.1),
             ('model__learning_rate', 0.0202020720953368),
             ('model__max_depth', 15),
             ('model__min_child_weight', 89),
             ('model__n_estimators', 904),
             ('model__scale_pos_weight', 0.8581331506977758)])

In [47]:
# Check the classification result of each XLM-RoBERTa Model 
from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score,
    precision_score, 
    recall_score, 
    f1_score,
    cohen_kappa_score
)
y_test_pred  = opt.predict(X_test)
y_test_prob = opt.predict_proba(X_test)

print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print("-------------------------")
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
auroc = roc_auc_score(y_test, y_test_prob[:, 1])
kappa = cohen_kappa_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)

print(f"Accuracy: ", '%.4f' % accuracy)
print(f"F1 score: ", '%.4f' % f1)
print(f"AUROC: ", '%.4f' % auroc)
print("Kappa : ", '%.4f' % kappa)
print(f"Recall: ", '%.4f' % recall)
print(f"Precision: ", '%.4f' % precision)

Confusion Matrix
[[270  26]
 [  7 108]]
-------------------------
Accuracy:  0.9197
F1 score:  0.8675
AUROC:  0.9715
Kappa :  0.8104
Recall:  0.9391
Precision:  0.8060
