# Bayesian Optimization for Hyperparameter Tuning

In [7]:
# load library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import zipfile
import urllib.request
import io
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
warnings.filterwarnings('ignore')
#!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

In [2]:
# load processed data
train_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train/all_feat/train_latest.csv')
valid_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/valid/all_feat/valid_latest.csv')

In [3]:
# separate response and features
X_train = train_latest.drop(['id','subject_id','admittime','dischtime','readmitted_within_30days'], axis=1)
y_train = train_latest["readmitted_within_30days"]
X_test = valid_latest.drop(['id','subject_id','admittime','dischtime','readmitted_within_30days'], axis=1)
y_test = valid_latest["readmitted_within_30days"]
#from adjust_imbl import adjust_imbl
#X_train, y_train = adjust_imbl(X_train, y_train)

In [43]:
# logistic regression
def lr_objt_fun(C):
    lr = LogisticRegression(C=C, penalty=penalty, solver='liblinear', random_state=42)
    score = cross_val_score(lr, X_train, y_train, cv=5, scoring='roc_auc').mean()
    return score
pbounds = {'C': (0.001, 100)}
penalties = ['l1', 'l2']
best_score = -1
best_params = None
for penalty in penalties:
    optimizer = BayesianOptimization(f=lr_objt_fun, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=10, n_iter=10)
    if optimizer.max['target'] > best_score:
        best_score = optimizer.max['target']
        best_params = {'C': optimizer.max['params']['C'], 'penalty': penalty}
print(f"Best result: {best_params} with score: {best_score}")

|   iter    |  target   |     C     |
-------------------------------------
| [0m1        [0m | [0m0.7373   [0m | [0m37.45    [0m |
| [0m2        [0m | [0m0.7368   [0m | [0m95.07    [0m |
| [0m3        [0m | [0m0.7369   [0m | [0m73.2     [0m |
| [0m4        [0m | [0m0.7369   [0m | [0m59.87    [0m |
| [95m5        [0m | [95m0.7382   [0m | [95m15.6     [0m |
| [95m6        [0m | [95m0.7382   [0m | [95m15.6     [0m |
| [95m7        [0m | [95m0.7396   [0m | [95m5.809    [0m |
| [0m8        [0m | [0m0.7368   [0m | [0m86.62    [0m |
| [0m9        [0m | [0m0.7369   [0m | [0m60.11    [0m |
| [0m10       [0m | [0m0.7369   [0m | [0m70.81    [0m |
| [0m11       [0m | [0m0.5061   [0m | [0m0.001    [0m |
| [0m12       [0m | [0m0.7388   [0m | [0m10.11    [0m |
| [0m13       [0m | [0m0.7375   [0m | [0m29.35    [0m |
| [0m14       [0m | [0m0.7371   [0m | [0m45.63    [0m |
| [0m15       [0m | [0m0.7377   [0m | [0m2

In [44]:
def lr_objt_fun(args):
    C, penalty = args
    lr = LogisticRegression(C=C, penalty=penalty, solver='liblinear', random_state=42)
    score = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy').mean()
    return -score  
space = [
    hp.loguniform('C', np.log(0.001), np.log(100)),  
    hp.choice('penalty', ['l1','l2']) 
]
trials = Trials()
best = fmin(fn=lr_objt_fun, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
print(best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [15:38<00:00,  9.39s/trial, best loss: -0.849746130607094]
{'C': 0.5487068708926907, 'penalty': 0}
