In [1]:
# e!pip install hyperopt

In [2]:
import hyperopt

hyperopt.__version__

'0.2.7'

## 1. 입력 변수명과 입력값의 검색 공간(Search Space) 설정
- -10 ~ 10까지 1간격을 가지는 입력 변수 x와 -15 ~ 15까지 1간격으로 입력 변수 y 설정.

In [3]:
from hyperopt import hp

search_space = {
    'x': hp.quniform('x', -10, 10, 1),
    'y': hp.quniform('y', -15, 15, 1)
}

## 2. 목적 함수(Objective Function) 설정
- 목적 함수를 생성.
- 변숫값과 변수 검색 공간을 가지는 딕셔너리를 인자로 받고, 특정 값을 반환

In [4]:
from hyperopt import STATUS_OK

def objective_func(search_space):
    x = search_space['x']
    y = search_space['y']
    retval = x**2 - 20*y

    # return {'loss' : retval,
    #         'status' : STATUS_OK}
    
    return retval

## 3. 목적 함수의 반환 최솟값을 가지는 최적 입력값을 유추

In [5]:
from hyperopt import fmin, tpe, Trials
import numpy as np

# 입력 결괏값을 저장한 Trials 객체값 생성.
trial_val = Trials()

# 목적 함수의 최솟값을 반환하는 최적 입력 변숫값을 5번의 입력값 시도(max_evals=5)로 찾아냄.
best_01 = fmin(fn=objective_func,
               space=search_space,
               algo=tpe.suggest,
               max_evals=5,
               trials=trial_val,
               rstate=np.random.default_rng(seed=0))

print('best:', best_01)

100%|████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1029.48trial/s, best loss: -224.0]
best: {'x': -4.0, 'y': 12.0}


In [6]:
trial_val = Trials()

# max_evals를 20회로 늘려서 재테스트
best_02 = fmin(fn=objective_func,
               space=search_space,
               algo=tpe.suggest,
               max_evals=20,
               trials=trial_val,
               rstate=np.random.default_rng(seed=0))

print('best:', best_02)

100%|███████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 849.44trial/s, best loss: -296.0]
best: {'x': 2.0, 'y': 15.0}


In [7]:
# fmin( )에 인자로 들어가는 Trials 객체의 result 속성에 파이썬 리스트로 목적 함수 반환값들이 저장됨
# 리스트 내부의 개별 원소는 {'loss':함수 반환값, 'status':반환 상태값} 와 같은 딕셔너리임. 
trial_val.results

[{'loss': -64.0, 'status': 'ok'},
 {'loss': -184.0, 'status': 'ok'},
 {'loss': 56.0, 'status': 'ok'},
 {'loss': -224.0, 'status': 'ok'},
 {'loss': 61.0, 'status': 'ok'},
 {'loss': -296.0, 'status': 'ok'},
 {'loss': -40.0, 'status': 'ok'},
 {'loss': 281.0, 'status': 'ok'},
 {'loss': 64.0, 'status': 'ok'},
 {'loss': 100.0, 'status': 'ok'},
 {'loss': 60.0, 'status': 'ok'},
 {'loss': -39.0, 'status': 'ok'},
 {'loss': 1.0, 'status': 'ok'},
 {'loss': -164.0, 'status': 'ok'},
 {'loss': 21.0, 'status': 'ok'},
 {'loss': -56.0, 'status': 'ok'},
 {'loss': 284.0, 'status': 'ok'},
 {'loss': 176.0, 'status': 'ok'},
 {'loss': -171.0, 'status': 'ok'},
 {'loss': 0.0, 'status': 'ok'}]

In [8]:
# Trials 객체의 vals 속성에 {'입력변수명':개별 수행 시마다 입력된 값 리스트} 형태로 저장됨.
trial_val.vals

{'x': [-6.0,
  -4.0,
  4.0,
  -4.0,
  9.0,
  2.0,
  10.0,
  -9.0,
  -8.0,
  -0.0,
  -0.0,
  1.0,
  9.0,
  6.0,
  9.0,
  2.0,
  -2.0,
  -4.0,
  7.0,
  -0.0],
 'y': [5.0,
  10.0,
  -2.0,
  12.0,
  1.0,
  15.0,
  7.0,
  -10.0,
  0.0,
  -5.0,
  -3.0,
  2.0,
  4.0,
  10.0,
  3.0,
  3.0,
  -14.0,
  -8.0,
  11.0,
  -0.0]}

In [9]:
import pandas as pd

# results에서 loss 키값에 해당하는 밸류들을 추출하여 list로 생성. 
losses = [loss_dict['loss'] for loss_dict in trial_val.results]

# DataFrame으로 생성.
result_df = pd.DataFrame({'x': trial_val.vals['x'], 'y': trial_val.vals['y'], 'losses': losses})
result_df

Unnamed: 0,x,y,losses
0,-6.0,5.0,-64.0
1,-4.0,10.0,-184.0
2,4.0,-2.0,56.0
3,-4.0,12.0,-224.0
4,9.0,1.0,61.0
5,2.0,15.0,-296.0
6,10.0,7.0,-40.0
7,-9.0,-10.0,281.0
8,-8.0,0.0,64.0
9,-0.0,-5.0,100.0


# HyperOpt를 이용한 XGBoot 하이퍼 파라미터 최적화

In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

dataset = load_breast_cancer()

df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df['target']= dataset.target

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [11]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

tr_X, eval_X, tr_y, eval_y = train_test_split(train_X, train_y, test_size=0.1, random_state=42)

In [12]:
from hyperopt import hp

xgb_search_space = {'max_depth' : hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                   }

In [13]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100,
                            max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss')
    
    accuracy = cross_val_score(xgb_clf, train_X, train_y, scoring='accuracy', cv=3)
    
    return {'loss': -1 * np.mean(accuracy),
            'status': STATUS_OK}

In [14]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val,
            rstate=np.random.default_rng(seed=42))

print('best:', best)

100%|███████████████████████████████████████████████████████████| 50/50 [00:08<00:00,  6.01trial/s, best loss: -0.9692111072382943]
best: {'colsample_bytree': 0.5405126265520063, 'learning_rate': 0.09666667836341278, 'max_depth': 17.0, 'min_child_weight': 2.0}


In [15]:
print('colsample_bytree:{0}, learning_rate:{1}, max_depth:{2}, min_child_weight:{3}'.format(
    round(best['colsample_bytree'], 5), round(best['learning_rate'], 5),
    int(best['max_depth']), int(best['min_child_weight'])))

colsample_bytree:0.54051, learning_rate:0.09667, max_depth:17, min_child_weight:2


In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [17]:
xgb_wrapper = XGBClassifier(n_estimators=400,
                            learning_rate=round(best['learning_rate'], 5),
                            max_depth=int(best['max_depth']),
                            min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )

evals = [(tr_X, tr_y), (eval_X, eval_y)]
# xgb_wrapper.fit(tr_X, tr_y, early_stopping_rounds=50, eval_metric='logloss', eval_set=evals, verbose=True)
xgb_wrapper.fit(tr_X, tr_y, eval_metric='logloss', eval_set=evals)

predict = xgb_wrapper.predict(test_X)
predict_proba = xgb_wrapper.predict_proba(test_X)[:, 1]

get_clf_eval(test_y, predict, predict_proba)

[0]	validation_0-logloss:0.58435	validation_1-logloss:0.57797
[1]	validation_0-logloss:0.52147	validation_1-logloss:0.51568
[2]	validation_0-logloss:0.46774	validation_1-logloss:0.46933
[3]	validation_0-logloss:0.42275	validation_1-logloss:0.42527
[4]	validation_0-logloss:0.38273	validation_1-logloss:0.38669
[5]	validation_0-logloss:0.34965	validation_1-logloss:0.35668
[6]	validation_0-logloss:0.32007	validation_1-logloss:0.32855
[7]	validation_0-logloss:0.29380	validation_1-logloss:0.30205
[8]	validation_0-logloss:0.27030	validation_1-logloss:0.28639
[9]	validation_0-logloss:0.24987	validation_1-logloss:0.26752
[10]	validation_0-logloss:0.23184	validation_1-logloss:0.25137
[11]	validation_0-logloss:0.21514	validation_1-logloss:0.23513
[12]	validation_0-logloss:0.20001	validation_1-logloss:0.22281
[13]	validation_0-logloss:0.18657	validation_1-logloss:0.20910
[14]	validation_0-logloss:0.17419	validation_1-logloss:0.19819
[15]	validation_0-logloss:0.16250	validation_1-logloss:0.18605
[1