## XGBoost-Optunaのサンプルコード

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# XGBoost
import xgboost as xgb

# BayesianOptimization
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_predict

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [3]:
print(xgb.__version__)

1.2.0


In [4]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### 前処理

In [5]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# trainのデータセットの2割をモデル学習時のバリデーションデータとして利用する
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [6]:
def xgb_regressor(
    max_depth,
    min_child_weight,
    gamma,
    subsample,
    colsample_bytree,
    reg_alpha,
    n_estimators,
    reg_lambda,
    learning_rate,
):

    params = {
        "max_depth": int(max_depth),
        "min_child_weight": int(min_child_weight),
        "gamma": gamma,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "reg_alpha": reg_alpha,
        "n_estimators": int(n_estimators),
        "reg_lambda": reg_lambda,
        "learning_rate": learning_rate,
    }

    model = xgb.XGBRegressor(
        **params,
        early_stopping_rounds=50,
        eval_set=[(x_valid, y_valid)],
        eval_metric="mae",
        silent=False,
        n_jobs=-1
    )

    y_pred_cv = cross_val_predict(model, x_train, y_train, cv=5, n_jobs=-1)
    mae_cv = mean_absolute_error(y_train, y_pred_cv)

    return -mae_cv

In [7]:
# ベイズ最適化で探索するパラメータ空間を定義する
xgb_bo = BayesianOptimization(
    xgb_regressor,
    {
        "max_depth": (3, 8),
        "min_child_weight": (1, 5),
        "gamma": (1e-8, 1.0),
        "subsample": (0.6, 1),
        "colsample_bytree": (0.6, 1),
        "reg_alpha": (1e-5, 100),
        "n_estimators": (1000, 2000),
        "reg_lambda": (1e-5, 100),
        "learning_rate": (0.1, 0.3),
    },
)

In [8]:
%time
# ベイズ最適化を実行（scoreが最大となるようにパラメータを探索していく）
# init_point：初期に探索する点数
# acq:獲得関数。EIは(expected improvement)
xgb_bo.maximize(init_points=5, n_iter=100, acq="ei")
# 最もスコアのよかったパラメータの値を取得する。
optimized_params = xgb_bo.max["params"]

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 27.2 µs
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.303   [0m | [0m 0.8974  [0m | [0m 0.0654  [0m | [0m 0.119   [0m | [0m 5.541   [0m | [0m 3.959   [0m | [0m 1.49e+03[0m | [0m 18.68   [0m | [0m 5.196   [0m | [0m 0.8107  [0m |
| [0m 2       [0m | [0m-2.397   [0m | [0m 0.7468  [0m | [0m 0.1571  [0m | [0m 0.2487  [0m | [0m 4.596   [0m | [0m 2.102   [0m | [0m 1.408e+0[0m | [0m 12.46   [0m | [0m 16.95   [0m | [0m 0.881   [0m |
| [0m 3       [0m | [0m-2.585   [0m | [0m 0.7897  [0m | [0m 0.7274  [0m | [0m 0.1518  [0m | [0m 5.887   [0m | [0m 3.203   [0m | [0m 1.468e+0[0m | [0m 46.87   [0m | [0m 89.51   [0m | [0m 0.7469  [0m |
| 

In [9]:
# 整数のパラメータは変換
optimized_params["max_depth"] = int(optimized_params["max_depth"])
optimized_params["min_child_weight"] = int(optimized_params["min_child_weight"])
optimized_params["n_estimators"] = int(optimized_params["n_estimators"])

In [10]:
# 調整したパラメータで精度検証する
opt_model = xgb.XGBRegressor()
opt_model.set_params(**optimized_params)
opt_model.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7009024088952355,
             gamma=0.7975812745061872, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.22924195776645184,
             max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1516, n_jobs=0,
             num_parallel_tree=1, random_state=0, reg_alpha=2.924291883639843,
             reg_lambda=0.46145508105653066, scale_pos_weight=1,
             subsample=0.9781739909452981, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [11]:
y_pred_train = opt_model.predict(x_train)
y_pred_valid = opt_model.predict(x_valid)
y_pred_test = opt_model.predict(x_test)

In [12]:
# 学習モデルの評価（RMSEを計算）
print("RMSE(train data):", round(np.sqrt(mean_squared_error(y_train, y_pred_train)), 3))
print("RMSE(valid data):", round(np.sqrt(mean_squared_error(y_valid, y_pred_valid)), 3))
print("RMSE(test data):", round(np.sqrt(mean_squared_error(y_test, y_pred_test)), 3))

RMSE(train data): 0.557
RMSE(valid data): 2.928
RMSE(test data): 3.461


In [13]:
# 評価
def calculate_scores(true, pred):
    """全ての評価指標を計算する

    Parameters
    ----------
    true (np.array)       : 実測値
    pred (np.array)       : 予測値

    Returns
    -------
    scores (pd.DataFrame) : 各評価指標を纏めた結果

    """
    scores = {}
    scores = pd.DataFrame(
        {
            "R2": r2_score(true, pred),
            "MAE": mean_absolute_error(true, pred),
            "MSE": mean_squared_error(true, pred),
            "RMSE": np.sqrt(mean_squared_error(true, pred)),
        },
        index=["scores"],
    )
    return scores

In [14]:
scores = calculate_scores(y_test, y_pred_test)
print(scores)

              R2       MAE        MSE      RMSE
scores  0.885479  2.515454  11.976702  3.460737
