## XGBoost-Optunaのサンプルコード

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# XGBoost
import xgboost as xgb

# Optuna
import optuna
from sklearn.model_selection import cross_val_score

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [3]:
print(xgb.__version__)

1.2.0


In [4]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### 前処理

In [5]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

In [6]:
def objective(trial):

    eta = trial.suggest_loguniform("eta", 1e-8, 1.0)
    gamma = trial.suggest_loguniform("gamma", 1e-8, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 10)
    min_child_weight = trial.suggest_loguniform("min_child_weight", 1, 40)
    max_delta_step = trial.suggest_loguniform("max_delta_step", 1e-8, 1.0)
    subsample = trial.suggest_uniform("subsample", 0.0, 1.0)
    reg_lambda = trial.suggest_uniform("reg_lambda", 0.0, 1000.0)
    reg_alpha = trial.suggest_uniform("reg_alpha", 0.0, 1000.0)

    model = xgb.XGBRegressor(
        eta=eta,
        gamma=gamma,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        max_delta_step=max_delta_step,
        subsample=subsample,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
    )

    score = cross_val_score(
        model, x_train, y_train, cv=5, scoring="neg_mean_absolute_error"
    )
    mae = score.mean()

    return mae

In [7]:
%%time
# optunaで最適値を見つける
# 注：cross_val_scoreの出力は全て高いほど良い
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[32m[I 2021-05-28 14:53:28,751][0m A new study created in memory with name: no-name-b62c491f-8991-4a10-bfae-f3e80a6f079a[0m
[32m[I 2021-05-28 14:53:28,924][0m Trial 0 finished with value: -21.343863422670484 and parameters: {'eta': 2.0868349067101036e-06, 'gamma': 0.012989525114987101, 'max_depth': 6, 'min_child_weight': 8.364310147778452, 'max_delta_step': 0.0035993934500215424, 'subsample': 0.28338800958767707, 'reg_lambda': 476.07974679616626, 'reg_alpha': 704.4893367224554}. Best is trial 0 with value: -21.343863422670484.[0m
[32m[I 2021-05-28 14:53:29,067][0m Trial 1 finished with value: -21.343864197530866 and parameters: {'eta': 5.651746392881348e-07, 'gamma': 0.001088276205966229, 'max_depth': 2, 'min_child_weight': 22.67929447375698, 'max_delta_step': 2.82972785590224e-05, 'subsample': 0.9161398936243222, 'reg_lambda': 122.88232665112075, 'reg_alpha': 324.936222717748}. Best is trial 0 with value: -21.343863422670484.[0m
[32m[I 2021-05-28 14:53:29,209][0m Trial 2 fi

CPU times: user 8min 20s, sys: 24.1 s, total: 8min 44s
Wall time: 6min 57s


In [8]:
# チューニングしたハイパーパラメーターをフィット
optimised_model = xgb.XGBRegressor(
    eta=study.best_params["eta"],
    gamma=study.best_params["gamma"],
    max_depth=study.best_params["max_depth"],
    min_child_weight=study.best_params["min_child_weight"],
    max_delta_step=study.best_params["max_delta_step"],
    subsample=study.best_params["subsample"],
    reg_lambda=study.best_params["reg_lambda"],
    reg_alpha=study.best_params["reg_alpha"],
)

optimised_model.fit(x_train, y_train)

# XGBoost推論
y_pred = optimised_model.predict(x_test)

In [9]:
# 評価
def calculate_scores(true, pred):
    """全ての評価指標を計算する

    Parameters
    ----------
    true (np.array)       : 実測値
    pred (np.array)       : 予測値

    Returns
    -------
    scores (pd.DataFrame) : 各評価指標を纏めた結果

    """
    scores = {}
    scores = pd.DataFrame(
        {
            "R2": r2_score(true, pred),
            "MAE": mean_absolute_error(true, pred),
            "MSE": mean_squared_error(true, pred),
            "RMSE": np.sqrt(mean_squared_error(true, pred)),
        },
        index=["scores"],
    )
    return scores

In [10]:
scores = calculate_scores(y_test, y_pred)
print(scores)

              R2       MAE        MSE      RMSE
scores  0.818378  2.868973  18.994179  4.358231
