## XGBoost-Optunaのサンプルコード（train, valid, test使用）

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly

%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# XGBoost
import xgboost as xgb

# Optuna
import optuna
from optuna.samplers import TPESampler

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=pd.core.common.SettingWithCopyWarning)

In [3]:
print(xgb.__version__)

1.3.3


In [4]:
print(optuna.__version__)

2.8.0


In [5]:
print(plotly.__version__)

5.1.0


In [6]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### 前処理

In [7]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# trainのデータセットの2割をモデル学習時のバリデーションデータとして利用する
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [8]:
def objective(trial):

    param = {
        "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1, 40),
        "max_delta_step": trial.suggest_loguniform("max_delta_step", 1e-8, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.0, 1.0),
        "reg_lambda": trial.suggest_uniform("reg_lambda", 0.0, 1000.0),
        "reg_alpha": trial.suggest_uniform("reg_alpha", 0.0, 1000.0),
    }

    model = xgb.XGBRegressor(**param)

    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        early_stopping_rounds=50,
        verbose=False,
    )

    preds = model.predict(x_valid)
    mae = mean_absolute_error(y_valid, preds)

    return mae

In [9]:
%%time
# optunaで最適値を見つける
# create_studyメソッドの引数"sampler"にサンプラーと乱数シードを指定
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=500)

[32m[I 2021-07-10 10:42:36,411][0m A new study created in memory with name: no-name-9cd152a0-e5ea-4243-b96d-8de46bafe39a[0m
[32m[I 2021-07-10 10:42:36,492][0m Trial 0 finished with value: 20.835659417988342 and parameters: {'eta': 0.014810344004555135, 'gamma': 1.4656004675652718e-08, 'max_depth': 6, 'min_child_weight': 15.83538918896983, 'max_delta_step': 9.728728830009641e-05, 'subsample': 0.22479664553084766, 'reg_lambda': 198.06286475962398, 'reg_alpha': 760.5307121989588}. Best is trial 0 with value: 20.835659417988342.[0m
[32m[I 2021-07-10 10:42:36,528][0m Trial 1 finished with value: 20.835802469135803 and parameters: {'eta': 2.2536511574969237e-07, 'gamma': 5.090008568091192e-08, 'max_depth': 7, 'min_child_weight': 33.681648241632445, 'max_delta_step': 1.075439863800637e-08, 'subsample': 0.5121922633857766, 'reg_lambda': 812.6209616521135, 'reg_alpha': 612.5260668293881}. Best is trial 0 with value: 20.835659417988342.[0m
[32m[I 2021-07-10 10:42:36,592][0m Trial 2 fi

CPU times: user 2min 19s, sys: 4.35 s, total: 2min 23s
Wall time: 1min 14s


In [10]:
best_params = study.best_params
best_params["random_state"] = RANDOM_STATE
best_params

{'eta': 0.7843941884609489,
 'gamma': 0.7110281809327414,
 'max_depth': 6,
 'min_child_weight': 3.004719486966463,
 'max_delta_step': 0.5195440186714264,
 'subsample': 0.34386629839407834,
 'reg_lambda': 33.59723365766873,
 'reg_alpha': 2.021344013552837,
 'random_state': 10}

In [11]:
# チューニングしたハイパーパラメーターをフィット
optimised_model = xgb.XGBRegressor(**(best_params))

optimised_model.fit(x_train, y_train)

# XGBoost推論
y_pred = optimised_model.predict(x_test)

In [12]:
# 評価
def calculate_scores(true, pred):
    """全ての評価指標を計算する

    Parameters
    ----------
    true (np.array)       : 実測値
    pred (np.array)       : 予測値

    Returns
    -------
    scores (pd.DataFrame) : 各評価指標を纏めた結果

    """
    scores = {}
    scores = pd.DataFrame(
        {
            "R2": r2_score(true, pred),
            "MAE": mean_absolute_error(true, pred),
            "MSE": mean_squared_error(true, pred),
            "RMSE": np.sqrt(mean_squared_error(true, pred)),
        },
        index=["scores"],
    )
    return scores

In [13]:
scores = calculate_scores(y_test, y_pred)
print(scores)

              R2       MAE       MSE      RMSE
scores  0.768409  3.302701  24.21993  4.921375


### 最適化結果の可視化

In [None]:
# 各パラメータにおける目的変数の値をヒートマップで表示
# optuna.visualization.plot_contour(study)

In [None]:
# どのパラメータが効いていたかを可視化
# optuna.visualization.plot_param_importances(study)

In [None]:
# 最適化の履歴を可視化
# optuna.visualization.plot_optimization_history(study)

In [None]:
# パラメータの組合せと目的変数の結果を可視化
# optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# 各パラメータの値と目的変数の結果を可視化
# optuna.visualization.plot_slice(study)