## CatBoost-Optunaのサンプルコード（train, valid, test使用）

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# CatBoost
import catboost as cb
from catboost import CatBoost, Pool

# Optuna
import optuna
from optuna.samplers import CmaEsSampler

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=pd.core.common.SettingWithCopyWarning)

In [3]:
print(cb.__version__)

0.26


In [4]:
print(optuna.__version__)

2.8.0


In [5]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### 前処理

In [6]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# trainのデータセットの2割をモデル学習時のバリデーションデータとして利用する
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [7]:
def objective(trial):

    param = {
        "iterations": trial.suggest_int("iterations", 50, 300),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "random_strength": trial.suggest_int("random_strength", 0, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.01, 100.00
        ),
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 50),
    }

    model = cb.CatBoostRegressor(**param)

    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False,
    )

    preds = model.predict(x_valid)
    mae = mean_absolute_error(y_valid, preds)

    return mae

In [8]:
%%time
# optunaで最適値を見つける
# create_studyメソッドの引数"sampler"にサンプラーと乱数シードを指定
study = optuna.create_study(direction='minimize', sampler=CmaEsSampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=50)

[32m[I 2021-07-08 06:36:21,829][0m A new study created in memory with name: no-name-813537b1-bb41-4745-ac5f-8c8c4fc977ba[0m
[32m[I 2021-07-08 06:36:23,030][0m Trial 0 finished with value: 1.9714684778872467 and parameters: {'iterations': 243, 'depth': 4, 'learning_rate': 0.08629294202140579, 'random_strength': 75, 'bagging_temperature': 0.986343187233007, 'od_type': 'IncToDec', 'od_wait': 41}. Best is trial 0 with value: 1.9714684778872467.[0m
[32m[I 2021-07-08 06:36:25,402][0m Trial 1 finished with value: 2.402267557615796 and parameters: {'iterations': 175, 'depth': 8, 'learning_rate': 0.028966079322940607, 'random_strength': 49, 'bagging_temperature': 0.7370362034389614, 'od_type': 'IncToDec', 'od_wait': 30}. Best is trial 0 with value: 1.9714684778872467.[0m
[32m[I 2021-07-08 06:36:27,044][0m Trial 2 finished with value: 2.72270832175003 and parameters: {'iterations': 175, 'depth': 7, 'learning_rate': 0.015613370417884475, 'random_strength': 50, 'bagging_temperature': 0.

CPU times: user 46 s, sys: 14.4 s, total: 1min
Wall time: 1min 6s


In [9]:
best_params = study.best_params
best_params["random_state"] = RANDOM_STATE
best_params

{'iterations': 175,
 'depth': 6,
 'learning_rate': 0.13067010288294736,
 'random_strength': 50,
 'bagging_temperature': 0.9120335759330601,
 'od_type': 'IncToDec',
 'od_wait': 30,
 'random_state': 10}

In [10]:
# チューニングしたハイパーパラメーターをフィット
optimised_model = cb.CatBoostRegressor(**(best_params))

optimised_model.fit(x_train, y_train)

# CatBoost推論
y_pred = optimised_model.predict(x_test)

0:	learn: 8.8301258	total: 2.43ms	remaining: 422ms
1:	learn: 8.5327045	total: 5.12ms	remaining: 443ms
2:	learn: 8.2697181	total: 7.25ms	remaining: 416ms
3:	learn: 8.0616444	total: 9.64ms	remaining: 412ms
4:	learn: 7.8322588	total: 11.8ms	remaining: 400ms
5:	learn: 7.6807961	total: 14.6ms	remaining: 410ms
6:	learn: 7.4992949	total: 16.9ms	remaining: 406ms
7:	learn: 7.3574938	total: 25.7ms	remaining: 536ms
8:	learn: 7.2153343	total: 27.7ms	remaining: 510ms
9:	learn: 7.0755669	total: 29.7ms	remaining: 490ms
10:	learn: 6.7450340	total: 32.9ms	remaining: 490ms
11:	learn: 6.5124649	total: 35.2ms	remaining: 478ms
12:	learn: 6.4420287	total: 37.9ms	remaining: 472ms
13:	learn: 6.3740807	total: 40.8ms	remaining: 469ms
14:	learn: 6.3369836	total: 43.5ms	remaining: 464ms
15:	learn: 6.0935660	total: 45.8ms	remaining: 455ms
16:	learn: 5.9557708	total: 49.1ms	remaining: 456ms
17:	learn: 5.8544576	total: 51.6ms	remaining: 450ms
18:	learn: 5.8054721	total: 54.7ms	remaining: 449ms
19:	learn: 5.7025170	t

In [11]:
# 評価
def calculate_scores(true, pred):
    """全ての評価指標を計算する

    Parameters
    ----------
    true (np.array)       : 実測値
    pred (np.array)       : 予測値

    Returns
    -------
    scores (pd.DataFrame) : 各評価指標を纏めた結果

    """
    scores = {}
    scores = pd.DataFrame(
        {
            "R2": r2_score(true, pred),
            "MAE": mean_absolute_error(true, pred),
            "MSE": mean_squared_error(true, pred),
            "RMSE": np.sqrt(mean_squared_error(true, pred)),
        },
        index=["scores"],
    )
    return scores

In [12]:
scores = calculate_scores(y_test, y_pred)
print(scores)

              R2     MAE        MSE      RMSE
scores  0.860679  2.6623  14.570245  3.817099
