# Prepare

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()

In [4]:
t = dataset.target
x = dataset.data

In [5]:
x.shape, t.shape

((569, 30), (569,))

# Data Split

In [6]:
from sklearn.model_selection import train_test_split

x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size=0.2, random_state=1)

In [7]:
# 検証用データセット：学習用データセット＝ 30 ： 70
x_train, x_val, t_train, t_val = train_test_split(x_train_val, t_train_val, test_size=0.3, random_state=1)

In [8]:
x_train.shape, x_val.shape, x_test.shape

((318, 30), (137, 30), (114, 30))

# 手動調整

In [9]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=0)

In [10]:
dtree.fit(x_train, t_train)

In [11]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  1.0
validation score :  0.927007299270073


In [12]:
# ハイパーパラメータを設定して、モデルの定義
dtree = DecisionTreeClassifier(max_depth=10, min_samples_split=30, random_state=0)

dtree.fit(x_train, t_train)

In [13]:
print('train score : ', dtree.score(x_train, t_train))
print('validation score : ', dtree.score(x_val, t_val))

train score :  0.9308176100628931
validation score :  0.9562043795620438


In [14]:
print('test score :', dtree.score(x_test, t_test))

test score : 0.9298245614035088


# グリッドサーチ

In [15]:
# GridSearchCV クラスのインポート
from sklearn.model_selection import GridSearchCV

In [16]:
# 学習に使用するアルゴリズムの定義
estimator = DecisionTreeClassifier(random_state=0)

In [17]:
# 探索するハイパーパラメータと範囲の定義
param_grid = [{
    'max_depth': [3, 20, 50],
    'min_samples_split': [3, 20, 30]
}]

In [18]:
# データセット分割数を定義
cv = 5

In [19]:
# GridSearchCV クラスを用いたモデルの定義
tuned_model = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           cv=cv, return_train_score=False)

In [20]:
# モデルの学習＆検証
tuned_model.fit(x_train_val, t_train_val)

In [21]:
# 検証結果の確認
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.0048,0.003733,0.003816,0.004474,0.004517,0.004437,0.004821,0.004419,0.00539
std_fit_time,0.001349,0.000099,0.000205,0.000234,0.000153,0.000379,0.000427,0.00021,0.000577
mean_score_time,0.000603,0.000464,0.000477,0.000489,0.000482,0.000462,0.0005,0.000468,0.000654
std_score_time,0.000137,0.000021,0.000029,0.000013,0.00005,0.000022,0.00003,0.000014,0.000126
param_max_depth,3,3,3,20,20,20,50,50,50
param_min_samples_split,3,20,30,3,20,30,3,20,30
params,"{'max_depth': 3, 'min_samples_split': 3}","{'max_depth': 3, 'min_samples_split': 20}","{'max_depth': 3, 'min_samples_split': 30}","{'max_depth': 20, 'min_samples_split': 3}","{'max_depth': 20, 'min_samples_split': 20}","{'max_depth': 20, 'min_samples_split': 30}","{'max_depth': 50, 'min_samples_split': 3}","{'max_depth': 50, 'min_samples_split': 20}","{'max_depth': 50, 'min_samples_split': 30}"
split0_test_score,0.923077,0.912088,0.912088,0.956044,0.912088,0.912088,0.956044,0.912088,0.912088
split1_test_score,0.901099,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.934066,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [22]:
estimator = DecisionTreeClassifier(random_state=0)
cv = 5
param_grid = [{
    'max_depth': [5, 10, 15] ,
    'min_samples_split': [10, 12, 15]
}]

In [23]:
# モデルの定義
tuned_model = GridSearchCV(estimator=estimator,
                           param_grid=param_grid,
                           cv=cv, return_train_score=False)

# モデルの学習
tuned_model.fit(x_train_val, t_train_val)

In [24]:
# 学習結果の確認
pd.DataFrame(tuned_model.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.005303,0.004581,0.004408,0.004726,0.004543,0.004691,0.004517,0.00449,0.00452
std_fit_time,0.001581,0.000214,0.000261,0.000637,0.000225,0.000483,0.000164,0.0002,0.000219
mean_score_time,0.000519,0.000491,0.000493,0.000541,0.000492,0.000517,0.000493,0.000493,0.000524
std_score_time,0.000066,0.000027,0.000033,0.000103,0.000018,0.000028,0.000026,0.00004,0.000074
param_max_depth,5,5,5,10,10,10,15,15,15
param_min_samples_split,10,12,15,10,12,15,10,12,15
params,"{'max_depth': 5, 'min_samples_split': 10}","{'max_depth': 5, 'min_samples_split': 12}","{'max_depth': 5, 'min_samples_split': 15}","{'max_depth': 10, 'min_samples_split': 10}","{'max_depth': 10, 'min_samples_split': 12}","{'max_depth': 10, 'min_samples_split': 15}","{'max_depth': 15, 'min_samples_split': 10}","{'max_depth': 15, 'min_samples_split': 12}","{'max_depth': 15, 'min_samples_split': 15}"
split0_test_score,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088
split1_test_score,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [25]:
# 最も予測精度の高かったハイパーパラメータの確認
tuned_model.best_params_

{'max_depth': 5, 'min_samples_split': 10}

In [26]:
# 最も予測精度の高かったモデルの引き継ぎ
best_model = tuned_model.best_estimator_

# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


# ランダムサーチ

ランダムサーチの方がグリッドサーチよりも最適解を見つけやすいらしい
まずランダムサーチで絞って、グリッドサーチすると良い

In [27]:
# RandomizedSearchCV クラスのインポート
from sklearn.model_selection import RandomizedSearchCV

In [28]:
# 学習に使用するアルゴリズム
estimator = DecisionTreeClassifier(random_state=0)

In [29]:
list(range(1, 10, 2))

[1, 3, 5, 7, 9]

In [30]:
# ハイパーパラメータを探索する範囲の指定
param_distributions = {
    'max_depth': list(range(5, 100, 2)),
    'min_samples_split': list(range(2, 50, 1))
}

In [31]:
# 試行回数の指定
n_iter = 100

In [32]:
cv = 5

In [33]:
# モデルの定義
tuned_model = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=param_distributions,
    n_iter=n_iter, cv=cv,
    random_state=0, return_train_score=False
)

In [34]:
# モデルの学習＆検証
tuned_model.fit(x_train_val, t_train_val)

In [35]:
# 学習結果の確認（スコアの高い順に表示）
pd.DataFrame(tuned_model.cv_results_).sort_values('rank_test_score').T

Unnamed: 0,47,77,82,90,42,19,28,12,11,62,...,40,41,98,50,55,58,60,67,31,99
mean_fit_time,0.011841,0.009509,0.006535,0.006845,0.022843,0.011811,0.01858,0.01331,0.016439,0.006582,...,0.016444,0.01813,0.006485,0.013074,0.006323,0.00598,0.006226,0.006504,0.014219,0.007626
std_fit_time,0.002993,0.002987,0.000277,0.000646,0.004437,0.007271,0.010488,0.005889,0.011425,0.000763,...,0.003955,0.003233,0.000419,0.004056,0.000404,0.000312,0.000548,0.000434,0.003784,0.001497
mean_score_time,0.000872,0.002336,0.000874,0.000865,0.003655,0.001989,0.001136,0.001983,0.00281,0.000763,...,0.003128,0.002274,0.000985,0.003581,0.000855,0.000746,0.000804,0.000913,0.001182,0.00153
std_score_time,0.000075,0.002856,0.000045,0.00005,0.004315,0.002424,0.000459,0.001315,0.003114,0.00011,...,0.002796,0.002851,0.000128,0.003449,0.000059,0.00008,0.000035,0.000056,0.000836,0.000638
param_min_samples_split,10,10,4,4,7,9,11,2,8,7,...,49,31,45,27,43,36,36,47,44,39
param_max_depth,23,65,95,39,15,37,7,87,29,7,...,87,23,19,99,27,27,47,75,95,87
params,"{'min_samples_split': 10, 'max_depth': 23}","{'min_samples_split': 10, 'max_depth': 65}","{'min_samples_split': 4, 'max_depth': 95}","{'min_samples_split': 4, 'max_depth': 39}","{'min_samples_split': 7, 'max_depth': 15}","{'min_samples_split': 9, 'max_depth': 37}","{'min_samples_split': 11, 'max_depth': 7}","{'min_samples_split': 2, 'max_depth': 87}","{'min_samples_split': 8, 'max_depth': 29}","{'min_samples_split': 7, 'max_depth': 7}",...,"{'min_samples_split': 49, 'max_depth': 87}","{'min_samples_split': 31, 'max_depth': 23}","{'min_samples_split': 45, 'max_depth': 19}","{'min_samples_split': 27, 'max_depth': 99}","{'min_samples_split': 43, 'max_depth': 27}","{'min_samples_split': 36, 'max_depth': 27}","{'min_samples_split': 36, 'max_depth': 47}","{'min_samples_split': 47, 'max_depth': 75}","{'min_samples_split': 44, 'max_depth': 95}","{'min_samples_split': 39, 'max_depth': 87}"
split0_test_score,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.967033,0.956044,0.967033,0.967033,...,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088
split1_test_score,0.912088,0.912088,0.912088,0.912088,0.912088,0.912088,0.901099,0.912088,0.912088,0.912088,...,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099,0.901099
split2_test_score,0.923077,0.923077,0.912088,0.912088,0.912088,0.912088,0.923077,0.923077,0.912088,0.912088,...,0.945055,0.934066,0.945055,0.934066,0.945055,0.945055,0.945055,0.945055,0.945055,0.945055


In [36]:
# 最も予測精度の高かったハイパーパラメータの確認
tuned_model.best_params_

{'min_samples_split': 10, 'max_depth': 23}

In [37]:
# 最も予測精度の高かったモデルの引き継ぎ
best_model = tuned_model.best_estimator_

In [38]:
# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193


# ベイズ最適化

In [39]:
# optuna のインストール
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.3-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.3 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [40]:
import optuna

In [41]:
from sklearn.model_selection import cross_val_score

In [42]:
def objective(trial, x, t, cv):
    # 1. ハイパーパラメータごとに探索範囲を指定
    max_depth = trial.suggest_int('max_depth', 2, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)

    # 2. 学習に使用するアルゴリズムを指定
    estimator = DecisionTreeClassifier(
      max_depth = max_depth,
      min_samples_split = min_samples_split
    )

    # 3. 学習の実行、検証結果の表示
    print('Current_params : ', trial.params)
    accuracy = cross_val_score(estimator, x, t, cv=cv).mean()
    return accuracy

In [43]:
# study オブジェクトの作成（最大化）
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(0), direction='maximize')

[I 2024-05-13 03:37:16,382] A new study created in memory with name: no-name-26efb845-6c40-4810-b3e5-b27a93b519e5


In [44]:
# K 分割交差検証の K
cv = 5
# 目的関数の最適化
study.optimize(lambda trial: objective(trial, x_train_val, t_train_val, cv), n_trials=10)

print(study.best_trial)

[I 2024-05-13 03:38:14,255] Trial 0 finished with value: 0.9208791208791209 and parameters: {'max_depth': 56, 'min_samples_split': 72}. Best is trial 0 with value: 0.9208791208791209.
[I 2024-05-13 03:38:14,329] Trial 1 finished with value: 0.9208791208791209 and parameters: {'max_depth': 61, 'min_samples_split': 55}. Best is trial 0 with value: 0.9208791208791209.
[I 2024-05-13 03:38:14,373] Trial 2 finished with value: 0.9208791208791209 and parameters: {'max_depth': 43, 'min_samples_split': 65}. Best is trial 0 with value: 0.9208791208791209.


Current_params :  {'max_depth': 56, 'min_samples_split': 72}
Current_params :  {'max_depth': 61, 'min_samples_split': 55}
Current_params :  {'max_depth': 43, 'min_samples_split': 65}
Current_params :  {'max_depth': 45, 'min_samples_split': 90}


[I 2024-05-13 03:38:14,442] Trial 3 finished with value: 0.9208791208791209 and parameters: {'max_depth': 45, 'min_samples_split': 90}. Best is trial 0 with value: 0.9208791208791209.
[I 2024-05-13 03:38:14,477] Trial 4 finished with value: 0.9208791208791209 and parameters: {'max_depth': 97, 'min_samples_split': 39}. Best is trial 0 with value: 0.9208791208791209.
[I 2024-05-13 03:38:14,507] Trial 5 finished with value: 0.9208791208791209 and parameters: {'max_depth': 80, 'min_samples_split': 54}. Best is trial 0 with value: 0.9208791208791209.
[I 2024-05-13 03:38:14,536] Trial 6 finished with value: 0.9186813186813187 and parameters: {'max_depth': 58, 'min_samples_split': 93}. Best is trial 0 with value: 0.9208791208791209.
[I 2024-05-13 03:38:14,568] Trial 7 finished with value: 0.945054945054945 and parameters: {'max_depth': 9, 'min_samples_split': 10}. Best is trial 7 with value: 0.945054945054945.
[I 2024-05-13 03:38:14,600] Trial 8 finished with value: 0.9208791208791209 and par

Current_params :  {'max_depth': 97, 'min_samples_split': 39}
Current_params :  {'max_depth': 80, 'min_samples_split': 54}
Current_params :  {'max_depth': 58, 'min_samples_split': 93}
Current_params :  {'max_depth': 9, 'min_samples_split': 10}
Current_params :  {'max_depth': 4, 'min_samples_split': 84}
Current_params :  {'max_depth': 79, 'min_samples_split': 88}
FrozenTrial(number=7, state=TrialState.COMPLETE, values=[0.945054945054945], datetime_start=datetime.datetime(2024, 5, 13, 3, 38, 14, 538054), datetime_complete=datetime.datetime(2024, 5, 13, 3, 38, 14, 568113), params={'max_depth': 9, 'min_samples_split': 10}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=100, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=100, log=False, low=2, step=1)}, trial_id=7, value=None)


In [45]:
# 最も予測精度の高かったハイパーパラメータの確認
study.best_params

{'max_depth': 9, 'min_samples_split': 10}

In [46]:
# 最適なハイパーパラメータを設定したモデルの定義
best_model = DecisionTreeClassifier(**study.best_params)

# モデルの学習
best_model.fit(x_train_val, t_train_val)

# モデルの検証
print(best_model.score(x_train_val, t_train_val))
print(best_model.score(x_test, t_test))

0.9934065934065934
0.956140350877193
