## グリッドサーチ
- 探索するパラメータのリストを総当たりに調べる手法

In [36]:
param_a = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
param_b = [1, 10, 100, 1000]

In [37]:
# 全てを組み合わせる
for a in param_a:
    for b in param_b:
        print(f'a = {a}, b = {b}')

a = 0.0, b = 1
a = 0.0, b = 10
a = 0.0, b = 100
a = 0.0, b = 1000
a = 0.2, b = 1
a = 0.2, b = 10
a = 0.2, b = 100
a = 0.2, b = 1000
a = 0.4, b = 1
a = 0.4, b = 10
a = 0.4, b = 100
a = 0.4, b = 1000
a = 0.6, b = 1
a = 0.6, b = 10
a = 0.6, b = 100
a = 0.6, b = 1000
a = 0.8, b = 1
a = 0.8, b = 10
a = 0.8, b = 100
a = 0.8, b = 1000
a = 1.0, b = 1
a = 1.0, b = 10
a = 1.0, b = 100
a = 1.0, b = 1000


### データの準備

In [38]:
from pandas import DataFrame
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()

X = breast_cancer.data[:,:10]
y = breast_cancer.target

columns = ['半径', 'テクスチャ', '周囲の長さ', '面積', 'なめらかさ', 'コンパクト性', 'へこみ', 'へこみの数', '対称性', 'フラクタル次元']

df = DataFrame(data=X[:,:10], columns=columns)
df['目的変数'] = y

In [48]:
X = df[['面積', 'へこみ']].values
y = df['目的変数'].values

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### グリッドサーチで学習

In [41]:
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [42]:
# scikit-learn のグリッドサーチを利用するために、GridSearchCV を import
# "CV" はクロスバリデーション (cross validation) のことで、パラメータの探索の際に交差検証にて評価を行う
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [1,2], 'n_estimators': [10,15,20,25,30]}
'''
パラメータの範囲を定義
max_depth は 1 または 2
n_estimators は 10 から 30 までの整数を 5 間隔で与える
'''
gs = GridSearchCV(
    estimator=RandomForestClassifier(criterion='gini', random_state=42),# ランダムフォレスト
    param_grid=param_grid,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),# 交差検証にStratifiedKFoldを利用
    return_train_score=True)

In [43]:
gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=F

### 探索結果の確認

In [44]:
# もっとも性能の良かったパラメータは best_params_ 属性に格納されている
gs.best_params_

{'max_depth': 2, 'n_estimators': 20}

In [45]:
# この時のスコア（今回は層化10分割交差検証による正解率の平均）を確認
gs.best_score_

0.9095477386934674

In [46]:
# cv_results_ 属性には、探索結果の詳細が含まれている。 DataFrame に変換して表示してみる。
df_grid_result = DataFrame(gs.cv_results_)
display(df_grid_result.head())
display(df_grid_result[['param_max_depth', 'param_n_estimators', 'mean_train_score', 'mean_test_score']])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.009473,0.003762,0.001552,0.000226,1,10,"{'max_depth': 1, 'n_estimators': 10}",0.775,0.85,0.875,...,0.891061,0.879888,0.891061,0.874302,0.902235,0.882682,0.874302,0.902778,0.886926,0.009623
1,0.011095,0.000756,0.001964,0.000392,1,15,"{'max_depth': 1, 'n_estimators': 15}",0.775,0.85,0.9,...,0.893855,0.879888,0.891061,0.879888,0.893855,0.885475,0.899441,0.897222,0.89056,0.006642
2,0.01301,0.000908,0.002038,0.000112,1,20,"{'max_depth': 1, 'n_estimators': 20}",0.775,0.85,0.9,...,0.893855,0.882682,0.888268,0.879888,0.891061,0.882682,0.891061,0.897222,0.887767,0.00537
3,0.016676,0.000543,0.002641,0.000336,1,25,"{'max_depth': 1, 'n_estimators': 25}",0.775,0.85,0.9,...,0.899441,0.879888,0.891061,0.879888,0.893855,0.882682,0.896648,0.897222,0.89084,0.00738
4,0.018541,0.000735,0.002851,0.000308,1,30,"{'max_depth': 1, 'n_estimators': 30}",0.775,0.85,0.9,...,0.893855,0.888268,0.891061,0.879888,0.893855,0.885475,0.899441,0.894444,0.890282,0.005203


Unnamed: 0,param_max_depth,param_n_estimators,mean_train_score,mean_test_score
0,1,10,0.886926,0.871859
1,1,15,0.89056,0.869347
2,1,20,0.887767,0.871859
3,1,25,0.89084,0.876884
4,1,30,0.890282,0.88191
5,2,10,0.918484,0.894472
6,2,15,0.919603,0.907035
7,2,20,0.923785,0.909548
8,2,25,0.923509,0.904523
9,2,30,0.92435,0.904523


### 最も良かった学習モデルを取り出す

In [47]:
# もっとも良かった機械学習モデルが best_estimator_ 属性に格納されている
clf = gs.best_estimator_
print(clf)
# scoreメソッドを利用して、正解率の計算する
clf.score(X_test, y_test)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


0.9298245614035088