### パラメータチューニング（グリッドサ-チ）

### データの準備

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name='y')
display(X.join(y).head())

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


### チューニングに使用するパイプライン準備

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# PCA（Principle Component Analysis）ベクトルの向きを揃えることで特徴抽出を行うライブラリ
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

# パイプライン設定
# パラメータはグリッドサーチにより行う為指定していない
pipe_logistic = Pipeline([
    ('scl', StandardScaler()),# 標準化
    ('pca', PCA(random_state=1)),# 特徴抽出
    ('est', LogisticRegression(solver='lbfgs', random_state=1))])# solver:最適解の探索方法lbfgs（準ニュートン法）

### パラメータチューニングの準備

In [13]:
# パラメータの範囲を設定する
param_grid_logistic = {'pca__n_components':[5,7,9], 'est__C':[0.1,1.0,10.0,100.0]}
# pca__n_components:PCAで変換する次元数
# est__C: L2正則化具合

### グリッドサーチの実行

In [19]:
print(f'探索空間:{param_grid_logistic}')
# グリッドサーチのインスタンス作成
gs = GridSearchCV(
    estimator=pipe_logistic,# パラメータチューニングのモデル
    param_grid=param_grid_logistic,# パラメータチューニングの範囲
    scoring='f1',# どの評価でチューニングを行うか
    cv=3,# 交差検定の回数
    return_train_score=False)# 学習データでの評価の出力

gs.fit(X, y)

探索空間:{'pca__n_components': [5, 7, 9], 'est__C': [0.1, 1.0, 10.0, 100.0]}


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scl',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None, random_state=1,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('est',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                 

### ベストモデルで予測

In [24]:
print(gs.predict(X)[:10])
print(gs.predict_proba(X)[:10])

[0 0 0 0 0 0 0 0 0 0]
[[9.99999991e-01 9.16949126e-09]
 [9.99858219e-01 1.41781006e-04]
 [9.99999722e-01 2.78322042e-07]
 [9.98062858e-01 1.93714165e-03]
 [9.99903186e-01 9.68143032e-05]
 [8.92844735e-01 1.07155265e-01]
 [9.99929334e-01 7.06662552e-05]
 [9.80585010e-01 1.94149897e-02]
 [9.98459255e-01 1.54074505e-03]
 [9.99355274e-01 6.44725734e-04]]


### グリッドサーチの探索結果確認

In [26]:
pd.DataFrame(gs.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_est__C,param_pca__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.016323,0.00431,0.006866,0.002708,0.1,5,"{'est__C': 0.1, 'pca__n_components': 5}",0.979424,0.97541,0.978903,0.97791,0.001783,5
1,0.006722,0.001077,0.001819,0.000191,0.1,7,"{'est__C': 0.1, 'pca__n_components': 7}",0.983471,0.97541,0.983051,0.98064,0.003707,2
2,0.006878,0.000612,0.001965,0.000256,0.1,9,"{'est__C': 0.1, 'pca__n_components': 9}",0.983471,0.971429,0.983193,0.979358,0.005615,4
3,0.006494,0.00043,0.002264,0.000414,1.0,5,"{'est__C': 1.0, 'pca__n_components': 5}",0.983333,0.983471,0.974359,0.980398,0.00426,3
4,0.007075,0.00045,0.001781,0.000138,1.0,7,"{'est__C': 1.0, 'pca__n_components': 7}",0.983333,0.971193,0.974359,0.976299,0.005146,6
