## ランダムサーチ
- パラメータをランダムに選択し、良さそうなパラメータを探す方法

In [1]:
import numpy
numpy.random.seed(0)

In [2]:
for _ in range(20):# 20回繰り返す、
    a = numpy.random.random()#  0.0 から 1.0 の範囲からランダムに選択
    b = numpy.random.randint(1, 1001)#  0.0 から 1.0 の範囲からランダムに選択
    print(f'a = {a}, b = {b}')

a = 0.5488135039273248, b = 630
a = 0.8442657485810173, b = 764
a = 0.5448831829968969, b = 10
a = 0.6235636967859723, b = 755
a = 0.4375872112626925, b = 71
a = 0.05671297731744318, b = 397
a = 0.3834415188257777, b = 487
a = 0.8121687287754932, b = 175
a = 0.5680445610939323, b = 678
a = 0.8360787635373775, b = 73
a = 0.08712929970154071, b = 116
a = 0.36824153984054797, b = 710
a = 0.7781567509498505, b = 432
a = 0.8700872583584364, b = 100
a = 0.7991585642167236, b = 756
a = 0.5204774795512048, b = 148
a = 0.11827442586893322, b = 289
a = 0.5820197920751071, b = 698
a = 0.9446689170495839, b = 544
a = 0.10590760718779213, b = 152


### データの準備

In [3]:
from pandas import DataFrame
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()

X = breast_cancer.data[:,:10]
y = breast_cancer.target

columns = ['半径', 'テクスチャ', '周囲の長さ', '面積', 'なめらかさ', 'コンパクト性', 'へこみ', 'へこみの数', '対称性', 'フラクタル次元']

df = DataFrame(data=X[:,:10], columns=columns)
df['目的変数'] = y

In [4]:
X = df[['面積', 'へこみ']].values
y = df['目的変数'].values

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### ランダムサーチで学習

In [6]:
import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# scikit-learn のランダムサーチを利用するために、RandomizedSearchCV を import
from sklearn.model_selection import RandomizedSearchCV

# パラメータの範囲（パラメータの分布とも呼ばれる）を与えるために、scipy というライブラリから randint を import
from scipy.stats import randint

param_dist = {'max_depth': randint(1,3), 'n_estimators': randint(10,31)}
'''
randint は randint(low, high) の形式で引数を取り、low 以上 high 未満の整数値を1つ返す。
パラメータの範囲を定義。
max_depth は 1 または 2
n_estimators は 10 以上 30 以下の整数とする。
'''

rs = RandomizedSearchCV(
    estimator = RandomForestClassifier(criterion='gini', random_state=42),# ランダムフォレスト
    param_distributions = param_dist,#パラメータの範囲
    scoring = 'accuracy',
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42),# 交差検証に StratifiedKFold を利用する
    n_iter = 10,# 探索回数は10回
    return_train_score=True,
    random_state=42)

In [7]:
rs.fit(X_train, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
                   error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_...
                                                    random_state=42, verbo

### 探索結果の確認

In [8]:
# もっとも性能の良かったパラメータは best_params_ 属性に格納されている
rs.best_params_

{'max_depth': 2, 'n_estimators': 17}

In [10]:
# この時のスコア（今回は層化10分割交差検証による正解率の平均）を確認
rs.best_score_

0.907035175879397

In [11]:
# cv_results_ 属性には、探索結果の詳細が含まれている。 DataFrame に変換して表示してみる。
df_random_result = DataFrame(rs.cv_results_)
display(df_random_result.head())
display(df_random_result[['param_max_depth', 'param_n_estimators', 'mean_train_score', 'mean_test_score']])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.018265,0.001302,0.002774,0.0003,1,29,"{'max_depth': 1, 'n_estimators': 29}",0.775,0.85,0.9,...,0.896648,0.891061,0.899441,0.879888,0.893855,0.885475,0.896648,0.894444,0.892238,0.005772
1,0.014755,0.000605,0.002273,8.7e-05,1,24,"{'max_depth': 1, 'n_estimators': 24}",0.775,0.85,0.9,...,0.893855,0.879888,0.888268,0.877095,0.893855,0.882682,0.899441,0.9,0.888883,0.007451
2,0.010435,0.00037,0.001795,0.000283,1,17,"{'max_depth': 1, 'n_estimators': 17}",0.775,0.85,0.9,...,0.893855,0.879888,0.891061,0.877095,0.893855,0.885475,0.899441,0.897222,0.890281,0.007126
3,0.018082,0.000489,0.002727,0.000154,1,30,"{'max_depth': 1, 'n_estimators': 30}",0.775,0.85,0.9,...,0.893855,0.888268,0.891061,0.879888,0.893855,0.885475,0.899441,0.894444,0.890282,0.005203
4,0.016686,0.000188,0.002562,0.000107,1,28,"{'max_depth': 1, 'n_estimators': 28}",0.775,0.85,0.9,...,0.893855,0.888268,0.891061,0.879888,0.893855,0.885475,0.896648,0.894444,0.890003,0.004759


Unnamed: 0,param_max_depth,param_n_estimators,mean_train_score,mean_test_score
0,1,29,0.892238,0.88191
1,1,24,0.888883,0.874372
2,1,17,0.890281,0.869347
3,1,30,0.890282,0.88191
4,1,28,0.890003,0.88191
5,1,20,0.887767,0.871859
6,1,30,0.890282,0.88191
7,2,17,0.921556,0.907035
8,2,12,0.91653,0.899497
9,2,30,0.92435,0.904523


### 最も良かった学習モデルを取り出す

In [14]:
# もっとも良かった機械学習モデルが best_estimator_ 属性に格納されている
clf = rs.best_estimator_
print(clf)
# scoreメソッドを利用して、正解率の計算する
clf.score(X_test, y_test)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=17,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


0.9181286549707602