### ランダムフォレスト
分類・回帰で使用することができる。  
各ツリーの予測の多数決が結果となる。

#### ディシジョンツリー
教師あり学習アルゴリズム　　
もしなになにだったらという質問を繰り返し、最終な予測を導き出す。

#### アンサンブル学習
弱学習器を複数用意して、その結果を結合することで予測精度を高める  
代表的な手法として、バギングとブースティングがある   
バギング・・・独立に弱学習器を作って多数決をとる  
ブースティング・・・一つずつ弱学習器を作成していく。先に作った弱学習器の弱点を補うように次の弱学習器を作成する　　

In [1]:
import numpy as np
import pandas as pd
np.random.seed(1)

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

df_X = pd.DataFrame(data=data.data, columns=data.feature_names,)
df_X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [2]:
df_Y = pd.DataFrame(data=data.target)
df_Y = df_Y.rename(columns={0: "判定（0: 悪性/ 1: 良性）"})
df_Y.head()

Unnamed: 0,判定（0: 悪性/ 1: 良性）
0,0
1,0
2,0
3,0
4,0


In [3]:
X = df_X.values
Y = df_Y.values.ravel()

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)


In [4]:
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
# n_estimators=決定木の数、乱数の種の設定
clf = RandomForestClassifier(n_estimators=10, random_state=0)
clf.fit(X_train, Y_train)

RandomForestClassifier(n_estimators=10, random_state=0)

In [5]:
from sklearn.metrics import confusion_matrix
Y_pred = clf.predict(X_test)
confusion_matrix(y_true=Y_test, y_pred=Y_pred)

array([[ 57,   6],
       [  7, 101]], dtype=int64)

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print('正解率：{}'.format(accuracy_score(y_true=Y_test, y_pred=Y_pred)))
print('適合率：{}'.format(precision_score(y_true=Y_test, y_pred=Y_pred)))
print('再現率：{}'.format(recall_score(y_true=Y_test, y_pred=Y_pred)))
print('F1スコア：{}'.format(f1_score(y_true=Y_test, y_pred=Y_pred)))

正解率：0.9239766081871345
適合率：0.9439252336448598
再現率：0.9351851851851852
F1スコア：0.9395348837209302


In [8]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [11]:
# グリッドサーチでパラメータを探す

# グリッドサーチでパラメータを取得
from sklearn.model_selection import GridSearchCV

# サーチするハイパーパラメータのグリッド
param_grid = {
    'max_depth':[1,2,3],
    'n_estimators':[5,10,20],
    'max_features':[1,3,10]
}
# モデルsvdに作成したグリッド、交差検証戦略は5分化交差検証
grid_search = GridSearchCV(clf, param_grid, cv=5)

# 訓練用と評価用データに分ける
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# データを用いてモデルを構築
# cv=5なので訓練用と評価用データが5分割され、交差検証される
grid_search.fit(X_train, Y_train)
# 正解率の出力
grid_search.score(X_test, Y_test)

0.9440559440559441

In [12]:
grid_search.best_params_

{'max_depth': 3, 'max_features': 10, 'n_estimators': 10}