# Chapter3-2 機械学習
## 3-2-1 scikit-learn

In [None]:
import sklearn

## 3-2-2 データセットの準備

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
brest_cancer = load_breast_cancer()

In [None]:
type(brest_cancer)

In [None]:
x = brest_cancer['data']
t = brest_cancer['target']

In [None]:
type(x), x.shape

In [None]:
type(t), t.shape

In [None]:
# 先頭の 3 件を表示
x[:3]

In [None]:
t

In [None]:
len(t[t == 0]), len(t[t == 1])

In [None]:
import numpy as np
# tが1の0の場合は1として、それ以外は0とする
t = np.where(t==0, 1, 0)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.hist(t, bins=3)

## 3-2-2-1 訓練データとテストデータ

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# テストが全体の　20%　となるようにランダムに分割
x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=0.2, random_state=0)

In [None]:
x_train.shape, x_test.shape

In [None]:
x_train.shape, t_test.shape

## 3-2-3 モデルの訓練

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# モデルを clf としてインスタンス化
clf = DecisionTreeClassifier(random_state=0)

In [None]:
# 訓練データに基づいて、パラメータの訓練
clf.fit(x_train, t_train)

In [None]:
# テストデータに対する予測値の計算（検証時に使用）
y_test = clf.predict(x_test)

## 3-2-4 モデルの検証

### 3-2-4-1 正解率

In [None]:
# テストデータに対する正解率の算出
clf.score(x_test, t_test)

In [None]:
# 訓練データに対する正解率の算出
clf.score(x_train, t_train)

### 3-2-4-2 混合行列

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
matrix = confusion_matrix(t_test, y_test)
matrix

In [None]:
import seaborn as sns

In [None]:
# ヒートマップで表示
sns.heatmap(matrix, annot=True, cmap='Blues');
plt.xlabel('Prediction')
plt.ylabel('Target')

### 3-2-4-3 適合率

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(t_test, y_test)

### 3-2-4-4 再現率

In [None]:
from sklearn.metrics import recall_score

In [None]:
recall_score(t_test, y_test)

### 3-2-4-5 F 値

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(t_test, y_test)

## 3-2-5 再現率を高めるための工夫

In [None]:
len(t[t == 0]), len(t[t == 1])

In [None]:
weight = {
    0: len(t) / len(t[t==0]),
    1: len(t) / len(t[t==1])
}

In [None]:
weight

In [None]:
clf = DecisionTreeClassifier(random_state=0, class_weight=weight)

In [None]:
# モデルの訓練
clf.fit(x_train, t_train)

In [None]:
# 正解率
clf.score(x_test, t_test)

In [None]:
# 予測値の計算
y_test = clf.predict(x_test)

In [None]:
# 適合率（precision_score）と再現率（recall_score）を確認
precision_score(t_test, y_test), recall_score(t_test, y_test)

## 3-2-5-2 ハイパーパラメータのチューニング

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# ハイパーパラメータの候補を列挙
params = {
    "max_depth": list(range(2, 10)),
    "criterion": ["gini", "entropy"]
}

In [None]:
# scoring は指定しない場合 Accuracy となります
# cv は交差検証の分割数
clf_grid = GridSearchCV(
    estimator = DecisionTreeClassifier(class_weight=weight, random_state=0),
    scoring = 'recall',
    param_grid = params,
    cv = 5,
)

In [None]:
# すべてのハイパーパラメータの候補で訓練
clf_grid.fit(x_train, t_train)

In [None]:
# 最も良かったハイパーパラメータの組み合わせ
clf_grid.best_params_

In [None]:
# 最も良かったハイパーパラメータを持つ訓練済みモデルを受け継ぐ
clf = clf_grid.best_estimator_
clf

In [None]:
# 正解率
clf.score(x_test, t_test)

In [None]:
# 予測値の計算
y_test = clf.predict(x_test)

In [None]:
# 適合率と再現率、F値
precision_score(t_test, y_test), recall_score(t_test, y_test), f1_score(t_test, y_test)

In [None]:
# 混合行列
matrix = confusion_matrix(t_test, y_test)
matrix

In [None]:
# 混合行列の可視化
sns.heatmap(matrix, annot=True, cmap='Blues');
plt.xlabel('Prediction')
plt.ylabel('Target')

## 3-2-6 訓練済みモデルの保存

In [None]:
import joblib

In [None]:
# brest_cancer.pkl という名前で保存
joblib.dump(clf, 'brest_cancer.pkl')

In [None]:
clf_load = joblib.load('brest_cancer.pkl')

In [None]:
# 読み込んだ訓練済みモデルを使用した推論
clf_load.predict(x_test)