# 機械学習をやってみる

## この章で扱うもの

- 前章で作った各種データを再利用
- 機械学習を行ってみる
- 学習結果の評価を行う
- 機械学習の改善方法


## この章で取り扱う手順

- 学習データとテストデータの分割
- sklearnを用いて学習
- 評価
- 交差検証
- グリットサーチ
- 各種モデルを試す
- 評価結果の再確認


In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_pickle("df.db")

# 学習データとテストデータの分割

http://qiita.com/terapyon/items/8f8d3518ee8eeb4f96b2

In [4]:
df.head()

Unnamed: 0,報告数,流行,増加,平均気温(℃),最高気温(℃),最低気温(℃),平均湿度(％),最小相対湿度(％),平均現地気圧(hPa),降水量の合計(mm),日照時間(時間),平均風速(m/s)
2014-01-01,0.178571,0,0,9.8,13.7,3.9,54.0,37.0,1005.3,0.0,9.2,5.3
2014-01-02,0.178571,0,0,8.0,12.9,4.4,41.0,26.0,1011.3,0.0,9.1,3.0
2014-01-03,0.178571,0,0,5.9,9.9,2.7,43.0,32.0,1014.9,0.0,4.1,1.6
2014-01-04,0.178571,0,0,6.7,11.5,2.1,47.0,29.0,1009.5,0.0,5.9,2.4
2014-01-05,0.178571,0,0,4.4,6.9,2.3,40.0,28.0,1016.6,0.0,1.1,2.5


In [5]:
X = df.iloc[:, 3:]

In [6]:
X.head()

Unnamed: 0,平均気温(℃),最高気温(℃),最低気温(℃),平均湿度(％),最小相対湿度(％),平均現地気圧(hPa),降水量の合計(mm),日照時間(時間),平均風速(m/s)
2014-01-01,9.8,13.7,3.9,54.0,37.0,1005.3,0.0,9.2,5.3
2014-01-02,8.0,12.9,4.4,41.0,26.0,1011.3,0.0,9.1,3.0
2014-01-03,5.9,9.9,2.7,43.0,32.0,1014.9,0.0,4.1,1.6
2014-01-04,6.7,11.5,2.1,47.0,29.0,1009.5,0.0,5.9,2.4
2014-01-05,4.4,6.9,2.3,40.0,28.0,1016.6,0.0,1.1,2.5


In [7]:
y = df['流行']

In [8]:
y.head()

2014-01-01    0
2014-01-02    0
2014-01-03    0
2014-01-04    0
2014-01-05    0
Name: 流行, dtype: int32

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

# ロジスティック回帰

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
clf = LogisticRegression()

In [13]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
y_train_pred = clf.predict(X_train)

## 正答率を確認

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(y_train, y_train_pred)

0.86986301369863017

In [17]:
y_val_pred = clf.predict(X_val)

In [18]:
accuracy_score(y_val, y_val_pred)

0.8954545454545455

## 混同行列

In [19]:
from sklearn.metrics import confusion_matrix

In [20]:
cm = confusion_matrix(y_val, y_val_pred)

In [21]:
cm

array([[184,   9],
       [ 14,  13]])

In [22]:
cm_t = confusion_matrix(y_train, y_train_pred)
cm_t

array([[664,  50],
       [ 64,  98]])

### 混同行列の評価

```
[[TN, FP],
 [FN, TP]]
 ```

# 適合率(precision)・再現率(recall)・F値(f1-score)

## 適合率
 
- P(今回の場合は、流行していない) に判定された率 (178 / (178+14) = 0.93)
- N(今回の場合は、流行している) に判定された率 (17 / (17+11) = 0.61)
 
## 再現率
 
- Tと正しく予測できた割合 (178 / (178+11) = 0.94)
- Fと正しく予測できた割合 (17 / (17+14) = 0.55)
 
 
## F値
 
`2 / (1/適合率+1/再現率) = 2 * 適合率 * 再現率 / (適合率+再現率）`
 
- 0のF値 `(2 * 0.93 * 0.94 / (0.93 + 0.94) = 0.93`
- 1のF値 `(2 * 0.61 * 0.55 / (0.61 + 0.55) = 0.58`

In [23]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_val, y_val_pred))

             precision    recall  f1-score   support

          0       0.93      0.95      0.94       193
          1       0.59      0.48      0.53        27

avg / total       0.89      0.90      0.89       220



## レポート関係を関数化し再利用可能にする

In [25]:
def report(y, pred):
    print(accuracy_score(y, pred))
    cm = confusion_matrix(y, pred)
    print(cm)
    cr = classification_report(y, pred)
    print(cr)

In [26]:
report(y_train, y_train_pred)

0.869863013699
[[664  50]
 [ 64  98]]
             precision    recall  f1-score   support

          0       0.91      0.93      0.92       714
          1       0.66      0.60      0.63       162

avg / total       0.87      0.87      0.87       876



## 学習から評価までを関数化


In [27]:
def fit_to_pred(clf, X_train, X_val, y_train, y_val):
    # 学習
    clf.fit(X_train, y_train)
    
    # 学習データで評価
    y_train_pred = clf.predict(X_train)
    print("y_train_pred: ")
    report(y_train, y_train_pred)
    
    # テストデータで評価
    y_val_pred = clf.predict(X_val)
    print("y_val_pred: ")
    report(y_val, y_val_pred)
    
    # 学習済みデータを返す
    return clf

In [28]:
clf = LogisticRegression()
fit_to_pred(clf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.869863013699
[[664  50]
 [ 64  98]]
             precision    recall  f1-score   support

          0       0.91      0.93      0.92       714
          1       0.66      0.60      0.63       162

avg / total       0.87      0.87      0.87       876

y_val_pred: 
0.895454545455
[[184   9]
 [ 14  13]]
             precision    recall  f1-score   support

          0       0.93      0.95      0.94       193
          1       0.59      0.48      0.53        27

avg / total       0.89      0.90      0.89       220



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# その他の機械学習アルゴリズム

- サポートベクターマシン SVC
- カーネルSVM
- 決定木 DecisionTreeClassifier
- ランダムフォレスト RandomForestClassifier
- k近傍

In [29]:
from sklearn.svm import SVC

In [30]:
svc = SVC(kernel="linear")
fit_to_pred(svc, X_train, X_val, y_train, y_val)

y_train_pred: 
0.876712328767
[[658  56]
 [ 52 110]]
             precision    recall  f1-score   support

          0       0.93      0.92      0.92       714
          1       0.66      0.68      0.67       162

avg / total       0.88      0.88      0.88       876

y_val_pred: 
0.890909090909
[[182  11]
 [ 13  14]]
             precision    recall  f1-score   support

          0       0.93      0.94      0.94       193
          1       0.56      0.52      0.54        27

avg / total       0.89      0.89      0.89       220



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [31]:
k_svc = SVC(kernel="rbf")
fit_to_pred(k_svc, X_train, X_val, y_train, y_val)

y_train_pred: 
0.996575342466
[[714   0]
 [  3 159]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       714
          1       1.00      0.98      0.99       162

avg / total       1.00      1.00      1.00       876

y_val_pred: 
0.877272727273
[[193   0]
 [ 27   0]]
             precision    recall  f1-score   support

          0       0.88      1.00      0.93       193
          1       0.00      0.00      0.00        27

avg / total       0.77      0.88      0.82       220



  'precision', 'predicted', average, warn_for)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
tree = DecisionTreeClassifier(max_depth=2)
fit_to_pred(tree, X_train, X_val, y_train, y_val)

y_train_pred: 
0.875570776256
[[653  61]
 [ 48 114]]
             precision    recall  f1-score   support

          0       0.93      0.91      0.92       714
          1       0.65      0.70      0.68       162

avg / total       0.88      0.88      0.88       876

y_val_pred: 
0.877272727273
[[181  12]
 [ 15  12]]
             precision    recall  f1-score   support

          0       0.92      0.94      0.93       193
          1       0.50      0.44      0.47        27

avg / total       0.87      0.88      0.87       220



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
rf = RandomForestClassifier()
fit_to_pred(rf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.988584474886
[[712   2]
 [  8 154]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       714
          1       0.99      0.95      0.97       162

avg / total       0.99      0.99      0.99       876

y_val_pred: 
0.877272727273
[[182  11]
 [ 16  11]]
             precision    recall  f1-score   support

          0       0.92      0.94      0.93       193
          1       0.50      0.41      0.45        27

avg / total       0.87      0.88      0.87       220



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
knn = KNeighborsClassifier()
fit_to_pred(knn, X_train, X_val, y_train, y_val)

y_train_pred: 
0.891552511416
[[673  41]
 [ 54 108]]
             precision    recall  f1-score   support

          0       0.93      0.94      0.93       714
          1       0.72      0.67      0.69       162

avg / total       0.89      0.89      0.89       876

y_val_pred: 
0.863636363636
[[182  11]
 [ 19   8]]
             precision    recall  f1-score   support

          0       0.91      0.94      0.92       193
          1       0.42      0.30      0.35        27

avg / total       0.85      0.86      0.85       220



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# 交差検証(クロスバリデーション)

In [38]:
from sklearn.model_selection import cross_val_score

In [39]:
from sklearn.model_selection import KFold

In [40]:
cv = KFold(5, shuffle=True)

In [41]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv)

array([ 0.80454545,  0.89954338,  0.88584475,  0.86757991,  0.89041096])

In [42]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv)

array([ 0.85909091,  0.80821918,  0.82191781,  0.83105023,  0.84018265])

In [43]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv)

array([ 0.83181818,  0.85388128,  0.84018265,  0.87214612,  0.8173516 ])

## F1-score で評価

In [44]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv, scoring="f1")

array([ 0.59459459,  0.66666667,  0.65060241,  0.55882353,  0.58333333])

In [45]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv, scoring="f1")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


array([ 0.        ,  0.08695652,  0.0625    ,  0.        ,  0.06060606])

In [46]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv, scoring="f1")

array([ 0.47457627,  0.49122807,  0.54320988,  0.52173913,  0.47887324])

# グリッドサーチ

In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
param_grid = {'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]}

In [49]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)

In [50]:
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1)

In [51]:
grid_search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    4.7s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [52]:
grid_search.best_score_

0.87226277372262773

In [53]:
grid_search.best_params_

{'max_depth': 2, 'n_estimators': 40}

In [54]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    5.0s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=1)

In [55]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.633797541829
{'max_depth': 3, 'n_estimators': 10}


In [56]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
cross_val_score(rf, X, y, cv=cv, scoring="f1")

array([ 0.65789474,  0.54054054,  0.69230769,  0.61290323,  0.64516129])

# 最終確認

In [57]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
fit_to_pred(rf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.883561643836
[[657  57]
 [ 45 117]]
             precision    recall  f1-score   support

          0       0.94      0.92      0.93       714
          1       0.67      0.72      0.70       162

avg / total       0.89      0.88      0.89       876

y_val_pred: 
0.881818181818
[[182  11]
 [ 15  12]]
             precision    recall  f1-score   support

          0       0.92      0.94      0.93       193
          1       0.52      0.44      0.48        27

avg / total       0.87      0.88      0.88       220



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [58]:
rf.predict(X_val)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [59]:
from sklearn.externals import joblib

In [60]:
joblib.dump(rf, "clf_rf.db")

['clf_rf.db']