## 9.1 アヤメのデータ

In [None]:
import statsmodels.api as sm
my_data = sm.datasets.get_rdataset('iris', 'datasets').data
my_data.head()

In [None]:
my_data.describe()
# 以下省略

## 9.2 木による分類

In [None]:
import graphviz
import pandas as pd
import statsmodels.api as sm
from sklearn import tree

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species

my_model = tree.DecisionTreeClassifier(max_depth=2, random_state=0)
my_model.fit(X, y)

In [None]:
my_dot = tree.export_graphviz(
    decision_tree=my_model,
    out_file=None,                 # ファイルに出力しない．
    feature_names=X.columns,       # 変数名
    class_names=my_model.classes_, # カテゴリ名
    filled=True)                   # 色を塗る．
graphviz.Source(my_dot)

In [None]:
my_test = pd.DataFrame([[5.0, 3.5, 1.5, 0.5],
                        [6.5, 3.0, 5.0, 2.0]])
my_model.predict(my_test)

In [None]:
pd.DataFrame(
    my_model.predict_proba(my_test),
    columns=my_model.classes_)

## 9.3 正解率

In [None]:
import graphviz
import pandas as pd
import statsmodels.api as sm
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV, LeaveOneOut

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species

my_model = tree.DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y)
y_ = my_model.predict(X)
confusion_matrix(y_true=y, y_pred=y_)

In [None]:
my_model.score(X, y)
# あるいは
y_ = my_model.predict(X)
(y_ == y).mean()


In [None]:
cross_val_score(my_model, X, y, cv=LeaveOneOut()).mean()

In [None]:
my_search = GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=0),
                         param_grid={'max_depth': range(1, 11)},
                         cv=LeaveOneOut(),
                         n_jobs=-1).fit(X, y)
my_search.best_params_, my_search.best_score_

In [None]:
my_params = {
    'max_depth': range(2, 6),
    'min_samples_split': [2, 20],
    'min_samples_leaf': range(1, 8)}

my_search = GridSearchCV(
    estimator=tree.DecisionTreeClassifier(min_impurity_decrease=0.01,
                                          random_state=0),
    param_grid=my_params,
    cv=LeaveOneOut(),
    n_jobs=-1).fit(X, y)
my_search.best_params_, my_search.best_score_

tmp = my_search.cv_results_
my_results = pd.DataFrame(tmp['params']).assign(
    Accuracy=tmp['mean_test_score'])
# 正解率（検証）の最大値
my_results[my_results.Accuracy == my_results.Accuracy.max()]

In [None]:
my_model = my_search.best_estimator_
my_dot = tree.export_graphviz(
    decision_tree=my_model,
    out_file=None,
    feature_names=X.columns,
    class_names=my_model.classes_,
    filled=True)
graphviz.Source(my_dot)

## 9.4 複数の木を使う方法

In [None]:
import pandas as pd
import statsmodels.api as sm
import warnings
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.preprocessing import LabelEncoder

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species
label_encoder = LabelEncoder(); y = label_encoder.fit_transform(y)

my_search = GridSearchCV(RandomForestClassifier(),
                         param_grid={'max_features': [2, 3, 4]},
                         cv=LeaveOneOut(),
                         n_jobs=-1).fit(X, y)
my_search.best_params_

my_search.cv_results_['mean_test_score']

In [None]:
warnings.simplefilter('ignore') # これ以降，警告を表示しない．
my_search = GridSearchCV(
    xgboost.XGBClassifier(eval_metric='mlogloss'),
    param_grid={'n_estimators'    : [50, 100, 150],
                'max_depth'       : [1, 2, 3],
                'learning_rate'   : [0.3, 0.4],
                'gamma'           : [0],
                'colsample_bytree': [0.6, 0.8],
                'min_child_weight': [1],
                'subsample'       : [0.5, 0.75, 1]},
    cv=5, # 5分割交差検証
    n_jobs=1).fit(X, y) # n_jobs=-1ではない．
warnings.simplefilter('default') # これ以降，警告を表示する．

my_search.best_params_

my_search.best_score_

In [None]:
my_model = RandomForestClassifier().fit(X, y)
tmp = pd.Series(my_model.feature_importances_, index=X.columns)
tmp.sort_values().plot(kind='barh')

## 9.5 欠損のあるデータでの学習

In [None]:
import numpy as np
import statsmodels.api as sm
import warnings
import xgboost
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.pipeline import Pipeline

my_data = sm.datasets.get_rdataset('iris', 'datasets').data

n = len(my_data)
my_data['Petal.Length'] = [np.nan if i % 10 == 0 else
                           my_data['Petal.Length'][i] for i in range(n)]
my_data['Petal.Width']  = [np.nan if i % 10 == 1 else
                           my_data['Petal.Width'][i]  for i in range(n)]

my_data.describe() # countの値が135の変数に，150-135=15個の欠損がある．
# 以下省略

X, y = my_data.iloc[:, 0:4], my_data.Species

In [None]:
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), # 欠損を中央値で埋める．
    ('tree', tree.DecisionTreeClassifier(random_state=0))])
my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(), n_jobs=-1)
my_scores.mean()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

warnings.simplefilter('ignore')  # これ以降，警告を表示しない．
my_scores = cross_val_score(
    xgboost.XGBClassifier(eval_metric='mlogloss'), X, y, cv=5)
warnings.simplefilter('default') # これ以降，警告を表示する．

my_scores.mean()

## 9.6 他の分類手法

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species

my_scores = cross_val_score(KNeighborsClassifier(), X, y, cv=LeaveOneOut())
my_scores.mean()

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

my_data = sm.datasets.get_rdataset('iris', 'datasets').data
X, y = my_data.iloc[:, 0:4], my_data.Species

my_pipeline = Pipeline([('sc',  StandardScaler()),              # 標準化
                        ('mlp', MLPClassifier(max_iter=1000))]) # ニューラルネットワーク
my_scores = cross_val_score(my_pipeline, X, y, cv=LeaveOneOut(), n_jobs=-1)
my_scores.mean()