## モデルの比較をしてみた

In [1]:
# データの準備
import pandas as pd
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
print(dataset.keys())
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name='y')

# データの確認
print('---------------------------------------')
print(f'X shape: {X.shape}')
print('---------------------------------------')
print(y.value_counts())
print('---------------------------------------')
print('y=0 means Marignant(悪性), y=1 means Benign(良性):')
print('---------------------------------------')
X.join(y).head()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
---------------------------------------
X shape: (569, 30)
---------------------------------------
1    357
0    212
Name: y, dtype: int64
---------------------------------------
y=0 means Marignant(悪性), y=1 means Benign(良性):
---------------------------------------


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


### パイプラインを使用し各モデルを一気に比較
- ols  :Ordinary Least Squares（最小2乗回帰）
- ridge:Ridge Regression（リッジ回帰）
- Logistic:Logistic Regression（ロジスティック回帰）
- knn  :KNeighborsClassifier（K近傍法）
- rsvc :Support Vector Machine（サポートベクターマシン）
- lsvc :Linear Support Vector Machine（）
- tree :Decision Tree（決定木）
- rf   :Random Forest（ランダムフォレスト）
- gbr1 :Gradient Boosting（勾配ブースティングr1）
- gbr2 :Gradient Boosting(勾配ブースティングr2)
- mlp  :multilayer perceptron（多層パーセプトロン）

In [2]:
import numpy as np
# 前処理
from sklearn.preprocessing import StandardScaler
# モデル
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression #線形回帰とリッジ回帰とロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier#K近傍法
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier #決定木
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #ランダムフォレストと勾配ブースティング
from sklearn.neural_network import MLPClassifier
# 分割と評価
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

# ホールドアウト法
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# パイプラインディキショナリーにセット
pipelines = {
    'ols': Pipeline([('scl', StandardScaler()),
                     ('est', LinearRegression())]),
    'ridge': Pipeline([('scl', StandardScaler()),
                     ('est', Ridge(random_state=0))]),
    'Logistic': Pipeline([('scl', StandardScaler()),
                     ('est', LogisticRegression(random_state=0))]),
    'knn': Pipeline([('scl', StandardScaler()),
                     ('est', KNeighborsClassifier())]),
    'rsvc': Pipeline([('scl', StandardScaler()),
                     ('est', SVC(C=1.0, kernel='rbf', class_weight='balanced', random_state=0))]),
    'lsvc': Pipeline([('scl', StandardScaler()),
                     ('est', LinearSVC(C=1.0, class_weight='balanced', random_state=0))]),
    'tree': Pipeline([('scl', StandardScaler()),
                     ('est', DecisionTreeClassifier(random_state=0))]),
    'rf': Pipeline([('scl', StandardScaler()),
                     ('est', RandomForestClassifier(random_state=0))]),
    'gbr1': Pipeline([('scl', StandardScaler()),
                     ('est', GradientBoostingClassifier(n_estimators=100, random_state=0))]),
    'gbr2': Pipeline([('scl', StandardScaler()),
                     ('est', GradientBoostingClassifier(n_estimators=250, random_state=0))]),
    'mlp': Pipeline([('scl', StandardScaler()),
                     ('est', MLPClassifier(hidden_layer_sizes=(3,3), max_iter=1000, random_state=0))])
}

In [3]:
# 学習と評価
accuracy_scores = {}

# まずはホールドアウト方で正解率を計算してみる
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    if pipeline.predict(X_train).dtype != 'float64':    
        accuracy_scores[(pipe_name, 'accuracy:train')] = accuracy_score(y_train, pipeline.predict(X_train))
        accuracy_scores[(pipe_name, 'accuracy:test')] = accuracy_score(y_test, pipeline.predict(X_test))
    else:
#         回帰の場合完全一致は難しいので計算できない
        accuracy_scores[(pipe_name, 'accuracy:train')] = '回帰'
        accuracy_scores[(pipe_name, 'accuracy:test')] = '回帰'

pd.Series(accuracy_scores).unstack()



Unnamed: 0,accuracy:test,accuracy:train
Logistic,0.982456,0.991209
gbr1,0.964912,1
gbr2,0.964912,1
knn,0.95614,0.982418
lsvc,0.973684,0.986813
mlp,0.964912,0.995604
ols,回帰,回帰
rf,0.95614,0.997802
ridge,回帰,回帰
rsvc,0.973684,0.982418


In [4]:
# ホールド・アウト法による評価
pd.Series(accuracy_scores).unstack()

# Cross_varidationによる評価をしてみる
# パイプラインは同じものを使用できる
cross_val_scores = {}
for pipe_name, pipeline in pipelines.items():
#     estでLinerRegressionを指定,Xは訓練データの指定（dfで入れたばかりのやつ）,yも左記と同じ,cvはデータの分割回数, scoringもしてくれる
    cv_results = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
    print('------------------')
    print('algorithm:', pipe_name)
#     分割した一個一個の結果
    print('cv_results:', cv_results)
#     平均と標準偏差
    print('avg +- std_dev', cv_results.mean(), '+-', cv_results.std())
# リッジ回帰2が一番良いとわかる

------------------
algorithm: ols
cv_results: [0.62359509 0.69896145 0.7559333  0.77302059 0.67192043]
avg +- std_dev 0.7046861734644287 +- 0.05473205568276265
------------------
algorithm: ridge
cv_results: [0.64100171 0.69807302 0.75274316 0.78308758 0.67094702]
avg +- std_dev 0.7091704994037773 +- 0.05214112447666963
------------------
algorithm: Logistic
cv_results: [0.92571059 0.88856589 0.88631791 0.88631791 0.96210597]
avg +- std_dev 0.9098036539651344 +- 0.030142216440528075
------------------
algorithm: knn
cv_results: [0.85142119 0.81427649 0.92421194 0.81052985 0.84842388]
avg +- std_dev 0.8497726670098107 +- 0.04085515847786738




------------------
algorithm: rsvc
cv_results: [0.85142119 0.85142119 0.96210597 0.84842388 0.92421194]
avg +- std_dev 0.8875168322597082 +- 0.04699812828103602
------------------
algorithm: lsvc
cv_results: [0.77713178 0.81427649 0.88631791 0.88631791 0.92421194]
avg +- std_dev 0.8576512043839262 +- 0.05373757432137294
------------------
algorithm: tree
cv_results: [0.59140827 0.66569767 0.62105969 0.77263581 0.58316566]
avg +- std_dev 0.6467934220308933 +- 0.06924147703970085
------------------
algorithm: rf
cv_results: [0.66569767 0.81427649 0.88631791 0.84842388 0.84842388]
avg +- std_dev 0.8126279641674335 +- 0.07692090756374717
------------------
algorithm: gbr1
cv_results: [0.70284238 0.73998708 0.88631791 0.92421194 0.92421194]
avg +- std_dev 0.8355142482803799 +- 0.09491347368267773
------------------
algorithm: gbr2
cv_results: [0.77713178 0.66569767 0.88631791 0.96210597 0.92421194]
avg +- std_dev 0.8430930544507355 +- 0.10810434338435487




------------------
algorithm: mlp
cv_results: [0.81427649 0.88856589 0.84842388 0.92421194 0.96210597]
avg +- std_dev 0.887516832259708 +- 0.0525425127075221


