In [1]:
# coding: UTF-8

import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from stacking import StackingClassifier

def main():
    digits = load_digits()
    noised_data = digits.data + np.random.random(digits.data.shape)*15

    X_train, X_test, y_train, y_test = train_test_split(
        noised_data, digits.target, test_size=0.8)

    svm =SVC(C=5, gamma=0.001, probability=True)
    lr = LogisticRegression()
    knn = KNN(n_jobs=-1)
    nb = GNB()
    rfc = RFC(n_estimators=500, n_jobs=-1)
    bgg = BaggingClassifier(n_estimators=300, n_jobs=-1)
    mlp = MLPClassifier(hidden_layer_sizes=(40, 20), max_iter=1000)
    xgb = XGBClassifier(n_estimators=300, n_jobs=-1)

    estimators = list(zip(["svm","lr","knn","nb","rfc","bgg","mlp","xgb"],
                          [svm,lr,knn,nb,rfc,bgg,mlp,xgb]))
    
    for name, clf in estimators:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        print(name)
        print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
            *precision_recall_fscore_support(y_test, preds, average="macro")))

    for v in ["hard", "soft"]:
        vc_hard = VotingClassifier(estimators, voting=v)
        vc_hard.fit(X_train, y_train)
        preds = vc_hard.predict(X_test)
        print(v, "voting")
        print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
            *precision_recall_fscore_support(y_test, preds, average="macro")))

    # ここから先だけ追加した
    stcl = StackingClassifier(estimators, RFC(n_estimators=2000, n_jobs=-1))
    stcl.fit(X_train, y_train)
    preds = stcl.predict(X_test)
    print("stacking")
    print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
        *precision_recall_fscore_support(y_test, preds, average="macro")))
    
if __name__ == "__main__":
    main()

svm
p:0.8866 r:0.8847 f1:0.8841
lr
p:0.7364 r:0.7325 f1:0.7337
knn
p:0.8419 r:0.8402 f1:0.8381
nb
p:0.8467 r:0.8437 f1:0.8438
rfc
p:0.8525 r:0.8516 f1:0.8499
bgg
p:0.7675 r:0.7599 f1:0.7571
mlp
p:0.7015 r:0.6976 f1:0.6966
hard voting
p:0.8759 r:0.8758 f1:0.8748
soft voting
p:0.8741 r:0.8740 f1:0.8732
stacking
p:0.8970 r:0.8962 f1:0.8957


In [19]:
digits = load_digits()
noised_data = digits.data + np.random.random(digits.data.shape)*15

X_train, X_test, y_train, y_test = train_test_split(
    noised_data, digits.target, test_size=0.8)

svm =SVC(C=5, gamma=0.001, probability=True)
lr = LogisticRegression()
knn = KNN(n_jobs=-1)
nb = GNB()
rfc = RFC(n_estimators=500, n_jobs=-1)
bgg = BaggingClassifier(n_estimators=300, n_jobs=-1)
mlp = MLPClassifier(hidden_layer_sizes=(40, 20), max_iter=1000)
xgb = XGBClassifier(n_estimators=300, n_jobs=-1)

estimators = list(zip(["svm","lr","knn","nb","rfc","bgg","mlp", "xgb"], 
                      [svm, lr, knn, nb, rfc, bgg, mlp, xgb]))
estimators

[('svm', SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)),
 ('lr',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('knn',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
             weights='uniform')),
 ('nb', GaussianNB(priors=None)),
 ('rfc',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight

In [20]:
for name, clf in estimators:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        print(name)
        print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
            *precision_recall_fscore_support(y_test, preds, average="macro")))

svm
p:0.8602 r:0.8449 f1:0.8441
lr
p:0.7413 r:0.7360 f1:0.7356
knn
p:0.8547 r:0.8522 f1:0.8500
nb
p:0.8173 r:0.8119 f1:0.8117
rfc
p:0.8650 r:0.8633 f1:0.8617
bgg
p:0.8016 r:0.8002 f1:0.8002
mlp
p:0.7239 r:0.7162 f1:0.7162
xgb
p:0.7865 r:0.7819 f1:0.7813


In [21]:
for v in ["hard", "soft"]:
        vc_hard = VotingClassifier(estimators, voting=v)
        vc_hard.fit(X_train, y_train)
        preds = vc_hard.predict(X_test)
        print(v, "voting")
        print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
            *precision_recall_fscore_support(y_test, preds, average="macro")))

hard voting
p:0.8757 r:0.8743 f1:0.8728
soft voting
p:0.8688 r:0.8679 f1:0.8668


In [22]:
# stacking
stcl = StackingClassifier(estimators, RFC(n_estimators=2000, n_jobs=-1))
stcl.fit(X_train, y_train)
preds = stcl.predict(X_test)
print("stacking")
print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
        *precision_recall_fscore_support(y_test, preds, average="macro")))

stacking
p:0.8813 r:0.8796 f1:0.8787
