In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import Image
%matplotlib inline
custom_style = {'axes.labelcolor': 'white',
                'xtick.color': 'white',
                'ytick.color': 'white'}
sns.set_style("darkgrid", rc=custom_style)
sns.set_context("notebook")
plt.style.use('dark_background')

In [3]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

In [109]:
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers, vote='classlabel', weights=None):
        self.classifiers = classifiers
        self.name_classifiers = {key: value for key, value
                                 in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
    
    def fit(self, X, y):
        #LabelEncoder を使ってクラスラベルが0から始まるようにエンコードする
        #self.predictのnp.argmax呼び出しで重要となる
        self.labelenc_ = LabelEncoder()
        self.labelenc_.fit(y)
        self.classes_ = self.labelenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.labelenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            # 各サンプルのクラス確率に重みをかけて足し合わせた値が最大となる
            # 列番号を配列として返す
            maj_vote = np.apply_along_axis(lambda x:
                                           np.argmax(np.bincount(x, weights=self.weights)),
                                           axis=1,
                                           arr=predictions)
        # 各サンプルに確率の最大値を与えるクラスラベルを抽出
        maj_vote = self.labelenc_.inverse_transform(maj_vote)
        return maj_vote
    
    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_ ])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba
    
    def get_params(self, deep=True):
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.name_classifiers.copy()
            for name, step in six.iteritems(self.name_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s_%s'%(name, key)] = value
            return out

In [91]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [92]:
iris = datasets.load_iris()
X, y = iris.data[50:,[1,2]], iris.target[50:]

In [93]:
le = LabelEncoder()
y = le.fit_transform(y)

In [94]:
y_a - y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state = 1, stratify=y)

In [96]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [97]:
clf1 = LogisticRegression(penalty='l2', C=0.001, random_state=1)
clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy',random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])
clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
print('10-fold cross validation: ¥n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train,
                            cv=10, scoring='roc_auc')
    print('ROC AUC: %0.2f (+/- %0.2f) [%s]'%(scores.mean(), scores.std(), label))

10-fold cross validation: ¥n
ROC AUC: 0.87 (+/- 0.17) [Logistic regression]
ROC AUC: 0.89 (+/- 0.16) [Decision tree]
ROC AUC: 0.88 (+/- 0.15) [KNN]


In [193]:
mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
clf_labels += ['Majority voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]


In [194]:
score = cross_val_score(estimator=mv_clf, X=X_train, y=y_train,
                            cv=10, scoring='roc_auc')

[[[0.50594038 0.49405962]
  [0.49977475 0.50022525]
  [0.49419935 0.50580065]
  [0.49671018 0.50328982]
  [0.50738072 0.49261928]
  [0.50088188 0.49911812]]

 [[0.91304348 0.08695652]
  [0.04761905 0.95238095]
  [0.04761905 0.95238095]
  [0.04761905 0.95238095]
  [0.91304348 0.08695652]
  [0.91304348 0.08695652]]

 [[1.         0.        ]
  [0.         1.        ]
  [0.         1.        ]
  [0.         1.        ]
  [1.         0.        ]
  [1.         0.        ]]]
[[[0.49612677 0.50387323]
  [0.49913759 0.50086241]
  [0.50379832 0.49620168]
  [0.50290337 0.49709663]
  [0.50756363 0.49243637]
  [0.5015891  0.4984109 ]]

 [[0.04761905 0.95238095]
  [0.04761905 0.95238095]
  [0.04761905 0.95238095]
  [0.91304348 0.08695652]
  [0.91304348 0.08695652]
  [0.91304348 0.08695652]]

 [[0.         1.        ]
  [0.         1.        ]
  [0.         1.        ]
  [1.         0.        ]
  [1.         0.        ]
  [1.         0.        ]]]
[[[0.49941775 0.50058225]
  [0.5034838  0.4965162 ]


In [188]:
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train,
                            cv=10, scoring='roc_auc')
    print('ROC AUC: %0.2f (+/- %0.2f) [%s]'%(scores.mean(), scores.std(), label))

ROC AUC: 0.87 (+/- 0.17) [Logistic regression]
ROC AUC: 0.89 (+/- 0.16) [Decision tree]
ROC AUC: 0.88 (+/- 0.15) [KNN]
ROC AUC: 0.94 (+/- 0.13) [Majority voting]


In [192]:
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers, vote='classlabel', weights=None):
        self.classifiers = classifiers
        self.name_classifiers = {key: value for key, value
                                 in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
    
    def fit(self, X, y):
        #LabelEncoder を使ってクラスラベルが0から始まるようにエンコードする
        #self.predictのnp.argmax呼び出しで重要となる
        self.labelenc_ = LabelEncoder()
        self.labelenc_.fit(y)
        self.classes_ = self.labelenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.labelenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    
    def predict(self, X):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
            print('1')
        else:
            print('2')
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            self.predictions_ = predictions
            # 各サンプルのクラス確率に重みをかけて足し合わせた値が最大となる
            # 列番号を配列として返す
            maj_vote = np.apply_along_axis(lambda x:
                                           np.argmax(np.bincount(x, weights=self.weights)),
                                           axis=1,
                                           arr=predictions)
        # 各サンプルに確率の最大値を与えるクラスラベルを抽出
        maj_vote = self.labelenc_.inverse_transform(maj_vote)
        return maj_vote
    
    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_ ])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba
    
    def get_params(self, deep=True):
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.name_classifiers.copy()
            for name, step in six.iteritems(self.name_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s_%s'%(name, key)] = value
            return out
  