Skip to content

Commit

Permalink
add DCS_LA
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 committed Aug 2, 2019
1 parent abecbad commit 66ab5de
Show file tree
Hide file tree
Showing 9 changed files with 483 additions and 51 deletions.
3 changes: 2 additions & 1 deletion CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ v<0.0.5>, <07/28/2019> -- Add Stacking (meta ensembling).
v<0.0.6>, <07/29/2019> -- Enable Appveyor integration.
v<0.0.6>, <07/29/2019> -- Update requirements file.
v<0.0.6>, <07/29/2019> -- Add simple outlier detector combination methods.
v<0.0.6>, <07/30/2019> -- Add LSCP.
v<0.0.6>, <07/30/2019> -- Add LSCP.
v<0.0.7>, <08/02/2019> -- Add DCS_LA.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ General Purpose Maximization: simple combination by taking the maximum scor
General Purpose Median: take the median value across all scores/prediction results N/A [#Zhou2012Ensemble]_
General Purpose Majority Vote & Weighted Majority Vote N/A [#Zhou2012Ensemble]_
Classification SimpleClassifierAggregator: combining classifiers by general purpose methods above N/A N/A
Classification DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates) 1997 [#Woods1997Combination]_ (work-in-progress)
Classification DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates) 1997 [#Woods1997Combination]_
Classification DES: Dynamic Ensemble Selection (From dynamic classifier selection to dynamic ensemble selection) 2008 [#Ko2008From]_ (work-in-progress)
Classification Stacking (meta ensembling): use a meta learner to learn the base classifier results N/A [#Gorman2016Kaggle]_
Clustering Clusterer Ensemble: combine the results of multiple clustering results by relabeling 2006 [#Zhou2006Clusterer]_
Expand Down
202 changes: 202 additions & 0 deletions combo/models/classifier_dcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# -*- coding: utf-8 -*-
"""Stacking (meta ensembling). See http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
for more information.
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

import warnings
import numpy as np

from sklearn.neighbors import KDTree
from sklearn.metrics import accuracy_score
from sklearn.utils import check_array
from sklearn.utils import check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import check_classification_targets

from ..utils.utility import check_parameter

from .base import BaseAggregator


class DCS_LA(BaseAggregator):
"""Dynamic Classifier Selection (DCS) is an established combination
framework for classification tasks. The technique was first proposed by Ho
et al. in 1994 :cite:`ho1994decision` and then extended, under the name
DCS Local Accuracy, by Woods et al. in 1997 :cite:`woods1997combination`
to select the most accurate base classifier in a local region.
The motivation behind this approach is that base classifiers often make
distinctive errors and over a degree of complementarity. Consequently,
selectively combining base classifier can result in a performance
improvement over generic ensembles which use the majority vote of all
base classifiers.
See :cite:`woods1997combination` for details.
Parameters
----------
base_estimators: list or numpy array (n_estimators,)
A list of base classifiers.
local_region_size : int, optional (default=30)
Number of training points to consider in each iteration of the local
region generation process (30 by default).
threshold : float in (0, 1), optional (default=None)
Cut-off value to convert scores into binary labels.
pre_fitted : bool, optional (default=False)
Whether the base classifiers are trained. If True, `fit`
process may be skipped.
"""

def __init__(self, base_estimators, local_region_size=30, threshold=None,
pre_fitted=None):

super(DCS_LA, self).__init__(
base_estimators=base_estimators, pre_fitted=pre_fitted)

# validate input parameters
if not isinstance(local_region_size, int):
raise ValueError('local_region_size must be an integer variable')
check_parameter(local_region_size, low=1, include_left=True,
param_name='n_folds')
self.local_region_size = local_region_size

if threshold is not None:
warnings.warn(
"Stacking does not support threshold setting option. "
"Please set the threshold in classifiers directly.")

if pre_fitted is not None:
warnings.warn("Stacking does not support pre_fitted option.")

def fit(self, X, y):
"""Fit classifier.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : numpy array of shape (n_samples,), optional (default=None)
The ground truth of the input samples (labels).
"""

# Validate inputs X and y
X, y = check_X_y(X, y)
X = check_array(X)
check_classification_targets(y)
self._classes = len(np.unique(y))
n_samples = X.shape[0]

# save the train ground truth for evaluation purpose
self.y_train_ = y

# build KDTree out of training subspace
self.tree_ = KDTree(X)

self.y_train_predicted_ = np.zeros(
[n_samples, self.n_base_estimators_])

# train all base classifiers on X, and get their local predicted scores
# iterate over all base classifiers
for i, clf in enumerate(self.base_estimators):
clf.fit(X, y)
self.y_train_predicted_[:, i] = clf.predict(X)
clf.fitted_ = True

self.fitted_ = True

return

def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
return self._predict_internal(X, predict_proba=False)

def predict_proba(self, X):
"""Return probability estimates for the test data X.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
p : numpy array of shape (n_samples,)
The class probabilities of the input samples.
Classes are ordered by lexicographic order.
"""
return self._predict_internal(X, predict_proba=True)

def _predict_internal(self, X, predict_proba):
"""Internal function for predict and predict_proba
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
predict_proba : bool
if True, return the result of predict_proba
Returns
-------
"""
check_is_fitted(self, ['fitted_'])
X = check_array(X)
n_samples = X.shape[0]

# Find neighbors for all test instances
_, ind_arr = self.tree_.query(X, k=self.local_region_size)

if predict_proba:
y_predicted = np.zeros([n_samples, self._classes])
else:
y_predicted = np.zeros([n_samples, ])

# For each test sample
for i in range(n_samples):
test_sample = X[i, :].reshape(1, -1)
train_inds = ind_arr[i, :]

# ground truth
y_train_sample = self.y_train_[train_inds]

clf_performance = np.zeros([self.n_base_estimators_, ])

for j, clf in enumerate(self.base_estimators):
y_train_clf = self.y_train_predicted_[train_inds, j]
clf_performance[j] = accuracy_score(y_train_sample,
y_train_clf)

# select the best clf. may get multiple results
select_clf_inds = np.argwhere(
clf_performance == np.amax(clf_performance)).ravel()

# select the first element from all candidates
best_clf_ind = select_clf_inds[-1]

# make prediction
if predict_proba:
y_predicted[i] = self.base_estimators[
best_clf_ind].predict_proba(test_sample)
else:
y_predicted[i] = self.base_estimators[best_clf_ind].predict(
test_sample)

return y_predicted
105 changes: 105 additions & 0 deletions combo/test/test_classifier_dcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-

import os
import sys

import unittest

import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.datasets import load_breast_cancer
# noinspection PyProtectedMember
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_array_less
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_greater_equal
from sklearn.utils.testing import assert_less_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_true

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# temporary solution for relative imports in case combo is not installed
# if combo is installed, no need to use the following line
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from combo.models.classifier_dcs import DCS_LA
from combo.utils.data import evaluate_print


class TestDCS_LA(unittest.TestCase):
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9

random_state = 42
X, y = load_breast_cancer(return_X_y=True)

self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)

classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]

self.clf = DCS_LA(classifiers, local_region_size=30)
self.clf.fit(self.X_train, self.y_train)

def test_parameters(self):
assert_true(hasattr(self.clf, 'base_estimators') and
self.clf.base_estimators is not None)

def test_train_scores(self):
y_train_predicted = self.clf.predict(self.X_train)
assert_equal(len(y_train_predicted), self.X_train.shape[0])

# check performance
assert_greater(accuracy_score(self.y_train, y_train_predicted),
self.accuracy_floor)

def test_prediction_scores(self):
y_test_predicted = self.clf.predict(self.X_test)
assert_equal(len(y_test_predicted), self.X_test.shape[0])

# check performance
assert_greater(accuracy_score(self.y_test, y_test_predicted),
self.accuracy_floor)

# test utility function
evaluate_print('averaging', self.y_test, y_test_predicted)

def test_prediction_proba(self):
y_test_predicted = self.clf.predict_proba(self.X_test)
assert_greater_equal(y_test_predicted.min(), 0)
assert_less_equal(y_test_predicted.max(), 1)

# check performance
assert_greater(roc_auc_score(self.y_test, y_test_predicted[:, 1]),
self.roc_floor)

# check shape of integrity
n_classes = len(np.unique(self.y_train))
assert_equal(y_test_predicted.shape, (self.X_test.shape[0], n_classes))

# check probability sum is 1
y_test_predicted_sum = np.sum(y_test_predicted, axis=1)
assert_allclose(np.ones([self.X_test.shape[0], ]),
y_test_predicted_sum)

def tearDown(self):
pass


if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Classifier Combination
combination methods, e.g., average, median, and majority vote.
* :class:`combo.models.classifier_stacking.Stacking`: Stacking (meta ensembling). Check this `introductory
article by Kaggle <http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/>`_.
* :class:`combo.models.classifier_dcs.DCS_LA`: Dynamic classifier selection (DCS) by local accuracy.


----
Expand Down
8 changes: 8 additions & 0 deletions docs/combo.models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ combo.models.classifier\_comb module
:show-inheritance:
:inherited-members:

combo.models.classifier\_dcs module
-----------------------------------

.. automodule:: combo.models.classifier_dcs
:members:
:undoc-members:
:show-inheritance:
:inherited-members:

combo.models.classifier\_stacking module
----------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ General Purpose Maximization: simple combination by taking the maximum scor
General Purpose Median: take the median value across all scores/prediction results N/A :cite:`a-zhou2012ensemble` :mod:`combo.models.score_comb.median`
General Purpose Majority Vote & Weighted Majority Vote N/A :cite:`a-zhou2012ensemble` :mod:`combo.models.score_comb.majority_vote`
Classification SimpleClassifierAggregator: combining classifiers by general purpose methods above N/A N/A :class:`combo.models.classifier_comb.SimpleClassifierAggregator`
Classification DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates) 1997 :cite:`a-woods1997combination` (WIP)
Classification DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates) 1997 :cite:`a-woods1997combination` :class:`combo.models.classifier_dcs.DCS_LA`
Classification DES: Dynamic Ensemble Selection (From dynamic classifier selection to dynamic ensemble selection) 2008 :cite:`a-ko2008dynamic` (WIP)
Classification Stacking (meta ensembling): use a meta learner to learn the base classifier results N/A :cite:`a-gorman2016kaggle` :class:`combo.models.classifier_stacking.Stacking`
Clustering Clusterer Ensemble: combine the results of multiple clustering results by relabeling 2006 :cite:`a-zhou2006clusterer` :class:`combo.models.cluster_comb.ClustererEnsemble`
Expand Down
Loading

0 comments on commit 66ab5de

Please sign in to comment.