add DCS_LA

yzhao062 · Aug 2, 2019 · 66ab5de · 66ab5de
1 parent abecbad
commit 66ab5de
Show file tree

Hide file tree

Showing 9 changed files with 483 additions and 51 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -12,4 +12,5 @@ v<0.0.5>, <07/28/2019> -- Add Stacking (meta ensembling).
 v<0.0.6>, <07/29/2019> -- Enable Appveyor integration.
 v<0.0.6>, <07/29/2019> -- Update requirements file.
 v<0.0.6>, <07/29/2019> -- Add simple outlier detector combination methods.
-v<0.0.6>, <07/30/2019> -- Add LSCP.
+v<0.0.6>, <07/30/2019> -- Add LSCP.
+v<0.0.7>, <08/02/2019> -- Add DCS_LA.
diff --git a/README.rst b/README.rst
@@ -227,7 +227,7 @@ General Purpose      Maximization: simple combination by taking the maximum scor
 General Purpose      Median: take the median value across all scores/prediction results                                      N/A    [#Zhou2012Ensemble]_
 General Purpose      Majority Vote & Weighted Majority Vote                                                                  N/A    [#Zhou2012Ensemble]_
 Classification       SimpleClassifierAggregator: combining classifiers by general purpose methods above                      N/A    N/A
-Classification       DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates)  1997   [#Woods1997Combination]_ (work-in-progress)
+Classification       DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates)  1997   [#Woods1997Combination]_
 Classification       DES: Dynamic Ensemble Selection (From dynamic classifier selection to dynamic ensemble selection)       2008   [#Ko2008From]_ (work-in-progress)
 Classification       Stacking (meta ensembling): use a meta learner to learn the base classifier results                     N/A    [#Gorman2016Kaggle]_
 Clustering           Clusterer Ensemble: combine the results of multiple clustering results by relabeling                    2006   [#Zhou2006Clusterer]_

diff --git a/combo/models/classifier_dcs.py b/combo/models/classifier_dcs.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+"""Stacking (meta ensembling). See http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
+for more information.
+"""
+# Author: Yue Zhao <zhaoy@cmu.edu>
+# License: BSD 2 clause
+
+import warnings
+import numpy as np
+
+from sklearn.neighbors import KDTree
+from sklearn.metrics import accuracy_score
+from sklearn.utils import check_array
+from sklearn.utils import check_X_y
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.multiclass import check_classification_targets
+
+from ..utils.utility import check_parameter
+
+from .base import BaseAggregator
+
+
+class DCS_LA(BaseAggregator):
+    """Dynamic Classifier Selection (DCS) is an established combination
+    framework for classification tasks. The technique was first proposed by Ho
+    et al. in 1994 :cite:`ho1994decision` and then extended, under the name
+    DCS Local Accuracy, by Woods et al. in 1997 :cite:`woods1997combination`
+    to select the most accurate base classifier in a local region.
+    The motivation behind this approach is that base classifiers often make
+    distinctive errors and over a degree of complementarity. Consequently,
+    selectively combining base classifier can result in a performance
+    improvement over generic ensembles which use the majority vote of all
+    base classifiers.
+
+    See :cite:`woods1997combination` for details.
+
+    Parameters
+    ----------
+    base_estimators: list or numpy array (n_estimators,)
+        A list of base classifiers.
+
+    local_region_size : int, optional (default=30)
+        Number of training points to consider in each iteration of the local
+        region generation process (30 by default).
+
+    threshold : float in (0, 1), optional (default=None)
+        Cut-off value to convert scores into binary labels.
+
+    pre_fitted : bool, optional (default=False)
+        Whether the base classifiers are trained. If True, `fit`
+        process may be skipped.
+
+    """
+
+    def __init__(self, base_estimators, local_region_size=30, threshold=None,
+                 pre_fitted=None):
+
+        super(DCS_LA, self).__init__(
+            base_estimators=base_estimators, pre_fitted=pre_fitted)
+
+        # validate input parameters
+        if not isinstance(local_region_size, int):
+            raise ValueError('local_region_size must be an integer variable')
+        check_parameter(local_region_size, low=1, include_left=True,
+                        param_name='n_folds')
+        self.local_region_size = local_region_size
+
+        if threshold is not None:
+            warnings.warn(
+                "Stacking does not support threshold setting option. "
+                "Please set the threshold in classifiers directly.")
+
+        if pre_fitted is not None:
+            warnings.warn("Stacking does not support pre_fitted option.")
+
+    def fit(self, X, y):
+        """Fit classifier.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        y : numpy array of shape (n_samples,), optional (default=None)
+            The ground truth of the input samples (labels).
+        """
+
+        # Validate inputs X and y
+        X, y = check_X_y(X, y)
+        X = check_array(X)
+        check_classification_targets(y)
+        self._classes = len(np.unique(y))
+        n_samples = X.shape[0]
+
+        # save the train ground truth for evaluation purpose
+        self.y_train_ = y
+
+        # build KDTree out of training subspace
+        self.tree_ = KDTree(X)
+
+        self.y_train_predicted_ = np.zeros(
+            [n_samples, self.n_base_estimators_])
+
+        # train all base classifiers on X, and get their local predicted scores
+        # iterate over all base classifiers
+        for i, clf in enumerate(self.base_estimators):
+            clf.fit(X, y)
+            self.y_train_predicted_[:, i] = clf.predict(X)
+            clf.fitted_ = True
+
+        self.fitted_ = True
+
+        return
+
+    def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        labels : numpy array of shape (n_samples,)
+            Class labels for each data sample.
+        """
+        return self._predict_internal(X, predict_proba=False)
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test data X.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        p : numpy array of shape (n_samples,)
+            The class probabilities of the input samples.
+            Classes are ordered by lexicographic order.
+        """
+        return self._predict_internal(X, predict_proba=True)
+
+    def _predict_internal(self, X, predict_proba):
+        """Internal function for predict and predict_proba
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        predict_proba : bool
+            if True, return the result of predict_proba
+
+        Returns
+        -------
+        """
+        check_is_fitted(self, ['fitted_'])
+        X = check_array(X)
+        n_samples = X.shape[0]
+
+        # Find neighbors for all test instances
+        _, ind_arr = self.tree_.query(X, k=self.local_region_size)
+
+        if predict_proba:
+            y_predicted = np.zeros([n_samples, self._classes])
+        else:
+            y_predicted = np.zeros([n_samples, ])
+
+        # For each test sample
+        for i in range(n_samples):
+            test_sample = X[i, :].reshape(1, -1)
+            train_inds = ind_arr[i, :]
+
+            # ground truth
+            y_train_sample = self.y_train_[train_inds]
+
+            clf_performance = np.zeros([self.n_base_estimators_, ])
+
+            for j, clf in enumerate(self.base_estimators):
+                y_train_clf = self.y_train_predicted_[train_inds, j]
+                clf_performance[j] = accuracy_score(y_train_sample,
+                                                    y_train_clf)
+
+            # select the best clf. may get multiple results
+            select_clf_inds = np.argwhere(
+                clf_performance == np.amax(clf_performance)).ravel()
+
+            # select the first element from all candidates
+            best_clf_ind = select_clf_inds[-1]
+
+            # make prediction
+            if predict_proba:
+                y_predicted[i] = self.base_estimators[
+                    best_clf_ind].predict_proba(test_sample)
+            else:
+                y_predicted[i] = self.base_estimators[best_clf_ind].predict(
+                    test_sample)
+
+        return y_predicted
diff --git a/combo/test/test_classifier_dcs.py b/combo/test/test_classifier_dcs.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+
+import unittest
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+
+from sklearn.datasets import load_breast_cancer
+# noinspection PyProtectedMember
+from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_array_less
+from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_greater
+from sklearn.utils.testing import assert_greater_equal
+from sklearn.utils.testing import assert_less_equal
+from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_true
+
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import accuracy_score
+
+# temporary solution for relative imports in case combo is not installed
+# if  combo is installed, no need to use the following line
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from combo.models.classifier_dcs import DCS_LA
+from combo.utils.data import evaluate_print
+
+
+class TestDCS_LA(unittest.TestCase):
+    def setUp(self):
+        self.roc_floor = 0.9
+        self.accuracy_floor = 0.9
+
+        random_state = 42
+        X, y = load_breast_cancer(return_X_y=True)
+
+        self.X_train, self.X_test, self.y_train, self.y_test = \
+            train_test_split(X, y, test_size=0.4, random_state=random_state)
+
+        classifiers = [DecisionTreeClassifier(random_state=random_state),
+                       LogisticRegression(random_state=random_state),
+                       KNeighborsClassifier(),
+                       RandomForestClassifier(random_state=random_state),
+                       GradientBoostingClassifier(random_state=random_state)]
+
+        self.clf = DCS_LA(classifiers, local_region_size=30)
+        self.clf.fit(self.X_train, self.y_train)
+
+    def test_parameters(self):
+        assert_true(hasattr(self.clf, 'base_estimators') and
+                    self.clf.base_estimators is not None)
+
+    def test_train_scores(self):
+        y_train_predicted = self.clf.predict(self.X_train)
+        assert_equal(len(y_train_predicted), self.X_train.shape[0])
+
+        # check performance
+        assert_greater(accuracy_score(self.y_train, y_train_predicted),
+                       self.accuracy_floor)
+
+    def test_prediction_scores(self):
+        y_test_predicted = self.clf.predict(self.X_test)
+        assert_equal(len(y_test_predicted), self.X_test.shape[0])
+
+        # check performance
+        assert_greater(accuracy_score(self.y_test, y_test_predicted),
+                       self.accuracy_floor)
+
+        # test utility function
+        evaluate_print('averaging', self.y_test, y_test_predicted)
+
+    def test_prediction_proba(self):
+        y_test_predicted = self.clf.predict_proba(self.X_test)
+        assert_greater_equal(y_test_predicted.min(), 0)
+        assert_less_equal(y_test_predicted.max(), 1)
+
+        # check performance
+        assert_greater(roc_auc_score(self.y_test, y_test_predicted[:, 1]),
+                       self.roc_floor)
+
+        # check shape of integrity
+        n_classes = len(np.unique(self.y_train))
+        assert_equal(y_test_predicted.shape, (self.X_test.shape[0], n_classes))
+
+        # check probability sum is 1
+        y_test_predicted_sum = np.sum(y_test_predicted, axis=1)
+        assert_allclose(np.ones([self.X_test.shape[0], ]),
+                        y_test_predicted_sum)
+
+    def tearDown(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/docs/api.rst b/docs/api.rst
@@ -13,6 +13,7 @@ Classifier Combination
   combination methods, e.g., average, median, and majority vote.
 * :class:`combo.models.classifier_stacking.Stacking`: Stacking (meta ensembling). Check this `introductory
   article by Kaggle <http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/>`_.
+* :class:`combo.models.classifier_dcs.DCS_LA`: Dynamic classifier selection (DCS) by local accuracy.
 
 
 ----

diff --git a/docs/combo.models.rst b/docs/combo.models.rst
@@ -11,6 +11,14 @@ combo.models.classifier\_comb module
    :show-inheritance:
    :inherited-members:
 
+combo.models.classifier\_dcs module
+-----------------------------------
+
+.. automodule:: combo.models.classifier_dcs
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :inherited-members:
 
 combo.models.classifier\_stacking module
 ----------------------------------------

diff --git a/docs/index.rst b/docs/index.rst
@@ -171,7 +171,7 @@ General Purpose      Maximization: simple combination by taking the maximum scor
 General Purpose      Median: take the median value across all scores/prediction results                                      N/A    :cite:`a-zhou2012ensemble`                   :mod:`combo.models.score_comb.median`
 General Purpose      Majority Vote & Weighted Majority Vote                                                                  N/A    :cite:`a-zhou2012ensemble`                   :mod:`combo.models.score_comb.majority_vote`
 Classification       SimpleClassifierAggregator: combining classifiers by general purpose methods above                      N/A    N/A                                          :class:`combo.models.classifier_comb.SimpleClassifierAggregator`
-Classification       DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates)  1997   :cite:`a-woods1997combination` (WIP)
+Classification       DCS: Dynamic Classifier Selection (Combination of multiple classifiers using local accuracy estimates)  1997   :cite:`a-woods1997combination`               :class:`combo.models.classifier_dcs.DCS_LA`
 Classification       DES: Dynamic Ensemble Selection (From dynamic classifier selection to dynamic ensemble selection)       2008   :cite:`a-ko2008dynamic` (WIP)
 Classification       Stacking (meta ensembling): use a meta learner to learn the base classifier results                     N/A    :cite:`a-gorman2016kaggle`                   :class:`combo.models.classifier_stacking.Stacking`
 Clustering           Clusterer Ensemble: combine the results of multiple clustering results by relabeling                    2006   :cite:`a-zhou2006clusterer`                  :class:`combo.models.cluster_comb.ClustererEnsemble`