add Stacking

yzhao062 · Jul 28, 2019 · cafc970 · cafc970
1 parent 53e2ee5
commit cafc970
Show file tree

Hide file tree

Showing 6 changed files with 189 additions and 34 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -8,4 +8,4 @@ v<0.0.4>, <07/17/2019> -- Update unit test and clustering algorithms.
 v<0.0.4>, <07/17/2019> -- Update documentation.
 v<0.0.4>, <07/21/2019> -- Add code maintainability.
 v<0.0.5>, <07/27/2019> -- Add median combination and score_to_proba function.
-v<0.0.5>, <07/28/2019> -- Add Stacking (meta learner).
+v<0.0.5>, <07/28/2019> -- Add Stacking (meta ensembling).
diff --git a/README.rst b/README.rst
@@ -97,6 +97,7 @@ combo is featured for:
 * `Proposed Algorithms <#proposed-algorithms>`_
 * `Quick Start for Classifier Combination <#quick-start-for-classifier-combination>`_
 * `Quick Start for Clustering Combination <#quick-start-for-clustering-combination>`_
+* `An Example of Stacking <#an-example-of-stacking>`_
 * `Development Status <#development-status>`_
 
 
@@ -177,7 +178,7 @@ Some of the methods are tasks specific:
 
   1. SimpleClassifierAggregator: combining classifiers by (i) (weighted) average (ii) maximization (iii) median and (iv) (weighted) majority vote
   2. Dynamic Classifier Selection & Dynamic Ensemble Selection [#Ko2008From]_ (work-in-progress)
-  3. Stacking (meta learner): build an additional classifier to learn base estimator weights [#Gorman2016Kaggle]_
+  3. Stacking (meta ensembling): build an additional classifier to learn base estimator weights [#Gorman2016Kaggle]_
 
 
 * **Cluster combination**: combine and align unsupervised clustering results
@@ -209,8 +210,6 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note
    .. code-block:: python
 
 
-       from combo.models.classifier_comb import SimpleClassifierAggregator
-
        # initialize a group of classifiers
        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
@@ -219,25 +218,20 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note
                       GradientBoostingClassifier(random_state=random_state)]
 
 
-#. Initialize an aggregator class and pass in combination methods
+#. Initialize, fit, predict, and evaluate with a simple aggregator (average)
 
    .. code-block:: python
 
 
-       # combine by averaging
+       from combo.models.classifier_comb import SimpleClassifierAggregator
+
        clf = SimpleClassifierAggregator(classifiers, method='average')
        clf.fit(X_train, y_train)
-
-
-#. Predict by SimpleClassifierAggregator and then evaluate
-
-   .. code-block:: python
-
-
        y_test_predicted = clf.predict(X_test)
        evaluate_print('Combination by avg   |', y_test, y_test_predicted)
 
 
+
 #. See a sample output of classifier_comb_example.py
 
    .. code-block:: python
@@ -263,7 +257,7 @@ Quick Start for Clustering Combination
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 `"examples/cluster_comb_example.py" <https://github.com/yzhao062/combo/blob/master/examples/cluster_comb_example.py>`_
-demonstrates the basic API of combining multiple base clustering estimators. **It is noted that the API across all other algorithms are consistent/similar**.
+demonstrates the basic API of combining multiple base clustering estimators.
 
 #. Initialize a group of clustering methods as base estimators
 
@@ -301,6 +295,58 @@ demonstrates the basic API of combining multiple base clustering estimators. **I
 -----
 
 
+An Example of Stacking
+^^^^^^^^^^^^^^^^^^^^^^
+
+`"examples/stacking_example.py" <https://github.com/yzhao062/combo/blob/master/examples/stacking_example.py>`_
+demonstrates the basic API of stacking (meta ensembling).
+
+
+#. Initialize a group of classifiers as base estimators
+
+   .. code-block:: python
+
+
+       # initialize a group of classifiers
+       classifiers = [DecisionTreeClassifier(random_state=random_state),
+                      LogisticRegression(random_state=random_state),
+                      KNeighborsClassifier(),
+                      RandomForestClassifier(random_state=random_state),
+                      GradientBoostingClassifier(random_state=random_state)]
+
+
+#. Initialize, fit, predict, and evaluate with Stacking
+
+   .. code-block:: python
+
+
+       from combo.models.stacking import Stacking
+
+       clf = Stacking(base_clfs=classifiers, n_folds=4, shuffle_data=False,
+                   keep_original=True, use_proba=False, random_state=random_state)
+
+       clf.fit(X_train, y_train)
+       y_test_predict = clf.predict(X_test)
+       evaluate_print('Stacking | ', y_test, y_test_predict)
+
+
+#. See a sample output of stacking_example.py
+
+   .. code-block:: python
+
+
+       Decision Tree        | Accuracy:0.9386, ROC:0.9383, F1:0.9521
+       Logistic Regression  | Accuracy:0.9649, ROC:0.9615, F1:0.973
+       K Neighbors          | Accuracy:0.9561, ROC:0.9519, F1:0.9662
+       Gradient Boosting    | Accuracy:0.9605, ROC:0.9524, F1:0.9699
+       Random Forest        | Accuracy:0.9605, ROC:0.961, F1:0.9693
+
+       Stacking             | Accuracy:0.9868, ROC:0.9841, F1:0.9899
+
+
+-----
+
+
 Development Status
 ^^^^^^^^^^^^^^^^^^
 

diff --git a/combo/models/stacking.py b/combo/models/stacking.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Stacking (Meta-learner). See http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
+"""Stacking (meta ensembling). See http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
 for more information.
 """
 # Author: Yue Zhao <zhaoy@cmu.edu>
@@ -82,7 +82,7 @@ def split_datasets(X, y, n_folds=3, shuffle_data=False, random_state=None):
 
 
 class Stacking(BaseClassifierAggregator):
-    """Meta learner, also known as stacking. See
+    """Meta ensembling, also known as stacking. See
     http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/
     for more information
 
@@ -92,7 +92,7 @@ class Stacking(BaseClassifierAggregator):
         A list of base classifiers.
 
     meta_clf : object, optional (default=LogisticRegression)
-        The meta learner to make the final prediction
+        The meta classifier to make the final prediction.
 
     n_folds : int, optional (default=2)
         The number of splits of the training sample.
@@ -211,7 +211,7 @@ def fit(self, X, y):
             X_new_comb = new_features
         y_new_comb = y_new
 
-        # train the meta learner
+        # train the meta classifier
         self.meta_clf.fit(X_new_comb, y_new_comb)
         self.fitted_ = True
 

diff --git a/docs/example.rst b/docs/example.rst
@@ -2,10 +2,12 @@ Quick Start
 ===========
 
 
+-----
+
+
 Quick Start for Classifier Combination
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-
 `"examples/classifier_comb_example.py" <https://github.com/yzhao062/combo/blob/master/examples/classifier_comb_example.py>`_
 demonstrates the basic API of predicting with multiple classifiers. **It is noted that the API across all other algorithms are consistent/similar**.
 
@@ -14,8 +16,6 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note
    .. code-block:: python
 
 
-       from combo.models.classifier_comb import SimpleClassifierAggregator
-
        # initialize a group of classifiers
        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
@@ -24,25 +24,20 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note
                       GradientBoostingClassifier(random_state=random_state)]
 
 
-#. Initialize an aggregator class and pass in combination methods
+#. Initialize, fit, predict, and evaluate with a simple aggregator (average)
 
    .. code-block:: python
 
 
-       # combine by averaging
+       from combo.models.classifier_comb import SimpleClassifierAggregator
+
        clf = SimpleClassifierAggregator(classifiers, method='average')
        clf.fit(X_train, y_train)
-
-
-#. Predict by SimpleClassifierAggregator and then evaluate
-
-   .. code-block:: python
-
-
        y_test_predicted = clf.predict(X_test)
        evaluate_print('Combination by avg   |', y_test, y_test_predicted)
 
 
+
 #. See a sample output of classifier_comb_example.py
 
    .. code-block:: python
@@ -64,11 +59,11 @@ demonstrates the basic API of predicting with multiple classifiers. **It is note
 -----
 
 
-Quick Start for Cluster Combination
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Quick Start for Clustering Combination
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 `"examples/cluster_comb_example.py" <https://github.com/yzhao062/combo/blob/master/examples/cluster_comb_example.py>`_
-demonstrates the basic API of combining multiple base clustering estimators. **It is noted that the API across all other algorithms are consistent/similar**.
+demonstrates the basic API of combining multiple base clustering estimators.
 
 #. Initialize a group of clustering methods as base estimators
 
@@ -102,3 +97,55 @@ demonstrates the basic API of combining multiple base clustering estimators. **I
        aligned_labels = clf.aligned_labels_
        predicted_labels = clf.labels_
 
+
+-----
+
+
+An Example of Stacking
+^^^^^^^^^^^^^^^^^^^^^^
+
+`"examples/stacking_example.py" <https://github.com/yzhao062/combo/blob/master/examples/stacking_example.py>`_
+demonstrates the basic API of stacking (meta ensembling).
+
+
+#. Initialize a group of classifiers as base estimators
+
+   .. code-block:: python
+
+
+       # initialize a group of classifiers
+       classifiers = [DecisionTreeClassifier(random_state=random_state),
+                      LogisticRegression(random_state=random_state),
+                      KNeighborsClassifier(),
+                      RandomForestClassifier(random_state=random_state),
+                      GradientBoostingClassifier(random_state=random_state)]
+
+
+#. Initialize, fit, predict, and evaluate with Stacking
+
+   .. code-block:: python
+
+
+       from combo.models.stacking import Stacking
+
+       clf = Stacking(base_clfs=classifiers, n_folds=4, shuffle_data=False,
+                   keep_original=True, use_proba=False, random_state=random_state)
+
+       clf.fit(X_train, y_train)
+       y_test_predict = clf.predict(X_test)
+       evaluate_print('Stacking | ', y_test, y_test_predict)
+
+
+#. See a sample output of stacking_example.py
+
+   .. code-block:: python
+
+
+       Decision Tree        | Accuracy:0.9386, ROC:0.9383, F1:0.9521
+       Logistic Regression  | Accuracy:0.9649, ROC:0.9615, F1:0.973
+       K Neighbors          | Accuracy:0.9561, ROC:0.9519, F1:0.9662
+       Gradient Boosting    | Accuracy:0.9605, ROC:0.9524, F1:0.9699
+       Random Forest        | Accuracy:0.9605, ROC:0.961, F1:0.9693
+
+       Stacking             | Accuracy:0.9868, ROC:0.9841, F1:0.9899
+
diff --git a/docs/index.rst b/docs/index.rst
@@ -132,7 +132,7 @@ Some of the methods are tasks specific:
 
   1. SimpleClassifierAggregator (:class:`combo.models.classifier_comb.SimpleClassifierAggregator`): combining classifiers by (i) (weighted) average (ii) maximization (iii) median and (iv) (weighted) majority vote
   2. Dynamic Classifier Selection & Dynamic Ensemble Selection :cite:`a-ko2008dynamic` (work-in-progress)
-  3. Stacking (meta learner :class:`combo.models.stacking.Stacking`): build an additional classifier to learn base estimator weights :cite:`a-gorman2016kaggle`
+  3. Stacking (meta ensembling :class:`combo.models.stacking.Stacking`): build an additional classifier to learn base estimator weights :cite:`a-gorman2016kaggle`
 
 
 * **Cluster combination**: combine and align unsupervised clustering results

diff --git a/examples/stacking_example.py b/examples/stacking_example.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+"""Example of Stacking (meta ensembling)
+"""
+# Author: Yue Zhao <zhaoy@cmu.edu>
+# License: BSD 2 clause
+
+
+import os
+import sys
+
+# temporary solution for relative imports in case combo is not installed
+# if combo is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_breast_cancer
+
+from combo.models.stacking import Stacking
+from combo.utils.data import evaluate_print
+
+import warnings
+
+warnings.filterwarnings("ignore")
+
+if __name__ == "__main__":
+    # Define data file and read X and y
+    random_state = 42
+    X, y = load_breast_cancer(return_X_y=True)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
+                                                        random_state=random_state)
+
+    # initialize a group of clfs
+    classifiers = [DecisionTreeClassifier(random_state=random_state),
+                   LogisticRegression(random_state=random_state),
+                   KNeighborsClassifier(),
+                   RandomForestClassifier(random_state=random_state),
+                   GradientBoostingClassifier(random_state=random_state)]
+    clf_names = ['DT', 'LR', 'KNN', 'RF', 'GBDT']
+
+    # evaluate individual classifiers
+    for i, clf in enumerate(classifiers):
+        clf.fit(X_train, y_train)
+        y_test_predict = clf.predict(X_test)
+        evaluate_print(clf_names[i] + '   |   ', y_test, y_test_predict)
+
+    print()
+    # build a Stacking model and evaluate
+    clf = Stacking(base_clfs=classifiers, n_folds=4, shuffle_data=False,
+                   keep_original=True, use_proba=False,
+                   random_state=random_state)
+
+    clf.fit(X_train, y_train)
+    y_test_predict = clf.predict(X_test)
+    evaluate_print('Stacking | ', y_test, y_test_predict)