Changing CBE to SEA.

w4k2 · Dec 24, 2019 · 60fdee2 · 60fdee2
1 parent 1dbf6e4
commit 60fdee2
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 29 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 metex.py
+example.png
+example.py
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/doc/api.rst b/doc/api.rst
@@ -63,7 +63,7 @@ API
    :toctree: generated/
    :template: class.rst
 
-   ensembles.ChunkBasedEnsemble
+   ensembles.SEA
    ensembles.OnlineBagging
    ensembles.OOB
    ensembles.UOB

diff --git a/doc/ensembles.rst b/doc/ensembles.rst
@@ -35,20 +35,20 @@ equal to the size of the chunk.
 Chunk-Based Ensemble
 --------------------
 
-The ``ChunkBasedEnsemble`` class implements a basic multi classifier approach for data stream classification. This model takes the base classifier as the ``base_estimator`` parameter and the pool size as the ``n_estimators``. A single base classifier is trained on each observed data chunk and added to the ensemble. If the fixed pool size is exceeded, the oldest model is removed. The final decision is obtained by accumulating the supports of base classifiers.
+The ``SEA`` class implements a basic multi classifier approach for data stream classification. This model takes the base classifier as the ``base_estimator`` parameter and the pool size as the ``n_estimators``. A single base classifier is trained on each observed data chunk and added to the ensemble. If the fixed pool size is exceeded, the oldest model is removed. The final decision is obtained by accumulating the supports of base classifiers.
 
 **Example**
 
 .. code-block:: python
 
   from strlearn.evaluators import TestThenTrain
   from strlearn.streams import StreamGenerator
-  from strlearn.ensembles import ChunkBasedEnsemble
+  from strlearn.ensembles import SEA
 
   from sklearn.naive_bayes import GaussianNB
 
   stream = StreamGenerator()
-  clf = ChunkBasedEnsemble(base_estimator=GaussianNB(), n_estimators=5)
+  clf = SEA(base_estimator=GaussianNB(), n_estimators=5)
   evaluator = TestThenTrain()
 
   evaluator.process(stream, clf)

diff --git a/doc/evaluators.rst b/doc/evaluators.rst
@@ -37,13 +37,13 @@ three-dimensional array of shape (n_classifiers, n_chunks, n_metrics).
 .. code-block:: python
 
   from strlearn.evaluators import TestThenTrain
-  from strlearn.ensembles import ChunkBasedEnsemble
+  from strlearn.ensembles import SEA
   from strlearn.utils.metrics import bac, f_score
   from strlearn.streams import StreamGenerator
   from sklearn.naive_bayes import GaussianNB
 
   stream = StreamGenerator(chunk_size=200, n_chunks=250)
-  clf = ChunkBasedEnsemble(base_estimator=GaussianNB())
+  clf = SEA(base_estimator=GaussianNB())
   evaluator = TestThenTrain(metrics=(bac, f_score))
 
   evaluator.process(stream, clf)
@@ -55,15 +55,15 @@ three-dimensional array of shape (n_classifiers, n_chunks, n_metrics).
 .. code-block:: python
 
   from strlearn.evaluators import TestThenTrain
-  from strlearn.ensembles import ChunkBasedEnsemble
+  from strlearn.ensembles import SEA
   from strlearn.utils.metrics import bac, f_score
   from strlearn.streams import StreamGenerator
   from sklearn.naive_bayes import GaussianNB
   from sklearn.tree import DecisionTreeClassifier
 
   stream = StreamGenerator(chunk_size=200, n_chunks=250)
-  clf1 = ChunkBasedEnsemble(base_estimator=GaussianNB())
-  clf2 = ChunkBasedEnsemble(base_estimator=DecisionTreeClassifier())
+  clf1 = SEA(base_estimator=GaussianNB())
+  clf2 = SEA(base_estimator=DecisionTreeClassifier())
   clfs = (clf1, clf2)
   evaluator = TestThenTrain(metrics=(bac, f_score))
 
@@ -96,13 +96,13 @@ the instance of ``StreamGenerator`` class.
 .. code-block:: python
 
   from strlearn.evaluators import Prequential
-  from strlearn.ensembles import ChunkBasedEnsemble
+  from strlearn.ensembles import SEA
   from strlearn.utils.metrics import bac, f_score
   from strlearn.streams import StreamGenerator
   from sklearn.naive_bayes import GaussianNB
 
   stream = StreamGenerator()
-  clf = ChunkBasedEnsemble(base_estimator=GaussianNB())
+  clf = SEA(base_estimator=GaussianNB())
   evaluator = TestThenTrain(metrics=(bac, f_score))
 
   evaluator.process(stream, clf, interval=100)
@@ -114,15 +114,15 @@ the instance of ``StreamGenerator`` class.
 .. code-block:: python
 
   from strlearn.evaluators import Prequential
-  from strlearn.ensembles import ChunkBasedEnsemble
+  from strlearn.ensembles import SEA
   from strlearn.utils.metrics import bac, f_score
   from strlearn.streams import StreamGenerator
   from sklearn.naive_bayes import GaussianNB
   from sklearn.tree import DecisionTreeClassifier
 
   stream = StreamGenerator(chunk_size=200, n_chunks=250)
-  clf1 = ChunkBasedEnsemble(base_estimator=GaussianNB())
-  clf2 = ChunkBasedEnsemble(base_estimator=DecisionTreeClassifier())
+  clf1 = SEA(base_estimator=GaussianNB())
+  clf2 = SEA(base_estimator=DecisionTreeClassifier())
   clfs = (clf1, clf2)
   evaluator = Prequential(metrics=(bac, f_score))
 

diff --git a/strlearn/ensembles/ChunkBasedEnsemble.py → strlearn/ensembles/SEA.py b/strlearn/ensembles/ChunkBasedEnsemble.py → strlearn/ensembles/SEA.py
@@ -1,12 +1,13 @@
 """Chunk based ensemble."""
 
 from sklearn.base import ClassifierMixin, clone
+from sklearn.metrics import accuracy_score
 from sklearn.ensemble import BaseEnsemble
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 import numpy as np
 
 
-class ChunkBasedEnsemble(ClassifierMixin, BaseEnsemble):
+class SEA(ClassifierMixin, BaseEnsemble):
     """
     Chunk based ensemble classifier.
 
@@ -15,7 +16,7 @@ class ChunkBasedEnsemble(ClassifierMixin, BaseEnsemble):
 
     Parameters
     ----------
-    
+
     n_estimators : integer, optional (default=5)
         The maximum number of estimators trained using consecutive data chunks
         and maintained in the ensemble.
@@ -31,7 +32,7 @@ class ChunkBasedEnsemble(ClassifierMixin, BaseEnsemble):
     --------
     >>> import strlearn as sl
     >>> stream = sl.streams.StreamGenerator()
-    >>> clf = sl.ensembles.ChunkBasedEnsemble()
+    >>> clf = sl.ensembles.SEA()
     >>> evaluator = sl.evaluators.TestThenTrainEvaluator()
     >>> evaluator.process(clf, stream)
     >>> print(evaluator.scores_)
@@ -45,15 +46,15 @@ class ChunkBasedEnsemble(ClassifierMixin, BaseEnsemble):
     [0.935      0.93569212 0.93540766 0.93569212 0.93467337]]
     """
 
-    def __init__(self, base_estimator=None, n_estimators=10):
+    def __init__(self, base_estimator=None, n_estimators=10, metric=accuracy_score):
         """Initialization."""
         self.base_estimator = base_estimator
         self.n_estimators = n_estimators
+        self.metric = metric
 
     def fit(self, X, y):
         """Fitting."""
         self.partial_fit(X, y)
-
         return self
 
     def partial_fit(self, X, y, classes=None):
@@ -73,11 +74,14 @@ def partial_fit(self, X, y, classes=None):
         if self.classes_ is None:
             self.classes_, _ = np.unique(y, return_inverse=True)
 
+        # Append new estimator
         self.ensemble_.append(clone(self.base_estimator).fit(self.X_, self.y_))
 
+        # Remove the worst when ensemble becomes too large
         if len(self.ensemble_) > self.n_estimators:
-            del self.ensemble_[0]
-
+            del self.ensemble_[
+                np.argmin([self.metric(y, clf.predict(X)) for clf in self.ensemble_])
+            ]
         return self
 
     def ensemble_support_matrix(self, X):

diff --git a/strlearn/ensembles/__init__.py b/strlearn/ensembles/__init__.py
@@ -1,4 +1,4 @@
-from .ChunkBasedEnsemble import ChunkBasedEnsemble
+from .SEA import SEA
 from .WAE import WAE
 from .OnlineBagging import OnlineBagging
 from .OOB import OOB

diff --git a/strlearn/tests/test_ensembles.py b/strlearn/tests/test_ensembles.py
@@ -8,14 +8,17 @@
 
 sys.path.insert(0, "../..")
 
+
 def get_stream():
     return sl.streams.StreamGenerator(n_chunks=10, n_features=10)
 
+
 def get_different_stream():
     return sl.streams.StreamGenerator(n_chunks=10, n_features=4)
 
+
 def test_ensembles_fit():
-    clf1 = sl.ensembles.ChunkBasedEnsemble(GaussianNB())
+    clf1 = sl.ensembles.SEA(GaussianNB())
     clf2 = sl.ensembles.WAE(GaussianNB())
     clf3 = sl.ensembles.OOB(GaussianNB())
     clf4 = sl.ensembles.OnlineBagging(GaussianNB())
@@ -35,7 +38,7 @@ def test_ensembles_fit():
 def test_features():
     "Bare CBE"
     clfs = [
-        sl.ensembles.ChunkBasedEnsemble(GaussianNB()),
+        sl.ensembles.SEA(GaussianNB()),
         sl.ensembles.OOB(GaussianNB()),
         sl.ensembles.UOB(GaussianNB()),
         sl.ensembles.WAE(GaussianNB()),
@@ -52,10 +55,11 @@ def test_features():
         with pytest.raises(ValueError):
             clf.partial_fit(X_b, y_b)
 
+
 def test_pred():
     """Pred error"""
     clfs = [
-        sl.ensembles.ChunkBasedEnsemble(GaussianNB()),
+        sl.ensembles.SEA(GaussianNB()),
         sl.ensembles.OOB(GaussianNB()),
         sl.ensembles.UOB(GaussianNB()),
         sl.ensembles.WAE(GaussianNB()),
@@ -72,13 +76,15 @@ def test_pred():
         with pytest.raises(ValueError):
             clf.predict(X_b)
 
-def test_CBE():
-    "Bare CBE"
+
+def test_SEA():
+    "Bare SEA"
     stream = get_stream()
-    clf = sl.ensembles.ChunkBasedEnsemble(GaussianNB(), n_estimators=5)
+    clf = sl.ensembles.SEA(GaussianNB(), n_estimators=5)
     evaluator = sl.evaluators.TestThenTrain()
     evaluator.process(stream, clf)
 
+
 def test_WAE():
     """Bare WAE."""
     stream = get_stream()
@@ -94,20 +100,23 @@ def test_OOB():
     evaluator = sl.evaluators.TestThenTrain()
     evaluator.process(stream, clf)
 
+
 def test_OB():
     """Bare WAE."""
     stream = get_stream()
     clf = sl.ensembles.OnlineBagging(GaussianNB())
     evaluator = sl.evaluators.TestThenTrain()
     evaluator.process(stream, clf)
 
+
 def test_UOB():
     """Bare WAE."""
     stream = get_stream()
     clf = sl.ensembles.UOB(GaussianNB())
     evaluator = sl.evaluators.TestThenTrain()
     evaluator.process(stream, clf)
 
+
 def test_pp_WAE():
     """Post pruned WAE."""
     stream = get_stream()
@@ -119,7 +128,9 @@ def test_pp_WAE():
 def test_WAE_wcm1():
     """Various weight computation methods of WAE."""
     stream = get_stream()
-    clf = sl.ensembles.WAE(GaussianNB(), weight_calculation_method="same_for_each", n_estimators=5)
+    clf = sl.ensembles.WAE(
+        GaussianNB(), weight_calculation_method="same_for_each", n_estimators=5
+    )
     evaluator = sl.evaluators.TestThenTrain()
     evaluator.process(stream, clf)