Revert "ENH iforest's score_samples uses chunks for fixed-memory comp…

…utation (scikit-learn#13283)" This reverts commit bce9351.
xhluca · Apr 28, 2019 · 33946b7 · 33946b7
1 parent b43d7d0
commit 33946b7
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 101 deletions.
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -168,10 +168,6 @@ Support for Python 3.4 and below has been officially dropped.
   by avoiding keeping in memory each tree prediction. :issue:`13260` by
   `Nicolas Goix`_.
 
-- |Efficiency| :class:`ensemble.IsolationForest` now uses chunks of data at
-  prediction step, thus capping the memory usage. :issue:`13283` by
-  `Nicolas Goix`_.
-
 - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
   the gradients would be incorrectly computed in multiclass classification
   problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.

diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
@@ -9,14 +9,9 @@
 from warnings import warn
 
 from ..tree import ExtraTreeRegressor
-from ..utils import (
-    check_random_state,
-    check_array,
-    gen_batches,
-    get_chunk_n_rows,
-)
+from ..utils import check_random_state, check_array
 from ..utils.fixes import _joblib_parallel_args
-from ..utils.validation import check_is_fitted, _num_samples
+from ..utils.validation import check_is_fitted
 from ..base import OutlierMixin
 
 from .bagging import BaseBagging
@@ -393,69 +388,21 @@ def score_samples(self, X):
                              "match the input. Model n_features is {0} and "
                              "input n_features is {1}."
                              "".format(self.n_features_, X.shape[1]))
+        n_samples = X.shape[0]
 
-        # Take the opposite of the scores as bigger is better (here less
-        # abnormal)
-        return -self._compute_chunked_score_samples(X)
-
-    @property
-    def threshold_(self):
-        if self.behaviour != 'old':
-            raise AttributeError("threshold_ attribute does not exist when "
-                                 "behaviour != 'old'")
-        warn("threshold_ attribute is deprecated in 0.20 and will"
-             " be removed in 0.22.", DeprecationWarning)
-        return self._threshold_
-
-    def _compute_chunked_score_samples(self, X):
-
-        n_samples = _num_samples(X)
+        n_samples_leaf = np.zeros(n_samples, order="f")
+        depths = np.zeros(n_samples, order="f")
 
         if self._max_features == X.shape[1]:
             subsample_features = False
         else:
             subsample_features = True
 
-        # We get as many rows as possible within our working_memory budget
-        # (defined by sklearn.get_config()['working_memory']) to store
-        # self._max_features in each row during computation.
-        #
-        # Note:
-        #  - this will get at least 1 row, even if 1 row of score will
-        #    exceed working_memory.
-        #  - this does only account for temporary memory usage while loading
-        #    the data needed to compute the scores -- the returned scores
-        #    themselves are 1D.
-
-        chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
-                                        max_n_rows=n_samples)
-        slices = gen_batches(n_samples, chunk_n_rows)
-
-        scores = np.zeros(n_samples, order="f")
-
-        for sl in slices:
-            # compute score on the slices of test samples:
-            scores[sl] = self._compute_score_samples(X[sl], subsample_features)
-
-        return scores
-
-    def _compute_score_samples(self, X, subsample_features):
-        """Compute the score of each samples in X going through the extra trees.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix
-
-        subsample_features : bool,
-            whether features should be subsampled
-        """
-        n_samples = X.shape[0]
-
-        depths = np.zeros(n_samples, order="f")
-
         for tree, features in zip(self.estimators_, self.estimators_features_):
-            X_subset = X[:, features] if subsample_features else X
-
+            if subsample_features:
+                X_subset = X[:, features]
+            else:
+                X_subset = X
             leaves_index = tree.apply(X_subset)
             node_indicator = tree.decision_path(X_subset)
             n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
@@ -471,7 +418,19 @@ def _compute_score_samples(self, X, subsample_features):
             / (len(self.estimators_)
                * _average_path_length([self.max_samples_]))
         )
-        return scores
+
+        # Take the opposite of the scores as bigger is better (here less
+        # abnormal)
+        return -scores
+
+    @property
+    def threshold_(self):
+        if self.behaviour != 'old':
+            raise AttributeError("threshold_ attribute does not exist when "
+                                 "behaviour != 'old'")
+        warn("threshold_ attribute is deprecated in 0.20 and will"
+             " be removed in 0.22.", DeprecationWarning)
+        return self._threshold_
 
 
 def _average_path_length(n_samples_leaf):

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
@@ -29,7 +29,6 @@
 from sklearn.metrics import roc_auc_score
 
 from scipy.sparse import csc_matrix, csr_matrix
-from unittest.mock import Mock, patch
 
 rng = check_random_state(0)
 
@@ -326,36 +325,3 @@ def test_behaviour_param():
     clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
     assert_array_equal(clf1.decision_function([[2., 2.]]),
                        clf2.decision_function([[2., 2.]]))
-
-
-# mock get_chunk_n_rows to actually test more than one chunk (here one
-# chunk = 3 rows:
-@patch(
-    "sklearn.ensemble.iforest.get_chunk_n_rows",
-    side_effect=Mock(**{"return_value": 3}),
-)
-@pytest.mark.parametrize(
-    "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
-)
-@pytest.mark.filterwarnings("ignore:threshold_ attribute")
-def test_iforest_chunks_works1(
-    mocked_get_chunk, contamination, n_predict_calls
-):
-    test_iforest_works(contamination)
-    assert mocked_get_chunk.call_count == n_predict_calls
-
-
-# idem with chunk_size = 5 rows
-@patch(
-    "sklearn.ensemble.iforest.get_chunk_n_rows",
-    side_effect=Mock(**{"return_value": 10}),
-)
-@pytest.mark.parametrize(
-    "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
-)
-@pytest.mark.filterwarnings("ignore:threshold_ attribute")
-def test_iforest_chunks_works2(
-    mocked_get_chunk, contamination, n_predict_calls
-):
-    test_iforest_works(contamination)
-    assert mocked_get_chunk.call_count == n_predict_calls