diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index a0c26370b2025..39a6e18e61c3f 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -168,10 +168,6 @@ Support for Python 3.4 and below has been officially dropped. by avoiding keeping in memory each tree prediction. :issue:`13260` by `Nicolas Goix`_. -- |Efficiency| :class:`ensemble.IsolationForest` now uses chunks of data at - prediction step, thus capping the memory usage. :issue:`13283` by - `Nicolas Goix`_. - - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where the gradients would be incorrectly computed in multiclass classification problems. :issue:`12715` by :user:`Nicolas Hug`. diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 8a1bd36259e48..0373bf56e845a 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -9,14 +9,9 @@ from warnings import warn from ..tree import ExtraTreeRegressor -from ..utils import ( - check_random_state, - check_array, - gen_batches, - get_chunk_n_rows, -) +from ..utils import check_random_state, check_array from ..utils.fixes import _joblib_parallel_args -from ..utils.validation import check_is_fitted, _num_samples +from ..utils.validation import check_is_fitted from ..base import OutlierMixin from .bagging import BaseBagging @@ -393,69 +388,21 @@ def score_samples(self, X): "match the input. Model n_features is {0} and " "input n_features is {1}." "".format(self.n_features_, X.shape[1])) + n_samples = X.shape[0] - # Take the opposite of the scores as bigger is better (here less - # abnormal) - return -self._compute_chunked_score_samples(X) - - @property - def threshold_(self): - if self.behaviour != 'old': - raise AttributeError("threshold_ attribute does not exist when " - "behaviour != 'old'") - warn("threshold_ attribute is deprecated in 0.20 and will" - " be removed in 0.22.", DeprecationWarning) - return self._threshold_ - - def _compute_chunked_score_samples(self, X): - - n_samples = _num_samples(X) + n_samples_leaf = np.zeros(n_samples, order="f") + depths = np.zeros(n_samples, order="f") if self._max_features == X.shape[1]: subsample_features = False else: subsample_features = True - # We get as many rows as possible within our working_memory budget - # (defined by sklearn.get_config()['working_memory']) to store - # self._max_features in each row during computation. - # - # Note: - # - this will get at least 1 row, even if 1 row of score will - # exceed working_memory. - # - this does only account for temporary memory usage while loading - # the data needed to compute the scores -- the returned scores - # themselves are 1D. - - chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features, - max_n_rows=n_samples) - slices = gen_batches(n_samples, chunk_n_rows) - - scores = np.zeros(n_samples, order="f") - - for sl in slices: - # compute score on the slices of test samples: - scores[sl] = self._compute_score_samples(X[sl], subsample_features) - - return scores - - def _compute_score_samples(self, X, subsample_features): - """Compute the score of each samples in X going through the extra trees. - - Parameters - ---------- - X : array-like or sparse matrix - - subsample_features : bool, - whether features should be subsampled - """ - n_samples = X.shape[0] - - depths = np.zeros(n_samples, order="f") - for tree, features in zip(self.estimators_, self.estimators_features_): - X_subset = X[:, features] if subsample_features else X - + if subsample_features: + X_subset = X[:, features] + else: + X_subset = X leaves_index = tree.apply(X_subset) node_indicator = tree.decision_path(X_subset) n_samples_leaf = tree.tree_.n_node_samples[leaves_index] @@ -471,7 +418,19 @@ def _compute_score_samples(self, X, subsample_features): / (len(self.estimators_) * _average_path_length([self.max_samples_])) ) - return scores + + # Take the opposite of the scores as bigger is better (here less + # abnormal) + return -scores + + @property + def threshold_(self): + if self.behaviour != 'old': + raise AttributeError("threshold_ attribute does not exist when " + "behaviour != 'old'") + warn("threshold_ attribute is deprecated in 0.20 and will" + " be removed in 0.22.", DeprecationWarning) + return self._threshold_ def _average_path_length(n_samples_leaf): diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 67ba2d7f933e3..e33547a44e41a 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -29,7 +29,6 @@ from sklearn.metrics import roc_auc_score from scipy.sparse import csc_matrix, csr_matrix -from unittest.mock import Mock, patch rng = check_random_state(0) @@ -326,36 +325,3 @@ def test_behaviour_param(): clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train) assert_array_equal(clf1.decision_function([[2., 2.]]), clf2.decision_function([[2., 2.]])) - - -# mock get_chunk_n_rows to actually test more than one chunk (here one -# chunk = 3 rows: -@patch( - "sklearn.ensemble.iforest.get_chunk_n_rows", - side_effect=Mock(**{"return_value": 3}), -) -@pytest.mark.parametrize( - "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)] -) -@pytest.mark.filterwarnings("ignore:threshold_ attribute") -def test_iforest_chunks_works1( - mocked_get_chunk, contamination, n_predict_calls -): - test_iforest_works(contamination) - assert mocked_get_chunk.call_count == n_predict_calls - - -# idem with chunk_size = 5 rows -@patch( - "sklearn.ensemble.iforest.get_chunk_n_rows", - side_effect=Mock(**{"return_value": 10}), -) -@pytest.mark.parametrize( - "contamination, n_predict_calls", [(0.25, 3), ("auto", 2)] -) -@pytest.mark.filterwarnings("ignore:threshold_ attribute") -def test_iforest_chunks_works2( - mocked_get_chunk, contamination, n_predict_calls -): - test_iforest_works(contamination) - assert mocked_get_chunk.call_count == n_predict_calls