Skip to content

Commit

Permalink
Revert "ENH iforest's score_samples uses chunks for fixed-memory comp…
Browse files Browse the repository at this point in the history
…utation (scikit-learn#13283)"

This reverts commit bce9351.
  • Loading branch information
Xing committed Apr 28, 2019
1 parent b43d7d0 commit 33946b7
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 101 deletions.
4 changes: 0 additions & 4 deletions doc/whats_new/v0.21.rst
Expand Up @@ -168,10 +168,6 @@ Support for Python 3.4 and below has been officially dropped.
by avoiding keeping in memory each tree prediction. :issue:`13260` by
`Nicolas Goix`_.

- |Efficiency| :class:`ensemble.IsolationForest` now uses chunks of data at
prediction step, thus capping the memory usage. :issue:`13283` by
`Nicolas Goix`_.

- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
the gradients would be incorrectly computed in multiclass classification
problems. :issue:`12715` by :user:`Nicolas Hug<NicolasHug>`.
Expand Down
85 changes: 22 additions & 63 deletions sklearn/ensemble/iforest.py
Expand Up @@ -9,14 +9,9 @@
from warnings import warn

from ..tree import ExtraTreeRegressor
from ..utils import (
check_random_state,
check_array,
gen_batches,
get_chunk_n_rows,
)
from ..utils import check_random_state, check_array
from ..utils.fixes import _joblib_parallel_args
from ..utils.validation import check_is_fitted, _num_samples
from ..utils.validation import check_is_fitted
from ..base import OutlierMixin

from .bagging import BaseBagging
Expand Down Expand Up @@ -393,69 +388,21 @@ def score_samples(self, X):
"match the input. Model n_features is {0} and "
"input n_features is {1}."
"".format(self.n_features_, X.shape[1]))
n_samples = X.shape[0]

# Take the opposite of the scores as bigger is better (here less
# abnormal)
return -self._compute_chunked_score_samples(X)

@property
def threshold_(self):
if self.behaviour != 'old':
raise AttributeError("threshold_ attribute does not exist when "
"behaviour != 'old'")
warn("threshold_ attribute is deprecated in 0.20 and will"
" be removed in 0.22.", DeprecationWarning)
return self._threshold_

def _compute_chunked_score_samples(self, X):

n_samples = _num_samples(X)
n_samples_leaf = np.zeros(n_samples, order="f")
depths = np.zeros(n_samples, order="f")

if self._max_features == X.shape[1]:
subsample_features = False
else:
subsample_features = True

# We get as many rows as possible within our working_memory budget
# (defined by sklearn.get_config()['working_memory']) to store
# self._max_features in each row during computation.
#
# Note:
# - this will get at least 1 row, even if 1 row of score will
# exceed working_memory.
# - this does only account for temporary memory usage while loading
# the data needed to compute the scores -- the returned scores
# themselves are 1D.

chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
max_n_rows=n_samples)
slices = gen_batches(n_samples, chunk_n_rows)

scores = np.zeros(n_samples, order="f")

for sl in slices:
# compute score on the slices of test samples:
scores[sl] = self._compute_score_samples(X[sl], subsample_features)

return scores

def _compute_score_samples(self, X, subsample_features):
"""Compute the score of each samples in X going through the extra trees.
Parameters
----------
X : array-like or sparse matrix
subsample_features : bool,
whether features should be subsampled
"""
n_samples = X.shape[0]

depths = np.zeros(n_samples, order="f")

for tree, features in zip(self.estimators_, self.estimators_features_):
X_subset = X[:, features] if subsample_features else X

if subsample_features:
X_subset = X[:, features]
else:
X_subset = X
leaves_index = tree.apply(X_subset)
node_indicator = tree.decision_path(X_subset)
n_samples_leaf = tree.tree_.n_node_samples[leaves_index]
Expand All @@ -471,7 +418,19 @@ def _compute_score_samples(self, X, subsample_features):
/ (len(self.estimators_)
* _average_path_length([self.max_samples_]))
)
return scores

# Take the opposite of the scores as bigger is better (here less
# abnormal)
return -scores

@property
def threshold_(self):
if self.behaviour != 'old':
raise AttributeError("threshold_ attribute does not exist when "
"behaviour != 'old'")
warn("threshold_ attribute is deprecated in 0.20 and will"
" be removed in 0.22.", DeprecationWarning)
return self._threshold_


def _average_path_length(n_samples_leaf):
Expand Down
34 changes: 0 additions & 34 deletions sklearn/ensemble/tests/test_iforest.py
Expand Up @@ -29,7 +29,6 @@
from sklearn.metrics import roc_auc_score

from scipy.sparse import csc_matrix, csr_matrix
from unittest.mock import Mock, patch

rng = check_random_state(0)

Expand Down Expand Up @@ -326,36 +325,3 @@ def test_behaviour_param():
clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
assert_array_equal(clf1.decision_function([[2., 2.]]),
clf2.decision_function([[2., 2.]]))


# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk = 3 rows:
@patch(
"sklearn.ensemble.iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 3}),
)
@pytest.mark.parametrize(
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
)
@pytest.mark.filterwarnings("ignore:threshold_ attribute")
def test_iforest_chunks_works1(
mocked_get_chunk, contamination, n_predict_calls
):
test_iforest_works(contamination)
assert mocked_get_chunk.call_count == n_predict_calls


# idem with chunk_size = 5 rows
@patch(
"sklearn.ensemble.iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 10}),
)
@pytest.mark.parametrize(
"contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]
)
@pytest.mark.filterwarnings("ignore:threshold_ attribute")
def test_iforest_chunks_works2(
mocked_get_chunk, contamination, n_predict_calls
):
test_iforest_works(contamination)
assert mocked_get_chunk.call_count == n_predict_calls

0 comments on commit 33946b7

Please sign in to comment.