From 75b044867230946b29b1b13ff8e82543f183dbd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 31 Aug 2017 01:05:48 +0200 Subject: [PATCH 01/10] DOC improve check_memory related docstrings (#9649) --- sklearn/cluster/hierarchical.py | 4 ++-- sklearn/linear_model/randomized_l1.py | 4 ++-- sklearn/pipeline.py | 4 ++-- sklearn/utils/validation.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 3a61b4f8770e4..966ed5e2cc121 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -609,7 +609,7 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin): "manhattan", "cosine", or 'precomputed'. If linkage is "ward", only "euclidean" is accepted. - memory : joblib.Memory-like or string, optional + memory : None, str or object with the joblib.Memory interface, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. @@ -769,7 +769,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): "manhattan", "cosine", or 'precomputed'. If linkage is "ward", only "euclidean" is accepted. - memory : joblib.Memory-like or string, optional + memory : None, str or object with the joblib.Memory interface, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py index 8f3692dc8675b..1b8cb567b661a 100644 --- a/sklearn/linear_model/randomized_l1.py +++ b/sklearn/linear_model/randomized_l1.py @@ -278,7 +278,7 @@ class RandomizedLasso(BaseRandomizedLinearModel): - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' - memory : Instance of sklearn.externals.joblib.Memory or string, optional \ + memory : None, str or object with the joblib.Memory interface, optional \ (default=None) Used for internal caching. By default, no caching is done. If a string is given, it is the path to the caching directory. @@ -472,7 +472,7 @@ class RandomizedLogisticRegression(BaseRandomizedLinearModel): - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' - memory : Instance of sklearn.externals.joblib.Memory or string, optional \ + memory : None, str or object with the joblib.Memory interface, optional \ (default=None) Used for internal caching. By default, no caching is done. If a string is given, it is the path to the caching directory. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 43a3b09e42e44..1c22210cbfb22 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -52,7 +52,7 @@ class Pipeline(_BaseComposition): chained, in the order in which they are chained, with the last object an estimator. - memory : joblib.Memory-like or string, optional + memory : None, str or object with the joblib.Memory interface, optional Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of @@ -528,7 +528,7 @@ def make_pipeline(*steps, **kwargs): ---------- *steps : list of estimators, - memory : joblib.Memory-like or string, optional + memory : None, str or object with the joblib.Memory interface, optional Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 7f89bfc89f9da..5847b540d7b6c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -165,7 +165,7 @@ def check_memory(memory): Parameters ---------- - memory : joblib.Memory-like or string or None + memory : None, str or object with the joblib.Memory interface Returns ------- From 4889a67942713777e0e250eda9a3e019d84d1950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 31 Aug 2017 10:28:09 +0200 Subject: [PATCH 02/10] MAINT remove unused imports --- sklearn/linear_model/tests/test_ransac.py | 2 -- sklearn/pipeline.py | 2 +- sklearn/tests/test_multioutput.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 41255f0c45fa4..7146ed1a129b2 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -1,5 +1,3 @@ -from scipy import sparse - import numpy as np from scipy import sparse diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1c22210cbfb22..66da9dffeb066 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -15,7 +15,7 @@ from scipy import sparse from .base import clone, TransformerMixin -from .externals.joblib import Parallel, delayed, Memory +from .externals.joblib import Parallel, delayed from .externals import six from .utils.metaestimators import if_delegate_has_method from .utils import Bunch diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index da8be05f29f75..26981d20fc633 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -15,7 +15,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn import datasets from sklearn.base import clone -from sklearn.datasets import fetch_mldata from sklearn.datasets import make_classification from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier from sklearn.exceptions import NotFittedError From d6a42354145c92cf88093cbcc70b13f639319c38 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 1 Sep 2017 07:11:00 +0100 Subject: [PATCH 03/10] DOC fix a glitch in pca docstring (#9664) --- sklearn/decomposition/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index c0f1eb77b5f56..171774321cec0 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -220,7 +220,7 @@ class PCA(_BasePCA): mean_ : array, shape (n_features,) Per-feature empirical mean, estimated from the training set. - Equal to `X.mean(axis=1)`. + Equal to `X.mean(axis=0)`. n_components_ : int The estimated number of components. When n_components is set From 9b5561148f56a3934da9882a52f1978d7aa5bc75 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 1 Sep 2017 04:29:05 -0400 Subject: [PATCH 04/10] [MRG] Figure improvements (#9648) * Example plots render poorly in dev * flake8 + bias_variance * title padding * misc ensemble variance plotting don't use rcParams to set size of a single figure, put legend outside of plot * semisupervised plotting fixes use explicit kwargs in subplots_adjust, change hspace, don't change aspect ratio of imshow. --- examples/ensemble/plot_bias_variance.py | 15 +++++++++++---- ...ot_label_propagation_digits_active_learning.py | 10 ++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/examples/ensemble/plot_bias_variance.py b/examples/ensemble/plot_bias_variance.py index 8d88f99df1668..0f0a2478472c3 100644 --- a/examples/ensemble/plot_bias_variance.py +++ b/examples/ensemble/plot_bias_variance.py @@ -88,12 +88,14 @@ n_estimators = len(estimators) + # Generate data def f(x): x = x.ravel() return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2) + def generate(n_samples, noise, n_repeat=1): X = np.random.rand(n_samples) * 10 - 5 X = np.sort(X) @@ -110,6 +112,7 @@ def generate(n_samples, noise, n_repeat=1): return X, y + X_train = [] y_train = [] @@ -120,6 +123,8 @@ def generate(n_samples, noise, n_repeat=1): X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat) +plt.figure(figsize=(10, 8)) + # Loop over estimators to compare for n, (name, estimator) in enumerate(estimators): # Compute predictions @@ -166,8 +171,8 @@ def generate(n_samples, noise, n_repeat=1): plt.xlim([-5, 5]) plt.title(name) - if n == 0: - plt.legend(loc="upper left", prop={"size": 11}) + if n == n_estimators - 1: + plt.legend(loc=(1.1, .5)) plt.subplot(2, n_estimators, n_estimators + n + 1) plt.plot(X_test, y_error, "r", label="$error(x)$") @@ -178,7 +183,9 @@ def generate(n_samples, noise, n_repeat=1): plt.xlim([-5, 5]) plt.ylim([0, 0.1]) - if n == 0: - plt.legend(loc="upper left", prop={"size": 11}) + if n == n_estimators - 1: + + plt.legend(loc=(1.1, .5)) +plt.subplots_adjust(right=.75) plt.show() diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py index 5c8543937beba..f46b7ece7cd78 100644 --- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py +++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py @@ -65,7 +65,8 @@ print("Iteration %i %s" % (i, 70 * "_")) print("Label Spreading model: %d labeled & %d unlabeled (%d total)" - % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)) + % (n_labeled_points, n_total_samples - n_labeled_points, + n_total_samples)) print(classification_report(true_labels, predicted_labels)) @@ -95,7 +96,7 @@ # for more than 5 iterations, visualize the gain only on the first 5 if i < 5: sub = f.add_subplot(5, 5, index + 1 + (5 * i)) - sub.imshow(image, cmap=plt.cm.gray_r) + sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none') sub.set_title("predict: %i\ntrue: %i" % ( lp_model.transduction_[image_index], y[image_index]), size=10) sub.axis('off') @@ -108,6 +109,7 @@ n_labeled_points += len(uncertainty_index) f.suptitle("Active learning with Label Propagation.\nRows show 5 most " - "uncertain labels to learn with the next model.") -plt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45) + "uncertain labels to learn with the next model.", y=1.15) +plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2, + hspace=0.85) plt.show() From b1b77ffbf310a83123e643f831bde8de25ba53c3 Mon Sep 17 00:00:00 2001 From: pasbi Date: Fri, 1 Sep 2017 11:10:01 +0200 Subject: [PATCH 05/10] Improve y parameter documentation for transformers (#9578) --- sklearn/decomposition/dict_learning.py | 12 +++++++++--- sklearn/decomposition/factor_analysis.py | 4 ++++ sklearn/decomposition/fastica_.py | 4 ++++ sklearn/decomposition/incremental_pca.py | 4 +++- sklearn/decomposition/nmf.py | 4 ++++ sklearn/decomposition/online_lda.py | 6 ++++++ sklearn/decomposition/pca.py | 10 ++++++++++ sklearn/decomposition/sparse_pca.py | 4 ++++ sklearn/decomposition/truncated_svd.py | 4 ++++ sklearn/manifold/isomap.py | 4 ++++ sklearn/manifold/locally_linear.py | 4 ++++ sklearn/manifold/mds.py | 4 ++++ sklearn/manifold/spectral_embedding_.py | 6 ++++++ sklearn/manifold/t_sne.py | 4 ++++ 14 files changed, 70 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 62cd2cd2aa101..7510efe508202 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -927,9 +927,9 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples in the number of samples - and n_features is the number of features. + X : Ignored. + + y : Ignored. Returns ------- @@ -1081,6 +1081,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self : object @@ -1251,6 +1253,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self : object @@ -1284,6 +1288,8 @@ def partial_fit(self, X, y=None, iter_offset=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + iter_offset : integer, optional The number of iteration on data batches that has been performed before this call to partial_fit. This is optional: diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index 4440ee90bd84a..1619d8e4da639 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -149,6 +149,8 @@ def fit(self, X, y=None): X : array-like, shape (n_samples, n_features) Training data. + y : Ignored. + Returns ------- self @@ -338,6 +340,8 @@ def score(self, X, y=None): X : array, shape (n_samples, n_features) The data + y : Ignored. + Returns ------- ll : float diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index fcc11ff643a5e..4af514bc327b2 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -509,6 +509,8 @@ def fit_transform(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- X_new : array-like, shape (n_samples, n_components) @@ -524,6 +526,8 @@ def fit(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index f0604001fab53..45828513bf95f 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -158,7 +158,7 @@ def fit(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. - y : Passthrough for ``Pipeline`` compatibility. + y : Ignored. Returns ------- @@ -199,6 +199,8 @@ def partial_fit(self, X, y=None, check_input=True): check_input : bool Run check_array on X. + y : Ignored. + Returns ------- self : object diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 153731cb83651..a8a744d7ff5e1 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -1211,6 +1211,8 @@ def fit_transform(self, X, y=None, W=None, H=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be decomposed + y : Ignored. + W : array-like, shape (n_samples, n_components) If init='custom', it is used as initial guess for the solution. @@ -1249,6 +1251,8 @@ def fit(self, X, y=None, **params): X : {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be decomposed + y : Ignored. + Returns ------- self diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index e9743c69422fb..84293145a1c61 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -473,6 +473,8 @@ def partial_fit(self, X, y=None): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. + y : Ignored. + Returns ------- self @@ -515,6 +517,8 @@ def fit(self, X, y=None): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. + y : Ignored. + Returns ------- self @@ -714,6 +718,8 @@ def score(self, X, y=None): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. + y : Ignored. + Returns ------- score : float diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 171774321cec0..bf167e4ae1b3c 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -319,6 +319,8 @@ def fit(self, X, y=None): Training data, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self : object @@ -336,6 +338,8 @@ def fit_transform(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- X_new : array-like, shape (n_samples, n_components) @@ -550,6 +554,8 @@ def score(self, X, y=None): X : array, shape(n_samples, n_features) The data. + y : Ignored. + Returns ------- ll : float @@ -676,6 +682,8 @@ def fit(self, X, y=None): Training data, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self : object @@ -762,6 +770,8 @@ def fit_transform(self, X, y=None): New data, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py index 47c03a80278b9..e0bd0debd04b5 100644 --- a/sklearn/decomposition/sparse_pca.py +++ b/sklearn/decomposition/sparse_pca.py @@ -107,6 +107,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self : object @@ -275,6 +277,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored. + Returns ------- self : object diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index 87b8b45e1543a..14925db8e6e0e 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -132,6 +132,8 @@ def fit(self, X, y=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. + y : Ignored. + Returns ------- self : object @@ -148,6 +150,8 @@ def fit_transform(self, X, y=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. + y : Ignored. + Returns ------- X_new : array, shape (n_samples, n_components) diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py index 1f6d0ae0dc0b1..6de1bfe7cdfb9 100644 --- a/sklearn/manifold/isomap.py +++ b/sklearn/manifold/isomap.py @@ -157,6 +157,8 @@ def fit(self, X, y=None): numpy array, precomputed tree, or NearestNeighbors object. + y: Ignored. + Returns ------- self : returns an instance of self. @@ -173,6 +175,8 @@ def fit_transform(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y: Ignored. + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index e8705cff359a6..0cfeb04889907 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -652,6 +652,8 @@ def fit(self, X, y=None): X : array-like of shape [n_samples, n_features] training set. + y: Ignored. + Returns ------- self : returns an instance of self. @@ -667,6 +669,8 @@ def fit_transform(self, X, y=None): X : array-like of shape [n_samples, n_features] training set. + y: Ignored. + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/manifold/mds.py b/sklearn/manifold/mds.py index 5f7327ef4dc84..c21a58689e8bc 100644 --- a/sklearn/manifold/mds.py +++ b/sklearn/manifold/mds.py @@ -379,6 +379,8 @@ def fit(self, X, y=None, init=None): Input data. If ``dissimilarity=='precomputed'``, the input should be the dissimilarity matrix. + y: Ignored. + init : ndarray, shape (n_samples,), optional, default: None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly @@ -397,6 +399,8 @@ def fit_transform(self, X, y=None, init=None): Input data. If ``dissimilarity=='precomputed'``, the input should be the dissimilarity matrix. + y: Ignored. + init : ndarray, shape (n_samples,), optional, default: None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index a330b7da7f856..7b64870aa4906 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -428,6 +428,8 @@ def _get_affinity_matrix(self, X, Y=None): Interpret X as precomputed adjacency graph computed from samples. + Y: Ignored. + Returns ------- affinity_matrix, shape (n_samples, n_samples) @@ -474,6 +476,8 @@ def fit(self, X, y=None): Interpret X as precomputed adjacency graph computed from samples. + Y: Ignored. + Returns ------- self : object @@ -514,6 +518,8 @@ def fit_transform(self, X, y=None): Interpret X as precomputed adjacency graph computed from samples. + Y: Ignored. + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 163e8340f7b29..83c0b363fb5a7 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -851,6 +851,8 @@ def fit_transform(self, X, y=None): If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. + y : Ignored. + Returns ------- X_new : array, shape (n_samples, n_components) @@ -870,6 +872,8 @@ def fit(self, X, y=None): matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' or 'coo'. + + y : Ignored. """ self.fit_transform(X) return self From ecc96be8c5e831fd0a12f3274ed4a31dabcbffe6 Mon Sep 17 00:00:00 2001 From: Pravar D Mahajan Date: Fri, 1 Sep 2017 05:26:19 -0400 Subject: [PATCH 06/10] [MRG] Raise exception on providing complex data to estimators (#9551) * Modifies model_selection.cross_validate docstring (#9534) - Fixes rendering of docstring examples - Instead of importing cross_val_score in example, cross_validate is imported * raise error on complex data input to estimators * Raise exception on providing complex data to estimators * adding checks to check_estimator for complex data * removing some unnecessary parts * autopep8 changes * removing ipdb, restoring some autopep8 fixes * removing ipdb, restoring some autopep8 fixes * adding documentation for complex data handling * adding one line explanation for each test case --- sklearn/utils/estimator_checks.py | 11 +++++++ sklearn/utils/tests/test_validation.py | 40 ++++++++++++++++++++++++++ sklearn/utils/validation.py | 28 +++++++++++++++++- 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 81f0d88e3f02b..3e7cb198a9d12 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -76,6 +76,7 @@ def _yield_non_meta_checks(name, estimator): yield check_sample_weights_pandas_series yield check_sample_weights_list yield check_estimators_fit_returns_self + yield check_complex_data # Check that all estimator yield informative messages when # trained on empty datasets @@ -458,6 +459,16 @@ def check_dtype_object(name, estimator_orig): assert_raises_regex(TypeError, msg, estimator.fit, X, y) +def check_complex_data(name, estimator_orig): + # check that estimators raise an exception on providing complex data + X = np.random.sample(10) + 1j * np.random.sample(10) + X = X.reshape(-1, 1) + y = np.random.sample(10) + 1j * np.random.sample(10) + estimator = clone(estimator_orig) + assert_raises_regex(ValueError, "Complex data not supported", + estimator.fit, X, y) + + @ignore_warnings def check_dict_unchanged(name, estimator_orig): # this estimator raises diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 6bebad884d835..dcfaa81178b79 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -437,6 +437,46 @@ def test_check_array_min_samples_and_features_messages(): assert_array_equal(y, y_checked) +def test_check_array_complex_data_error(): + # np array + X = np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]) + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + # list of lists + X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]] + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + # tuple of tuples + X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j)) + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + # list of np arrays + X = [np.array([1 + 2j, 3 + 4j, 5 + 7j]), + np.array([2 + 3j, 4 + 5j, 6 + 7j])] + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + # tuple of np arrays + X = (np.array([1 + 2j, 3 + 4j, 5 + 7j]), + np.array([2 + 3j, 4 + 5j, 6 + 7j])) + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + # dataframe + X = MockDataFrame( + np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])) + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + # sparse matrix + X = sp.coo_matrix([[0, 1 + 2j], [0, 0]]) + assert_raises_regexp( + ValueError, "Complex data not supported", check_array, X) + + def test_has_fit_parameter(): assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5847b540d7b6c..080c30fcf9b2c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -13,6 +13,7 @@ import numpy as np import scipy.sparse as sp +from numpy.core.numeric import ComplexWarning from ..externals import six from ..utils.fixes import signature @@ -307,6 +308,13 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, return spmatrix +def _ensure_no_complex_data(array): + if hasattr(array, 'dtype') and array.dtype is not None \ + and hasattr(array.dtype, 'kind') and array.dtype.kind == "c": + raise ValueError("Complex data not supported\n" + "{}\n".format(array)) + + def check_array(array, accept_sparse=False, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, @@ -427,10 +435,28 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None, context = " by %s" % estimator_name if estimator is not None else "" if sp.issparse(array): + _ensure_no_complex_data(array) array = _ensure_sparse_format(array, accept_sparse, dtype, copy, force_all_finite) else: - array = np.array(array, dtype=dtype, order=order, copy=copy) + # If np.array(..) gives ComplexWarning, then we convert the warning + # to an error. This is needed because specifying a non complex + # dtype to the function converts complex to real dtype, + # thereby passing the test made in the lines following the scope + # of warnings context manager. + with warnings.catch_warnings(): + try: + warnings.simplefilter('error', ComplexWarning) + array = np.array(array, dtype=dtype, order=order, copy=copy) + except ComplexWarning: + raise ValueError("Complex data not supported\n" + "{}\n".format(array)) + + # It is possible that the np.array(..) gave no warning. This happens + # when no dtype conversion happend, for example dtype = None. The + # result is that np.array(..) produces an array of complex dtype + # and we need to catch and raise exception for such cases. + _ensure_no_complex_data(array) if ensure_2d: if array.ndim == 1: From 846313b57009dd9f8340f5f7004bc2bf1119b709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 1 Sep 2017 12:53:59 +0200 Subject: [PATCH 07/10] [MRG+1] Deprecate sklearn.utils.testing.raises and remove it from tests (#9660) --- sklearn/datasets/tests/test_lfw.py | 18 +++-- .../datasets/tests/test_svmlight_format.py | 20 ++---- .../tests/test_gaussian_process.py | 5 +- sklearn/linear_model/tests/test_logistic.py | 5 +- sklearn/linear_model/tests/test_sgd.py | 67 ++++++++----------- sklearn/linear_model/tests/test_theil_sen.py | 18 ++--- sklearn/svm/tests/test_bounds.py | 8 +-- sklearn/tree/tests/test_tree.py | 4 +- sklearn/utils/testing.py | 11 ++- 9 files changed, 69 insertions(+), 87 deletions(-) diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py index 3e5875a060be1..ac6395c4958be 100644 --- a/sklearn/datasets/tests/test_lfw.py +++ b/sklearn/datasets/tests/test_lfw.py @@ -28,7 +28,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import SkipTest -from sklearn.utils.testing import raises +from sklearn.utils.testing import assert_raises SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_") @@ -110,10 +110,9 @@ def teardown_module(): shutil.rmtree(SCIKIT_LEARN_EMPTY_DATA) -@raises(IOError) def test_load_empty_lfw_people(): - fetch_lfw_people(data_home=SCIKIT_LEARN_EMPTY_DATA, - download_if_missing=False) + assert_raises(IOError, fetch_lfw_people, data_home=SCIKIT_LEARN_EMPTY_DATA, + download_if_missing=False) def test_load_fake_lfw_people(): @@ -148,16 +147,15 @@ def test_load_fake_lfw_people(): 'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez']) -@raises(ValueError) def test_load_fake_lfw_people_too_restrictive(): - fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=100, - download_if_missing=False) + assert_raises(ValueError, fetch_lfw_people, data_home=SCIKIT_LEARN_DATA, + min_faces_per_person=100, download_if_missing=False) -@raises(IOError) def test_load_empty_lfw_pairs(): - fetch_lfw_pairs(data_home=SCIKIT_LEARN_EMPTY_DATA, - download_if_missing=False) + assert_raises(IOError, fetch_lfw_pairs, + data_home=SCIKIT_LEARN_EMPTY_DATA, + download_if_missing=False) def test_load_fake_lfw_pairs(): diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index d688dc798237b..2e3b7982476b0 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -15,7 +15,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex -from sklearn.utils.testing import raises from sklearn.utils.testing import assert_in from sklearn.utils.fixes import sp_version @@ -138,20 +137,17 @@ def test_load_compressed(): assert_array_equal(y, ybz) -@raises(ValueError) def test_load_invalid_file(): - load_svmlight_file(invalidfile) + assert_raises(ValueError, load_svmlight_file, invalidfile) -@raises(ValueError) def test_load_invalid_order_file(): - load_svmlight_file(invalidfile2) + assert_raises(ValueError, load_svmlight_file, invalidfile2) -@raises(ValueError) def test_load_zero_based(): f = BytesIO(b("-1 4:1.\n1 0:1\n")) - load_svmlight_file(f, zero_based=False) + assert_raises(ValueError, load_svmlight_file, f, zero_based=False) def test_load_zero_based_auto(): @@ -186,21 +182,19 @@ def test_load_with_qid(): assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]]) -@raises(ValueError) def test_load_invalid_file2(): - load_svmlight_files([datafile, invalidfile, datafile]) + assert_raises(ValueError, load_svmlight_files, + [datafile, invalidfile, datafile]) -@raises(TypeError) def test_not_a_filename(): # in python 3 integers are valid file opening arguments (taken as unix # file descriptors) - load_svmlight_file(.42) + assert_raises(TypeError, load_svmlight_file, .42) -@raises(IOError) def test_invalid_filename(): - load_svmlight_file("trou pic nic douille") + assert_raises(IOError, load_svmlight_file, "trou pic nic douille") def test_dump(): diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index 860e3f290f3ea..37d872fc99fb5 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -11,7 +11,7 @@ from sklearn.gaussian_process import regression_models as regression from sklearn.gaussian_process import correlation_models as correlation from sklearn.datasets import make_regression -from sklearn.utils.testing import assert_greater, assert_true, raises +from sklearn.utils.testing import assert_greater, assert_true, assert_raises f = lambda x: x * np.sin(x) @@ -95,10 +95,9 @@ def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential, assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)) -@raises(ValueError) def test_wrong_number_of_outputs(): gp = GaussianProcess() - gp.fit([[1, 2, 3], [4, 5, 6]], [1, 2, 3]) + assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3]) def test_more_builtin_correlation_models(random_start=1): diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 94eb3ea3d2dcb..ea4300df01100 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -17,7 +17,6 @@ from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import raises from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model.logistic import ( @@ -249,13 +248,13 @@ def test_write_parameters(): assert_array_almost_equal(clf.decision_function(X), 0) -@raises(ValueError) def test_nan(): # Test proper NaN handling. # Regression test for Issue #252: fit used to go into an infinite loop. Xnan = np.array(X, dtype=np.float64) Xnan[0, 1] = np.nan - LogisticRegression(random_state=0).fit(Xnan, Y1) + logistic = LogisticRegression(random_state=0) + assert_raises(ValueError, logistic.fit, Xnan, Y1) def test_consistency_path(): diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index f033a4f6021b2..d4552a9934cf1 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_less -from sklearn.utils.testing import raises from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false, assert_true from sklearn.utils.testing import assert_equal @@ -266,11 +265,11 @@ def test_late_onset_averaging_reached(self): decimal=16) assert_almost_equal(clf1.intercept_, average_intercept, decimal=16) - @raises(ValueError) def test_sgd_bad_alpha_for_optimal_learning_rate(self): # Check whether expected ValueError on bad alpha, i.e. 0 # since alpha is used to compute the optimal learning rate - self.factory(alpha=0, learning_rate="optimal") + assert_raises(ValueError, self.factory, + alpha=0, learning_rate="optimal") class DenseSGDClassifierTestCase(unittest.TestCase, CommonTest): @@ -287,63 +286,56 @@ def test_sgd(self): # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7) assert_array_equal(clf.predict(T), true_result) - @raises(ValueError) def test_sgd_bad_l1_ratio(self): # Check whether expected ValueError on bad l1_ratio - self.factory(l1_ratio=1.1) + assert_raises(ValueError, self.factory, l1_ratio=1.1) - @raises(ValueError) def test_sgd_bad_learning_rate_schedule(self): # Check whether expected ValueError on bad learning_rate - self.factory(learning_rate="") + assert_raises(ValueError, self.factory, learning_rate="") - @raises(ValueError) def test_sgd_bad_eta0(self): # Check whether expected ValueError on bad eta0 - self.factory(eta0=0, learning_rate="constant") + assert_raises(ValueError, self.factory, eta0=0, + learning_rate="constant") - @raises(ValueError) def test_sgd_bad_alpha(self): # Check whether expected ValueError on bad alpha - self.factory(alpha=-.1) + assert_raises(ValueError, self.factory, alpha=-.1) - @raises(ValueError) def test_sgd_bad_penalty(self): # Check whether expected ValueError on bad penalty - self.factory(penalty='foobar', l1_ratio=0.85) + assert_raises(ValueError, self.factory, penalty='foobar', + l1_ratio=0.85) - @raises(ValueError) def test_sgd_bad_loss(self): # Check whether expected ValueError on bad loss - self.factory(loss="foobar") + assert_raises(ValueError, self.factory, loss="foobar") - @raises(ValueError) def test_sgd_max_iter_param(self): # Test parameter validity check - self.factory(max_iter=-10000) + assert_raises(ValueError, self.factory, max_iter=-10000) - @raises(ValueError) def test_sgd_shuffle_param(self): # Test parameter validity check - self.factory(shuffle="false") + assert_raises(ValueError, self.factory, shuffle="false") - @raises(TypeError) def test_argument_coef(self): # Checks coef_init not allowed as model argument (only fit) - # Provided coef_ does not match dataset. - self.factory(coef_init=np.zeros((3,))).fit(X, Y) + # Provided coef_ does not match dataset + assert_raises(TypeError, self.factory, coef_init=np.zeros((3,))) - @raises(ValueError) def test_provide_coef(self): # Checks coef_init shape for the warm starts # Provided coef_ does not match dataset. - self.factory().fit(X, Y, coef_init=np.zeros((3,))) + assert_raises(ValueError, self.factory().fit, + X, Y, coef_init=np.zeros((3,))) - @raises(ValueError) def test_set_intercept(self): # Checks intercept_ shape for the warm starts # Provided intercept_ does not match dataset. - self.factory().fit(X, Y, intercept_init=np.zeros((3,))) + assert_raises(ValueError, self.factory().fit, + X, Y, intercept_init=np.zeros((3,))) def test_set_intercept_binary(self): # Checks intercept_ shape for the warm starts in binary case @@ -386,10 +378,10 @@ def test_set_intercept_to_intercept(self): clf = self.factory().fit(X, Y) self.factory().fit(X, Y, intercept_init=clf.intercept_) - @raises(ValueError) def test_sgd_at_least_two_labels(self): # Target must have at least two labels - self.factory(alpha=0.01, max_iter=20).fit(X2, np.ones(9)) + clf = self.factory(alpha=0.01, max_iter=20) + assert_raises(ValueError, clf.fit, X2, np.ones(9)) def test_partial_fit_weight_class_balanced(self): # partial_fit with class_weight='balanced' not supported""" @@ -607,17 +599,15 @@ def test_equal_class_weight(self): # should be similar up to some epsilon due to learning rate schedule assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) - @raises(ValueError) def test_wrong_class_weight_label(self): # ValueError due to not existing class label. clf = self.factory(alpha=0.1, max_iter=1000, class_weight={0: 0.5}) - clf.fit(X, Y) + assert_raises(ValueError, clf.fit, X, Y) - @raises(ValueError) def test_wrong_class_weight_format(self): # ValueError due to wrong class_weight argument type. clf = self.factory(alpha=0.1, max_iter=1000, class_weight=[0.5]) - clf.fit(X, Y) + assert_raises(ValueError, clf.fit, X, Y) def test_weights_multiplied(self): # Tests that class_weight and sample_weight are multiplicative @@ -700,18 +690,16 @@ def test_sample_weights(self): # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) - @raises(ValueError) def test_wrong_sample_weights(self): # Test if ValueError is raised if sample_weight has wrong shape clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False) # provided sample_weight too long - clf.fit(X, Y, sample_weight=np.arange(7)) + assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7)) - @raises(ValueError) def test_partial_fit_exception(self): clf = self.factory(alpha=0.01) # classes was not specified - clf.partial_fit(X3, Y3) + assert_raises(ValueError, clf.partial_fit, X3, Y3) def test_partial_fit_binary(self): third = X.shape[0] // 3 @@ -851,15 +839,14 @@ def test_sgd(self): clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) assert_equal(clf.coef_[0], clf.coef_[1]) - @raises(ValueError) def test_sgd_bad_penalty(self): # Check whether expected ValueError on bad penalty - self.factory(penalty='foobar', l1_ratio=0.85) + assert_raises(ValueError, self.factory, + penalty='foobar', l1_ratio=0.85) - @raises(ValueError) def test_sgd_bad_loss(self): # Check whether expected ValueError on bad loss - self.factory(loss="foobar") + assert_raises(ValueError, self.factory, loss="foobar") def test_sgd_averaged_computed_correctly(self): # Tests the average regressor matches the naive implementation diff --git a/sklearn/linear_model/tests/test_theil_sen.py b/sklearn/linear_model/tests/test_theil_sen.py index 279beb8014e95..3a2b1f9dc006f 100644 --- a/sklearn/linear_model/tests/test_theil_sen.py +++ b/sklearn/linear_model/tests/test_theil_sen.py @@ -20,7 +20,7 @@ from sklearn.linear_model.theil_sen import _spatial_median, _breakdown_point from sklearn.linear_model.theil_sen import _modified_weiszfeld_step from sklearn.utils.testing import ( - assert_almost_equal, assert_greater, assert_less, raises, + assert_almost_equal, assert_greater, assert_less, assert_raises, ) @@ -202,31 +202,31 @@ def test_calc_breakdown_point(): assert_less(np.abs(bp - 1 + 1 / (np.sqrt(2))), 1.e-6) -@raises(ValueError) def test_checksubparams_negative_subpopulation(): X, y, w, c = gen_toy_problem_1d() - TheilSenRegressor(max_subpopulation=-1, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(max_subpopulation=-1, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) -@raises(ValueError) def test_checksubparams_too_few_subsamples(): X, y, w, c = gen_toy_problem_1d() - TheilSenRegressor(n_subsamples=1, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(n_subsamples=1, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) -@raises(ValueError) def test_checksubparams_too_many_subsamples(): X, y, w, c = gen_toy_problem_1d() - TheilSenRegressor(n_subsamples=101, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(n_subsamples=101, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) -@raises(ValueError) def test_checksubparams_n_subsamples_if_less_samples_than_features(): random_state = np.random.RandomState(0) n_samples, n_features = 10, 20 X = random_state.normal(size=(n_samples, n_features)) y = random_state.normal(size=n_samples) - TheilSenRegressor(n_subsamples=9, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) def test_subpopulation(): diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 583c413bc5c11..e46dbb92df44a 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -5,7 +5,7 @@ from sklearn.svm import LinearSVC from sklearn.linear_model.logistic import LogisticRegression -from sklearn.utils.testing import assert_true, raises +from sklearn.utils.testing import assert_true, assert_raises from sklearn.utils.testing import assert_raise_message @@ -63,13 +63,11 @@ def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None): (np.asarray(clf.intercept_) != 0).any()) -@raises(ValueError) def test_ill_posed_min_c(): X = [[0, 0], [0, 0]] y = [0, 1] - l1_min_c(X, y) + assert_raises(ValueError, l1_min_c, X, y) -@raises(ValueError) def test_unsupported_loss(): - l1_min_c(dense_X, Y1, 'l1') + assert_raises(ValueError, l1_min_c, dense_X, Y1, 'l1') diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 97eee80ecff71..71ee8fa2bcb61 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -30,7 +30,6 @@ from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import raises from sklearn.utils.testing import ignore_warnings from sklearn.utils.validation import check_random_state @@ -394,11 +393,10 @@ def test_importances(): clf2.feature_importances_) -@raises(ValueError) def test_importances_raises(): # Check if variable importance before fit raises ValueError. clf = DecisionTreeClassifier() - clf.feature_importances_ + assert_raises(ValueError, getattr, clf, 'feature_importances_') def test_importances_gini_equal_mse(): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 4e7f7ea3e98a3..c5467f199697f 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -45,8 +45,17 @@ import sklearn from sklearn.base import BaseEstimator from sklearn.externals import joblib +from sklearn.utils import deprecated -from nose.tools import raises +try: + from nose.tools import raises as _nose_raises + deprecation_message = ( + 'sklearn.utils.testing.raises has been deprecated in version 0.20 ' + 'and will be removed in 0.22. Please use ' + 'sklearn.utils.testing.assert_raises instead.') + raises = deprecated(deprecation_message)(_nose_raises) +except ImportError: + pass from nose import with_setup from numpy.testing import assert_almost_equal From 6e01feff14e2016253b5ada96964a1b5b6145128 Mon Sep 17 00:00:00 2001 From: Minghui Liu Date: Fri, 1 Sep 2017 13:13:12 +0200 Subject: [PATCH 08/10] OPTIM make GaussianProcessRegressor faster with return_std=True --- doc/whats_new.rst | 8 ++++++++ sklearn/gaussian_process/gpr.py | 17 ++++++++++++----- sklearn/gaussian_process/tests/test_gpr.py | 22 +++++++++++++++++++++- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0ca707ce2cbbf..258d6acc11aa8 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -43,6 +43,14 @@ Classifiers and regressors Enhancements ............ +Classifiers and regressors + +- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` + is faster when using ``return_std=True`` in particular more when called + several times in a row. :issue:`9234` by :user:`andrewww ` + and :user:`Minghui Liu `. + + Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py index 4f9ff9cee7911..c92ca7f68f368 100644 --- a/sklearn/gaussian_process/gpr.py +++ b/sklearn/gaussian_process/gpr.py @@ -245,6 +245,8 @@ def obj_func(theta, eval_gradient=True): K[np.diag_indices_from(K)] += self.alpha try: self.L_ = cholesky(K, lower=True) # Line 2 + # self.L_ changed, self._K_inv needs to be recomputed + self._K_inv = None except np.linalg.LinAlgError as exc: exc.args = ("The kernel, %s, is not returning a " "positive definite matrix. Try gradually " @@ -320,13 +322,18 @@ def predict(self, X, return_std=False, return_cov=False): y_cov = self.kernel_(X) - K_trans.dot(v) # Line 6 return y_mean, y_cov elif return_std: - # compute inverse K_inv of K based on its Cholesky - # decomposition L and its inverse L_inv - L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0])) - K_inv = L_inv.dot(L_inv.T) + # cache result of K_inv computation + if self._K_inv is None: + # compute inverse K_inv of K based on its Cholesky + # decomposition L and its inverse L_inv + L_inv = solve_triangular(self.L_.T, + np.eye(self.L_.shape[0])) + self._K_inv = L_inv.dot(L_inv.T) + # Compute variance of predictive distribution y_var = self.kernel_.diag(X) - y_var -= np.einsum("ij,ij->i", np.dot(K_trans, K_inv), K_trans) + y_var -= np.einsum("ij,ij->i", + np.dot(K_trans, self._K_inv), K_trans) # Check if any of the variances is negative because of # numerical issues. If yes: set the variance to 0. diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index b645a6be18e22..602b2b88ae9c9 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -15,11 +15,13 @@ from sklearn.utils.testing \ import (assert_true, assert_greater, assert_array_less, assert_almost_equal, assert_equal, assert_raise_message, - assert_array_almost_equal) + assert_array_almost_equal, assert_array_equal) def f(x): return x * np.sin(x) + + X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T y = f(X).ravel() @@ -344,3 +346,21 @@ def test_no_fit_default_predict(): assert_array_almost_equal(y_std1, y_std2) assert_array_almost_equal(y_cov1, y_cov2) + + +def test_K_inv_reset(): + y2 = f(X2).ravel() + for kernel in kernels: + # Test that self._K_inv is reset after a new fit + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert_true(hasattr(gpr, '_K_inv')) + assert_true(gpr._K_inv is None) + gpr.predict(X, return_std=True) + assert_true(gpr._K_inv is not None) + gpr.fit(X2, y2) + assert_true(gpr._K_inv is None) + gpr.predict(X2, return_std=True) + gpr2 = GaussianProcessRegressor(kernel=kernel).fit(X2, y2) + gpr2.predict(X2, return_std=True) + # the value of K_inv should be independent of the first fit + assert_array_equal(gpr._K_inv, gpr2._K_inv) From deaa96452a981e3e54dc302fc14cb1c83cb2e399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 1 Sep 2017 14:19:19 +0200 Subject: [PATCH 09/10] Fix test_validation.py --- sklearn/utils/tests/test_validation.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index dcfaa81178b79..37a0eb859f565 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -438,42 +438,41 @@ def test_check_array_min_samples_and_features_messages(): def test_check_array_complex_data_error(): - # np array X = np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]) - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) # list of lists X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]] - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) # tuple of tuples X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j)) - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) # list of np arrays X = [np.array([1 + 2j, 3 + 4j, 5 + 7j]), np.array([2 + 3j, 4 + 5j, 6 + 7j])] - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) # tuple of np arrays X = (np.array([1 + 2j, 3 + 4j, 5 + 7j]), np.array([2 + 3j, 4 + 5j, 6 + 7j])) - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) # dataframe X = MockDataFrame( np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])) - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) # sparse matrix X = sp.coo_matrix([[0, 1 + 2j], [0, 0]]) - assert_raises_regexp( + assert_raises_regex( ValueError, "Complex data not supported", check_array, X) From 233a3e53478aaeb4999728d19402580d6302c726 Mon Sep 17 00:00:00 2001 From: RAKOTOARISON Herilalaina Date: Sun, 3 Sep 2017 00:54:35 +0200 Subject: [PATCH 10/10] ENH Add named_estimator_ for votingClassifier (#9168) --- doc/whats_new.rst | 4 ++++ sklearn/ensemble/tests/test_voting_classifier.py | 7 +++++++ sklearn/ensemble/voting_classifier.py | 14 +++++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 258d6acc11aa8..88aa6cd7c0404 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -50,6 +50,10 @@ Classifiers and regressors several times in a row. :issue:`9234` by :user:`andrewww ` and :user:`Minghui Liu `. +- Add `named_estimators_` parameter in + :class:`sklearn.ensemble.voting_classifier` to access fitted + estimators. :issue:`9157` by :user:`Herilalaina Rakotoarison `. + Model evaluation and meta-estimators diff --git a/sklearn/ensemble/tests/test_voting_classifier.py b/sklearn/ensemble/tests/test_voting_classifier.py index 023be79912d12..22665384ed7ce 100644 --- a/sklearn/ensemble/tests/test_voting_classifier.py +++ b/sklearn/ensemble/tests/test_voting_classifier.py @@ -296,7 +296,14 @@ def test_set_params(): clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) + assert_true('lr' in eclf1.named_estimators) + assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) + assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) + assert_true('lr' in eclf1.named_estimators_) + assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) + assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) + eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py index ad6c0125dd664..26bc8e66df01a 100644 --- a/sklearn/ensemble/voting_classifier.py +++ b/sklearn/ensemble/voting_classifier.py @@ -21,6 +21,7 @@ from ..externals.joblib import Parallel, delayed from ..utils.validation import has_fit_parameter, check_is_fitted from ..utils.metaestimators import _BaseComposition +from ..utils import Bunch def _parallel_fit_estimator(estimator, X, y, sample_weight=None): @@ -75,6 +76,11 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin): The collection of fitted sub-estimators as defined in ``estimators`` that are not `None`. + named_estimators_ : Bunch object, a dictionary with attribute access + Attribute to access any fitted sub-estimators by name. + + .. versionadded:: 0.20 + classes_ : array-like, shape = [n_predictions] The classes labels. @@ -94,6 +100,9 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin): >>> eclf1 = eclf1.fit(X, y) >>> print(eclf1.predict(X)) [1 1 1 2 2 2] + >>> np.array_equal(eclf1.named_estimators_.lr.predict(X), + ... eclf1.named_estimators_['lr'].predict(X)) + True >>> eclf2 = VotingClassifier(estimators=[ ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], ... voting='soft') @@ -122,7 +131,7 @@ def __init__(self, estimators, voting='hard', weights=None, n_jobs=1, @property def named_estimators(self): - return dict(self.estimators) + return Bunch(**dict(self.estimators)) def fit(self, X, y, sample_weight=None): """ Fit the estimators. @@ -188,6 +197,9 @@ def fit(self, X, y, sample_weight=None): sample_weight=sample_weight) for clf in clfs if clf is not None) + self.named_estimators_ = Bunch(**dict()) + for k, e in zip(self.estimators, self.estimators_): + self.named_estimators_[k[0]] = e return self @property