diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e03bc74..d0c583f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -30,6 +30,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ ubuntu-latest, macOS-latest, windows-latest ] python-version: [ '3.8', '3.9', '3.10' ] diff --git a/pyproject.toml b/pyproject.toml index cb94c1e..113f86e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "tsml" -version = "0.0.6" +version = "0.0.7" description = "A toolkit for time series machine learning algorithms." authors = [ {name = "Matthew Middlehurst", email = "m.middlehurst@uea.ac.uk"}, @@ -38,6 +38,7 @@ dependencies = [ "numba>=0.55", "numpy>=1.21.0", "scikit-learn>=1.0.2", + "pandas", ] [project.optional-dependencies] diff --git a/tsml/__init__.py b/tsml/__init__.py index 31ae2af..e1e3ccb 100644 --- a/tsml/__init__.py +++ b/tsml/__init__.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- """tsml.""" -__version__ = "0.0.6" +__version__ = "0.0.7" diff --git a/tsml/dummy/_dummy.py b/tsml/dummy/_dummy.py index 9cb4035..a068658 100644 --- a/tsml/dummy/_dummy.py +++ b/tsml/dummy/_dummy.py @@ -10,7 +10,7 @@ from sklearn.dummy import DummyRegressor as SklearnDummyRegressor from sklearn.utils import check_random_state from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import _num_samples, check_is_fitted from tsml.base import BaseTimeSeriesEstimator @@ -85,8 +85,11 @@ class prior probabilities. 0.5 """ - def __init__(self, strategy="prior", random_state=None, constant=None): + def __init__( + self, strategy="prior", validate=False, random_state=None, constant=None + ): self.strategy = strategy + self.validate = validate self.random_state = random_state self.constant = constant @@ -94,25 +97,28 @@ def __init__(self, strategy="prior", random_state=None, constant=None): def fit(self, X, y): """""" - X, y = self._validate_data(X=X, y=y, ensure_min_series_length=1) + if self.validate: + X, y = self._validate_data(X=X, y=y, ensure_min_series_length=1) - check_classification_targets(y) + check_classification_targets(y) - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, classVal in enumerate(self.classes_): - self.class_dictionary_[classVal] = index + self.classes_ = np.unique(np.asarray(y)) - if self.n_classes_ == 1: - return self + if self.validate: + self.n_classes_ = self.classes_.shape[0] + self.class_dictionary_ = {} + for index, classVal in enumerate(self.classes_): + self.class_dictionary_[classVal] = index - self._clf = SklearnDummyClassifier( + if self.n_classes_ == 1: + return self + + self.clf_ = SklearnDummyClassifier( strategy=self.strategy, random_state=self.random_state, constant=self.constant, ) - self._clf.fit(None, y) + self.clf_.fit(None, y) return self @@ -120,30 +126,36 @@ def predict(self, X) -> np.ndarray: """""" check_is_fitted(self) - X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) + if self.validate: + # treat case of single class seen in fit + if self.n_classes_ == 1: + return np.repeat( + list(self.class_dictionary_.keys()), X.shape[0], axis=0 + ) - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0) + X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) - return self._clf.predict(np.zeros(X.shape)) + return self.clf_.predict(np.zeros((_num_samples(X), 2))) def predict_proba(self, X) -> np.ndarray: """""" check_is_fitted(self) - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat([[1]], X.shape[0], axis=0) + if self.validate: + # treat case of single class seen in fit + if self.n_classes_ == 1: + return np.repeat([[1]], X.shape[0], axis=0) - X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) + X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) - return self._clf.predict_proba(np.zeros(X.shape)) + return self.clf_.predict_proba(np.zeros((_num_samples(X), 2))) def _more_tags(self): return { "X_types": ["3darray", "2darray", "np_list"], "equal_length_only": False, + "no_validation": not self.validate, + "allow_nan": True, } @@ -199,8 +211,9 @@ class DummyRegressor(RegressorMixin, BaseTimeSeriesEstimator): -0.07184048625633688 """ - def __init__(self, strategy="mean", constant=None, quantile=None): + def __init__(self, strategy="mean", validate=False, constant=None, quantile=None): self.strategy = strategy + self.validate = validate self.constant = constant self.quantile = quantile @@ -208,12 +221,13 @@ def __init__(self, strategy="mean", constant=None, quantile=None): def fit(self, X, y): """""" - _, y = self._validate_data(X=X, y=y, ensure_min_series_length=1) + if self.validate: + _, y = self._validate_data(X=X, y=y, ensure_min_series_length=1) - self._reg = SklearnDummyRegressor( + self.reg_ = SklearnDummyRegressor( strategy=self.strategy, constant=self.constant, quantile=self.quantile ) - self._reg.fit(None, y) + self.reg_.fit(None, y) return self @@ -221,14 +235,17 @@ def predict(self, X): """""" check_is_fitted(self) - X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) + if self.validate: + X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) - return self._reg.predict(np.zeros(X.shape)) + return self.reg_.predict(np.zeros((_num_samples(X), 2))) def _more_tags(self): return { "X_types": ["3darray", "2darray", "np_list"], "equal_length_only": False, + "no_validation": not self.validate, + "allow_nan": True, } @@ -257,8 +274,11 @@ class DummyClusterer(ClusterMixin, BaseTimeSeriesEstimator): 0.2087729039422543 """ - def __init__(self, strategy="single", n_clusters=2, random_state=None): + def __init__( + self, strategy="single", validate=False, n_clusters=2, random_state=None + ): self.strategy = strategy + self.validate = validate self.n_clusters = n_clusters self.random_state = random_state @@ -266,7 +286,8 @@ def __init__(self, strategy="single", n_clusters=2, random_state=None): def fit(self, X, y=None): """""" - X = self._validate_data(X=X, ensure_min_series_length=1) + if self.validate: + X = self._validate_data(X=X, ensure_min_series_length=1) if self.strategy == "single": self.labels_ = np.zeros(len(X), dtype=np.int32) @@ -284,15 +305,16 @@ def predict(self, X): """""" check_is_fitted(self) - X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) + if self.validate: + X = self._validate_data(X=X, reset=False, ensure_min_series_length=1) if self.strategy == "single": - return np.zeros(len(X), dtype=np.int32) + return np.zeros(_num_samples(X), dtype=np.int32) elif self.strategy == "unique": - return np.arange(len(X), dtype=np.int32) + return np.arange(_num_samples(X), dtype=np.int32) elif self.strategy == "random": rng = check_random_state(self.random_state) - return rng.randint(self.n_clusters, size=len(X), dtype=np.int32) + return rng.randint(self.n_clusters, size=_num_samples(X), dtype=np.int32) else: raise ValueError(f"Unknown strategy {self.strategy}") @@ -300,4 +322,6 @@ def _more_tags(self): return { "X_types": ["3darray", "2darray", "np_list"], "equal_length_only": False, + "no_validation": not self.validate, + "allow_nan": True, } diff --git a/tsml/interval_based/_base.py b/tsml/interval_based/_base.py index a455ab1..9e594de 100644 --- a/tsml/interval_based/_base.py +++ b/tsml/interval_based/_base.py @@ -214,6 +214,8 @@ def fit(self, X, y): X, y = self._validate_data(X=X, y=y, ensure_min_samples=2) X = self._convert_X(X) + rng = check_random_state(self.random_state) + self.n_instances_, self.n_dims_, self.series_length_ = X.shape if is_classifier(self): check_classification_targets(y) @@ -260,9 +262,7 @@ def fit(self, X, y): self._series_transformers = [None] # clone series_transformers if it is a transformer and transform the input data elif is_transformer(self.series_transformers): - t = _clone_estimator( - self.series_transformers, random_state=self.random_state - ) + t = _clone_estimator(self.series_transformers, random_state=rng) Xt = [t.fit_transform(X, y)] self._series_transformers = [t] # clone each series_transformers transformer and include the base series if None @@ -276,7 +276,7 @@ def fit(self, X, y): Xt.append(X) self._series_transformers.append(None) elif is_transformer(transformer): - t = _clone_estimator(transformer, random_state=self.random_state) + t = _clone_estimator(transformer, random_state=rng) Xt.append(t.fit_transform(X, y)) self._series_transformers.append(t) else: @@ -458,7 +458,8 @@ def fit(self, X, y): # single transformer or function for all series_transformers if is_transformer(self.interval_features): self._interval_transformer = [True] * len(Xt) - self._interval_features = [[self.interval_features]] * len(Xt) + transformer = _clone_estimator(self.interval_features, random_state=rng) + self._interval_features = [[transformer]] * len(Xt) elif callable(self.interval_features): self._interval_function = [True] * len(Xt) self._interval_features = [[self.interval_features]] * len(Xt) @@ -491,6 +492,7 @@ def fit(self, X, y): for method in feature: if is_transformer(method): self._interval_transformer[i] = True + feature = _clone_estimator(feature, random_state=rng) elif callable(method): self._interval_function[i] = True else: @@ -503,6 +505,7 @@ def fit(self, X, y): self._interval_features.append(feature) elif is_transformer(feature): self._interval_transformer[i] = True + feature = _clone_estimator(feature, random_state=rng) self._interval_features.append([feature]) elif callable(feature): self._interval_function[i] = True @@ -1030,4 +1033,3 @@ def _predict_for_estimator(self, Xt, estimator, intervals, predict_proba=False): return estimator.predict_proba(interval_features) else: return estimator.predict(interval_features) - diff --git a/tsml/interval_based/_interval_pipelines.py b/tsml/interval_based/_interval_pipelines.py index cc29379..9f2f750 100644 --- a/tsml/interval_based/_interval_pipelines.py +++ b/tsml/interval_based/_interval_pipelines.py @@ -99,6 +99,7 @@ def fit(self, X, y): X, y = self._validate_data( X=X, y=y, ensure_min_samples=2, ensure_min_series_length=3 ) + X = self._convert_X(X) self.n_instances_, self.n_dims_, self.series_length_ = X.shape self.classes_ = np.unique(y) @@ -152,6 +153,7 @@ def predict(self, X) -> np.ndarray: check_is_fitted(self) X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) + X = self._convert_X(X) return self._estimator.predict(self._transformer.transform(X)) @@ -171,6 +173,7 @@ def predict_proba(self, X) -> np.ndarray: check_is_fitted(self) X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) + X = self._convert_X(X) m = getattr(self._estimator, "predict_proba", None) if callable(m): @@ -311,6 +314,7 @@ def fit(self, X, y): X, y = self._validate_data( X=X, y=y, ensure_min_samples=2, ensure_min_series_length=3 ) + X = self._convert_X(X) self.n_instances_, self.n_dims_, self.series_length_ = X.shape @@ -359,6 +363,7 @@ def predict(self, X) -> np.ndarray: check_is_fitted(self) X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) + X = self._convert_X(X) return self._estimator.predict(self._transformer.transform(X)) @@ -489,6 +494,7 @@ def fit(self, X, y): X, y = self._validate_data( X=X, y=y, ensure_min_samples=2, ensure_min_series_length=7 ) + X = self._convert_X(X) self.n_instances_, self.n_dims_, self.series_length_ = X.shape self.classes_ = np.unique(y) @@ -542,6 +548,7 @@ def predict(self, X) -> np.ndarray: check_is_fitted(self) X = self._validate_data(X=X, reset=False, ensure_min_series_length=7) + X = self._convert_X(X) return self._estimator.predict(self._transformer.transform(X)) @@ -561,6 +568,7 @@ def predict_proba(self, X) -> np.ndarray: check_is_fitted(self) X = self._validate_data(X=X, reset=False, ensure_min_series_length=7) + X = self._convert_X(X) m = getattr(self._estimator, "predict_proba", None) if callable(m): diff --git a/tsml/tests/_sklearn_checks.py b/tsml/tests/_sklearn_checks.py index ec01e37..93324d5 100644 --- a/tsml/tests/_sklearn_checks.py +++ b/tsml/tests/_sklearn_checks.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Patched estimator checks originating from scikit-learn""" +"""Patched estimator checks originating from scikit-learn.""" __author__ = ["MatthewMiddlehurst"] @@ -10,103 +10,51 @@ import joblib import numpy as np -from numpy.testing import ( - assert_array_almost_equal, - assert_array_equal, - assert_array_less, -) +from numpy.testing import assert_array_almost_equal, assert_array_equal from scipy.stats import rankdata from sklearn import clone -from sklearn.base import is_classifier -from sklearn.datasets import make_multilabel_classification, make_regression from sklearn.exceptions import DataConversionWarning, NotFittedError -from sklearn.metrics.pairwise import linear_kernel, pairwise_distances, rbf_kernel -from sklearn.model_selection import ShuffleSplit, train_test_split +from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import scale -from sklearn.utils import IS_PYPY, shuffle +from sklearn.utils import shuffle from sklearn.utils._testing import ( SkipTest, - _get_args, assert_allclose, - assert_allclose_dense_sparse, - assert_raise_message, create_memmap_backed_data, ignore_warnings, raises, set_random_state, ) -from sklearn.utils.estimator_checks import ( - _choose_check_classifiers_labels, - _enforce_estimator_tags_y, - _is_pairwise_metric, - _is_public_parameter, - _NotAnArray, - _regression_dataset, - check_estimators_data_not_an_array, -) -from sklearn.utils.metaestimators import _safe_split -from sklearn.utils.validation import _num_samples, check_is_fitted, has_fit_parameter +from sklearn.utils.estimator_checks import _is_public_parameter, _NotAnArray +from sklearn.utils.validation import _num_samples, check_is_fitted import tsml.utils.testing as test_utils from tsml.utils._tags import _DEFAULT_TAGS, _safe_tags -def _enforce_estimator_tags_X(estimator, X, kernel=linear_kernel): - # Estimators with `1darray` in `X_types` tag only accept - # X of shape (`n_samples`,) - if "1darray" in _safe_tags(estimator, key="X_types"): - X = X[:, 0] - # Estimators with a `requires_positive_X` tag only accept - # strictly positive data - if _safe_tags(estimator, key="requires_positive_X"): - X = X - X.min() - if "categorical" in _safe_tags(estimator, key="X_types"): - X = (X - X.min()).astype(np.int32) - - if estimator.__class__.__name__ == "SkewedChi2Sampler": - # SkewedChi2Sampler requires X > -skewdness in transform - X = X - X.min() - - # Pairwise estimators only accept - # X of shape (`n_samples`, `n_samples`) - if _is_pairwise_metric(estimator): - X = pairwise_distances(X, metric="euclidean") - elif _safe_tags(estimator, key="pairwise"): - X = kernel(X, X) - return X - - @ignore_warnings(category=FutureWarning) def check_supervised_y_no_nan(name, estimator_orig): - """ - Checks that the Estimator targets are not NaN. + """Check that the Estimator targets are not NaN. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - estimator = clone(estimator_orig) X, _ = test_utils.generate_3d_test_data() + estimator = clone(estimator_orig) + for value in [np.nan, np.inf]: y = np.full(10, value) - y = _enforce_estimator_tags_y(estimator, y) - module_name = estimator.__module__ - if module_name.startswith("sklearn.") and not ( - "test_" in module_name or module_name.endswith("_testing") - ): - # In scikit-learn we want the error message to mention the input - # name and be specific about the kind of unexpected value. - if np.isinf(value): - match = ( - r"Input (y|Y) contains infinity or a value too large for" - r" dtype\('float64'\)." - ) - else: - match = r"Input (y|Y) contains NaN." + if np.isinf(value): + match = ( + r"Input (y|Y) contains infinity or a value too large for" + r" dtype\('float64'\)." + ) else: - # Do not impose a particular error message to third-party libraries. - match = None + match = r"Input (y|Y) contains NaN." + err_msg = ( f"Estimator {name} should have raised error on fitting array y with inf" " value." @@ -116,253 +64,96 @@ def check_supervised_y_no_nan(name, estimator_orig): @ignore_warnings(category=FutureWarning) -def check_sample_weights_not_an_array(name, estimator_orig): - """ - check that estimators will accept a 'sample_weight' parameter of type _NotAnArray - in the 'fit' function. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - estimator = clone(estimator_orig) - X = _NotAnArray(_enforce_estimator_tags_X(estimator_orig, X)) - y = _NotAnArray(y) - weights = _NotAnArray([1] * 12) - if _safe_tags(estimator, key="multioutput_only"): - y = _NotAnArray(y.data.reshape(-1, 1)) - estimator.fit(X, y, sample_weight=weights) - - -@ignore_warnings(category=FutureWarning) -def check_sample_weights_list(name, estimator_orig): - """ - check that estimators will accept a 'sample_weight' parameter of - type list in the 'fit' function. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - estimator = clone(estimator_orig) - rnd = np.random.RandomState(0) - n_samples = 30 - X = _enforce_estimator_tags_X(estimator_orig, X) - y = _enforce_estimator_tags_y(estimator, y) - sample_weight = [3] * n_samples - # Test that estimators don't raise any exception - estimator.fit(X, y, sample_weight=sample_weight) - - -@ignore_warnings(category=FutureWarning) -def check_sample_weights_shape(name, estimator_orig): - """ - check that estimators raise an error if sample_weight - shape mismatches the input - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - - estimator.fit(X, y, sample_weight=np.ones(len(y))) - - with raises(ValueError): - estimator.fit(X, y, sample_weight=np.ones(2 * len(y))) - - with raises(ValueError): - estimator.fit(X, y, sample_weight=np.ones((len(y), 2))) - - -@ignore_warnings(category=FutureWarning) -def check_sample_weights_invariance(name, estimator_orig, kind="ones"): - """ - For kind="ones" check that the estimators yield same results for - unit weights and no weights - For kind="zeros" check that setting sample_weight to 0 is equivalent - to removing corresponding samples. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X1, y1 = test_utils.generate_3d_test_data() - - estimator1 = clone(estimator_orig) - estimator2 = clone(estimator_orig) - set_random_state(estimator1, random_state=0) - set_random_state(estimator2, random_state=0) - - if kind == "ones": - X2 = X1 - y2 = y1 - sw2 = np.ones(shape=len(y1)) - err_msg = ( - f"For {name} sample_weight=None is not equivalent to sample_weight=ones" - ) - elif kind == "zeros": - # Construct a dataset that is very different to (X, y) if weights - # are disregarded, but identical to (X, y) given weights. - X2 = np.vstack([X1, X1 + 1]) - y2 = np.hstack([y1, 3 - y1]) - sw2 = np.ones(shape=len(y1) * 2) - sw2[len(y1) :] = 0 - X2, y2, sw2 = shuffle(X2, y2, sw2, random_state=0) - - err_msg = ( - f"For {name}, a zero sample_weight is not equivalent to removing the sample" - ) - else: # pragma: no cover - raise ValueError - - y1 = _enforce_estimator_tags_y(estimator1, y1) - y2 = _enforce_estimator_tags_y(estimator2, y2) - - estimator1.fit(X1, y=y1, sample_weight=None) - estimator2.fit(X2, y=y2, sample_weight=sw2) - - for method in ["predict", "predict_proba", "decision_function", "transform"]: - if hasattr(estimator_orig, method): - X_pred1 = getattr(estimator1, method)(X1) - X_pred2 = getattr(estimator2, method)(X1) - assert_allclose_dense_sparse(X_pred1, X_pred2, err_msg=err_msg) - - -def check_sample_weights_not_overwritten(name, estimator_orig): - """ - check that estimators don't override the passed sample_weight parameter - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - estimator = clone(estimator_orig) - set_random_state(estimator, random_state=0) - - y = _enforce_estimator_tags_y(estimator, y) - - sample_weight_original = np.ones(y.shape[0]) - sample_weight_original[0] = 10.0 - - sample_weight_fit = sample_weight_original.copy() - - estimator.fit(X, y, sample_weight=sample_weight_fit) - - err_msg = f"{name} overwrote the original `sample_weight` given during fit" - assert_allclose(sample_weight_fit, sample_weight_original, err_msg=err_msg) - - -@ignore_warnings(category=(FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): - """ - check that estimators treat dtype object as numeric if possible + """Check that estimators treat dtype object as numeric if possible. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - - X = _enforce_estimator_tags_X(estimator_orig, X) X = X.astype(object) - tags = _safe_tags(estimator_orig) + estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) estimator.fit(X, y) - if hasattr(estimator, "predict"): - estimator.predict(X) - if hasattr(estimator, "transform"): - estimator.transform(X) + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + ]: + if hasattr(estimator, method): + getattr(estimator, method)(X) with raises(Exception, match="Unknown label type", may_pass=True): estimator.fit(X, y.astype(object)) - if "string" not in tags["X_types"]: - X[0, 0] = {"foo": "bar"} - msg = "argument must be a string.* number" - with raises(TypeError, match=msg): - estimator.fit(X, y) - else: - # Estimators supporting string will not call np.asarray to convert the - # data to numeric and therefore, the error will not be raised. - # Checking for each element dtype in the input array will be costly. - # Refer to #11401 for full discussion. + X[0, 0] = {"foo": "bar"} + msg = "argument must be a string.* number" + with raises(TypeError, match=msg): estimator.fit(X, y) +@ignore_warnings(category=FutureWarning) def check_complex_data(name, estimator_orig): - """ + """Check that estimators raise an exception on complex data. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - rng = np.random.RandomState() + X, y = test_utils.generate_3d_test_data() + # check that estimators raise an exception on providing complex data - X = rng.uniform(size=10) + 1j * rng.uniform(size=10) - X = X.reshape(-1, 1) + X = X + 1j + y = y + 1j - # Something both valid for classification and regression - y = rng.randint(low=0, high=2, size=10) + 1j estimator = clone(estimator_orig) - set_random_state(estimator, random_state=0) + with raises(ValueError, match="Complex data not supported"): estimator.fit(X, y) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_dict_unchanged(name, estimator_orig): - """ + """Check that estimator dict is not modified by predict/transform methods. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) - estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 - - if hasattr(estimator, "n_best"): - estimator.n_best = 1 - - set_random_state(estimator, 1) estimator.fit(X, y) - for method in ["predict", "transform", "decision_function", "predict_proba"]: + + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + ]: if hasattr(estimator, method): dict_before = estimator.__dict__.copy() getattr(estimator, method)(X) - assert estimator.__dict__ == dict_before, ( - "Estimator changes __dict__ during %s" % method - ) + assert ( + estimator.__dict__ == dict_before + ), f"Estimator changes __dict__ during {method}" @ignore_warnings(category=FutureWarning) def check_dont_overwrite_parameters(name, estimator_orig): - """ - check that fit method only changes or sets private attributes + """Check that fit method only changes or sets private attributes. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() estimator = clone(estimator_orig) - X = _enforce_estimator_tags_X(estimator_orig, X) - y = _enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 - - set_random_state(estimator, 1) dict_before_fit = estimator.__dict__.copy() estimator.fit(X, y) - dict_after_fit = estimator.__dict__ public_keys_after_fit = [ @@ -375,11 +166,9 @@ def check_dont_overwrite_parameters(name, estimator_orig): # check that fit doesn't add any public attribute assert not attrs_added_by_fit, ( - "Estimator adds public attribute(s) during" - " the fit method." - " Estimators are only allowed to add private attributes" - " either started with _ or ended" - " with _ but %s added" % ", ".join(attrs_added_by_fit) + "Estimator adds public attribute(s) during the fit method. Estimators are only " + "allowed to add private attributes either started with _ or ended with _ but " + f"{', '.join(attrs_added_by_fit)} were added." ) # check that fit doesn't change any public attribute @@ -390,40 +179,34 @@ def check_dont_overwrite_parameters(name, estimator_orig): ] assert not attrs_changed_by_fit, ( - "Estimator changes public attribute(s) during" - " the fit method. Estimators are only allowed" - " to change attributes started" - " or ended with _, but" - " %s changed" % ", ".join(attrs_changed_by_fit) + "Estimator changes public attribute(s) during the fit method. Estimators are " + "only allowed to change attributes started or ended with _, but " + f"{', '.join(attrs_changed_by_fit)} were changed." ) @ignore_warnings(category=FutureWarning) def check_fit3d_predict1d(name, estimator_orig): - """ - check by fitting a 3d array and predicting with a 1d array + """Check by fitting a 3d array and predicting with a 1d array. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with a similar name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 - - set_random_state(estimator, 1) estimator.fit(X, y) - for method in ["predict", "transform", "decision_function", "predict_proba"]: + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + ]: if hasattr(estimator, method): - assert_raise_message( - ValueError, "Reshape your data", getattr(estimator, method), X[0][0] - ) + with raises(ValueError, match="Reshape your data"): + getattr(estimator, method)(X[0][0]) @ignore_warnings(category=FutureWarning) @@ -431,26 +214,22 @@ def check_methods_subset_invariance(name, estimator_orig): """Check smaller batches of data for predict methods does not impact results. Check that method gives invariant results if applied on mini batches or the whole - set + set. - Modified version of the scikit-learn 1.2.1 function with the name for time series - data. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - set_random_state(estimator, 1) estimator.fit(X, y) for method in [ "predict", - "transform", - "decision_function", - "score_samples", "predict_proba", + "decision_function", + "transform", ]: msg = f"{method} of {name} is not invariant when applied to a subset." @@ -465,29 +244,23 @@ def check_methods_subset_invariance(name, estimator_orig): result_by_batch = list(map(lambda x: x[0], result_by_batch)) assert_allclose( - np.ravel(result_full), np.ravel(result_by_batch), atol=1e-7, err_msg=msg + np.ravel(result_full), np.ravel(result_by_batch), err_msg=msg ) @ignore_warnings(category=FutureWarning) def check_methods_sample_order_invariance(name, estimator_orig): - """ - check that method gives invariant results if applied - on a subset with different sample order + """Test sample order invariance. + + Check that method gives invariant results if applied on a subset with different + sample order. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 2 - set_random_state(estimator, 1) estimator.fit(X, y) @@ -495,26 +268,24 @@ def check_methods_sample_order_invariance(name, estimator_orig): for method in [ "predict", - "transform", - "decision_function", - "score_samples", "predict_proba", + "decision_function", + "transform", ]: msg = ( - "{method} of {name} is not invariant when applied to a dataset" + f"{method} of {name} is not invariant when applied to a dataset " "with different sample order." - ).format(method=method, name=name) + ) if hasattr(estimator, method): - assert_allclose_dense_sparse( + assert_allclose( getattr(estimator, method)(X)[idx], getattr(estimator, method)(X[idx]), - atol=1e-9, err_msg=msg, ) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_fit3d_1sample(name, estimator_orig): """Check for fitting an estimator with only 1 sample. @@ -522,16 +293,12 @@ def check_fit3d_1sample(name, estimator_orig): returns an informative message. The error message should either mention the number of samples or the number of classes. - Modified version of the scikit-learn 1.2.1 function with the name for time series - data. + Modified version of the scikit-learn 1.2.1 function with a similar name for time + series data. """ X, y = test_utils.generate_3d_test_data(n_samples=1) - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - - set_random_state(estimator, 1) msg = [ "1 sample", @@ -545,58 +312,51 @@ def check_fit3d_1sample(name, estimator_orig): estimator.fit(X, y) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_fit3d_1feature(name, estimator_orig): """Check for fitting an estimator with only 1 series length. Check fitting a 3d array with only 1 feature either works or returns informative message - Modified version of the scikit-learn 1.2.1 function with the name for time series - data. + Modified version of the scikit-learn 1.2.1 function with a similar name for time + series data. """ X, y = test_utils.generate_3d_test_data(series_length=1) - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - - y = _enforce_estimator_tags_y(estimator, y) - set_random_state(estimator, 1) msg = ["1 series length", "series length 1", "series length = 1", "series length=1"] with raises(ValueError, match=msg, may_pass=True): estimator.fit(X, y) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_fit1d(name, estimator_orig): """Check fitting 1d X array raises a ValueError. - Modified version of the scikit-learn 1.2.1 function with the name for time series - data. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ rnd = np.random.RandomState() - X = 3 * rnd.uniform(size=(20)) + X = 3 * rnd.uniform(size=10) y = X.astype(int) + estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - set_random_state(estimator, 1) with raises(ValueError): estimator.fit(X, y) @ignore_warnings(category=FutureWarning) def check_transformer_general(name, transformer, readonly_memmap=False): - """ + """Check transformer adheres to sklearn-like conventions. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(transformer, X) - if readonly_memmap: X, y = create_memmap_backed_data([X, y]) @@ -605,48 +365,27 @@ def check_transformer_general(name, transformer, readonly_memmap=False): @ignore_warnings(category=FutureWarning) def check_transformer_data_not_an_array(name, transformer): - """ + """Check transformer works with non-array input. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(transformer, X) this_X = _NotAnArray(X) this_y = _NotAnArray(np.asarray(y)) + _check_transformer(name, transformer, this_X, this_y) # try the same with some list _check_transformer(name, transformer, X.tolist(), y.tolist()) -@ignore_warnings(category=FutureWarning) -def check_transformers_unfitted(name, transformer): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, _ = test_utils.generate_3d_test_data() - - transformer = clone(transformer) - with raises( - (AttributeError, ValueError), - err_msg=( - "The unfitted " - f"transformer {name} does not raise an error when " - "transform is called. Perhaps use " - "check_is_fitted in transform." - ), - ): - transformer.transform(X) - - def _check_transformer(name, transformer_orig, X, y): n_samples, n_dimensions, n_features = np.asarray(X).shape transformer = clone(transformer_orig) set_random_state(transformer) # fit - transformer.fit(X, y) # fit_transform method should work on non fitted estimator transformer_clone = clone(transformer) @@ -659,115 +398,109 @@ def _check_transformer(name, transformer_orig, X, y): # check for consistent n_samples assert X_pred.shape[0] == n_samples - if hasattr(transformer, "transform"): - X_pred2 = transformer.transform(X) - X_pred3 = transformer.fit_transform(X, y=y) - - if _safe_tags(transformer_orig, key="non_deterministic"): - msg = name + " is non deterministic" - raise SkipTest(msg) - if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): - for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): - assert_allclose_dense_sparse( - x_pred, - x_pred2, - atol=1e-2, - err_msg="fit_transform and transform outcomes not consistent in %s" - % transformer, - ) - assert_allclose_dense_sparse( - x_pred, - x_pred3, - atol=1e-2, - err_msg="consecutive fit_transform outcomes not consistent in %s" - % transformer, - ) - else: - assert_allclose_dense_sparse( - X_pred, - X_pred2, + assert hasattr(transformer, "transform") + + X_pred2 = transformer.transform(X) + X_pred3 = transformer.fit_transform(X, y=y) + + if _safe_tags(transformer_orig, key="non_deterministic"): + msg = name + " is non deterministic" + raise SkipTest(msg) + + if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): + for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): + assert_allclose( + x_pred, + x_pred2, + atol=1e-2, err_msg="fit_transform and transform outcomes not consistent in %s" % transformer, - atol=1e-2, ) - assert_allclose_dense_sparse( - X_pred, - X_pred3, + assert_allclose( + x_pred, + x_pred3, atol=1e-2, err_msg="consecutive fit_transform outcomes not consistent in %s" % transformer, ) - assert _num_samples(X_pred2) == n_samples - assert _num_samples(X_pred3) == n_samples - - # raises error on malformed input for transform - if ( - hasattr(X, "shape") - and not _safe_tags(transformer, key="stateless") - and X.ndim == 2 - and X.shape[1] > 1 + else: + assert_allclose( + X_pred, + X_pred2, + err_msg="fit_transform and transform outcomes not consistent in %s" + % transformer, + atol=1e-2, + ) + assert_allclose( + X_pred, + X_pred3, + atol=1e-2, + err_msg="consecutive fit_transform outcomes not consistent in %s" + % transformer, + ) + assert _num_samples(X_pred2) == n_samples + assert _num_samples(X_pred3) == n_samples + + # raises error on malformed input for transform + if ( + hasattr(X, "shape") + and _safe_tags(transformer, key="requires_fit") + and X.ndim == 3 + and X.shape[2] > 1 + ): + # If it's not an array, it does not have a 'T' property + with raises( + ValueError, + err_msg=( + f"The transformer {name} does not raise an error " + "when the number of features in transform is different from " + "the number of features in fit." + ), ): - # If it's not an array, it does not have a 'T' property - with raises( - ValueError, - err_msg=( - f"The transformer {name} does not raise an error " - "when the number of features in transform is different from " - "the number of features in fit." - ), - ): - transformer.transform(X[:, :-1]) + transformer.transform(X[:, :, -1]) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_pipeline_consistency(name, estimator_orig): - """ + """Check estimators and pipelines created from an estimator give same results. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - if _safe_tags(estimator_orig, key="non_deterministic"): - msg = name + " is non deterministic" - raise SkipTest(msg) - - # check that make_pipeline(est) gives same score as est - X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) pipeline = make_pipeline(estimator) + estimator.fit(X, y) pipeline.fit(X, y) + # check that make_pipeline(est) gives same score as est funcs = ["score", "fit_transform"] - for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) - assert_allclose_dense_sparse(result, result_pipe) + assert_allclose(result, result_pipe) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_fit_score_takes_y(name, estimator_orig): - """ - check that all estimators accept an optional y in fit and score so they can be used - in pipelines + """Check that all estimators accept an optional y in fit and score. + + This is so they can be used in pipelines. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - set_random_state(estimator) - funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] - for func_name in funcs: + for func_name in ["fit", "score", "fit_predict", "fit_transform"]: func = getattr(estimator, func_name, None) if func is not None: func(X, y) @@ -783,44 +516,46 @@ def check_fit_score_takes_y(name, estimator_orig): ) -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_estimators_dtypes(name, estimator_orig): - """ + """Check that estimators work with different dtypes. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() X_train_32 = X.astype(np.float32) - X_train_32 = _enforce_estimator_tags_X(estimator_orig, X_train_32) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) - y = _enforce_estimator_tags_y(estimator_orig, y) - - methods = ["predict", "transform", "decision_function", "predict_proba"] for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]: estimator = clone(estimator_orig) - set_random_state(estimator, 1) + estimator.fit(X_train, y) - for method in methods: + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + ]: if hasattr(estimator, method): getattr(estimator, method)(X_train) +@ignore_warnings(category=FutureWarning) def check_transformer_preserve_dtypes(name, transformer_orig): - """ - check that dtype are preserved meaning if input X is of some dtype + """Check that dtype are preserved meaning if input X is of some dtype. + X_transformed should be from the same dtype. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(transformer_orig, X) - for dtype in _safe_tags(transformer_orig, key="preserves_dtype"): X_cast = X.astype(dtype) transformer = clone(transformer_orig) @@ -845,281 +580,110 @@ def check_transformer_preserve_dtypes(name, transformer_orig): def check_estimators_empty_data_messages(name, estimator_orig): """Check the error message for estimators trained on empty data. - Modified version of the scikit-learn 1.2.1 function with the name for time series - data. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - e = clone(estimator_orig) + estimator = clone(estimator_orig) X_zero_samples = np.empty(0).reshape((0, 1, 8)) - msg = ["0 sample\(s\)", "n_samples=0", "n_samples = 0"] + msg = ["0 sample\(s\)", "n_samples=0", "n_samples = 0"] # noqa: W605 with raises(ValueError, match=msg): - e.fit(X_zero_samples, []) + estimator.fit(X_zero_samples, []) X_zero_features = np.empty(0).reshape((12, 1, 0)) # the following y should be accepted by both classifiers and regressors # and ignored by unsupervised models - y = _enforce_estimator_tags_y(e, np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])) + y = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) msg = ["0 series length", "series length 0", "series length=0", "series length = 0"] with raises(ValueError, match=msg): - e.fit(X_zero_features, y) + estimator.fit(X_zero_features, y) @ignore_warnings(category=FutureWarning) def check_estimators_nan_inf(name, estimator_orig): - """ - Checks that Estimator X's do not contain NaN or inf. + """Check that Estimator X's do not contain NaN or inf. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() + X_nan = X.copy() + X_nan[0, 0, 0] = np.nan + X_inf = X.copy() + X_inf[0, 0, 0] = np.inf - rnd = np.random.RandomState(0) - X_train_finite = _enforce_estimator_tags_X(estimator_orig, X) - X_train_nan = rnd.uniform(size=(10, 1, 3)) - X_train_nan[0, 0, 0] = np.nan - X_train_inf = rnd.uniform(size=(10, 1, 3)) - X_train_inf[0, 0, 0] = np.inf - y = _enforce_estimator_tags_y(estimator_orig, y) error_string_fit = f"Estimator {name} doesn't check for NaN and inf in fit." - error_string_predict = f"Estimator {name} doesn't check for NaN and inf in predict." - error_string_transform = ( - f"Estimator {name} doesn't check for NaN and inf in transform." - ) - for X_train in [X_train_nan, X_train_inf]: - # catch deprecation warnings - with ignore_warnings(category=FutureWarning): - estimator = clone(estimator_orig) - set_random_state(estimator, 1) - # try to fit - with raises(ValueError, match=["inf", "NaN"], err_msg=error_string_fit): - estimator.fit(X_train, y) - # actually fit - estimator.fit(X_train_finite, y) - - # predict - if hasattr(estimator, "predict"): - with raises( - ValueError, - match=["inf", "NaN"], - err_msg=error_string_predict, - ): - estimator.predict(X_train) + for X_train in [X_nan, X_inf]: + estimator = clone(estimator_orig) + + # try to fit + with raises(ValueError, match=["inf", "NaN"], err_msg=error_string_fit): + estimator.fit(X_train, y) + # actually fit + estimator.fit(X, y) - # transform - if hasattr(estimator, "transform"): + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + ]: + if hasattr(estimator, method): with raises( ValueError, match=["inf", "NaN"], - err_msg=error_string_transform, + err_msg=( + f"Estimator {name} doesn't check for NaN and inf in {method}." + ), ): - estimator.transform(X_train) + getattr(estimator, method)(X_train) -@ignore_warnings -def check_nonsquare_error(name, estimator_orig): - """ - Test that error is thrown when non-square data provided. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - estimator = clone(estimator_orig) - - with raises( - ValueError, - err_msg=( - f"The pairwise estimator {name} does not raise an error on non-square data" - ), - ): - estimator.fit(X, y) - - -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_estimators_pickle(name, estimator_orig): - """ - Test that we can pickle all estimators + """Test that we can pickle all estimators. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - check_methods = ["predict", "transform", "decision_function", "predict_proba"] - - X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel) - - tags = _safe_tags(estimator_orig) # include NaN values when the estimator should deal with them - if tags["allow_nan"]: + if _safe_tags(estimator_orig, key="allow_nan"): # set randomly 10 elements to np.nan - rng = np.random.RandomState(42) + rng = np.random.RandomState() mask = rng.choice(X.size, 10, replace=False) X.reshape(-1)[mask] = np.nan estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - - set_random_state(estimator) estimator.fit(X, y) # pickle and unpickle! pickled_estimator = pickle.dumps(estimator) - module_name = estimator.__module__ - if module_name.startswith("sklearn.") and not ( - "test_" in module_name or module_name.endswith("_testing") - ): - # strict check for sklearn estimators that are not implemented in test - # modules. - assert b"version" in pickled_estimator unpickled_estimator = pickle.loads(pickled_estimator) result = dict() - for method in check_methods: + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + ]: if hasattr(estimator, method): result[method] = getattr(estimator, method)(X) for method in result: unpickled_result = getattr(unpickled_estimator, method)(X) - assert_allclose_dense_sparse(result[method], unpickled_result) - - -@ignore_warnings(category=FutureWarning) -def check_estimators_partial_fit_n_features(name, estimator_orig): - """ - check if number of features changes between calls to partial_fit. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - if not hasattr(estimator_orig, "partial_fit"): - return - estimator = clone(estimator_orig) - X = _enforce_estimator_tags_X(estimator_orig, X) - y = _enforce_estimator_tags_y(estimator_orig, y) - - try: - if is_classifier(estimator): - classes = np.unique(y) - estimator.partial_fit(X, y, classes=classes) - else: - estimator.partial_fit(X, y) - except NotImplementedError: - return - - with raises( - ValueError, - err_msg=( - f"The estimator {name} does not raise an error when the " - "number of features changes between calls to partial_fit." - ), - ): - estimator.partial_fit(X[:, :-1], y) - - -@ignore_warnings(category=FutureWarning) -def check_classifier_multioutput(name, estimator): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - n_samples, n_labels, n_classes = 42, 5, 3 - tags = _safe_tags(estimator) - estimator = clone(estimator) - X, y = make_multilabel_classification( - random_state=42, n_samples=n_samples, n_labels=n_labels, n_classes=n_classes - ) - X = X.reshape(n_samples, 1, -1) - estimator.fit(X, y) - y_pred = estimator.predict(X) - - assert y_pred.shape == (n_samples, n_classes), ( - "The shape of the prediction for multioutput data is " - "incorrect. Expected {}, got {}.".format((n_samples, n_labels), y_pred.shape) - ) - assert y_pred.dtype.kind == "i" - - if hasattr(estimator, "decision_function"): - decision = estimator.decision_function(X) - assert isinstance(decision, np.ndarray) - assert decision.shape == (n_samples, n_classes), ( - "The shape of the decision function output for " - "multioutput data is incorrect. Expected {}, got {}.".format( - (n_samples, n_classes), decision.shape - ) - ) - - dec_pred = (decision > 0).astype(int) - dec_exp = estimator.classes_[dec_pred] - assert_array_equal(dec_exp, y_pred) - - if hasattr(estimator, "predict_proba"): - y_prob = estimator.predict_proba(X) - - if isinstance(y_prob, list) and not tags["poor_score"]: - for i in range(n_classes): - assert y_prob[i].shape == (n_samples, 2), ( - "The shape of the probability for multioutput data is" - " incorrect. Expected {}, got {}.".format( - (n_samples, 2), y_prob[i].shape - ) - ) - assert_array_equal( - np.argmax(y_prob[i], axis=1).astype(int), y_pred[:, i] - ) - elif not tags["poor_score"]: - assert y_prob.shape == (n_samples, n_classes), ( - "The shape of the probability for multioutput data is" - " incorrect. Expected {}, got {}.".format( - (n_samples, n_classes), y_prob.shape - ) - ) - assert_array_equal(y_prob.round().astype(int), y_pred) - - if hasattr(estimator, "decision_function") and hasattr(estimator, "predict_proba"): - for i in range(n_classes): - y_proba = estimator.predict_proba(X)[:, i] - y_decision = estimator.decision_function(X) - assert_array_equal(rankdata(y_proba), rankdata(y_decision[:, i])) - - -@ignore_warnings(category=FutureWarning) -def check_regressor_multioutput(name, estimator): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - estimator = clone(estimator) - n_samples = n_features = 10 - - if not _is_pairwise_metric(estimator): - n_samples = n_samples + 1 - - X, y = make_regression( - random_state=42, n_targets=5, n_samples=n_samples, n_features=n_features - ) - X = X.reshape(n_samples, 1, -1) - X = _enforce_estimator_tags_X(estimator, X) - - estimator.fit(X, y) - y_pred = estimator.predict(X) - - assert y_pred.dtype == np.dtype("float64"), ( - "Multioutput predictions by a regressor are expected to be" - " floating-point precision. Got {} instead".format(y_pred.dtype) - ) - assert y_pred.shape == y.shape, ( - "The shape of the prediction for multioutput data is incorrect." - " Expected {}, got {}." - ) + assert_allclose(result[method], unpickled_result) @ignore_warnings(category=FutureWarning) def check_clustering(name, clusterer_orig, readonly_memmap=False): - """ + """Check clusterer adheres to sklearn-like conventions. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() @@ -1171,17 +735,16 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False): assert labels_sorted[0] in [0, -1] # Labels should be less than n_clusters - 1 if hasattr(clusterer, "n_clusters"): - n_clusters = getattr(clusterer, "n_clusters") - assert n_clusters - 1 >= labels_sorted[-1] + assert clusterer.n_clusters - 1 >= labels_sorted[-1] # else labels should be less than max(labels_) which is necessarily true @ignore_warnings(category=FutureWarning) def check_clusterer_compute_labels_predict(name, clusterer_orig): - """ - Check that predict is invariant of compute_labels. + """Check that predict is invariant of compute_labels. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() @@ -1189,7 +752,6 @@ def check_clusterer_compute_labels_predict(name, clusterer_orig): set_random_state(clusterer) if hasattr(clusterer, "compute_labels"): - # MiniBatchKMeans X_pred1 = clusterer.fit(X).predict(X) clusterer.set_params(compute_labels=False) X_pred2 = clusterer.fit(X).predict(X) @@ -1198,80 +760,46 @@ def check_clusterer_compute_labels_predict(name, clusterer_orig): @ignore_warnings(category=FutureWarning) def check_classifiers_one_label(name, classifier_orig): - """ + """Check classifier outputs suitable error or correct output for single class input. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - error_string_fit = "Classifier can't train when only one class is present." - error_string_predict = "Classifier can't predict when only one class is present." - rnd = np.random.RandomState(0) - X_train = rnd.uniform(size=(10, 1, 8)) - X_test = rnd.uniform(size=(10, 1, 8)) + X_train, _ = test_utils.generate_3d_test_data() + X_test, _ = test_utils.generate_3d_test_data() y = np.ones(10) - # catch deprecation warnings - with ignore_warnings(category=FutureWarning): - classifier = clone(classifier_orig) - with raises( - ValueError, match="class", may_pass=True, err_msg=error_string_fit - ) as cm: - classifier.fit(X_train, y) - - if cm.raised_and_matched: - # ValueError was raised with proper error message - return - - assert_array_equal(classifier.predict(X_test), y, err_msg=error_string_predict) + classifier = clone(classifier_orig) -@ignore_warnings(category=FutureWarning) -def check_classifiers_one_label_sample_weights(name, classifier_orig): - """ - Check that classifiers accepting sample_weight fit or throws a ValueError with - an explicit message if the problem is reduced to one class. + with raises( + ValueError, + match="single class", + may_pass=True, + err_msg="Classifier can't train when only one class is present.", + ) as cm: + classifier.fit(X_train, y) + + if cm.raised_and_matched: + # ValueError was raised with proper error message + return - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - error_fit = ( - f"{name} failed when fitted on one label after sample_weight trimming. Error " - "message is not explicit, it should have 'class'." + assert_array_equal( + classifier.predict(X_test), + y, + "Classifier can't predict when only one class is present.", ) - error_predict = f"{name} prediction results should only output the remaining class." - rnd = np.random.RandomState(0) - # X should be square for test on SVC with precomputed kernel - X_train = rnd.uniform(size=(10, 1, 8)) - X_test = rnd.uniform(size=(10, 1, 8)) - y = np.arange(10) % 2 - sample_weight = y.copy() # select a single class - classifier = clone(classifier_orig) - if has_fit_parameter(classifier, "sample_weight"): - match = [r"\bclass(es)?\b", error_predict] - err_type, err_msg = (AssertionError, ValueError), error_fit - else: - match = r"\bsample_weight\b" - err_type, err_msg = (TypeError, ValueError), None - - with raises(err_type, match=match, may_pass=True, err_msg=err_msg) as cm: - classifier.fit(X_train, y, sample_weight=sample_weight) - if cm.raised_and_matched: - # raise the proper error type with the proper error message - return - # for estimators that do not fail, they should be able to predict the only - # class remaining during fit - assert_array_equal( - classifier.predict(X_test), np.ones(10), err_msg=error_predict - ) - -@ignore_warnings # Warnings are raised by decision function +@ignore_warnings(category=FutureWarning) def check_classifiers_train( name, classifier_orig, readonly_memmap=False, X_dtype="float64" ): - """ + """Check classifier adheres to sklearn-like conventions. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - X_m, y_m = test_utils.generate_3d_test_data(n_samples=15, n_labels=3) + X_m, y_m = test_utils.generate_3d_test_data(n_labels=3) X_m = X_m.astype(X_dtype) # generate binary problem from multi-class one @@ -1283,18 +811,14 @@ def check_classifiers_train( if readonly_memmap: X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b]) - problems = [(X_b, y_b)] + problems = [(X_b, y_b), (X_m, y_m)] tags = _safe_tags(classifier_orig) - if not tags["binary_only"]: - problems.append((X_m, y_m)) for X, y in problems: classes = np.unique(y) n_classes = len(classes) n_samples, _, n_features = X.shape classifier = clone(classifier_orig) - X = _enforce_estimator_tags_X(classifier, X) - y = _enforce_estimator_tags_y(classifier, y) # raises error on malformed input for fit if not tags["no_validation"]: @@ -1319,57 +843,34 @@ def check_classifiers_train( assert y_pred.shape == (n_samples,) # raises error on malformed input for predict - msg_pairwise = ( - "The classifier {} does not raise an error when shape of X in " - " {} is not equal to (n_test_samples, n_training_samples)" - ) msg = ( "The classifier {} does not raise an error when the number of " "features in {} is different from the number of features in " "fit." ) - # if not tags["no_validation"]: - # if tags["pairwise"]: - # with raises( - # ValueError, - # err_msg=msg_pairwise.format(name, "predict"), - # ): - # classifier.predict(X.reshape(-1, 1)) - # else: - # with raises(ValueError, err_msg=msg.format(name, "predict")): - # classifier.predict(X.T) + if not tags["no_validation"]: + with raises(ValueError, err_msg=msg.format(name, "predict")): + classifier.predict(X.T) + if hasattr(classifier, "decision_function"): - try: - # decision_function agrees with predict - decision = classifier.decision_function(X) - if n_classes == 2: - if not tags["multioutput_only"]: - assert decision.shape == (n_samples,) - else: - assert decision.shape == (n_samples, 1) - dec_pred = (decision.ravel() > 0).astype(int) - assert_array_equal(dec_pred, y_pred) - else: - assert decision.shape == (n_samples, n_classes) - assert_array_equal(np.argmax(decision, axis=1), y_pred) - - # raises error on malformed input for decision_function - if not tags["no_validation"]: - if tags["pairwise"]: - with raises( - ValueError, - err_msg=msg_pairwise.format(name, "decision_function"), - ): - classifier.decision_function(X.reshape(-1, 1)) - else: - with raises( - ValueError, - err_msg=msg.format(name, "decision_function"), - ): - classifier.decision_function(X.T) - except NotImplementedError: - pass + # decision_function agrees with predict + decision = classifier.decision_function(X) + if n_classes == 2: + assert decision.shape == (n_samples,) + dec_pred = (decision.ravel() > 0).astype(int) + assert_array_equal(dec_pred, y_pred) + else: + assert decision.shape == (n_samples, n_classes) + assert_array_equal(np.argmax(decision, axis=1), y_pred) + + # raises error on malformed input for decision_function + if not tags["no_validation"]: + with raises( + ValueError, + err_msg=msg.format(name, "decision_function"), + ): + classifier.decision_function(X.T) if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict @@ -1378,339 +879,78 @@ def check_classifiers_train( assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) - # if not tags["no_validation"]: - # # raises error on malformed input for predict_proba - # if tags["pairwise"]: - # with raises( - # ValueError, - # err_msg=msg_pairwise.format(name, "predict_proba"), - # ): - # classifier.predict_proba(X.reshape(-1, 1)) - # else: - # with raises( - # ValueError, - # err_msg=msg.format(name, "predict_proba"), - # ): - # classifier.predict_proba(X.T) - if hasattr(classifier, "predict_log_proba"): - # predict_log_proba is a transformation of predict_proba - y_log_prob = classifier.predict_log_proba(X) - assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9) - assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob)) - - -@ignore_warnings(category=FutureWarning) -def check_classifiers_multilabel_representation_invariance(name, classifier_orig): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - n_samples, test_size, n_outputs = 20, 10, 4 - X, y = make_multilabel_classification( - n_samples=n_samples, - n_features=8, - n_classes=n_outputs, - n_labels=3, - length=15, - allow_unlabeled=True, - ) - X = X.reshape(X.shape[0], 1, -1) - X = scale(X) - - X_train, X_test = X[:-test_size], X[-test_size:] - (y_train,) = y[:-test_size] - - y_train_list_of_lists = y_train.tolist() - y_train_list_of_arrays = list(y_train) - - classifier = clone(classifier_orig) - set_random_state(classifier) - - y_pred = classifier.fit(X_train, y_train).predict(X_test) - - y_pred_list_of_lists = classifier.fit(X_train, y_train_list_of_lists).predict( - X_test - ) - - y_pred_list_of_arrays = classifier.fit(X_train, y_train_list_of_arrays).predict( - X_test - ) - - assert_array_equal(y_pred, y_pred_list_of_arrays) - assert_array_equal(y_pred, y_pred_list_of_lists) - - assert y_pred.dtype == y_pred_list_of_arrays.dtype - assert y_pred.dtype == y_pred_list_of_lists.dtype - assert type(y_pred) == type(y_pred_list_of_arrays) - assert type(y_pred) == type(y_pred_list_of_lists) - - -@ignore_warnings(category=FutureWarning) -def check_classifiers_multilabel_output_format_predict(name, classifier_orig): - """ - Check the output of the `predict` method for classifiers supporting - multilabel-indicator targets. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - classifier = clone(classifier_orig) - set_random_state(classifier) - - n_samples, test_size, n_outputs = 30, 10, 4 - X, y = make_multilabel_classification( - n_samples=n_samples, - n_features=2, - n_classes=n_outputs, - n_labels=3, - length=15, - allow_unlabeled=True, - ) - X = X.reshape(X.shape[0], 1, -1) - X = scale(X) - - X_train, X_test = X[:-test_size], X[-test_size:] - y_train, y_test = y[:-test_size], y[-test_size:] - classifier.fit(X_train, y_train) - - response_method_name = "predict" - predict_method = getattr(classifier, response_method_name, None) - if predict_method is None: - raise SkipTest(f"{name} does not have a {response_method_name} method.") - - y_pred = predict_method(X_test) - - # y_pred.shape -> y_test.shape with the same dtype - assert isinstance(y_pred, np.ndarray), ( - f"{name}.predict is expected to output a NumPy array. Got " - f"{type(y_pred)} instead." - ) - assert y_pred.shape == y_test.shape, ( - f"{name}.predict outputs a NumPy array of shape {y_pred.shape} " - f"instead of {y_test.shape}." - ) - assert y_pred.dtype == y_test.dtype, ( - f"{name}.predict does not output the same dtype than the targets. " - f"Got {y_pred.dtype} instead of {y_test.dtype}." - ) - - -@ignore_warnings(category=FutureWarning) -def check_classifiers_multilabel_output_format_predict_proba(name, classifier_orig): - """ - Check the output of the `predict_proba` method for classifiers supporting - multilabel-indicator targets. - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - classifier = clone(classifier_orig) - set_random_state(classifier) - - n_samples, test_size, n_outputs = 30, 10, 4 - X, y = make_multilabel_classification( - n_samples=n_samples, - n_features=2, - n_classes=n_outputs, - n_labels=3, - length=15, - allow_unlabeled=True, - ) - X = X.reshape(X.shape[0], 1, -1) - X = scale(X) - - X_train, X_test = X[:-test_size], X[-test_size:] - y_train = y[:-test_size] - classifier.fit(X_train, y_train) - - response_method_name = "predict_proba" - predict_proba_method = getattr(classifier, response_method_name, None) - if predict_proba_method is None: - raise SkipTest(f"{name} does not have a {response_method_name} method.") - - y_pred = predict_proba_method(X_test) - - # y_pred.shape -> 2 possibilities: - # - list of length n_outputs of shape (n_samples, 2); - # - ndarray of shape (n_samples, n_outputs). - # dtype should be floating - if isinstance(y_pred, list): - assert len(y_pred) == n_outputs, ( - f"When {name}.predict_proba returns a list, the list should " - "be of length n_outputs and contain NumPy arrays. Got length " - f"of {len(y_pred)} instead of {n_outputs}." - ) - for pred in y_pred: - assert pred.shape == (test_size, 2), ( - f"When {name}.predict_proba returns a list, this list " - "should contain NumPy arrays of shape (n_samples, 2). Got " - f"NumPy arrays of shape {pred.shape} instead of " - f"{(test_size, 2)}." - ) - assert pred.dtype.kind == "f", ( - f"When {name}.predict_proba returns a list, it should " - "contain NumPy arrays with floating dtype. Got " - f"{pred.dtype} instead." - ) - # check that we have the correct probabilities - err_msg = ( - f"When {name}.predict_proba returns a list, each NumPy " - "array should contain probabilities for each class and " - "thus each row should sum to 1 (or close to 1 due to " - "numerical errors)." - ) - assert_allclose(pred.sum(axis=1), 1, err_msg=err_msg) - elif isinstance(y_pred, np.ndarray): - assert y_pred.shape == (test_size, n_outputs), ( - f"When {name}.predict_proba returns a NumPy array, the " - f"expected shape is (n_samples, n_outputs). Got {y_pred.shape}" - f" instead of {(test_size, n_outputs)}." - ) - assert y_pred.dtype.kind == "f", ( - f"When {name}.predict_proba returns a NumPy array, the " - f"expected data type is floating. Got {y_pred.dtype} instead." - ) - err_msg = ( - f"When {name}.predict_proba returns a NumPy array, this array " - "is expected to provide probabilities of the positive class " - "and should therefore contain values between 0 and 1." - ) - assert_array_less(0, y_pred, err_msg=err_msg) - assert_array_less(y_pred, 1, err_msg=err_msg) - else: - raise ValueError( - f"Unknown returned type {type(y_pred)} by {name}." - "predict_proba. A list or a Numpy array is expected." - ) - - -@ignore_warnings(category=FutureWarning) -def check_classifiers_multilabel_output_format_decision_function(name, classifier_orig): - """ - Check the output of the `decision_function` method for classifiers supporting - multilabel-indicator targets. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - classifier = clone(classifier_orig) - set_random_state(classifier) - - n_samples, test_size, n_outputs = 30, 10, 4 - X, y = make_multilabel_classification( - n_samples=n_samples, - n_features=2, - n_classes=n_outputs, - n_labels=3, - length=15, - allow_unlabeled=True, - ) - X = X.reshape(X.shape[0], 1, -1) - X = scale(X) - - X_train, X_test = X[:-test_size], X[-test_size:] - y_train = y[:-test_size] - classifier.fit(X_train, y_train) - - response_method_name = "decision_function" - decision_function_method = getattr(classifier, response_method_name, None) - if decision_function_method is None: - raise SkipTest(f"{name} does not have a {response_method_name} method.") - - y_pred = decision_function_method(X_test) - - # y_pred.shape -> y_test.shape with floating dtype - assert isinstance(y_pred, np.ndarray), ( - f"{name}.decision_function is expected to output a NumPy array." - f" Got {type(y_pred)} instead." - ) - assert y_pred.shape == (test_size, n_outputs), ( - f"{name}.decision_function is expected to provide a NumPy array " - f"of shape (n_samples, n_outputs). Got {y_pred.shape} instead of " - f"{(test_size, n_outputs)}." - ) - assert y_pred.dtype.kind == "f", ( - f"{name}.decision_function is expected to output a floating dtype." - f" Got {y_pred.dtype} instead." - ) - - -@ignore_warnings(category=FutureWarning) -def check_get_feature_names_out_error(name, estimator_orig): - """ - Check the error raised by get_feature_names_out when called before fit. - - Unfitted estimators with get_feature_names_out should raise a NotFittedError. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - estimator = clone(estimator_orig) - err_msg = ( - f"Estimator {name} should have raised a NotFitted error when fit is called" - " before get_feature_names_out" - ) - with raises(NotFittedError, err_msg=err_msg): - estimator.get_feature_names_out() + if not tags["no_validation"]: + # raises error on malformed input for predict_proba + with raises( + ValueError, + err_msg=msg.format(name, "predict_proba"), + ): + classifier.predict_proba(X.T) @ignore_warnings(category=FutureWarning) def check_estimators_fit_returns_self(name, estimator_orig, readonly_memmap=False): - """ - Check if self is returned when calling fit. + """Check if self is returned when calling fit. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) if readonly_memmap: X, y = create_memmap_backed_data([X, y]) - set_random_state(estimator) assert estimator.fit(X, y) is estimator -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_estimators_unfitted(name, estimator_orig): - """ - Check that predict raises an exception in an unfitted estimator. + """Check that predict raises an exception in an unfitted estimator. Unfitted estimators should raise a NotFittedError. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() estimator = clone(estimator_orig) for method in ( - "decision_function", "predict", "predict_proba", - "predict_log_proba", + "decision_function", + "transform", ): if hasattr(estimator, method): - with raises(NotFittedError): + with raises( + NotFittedError, + err_msg=( + f"The unfitted estimator {name} does not raise an error when " + f"{method} is called. Perhaps use check_is_fitted." + ), + ): getattr(estimator, method)(X) @ignore_warnings(category=FutureWarning) def check_supervised_y_2d(name, estimator_orig): - """ + """Check that when a 2D y is given, a DataConversionWarning is raised. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - tags = _safe_tags(estimator_orig) - X = _enforce_estimator_tags_X(estimator_orig, X) - y = _enforce_estimator_tags_y(estimator_orig, y) estimator = clone(estimator_orig) set_random_state(estimator) + # fit estimator.fit(X, y) y_pred = estimator.predict(X) - set_random_state(estimator) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: @@ -1721,20 +961,22 @@ def check_supervised_y_2d(name, estimator_orig): msg = "expected 1 DataConversionWarning, got: %s" % ", ".join( [str(w_x) for w_x in w] ) - if not tags["multioutput"]: - # check that we warned if we don't support multi-output - assert len(w) > 0, msg - assert ( - "DataConversionWarning('A column-vector y" - " was passed when a 1d array was expected" in msg - ) + + assert len(w) > 0, msg + assert ( + "DataConversionWarning('A column-vector y" + " was passed when a 1d array was expected" in msg + ) + assert_allclose(y_pred.ravel(), y_pred_2d.ravel()) +@ignore_warnings(category=FutureWarning) def check_classifiers_classes(name, classifier_orig): - """ + """Check classifier can handle binary and multiclass data with different y types. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X_multiclass, y_multiclass = test_utils.generate_3d_test_data(n_labels=3) X_multiclass, y_multiclass = shuffle(X_multiclass, y_multiclass, random_state=7) @@ -1742,37 +984,30 @@ def check_classifiers_classes(name, classifier_orig): X_binary = X_multiclass[y_multiclass != 2] y_binary = y_multiclass[y_multiclass != 2] - X_multiclass = _enforce_estimator_tags_X(classifier_orig, X_multiclass) - X_binary = _enforce_estimator_tags_X(classifier_orig, X_binary) - labels_multiclass = ["one", "two", "three"] labels_binary = ["one", "two"] y_names_multiclass = np.take(labels_multiclass, y_multiclass) y_names_binary = np.take(labels_binary, y_binary) - problems = [(X_binary, y_binary, y_names_binary)] - if not _safe_tags(classifier_orig, key="binary_only"): - problems.append((X_multiclass, y_multiclass, y_names_multiclass)) + problems = [ + (X_binary, y_names_binary), + (X_multiclass, y_names_multiclass), + ] - for X, y, y_names in problems: + for X, y_names in problems: for y_names_i in [y_names, y_names.astype("O")]: - y_ = _choose_check_classifiers_labels(name, y, y_names_i) - check_classifiers_predictions(X, y_, name, classifier_orig) + _check_classifiers_predictions(X, y_names_i, name, classifier_orig) labels_binary = [-1, 1] y_names_binary = np.take(labels_binary, y_binary) - y_binary = _choose_check_classifiers_labels(name, y_binary, y_names_binary) - check_classifiers_predictions(X_binary, y_binary, name, classifier_orig) + _check_classifiers_predictions(X_binary, y_names_binary, name, classifier_orig) @ignore_warnings -def check_classifiers_predictions(X, y, name, classifier_orig): +def _check_classifiers_predictions(X, y, name, classifier_orig): classes = np.unique(y) classifier = clone(classifier_orig) - if name == "BernoulliNB": - X = X > X.mean() - set_random_state(classifier) classifier.fit(X, y) y_pred = classifier.predict(X) @@ -1827,14 +1062,13 @@ def check_classifiers_predictions(X, y, name, classifier_orig): @ignore_warnings(category=FutureWarning) def check_regressors_int(name, regressor_orig): - """ + """Check that regressor labels return the same result as float or int labels. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(regressor_orig, X) - y = _enforce_estimator_tags_y(regressor_orig, y) # separate estimators to control random seeds regressor_1 = clone(regressor_orig) regressor_2 = clone(regressor_orig) @@ -1853,118 +1087,70 @@ def check_regressors_int(name, regressor_orig): def check_regressors_train( name, regressor_orig, readonly_memmap=False, X_dtype=np.float64 ): - """ + """Check regressor adheres to sklearn-like conventions. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - X, y = test_utils.generate_3d_test_data() + X, y = test_utils.generate_3d_test_data(regression_target=True) X = X.astype(X_dtype) y = scale(y) # X is already scaled + regressor = clone(regressor_orig) - X = _enforce_estimator_tags_X(regressor, X) - y = _enforce_estimator_tags_y(regressor, y) if readonly_memmap: X, y = create_memmap_backed_data([X, y]) # raises error on malformed input for fit - with raises( - ValueError, - err_msg=( - f"The classifier {name} does not raise an error when " - "incorrect/malformed input data for fit is passed. The number of " - "training examples is not the same as the number of labels. Perhaps " - "use check_X_y in fit." - ), - ): - regressor.fit(X, y[:-1]) + if not _safe_tags(regressor_orig, key="no_validation"): + with raises( + ValueError, + err_msg=( + f"The regressor {name} does not raise an error when " + "incorrect/malformed input data for fit is passed. The number of " + "training examples is not the same as the number of labels. Perhaps " + "use check_X_y in fit." + ), + ): + regressor.fit(X, y[:-1]) # fit - set_random_state(regressor) regressor.fit(X, y) regressor.fit(X.tolist(), y.tolist()) y_pred = regressor.predict(X) assert y_pred.shape == y.shape -@ignore_warnings +@ignore_warnings(category=FutureWarning) def check_regressors_no_decision_function(name, regressor_orig): - """ - check that regressors don't have a decision_function, predict_proba, or - predict_log_proba method. + """Check that regressors don't have a decision_function or predict_proba. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - X, y = test_utils.generate_3d_test_data() + X, y = test_utils.generate_3d_test_data(regression_target=True) regressor = clone(regressor_orig) - X = _enforce_estimator_tags_X(regressor_orig, X) - y = _enforce_estimator_tags_y(regressor, y) - regressor.fit(X, y) - funcs = ["decision_function", "predict_proba", "predict_log_proba"] - for func_name in funcs: - assert not hasattr(regressor, func_name) - - -@ignore_warnings(category=FutureWarning) -def check_class_weight_classifiers(name, classifier_orig): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - if _safe_tags(classifier_orig, key="binary_only"): - problems = [2] - else: - problems = [2, 3] - - for n_classes in problems: - # create a very noisy dataset - X, y = test_utils.generate_3d_test_data(n_samples=15, n_labels=n_classes) - rng = np.random.RandomState(0) - X += 20 * rng.uniform(size=X.shape) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5, random_state=0 - ) - - # can't use gram_if_pairwise() here, setting up gram matrix manually - if _safe_tags(classifier_orig, key="pairwise"): - X_test = rbf_kernel(X_test, X_train) - X_train = rbf_kernel(X_train, X_train) - - n_classes = len(np.unique(y_train)) - - if n_classes == 2: - class_weight = {0: 1000, 1: 0.0001} - else: - y[-1] = 2 - class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} - - classifier = clone(classifier_orig).set_params(class_weight=class_weight) - - set_random_state(classifier) - classifier.fit(X_train, y_train) - y_pred = classifier.predict(X_test) - - if not _safe_tags(classifier_orig, key="poor_score"): - assert np.mean(y_pred == 0) > 0.75 + for method in [ + "decision_function", + "predict_proba", + ]: + assert not hasattr(regressor, method) @ignore_warnings(category=FutureWarning) def check_estimators_overwrite_params(name, estimator_orig): - """ + """Check estimators do not overwrite parameters during fit. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X, kernel=rbf_kernel) estimator = clone(estimator_orig) - y = _enforce_estimator_tags_y(estimator, y) - - set_random_state(estimator) # Make a physical copy of the original estimator parameters before fitting. params = estimator.get_params() @@ -1985,144 +1171,88 @@ def check_estimators_overwrite_params(name, estimator_orig): # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert joblib.hash(new_value) == joblib.hash(original_value), ( - "Estimator %s should not change or mutate " - " the parameter %s from %s to %s during fit." - % (name, param_name, original_value, new_value) + f"Estimator {name} should not change or mutate the parameter {param_name} " + f"from {original_value} to {new_value} during fit." ) @ignore_warnings(category=FutureWarning) -def check_no_attributes_set_in_init(name, estimator_orig): - """ +def check_estimator_data_not_an_array(name, estimator_orig): + """Check non-array data input runs without error and produces similar output. - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - try: - # Clone fails if the estimator does not store - # all parameters as an attribute during init - estimator = clone(estimator_orig) - except AttributeError: - raise AttributeError( - f"Estimator {name} should store all parameters as an attribute during init." - ) - - if hasattr(type(estimator).__init__, "deprecated_original"): - return - - init_params = _get_args(type(estimator).__init__) - if IS_PYPY: - # __init__ signature has additional objects in PyPy - for key in ["obj"]: - if key in init_params: - init_params.remove(key) - parents_init_params = [ - param - for params_parent in (_get_args(parent) for parent in type(estimator).__mro__) - for param in params_parent - ] - - # Test for no setting apart from parameters during init - invalid_attr = set(vars(estimator)) - set(init_params) - set(parents_init_params) - assert not invalid_attr, ( - "Estimator %s should not set any attribute apart" - " from parameters during init. Found attributes %s." - % (name, sorted(invalid_attr)) - ) - - -@ignore_warnings(category=FutureWarning) -def check_classifier_data_not_an_array(name, estimator_orig): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with similar names for time + series data. """ X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) - y = _enforce_estimator_tags_y(estimator_orig, y) - for obj_type in ["NotAnArray"]: - check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type) + for obj_type in ["NotAnArray", "PandasDataframe"]: + _check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type) @ignore_warnings(category=FutureWarning) -def check_regressor_data_not_an_array(name, estimator_orig): - """ - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X, y = test_utils.generate_3d_test_data() - - X = _enforce_estimator_tags_X(estimator_orig, X) - y = _enforce_estimator_tags_y(estimator_orig, y) - for obj_type in ["NotAnArray"]: - check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type) - - -@ignore_warnings(category=FutureWarning) -def check_non_transformer_estimators_n_iter(name, estimator_orig): - """ - Test that estimators that are not transformers with a parameter - max_iter, return the attribute of n_iter_ at least 1. - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - estimator = clone(estimator_orig) - if hasattr(estimator, "max_iter"): - X, y = test_utils.generate_3d_test_data() - y = _enforce_estimator_tags_y(estimator, y) - - set_random_state(estimator, 0) - - X = _enforce_estimator_tags_X(estimator_orig, X) - - estimator.fit(X, y) - - assert np.all(estimator.n_iter_ >= 1) - +def _check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type): + estimator_1 = clone(estimator_orig) + estimator_2 = clone(estimator_orig) + set_random_state(estimator_1) + set_random_state(estimator_2) + + if obj_type not in ["NotAnArray", "PandasDataframe"]: + raise ValueError("Data type {0} not supported".format(obj_type)) + + if obj_type == "NotAnArray": + y_ = _NotAnArray(np.asarray(y)) + X_ = _NotAnArray(np.asarray(X)) + else: + # Here pandas objects (Series and DataFrame) are tested explicitly + # because some estimators may handle them (especially their indexing) + # specially. + try: + import pandas as pd -@ignore_warnings(category=FutureWarning) -def check_transformer_n_iter(name, estimator_orig): - """ - Test that transformers with a parameter max_iter, return the - attribute of n_iter_ at least 1. + y_ = np.asarray(y) + if y_.ndim == 1: + y_ = pd.Series(y_) + else: + y_ = pd.DataFrame(y_) + X_ = pd.DataFrame(np.asarray(X.reshape((X.shape[0], -1)))) - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - estimator = clone(estimator_orig) - if hasattr(estimator, "max_iter"): - X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator_orig, X) - set_random_state(estimator, 0) - estimator.fit(X, y) + except ImportError: + raise SkipTest( + "pandas is not installed: not checking estimators for pandas objects." + ) - assert estimator.n_iter_ >= 1 + # fit + estimator_1.fit(X_, y_) + pred1 = estimator_1.predict(X_) + estimator_2.fit(X, y) + pred2 = estimator_2.predict(X) + assert_allclose(pred1, pred2, atol=1e-2, err_msg=name) @ignore_warnings(category=FutureWarning) -def check_classifiers_regression_target(name, estimator_orig): - """ - Check if classifier throws an exception when fed regression targets +def check_classifiers_regression_target(name, classifier_orig): + """Check if classifier throws an exception when fed regression targets. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - X, y = _regression_dataset() - X.reshape((X.shape[0], 1, -1)) + X, y = test_utils.generate_3d_test_data(regression_target=True) - X = _enforce_estimator_tags_X(estimator_orig, X) - e = clone(estimator_orig) + classifier = clone(classifier_orig) msg = "Unknown label type: " - if not _safe_tags(e, key="no_validation"): - with raises(ValueError, match=msg): - e.fit(X, y) + with raises(ValueError, match=msg): + classifier.fit(X, y) @ignore_warnings(category=FutureWarning) def check_decision_proba_consistency(name, estimator_orig): - """ + """Check estimators with both decision_function and predict_proba methods. + Check whether an estimator having both decision_function and predict_proba methods has outputs with perfect rank correlation. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ X, y = test_utils.generate_3d_test_data() @@ -2157,22 +1287,10 @@ def check_decision_proba_consistency(name, estimator_orig): assert_array_equal(sorted_idx, np.arange(len(sorted_idx))) -def check_fit_non_negative(name, estimator_orig): - """ - Check that proper warning is raised for negative X - when tag requires_positive_X is present - - Modified version of the scikit-learn 1.2.1 function with the name for time series. - """ - X = np.array([[[-1.0, 1], [-1.0, 1]]]) - y = np.array([1, 2]) - estimator = clone(estimator_orig) - with raises(ValueError): - estimator.fit(X, y) - - +@ignore_warnings(category=FutureWarning) def check_fit_idempotent(name, estimator_orig): - """ + """Check that estimator can be fit multiple times with the same results. + Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would check that the estimated parameters during training (e.g. coefs_) are the same, but having a universal comparison function for those @@ -2180,27 +1298,19 @@ def check_fit_idempotent(name, estimator_orig): predict(), predict_proba(), decision_function() and transform() return the same results. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - check_methods = ["predict", "transform", "decision_function", "predict_proba"] - rng = np.random.RandomState(0) + X_train, y_train = test_utils.generate_3d_test_data() + X_test, y_test = test_utils.generate_3d_test_data() estimator = clone(estimator_orig) - set_random_state(estimator) - if "warm_start" in estimator.get_params().keys(): - estimator.set_params(warm_start=False) - - X, y = test_utils.generate_3d_test_data(n_samples=15) - X = _enforce_estimator_tags_X(estimator, X) - y = _enforce_estimator_tags_y(estimator, y) - - train, test = next(ShuffleSplit(test_size=0.2, random_state=rng).split(X)) - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, y_test = _safe_split(estimator, X, y, test, train) + set_random_state(estimator, 1) # Fit for the first time estimator.fit(X_train, y_train) + check_methods = ["predict", "transform", "decision_function", "predict_proba"] result = { method: getattr(estimator, method)(X_test) for method in check_methods @@ -2208,42 +1318,34 @@ def check_fit_idempotent(name, estimator_orig): } # Fit again - set_random_state(estimator) estimator.fit(X_train, y_train) for method in check_methods: if hasattr(estimator, method): new_result = getattr(estimator, method)(X_test) - if np.issubdtype(new_result.dtype, np.floating): - tol = 2 * np.finfo(new_result.dtype).eps - else: - tol = 2 * np.finfo(np.float64).eps - assert_allclose_dense_sparse( + + assert_allclose( result[method], new_result, - atol=max(tol, 1e-9), - rtol=max(tol, 1e-7), err_msg="Idempotency check failed for method {}".format(method), ) +@ignore_warnings(category=FutureWarning) def check_fit_check_is_fitted(name, estimator_orig): - """ + """Check check_is_fitted works as expected. + Make sure that estimator doesn't pass check_is_fitted before calling fit and that passes check_is_fitted once it's fit. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - estimator = clone(estimator_orig) - set_random_state(estimator) - if "warm_start" in estimator.get_params(): - estimator.set_params(warm_start=False) + X, y = test_utils.generate_3d_test_data() - X, y = test_utils.generate_3d_test_data(n_samples=15) - X = _enforce_estimator_tags_X(estimator, X) - y = _enforce_estimator_tags_y(estimator, y) + estimator = clone(estimator_orig) - if not _safe_tags(estimator).get("stateless", False): + if _safe_tags(estimator, key="requires_fit"): # stateless estimators (such as FunctionTransformer) are always "fit"! try: check_is_fitted(estimator) @@ -2253,6 +1355,7 @@ def check_fit_check_is_fitted(name, estimator_orig): ) except NotFittedError: pass + estimator.fit(X, y) try: check_is_fitted(estimator) @@ -2262,21 +1365,19 @@ def check_fit_check_is_fitted(name, estimator_orig): ) from e +@ignore_warnings(category=FutureWarning) def check_n_features_in(name, estimator_orig): - """ + """Check n_features_in_ attribute. + Make sure that n_features_in_ attribute doesn't exist until fit is called, and that its value is correct. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - estimator = clone(estimator_orig) - set_random_state(estimator) - if "warm_start" in estimator.get_params(): - estimator.set_params(warm_start=False) - X, y = test_utils.generate_3d_test_data() - X = _enforce_estimator_tags_X(estimator, X) - y = _enforce_estimator_tags_y(estimator, y) + + estimator = clone(estimator_orig) assert not hasattr(estimator, "n_features_in_") estimator.fit(X, y) @@ -2284,38 +1385,36 @@ def check_n_features_in(name, estimator_orig): assert estimator.n_features_in_ == (X.shape[1], X.shape[2], X.shape[2]) +@ignore_warnings(category=FutureWarning) def check_requires_y_none(name, estimator_orig): - """ + """Check that estimators requiring y fail gracefully. + Make sure that an estimator with requires_y=True fails gracefully when - given y=None + given y=None. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ - estimator = clone(estimator_orig) - set_random_state(estimator) + X, _ = test_utils.generate_3d_test_data() - X, _ = test_utils.generate_3d_test_data(n_samples=15) - X = _enforce_estimator_tags_X(estimator, X) + estimator = clone(estimator_orig) - expected_err_msgs = ( + msg = [ "requires y to be passed, but the target y is None", "Expected array-like (array or non-string sequence), got None", "y should be a 1d array", - ) + ] - try: + with raises(ValueError, match=msg): estimator.fit(X, None) - except ValueError as ve: - if not any(msg in str(ve) for msg in expected_err_msgs): - raise ve +@ignore_warnings(category=FutureWarning) def check_estimator_get_tags_default_keys(name, estimator_orig): - """ - check that if _get_tags is implemented, it contains all keys from - _DEFAULT_KEYS + """Check that if _get_tags is implemented, it contains all keys from _DEFAULT_KEYS. - Modified version of the scikit-learn 1.2.1 function with the name for time series. + Modified version of the scikit-learn 1.2.1 function with the same name for time + series data. """ estimator = clone(estimator_orig) if not hasattr(estimator, "_get_tags"): @@ -2324,8 +1423,8 @@ def check_estimator_get_tags_default_keys(name, estimator_orig): tags_keys = set(estimator._get_tags().keys()) default_tags_keys = set(_DEFAULT_TAGS.keys()) assert tags_keys.intersection(default_tags_keys) == default_tags_keys, ( - f"{name}._get_tags() is missing entries for the following default tags" - f": {default_tags_keys - tags_keys.intersection(default_tags_keys)}" + f"{name}._get_tags() is missing entries for the following default tags: " + f"{default_tags_keys - tags_keys.intersection(default_tags_keys)}" ) diff --git a/tsml/tests/estimator_checks.py b/tsml/tests/estimator_checks.py index b436737..f23d51c 100644 --- a/tsml/tests/estimator_checks.py +++ b/tsml/tests/estimator_checks.py @@ -8,12 +8,13 @@ from sklearn.base import is_classifier, is_regressor from sklearn.exceptions import SkipTestWarning +from sklearn.utils._testing import ignore_warnings from sklearn.utils.estimator_checks import ( check_get_params_invariance, + check_no_attributes_set_in_init, check_parameters_default_constructible, check_set_params, ) -from sklearn.utils.validation import has_fit_parameter import tsml.tests._sklearn_checks as patched_checks from tsml.utils._tags import _safe_tags @@ -55,14 +56,13 @@ def _yield_checks(estimator): """sklearn""" tags = _safe_tags(estimator) - yield patched_checks.check_no_attributes_set_in_init + yield check_no_attributes_set_in_init yield patched_checks.check_estimators_dtypes yield patched_checks.check_fit_score_takes_y yield patched_checks.check_estimators_fit_returns_self yield partial( patched_checks.check_estimators_fit_returns_self, readonly_memmap=True ) - yield patched_checks.check_pipeline_consistency yield patched_checks.check_estimators_overwrite_params yield patched_checks.check_estimators_pickle yield patched_checks.check_estimator_get_tags_default_keys @@ -73,19 +73,10 @@ def _yield_checks(estimator): yield check_set_params yield patched_checks.check_dict_unchanged yield patched_checks.check_dont_overwrite_parameters - yield patched_checks.check_fit_idempotent yield patched_checks.check_fit_check_is_fitted - if has_fit_parameter(estimator, "sample_weight"): - yield patched_checks.check_sample_weights_not_an_array - yield patched_checks.check_sample_weights_list - if not tags["pairwise"]: - yield patched_checks.check_sample_weights_shape - yield patched_checks.check_sample_weights_not_overwritten - yield partial(patched_checks.check_sample_weights_invariance, kind="ones") - yield partial(patched_checks.check_sample_weights_invariance, kind="zeros") - if not tags["no_validation"]: + yield check_estimator_input_types yield patched_checks.check_complex_data yield patched_checks.check_dtype_object yield patched_checks.check_estimators_empty_data_messages @@ -99,81 +90,61 @@ def _yield_checks(estimator): yield patched_checks.check_estimators_nan_inf if not tags["non_deterministic"]: + yield patched_checks.check_pipeline_consistency + yield patched_checks.check_fit_idempotent yield patched_checks.check_methods_sample_order_invariance yield patched_checks.check_methods_subset_invariance - if tags["requires_positive_X"]: - yield patched_checks.check_fit_non_negative - - if tags["pairwise"]: - yield patched_checks.check_nonsquare_error - if not tags["univariate_only"]: - _yield_multivariate_checks(estimator) + yield check_estimator_handles_multivariate_data + yield check_fit3d_predict2d + else: + yield check_estimator_cannot_handle_multivariate_data if not tags["equal_length_only"]: - _yield_unequal_length_checks(estimator) + yield check_estimator_handles_unequal_data + yield check_n_features_unequal + else: + yield check_estimator_cannot_handle_unequal_data def _yield_classifier_checks(classifier): tags = _safe_tags(classifier) - yield patched_checks.check_classifier_data_not_an_array + yield patched_checks.check_estimator_data_not_an_array yield patched_checks.check_classifiers_one_label - yield patched_checks.check_classifiers_one_label_sample_weights yield patched_checks.check_classifiers_classes - yield patched_checks.check_estimators_partial_fit_n_features yield patched_checks.check_classifiers_train yield partial(patched_checks.check_classifiers_train, readonly_memmap=True) yield partial( patched_checks.check_classifiers_train, readonly_memmap=True, X_dtype="float32" ) - yield patched_checks.check_classifiers_regression_target - yield patched_checks.check_non_transformer_estimators_n_iter yield patched_checks.check_decision_proba_consistency - if tags["multioutput"]: - yield patched_checks.check_classifier_multioutput - - if tags["multilabel"]: - yield patched_checks.check_classifiers_multilabel_representation_invariance - yield patched_checks.check_classifiers_multilabel_output_format_predict - yield patched_checks.check_classifiers_multilabel_output_format_predict_proba - yield patched_checks.check_classifiers_multilabel_output_format_decision_function - if not tags["no_validation"]: + yield patched_checks.check_classifiers_regression_target yield patched_checks.check_supervised_y_no_nan - if not tags["multioutput_only"]: - yield patched_checks.check_supervised_y_2d + yield patched_checks.check_supervised_y_2d if tags["requires_fit"]: yield patched_checks.check_estimators_unfitted - if "class_weight" in classifier.get_params().keys(): - yield patched_checks.check_class_weight_classifiers - def _yield_regressor_checks(regressor): tags = _safe_tags(regressor) + yield patched_checks.check_estimator_data_not_an_array yield patched_checks.check_regressors_train yield partial(patched_checks.check_regressors_train, readonly_memmap=True) yield partial( patched_checks.check_regressors_train, readonly_memmap=True, X_dtype="float32" ) - yield patched_checks.check_regressor_data_not_an_array - yield patched_checks.check_estimators_partial_fit_n_features yield patched_checks.check_regressors_no_decision_function yield patched_checks.check_regressors_int - yield patched_checks.check_non_transformer_estimators_n_iter - - if tags["multioutput"]: - yield patched_checks.check_regressor_multioutput if not tags["no_validation"]: yield patched_checks.check_supervised_y_no_nan - if not tags["multioutput_only"]: - yield patched_checks.check_supervised_y_2d + yield patched_checks.check_supervised_y_2d if tags["requires_fit"]: yield patched_checks.check_estimators_unfitted @@ -184,7 +155,6 @@ def _yield_transformer_checks(transformer): yield patched_checks.check_transformer_general yield partial(patched_checks.check_transformer_general, readonly_memmap=True) - yield patched_checks.check_transformer_n_iter if not tags["no_validation"]: yield patched_checks.check_transformer_data_not_an_array @@ -192,23 +162,46 @@ def _yield_transformer_checks(transformer): if tags["preserves_dtype"]: yield patched_checks.check_transformer_preserve_dtypes - if not tags["stateless"]: - yield patched_checks.check_transformers_unfitted + if tags["requires_fit"]: + yield patched_checks.check_estimators_unfitted def _yield_clustering_checks(clusterer): yield patched_checks.check_clusterer_compute_labels_predict yield patched_checks.check_clustering yield partial(patched_checks.check_clustering, readonly_memmap=True) - yield patched_checks.check_estimators_partial_fit_n_features - if not hasattr(clusterer, "transform"): - yield patched_checks.check_non_transformer_estimators_n_iter + +def check_estimator_input_types(name, estimator_orig): + pass + + +@ignore_warnings(category=FutureWarning) +def check_fit3d_predict2d(name, estimator_orig): + pass + + +@ignore_warnings(category=FutureWarning) +def check_estimator_cannot_handle_multivariate_data(name, estimator_orig): + pass + + +@ignore_warnings(category=FutureWarning) +def check_estimator_handles_multivariate_data(name, estimator_orig): + pass + + +@ignore_warnings(category=FutureWarning) +def check_estimator_cannot_handle_unequal_data(name, estimator_orig): + pass -def _yield_multivariate_checks(estimator): +@ignore_warnings(category=FutureWarning) +def check_estimator_handles_unequal_data(name, estimator_orig): pass -def _yield_unequal_length_checks(estimator): +@ignore_warnings(category=FutureWarning) +def check_n_features_unequal(name, estimator_orig): pass + diff --git a/tsml/transformations/_ar_coefficient.py b/tsml/transformations/_ar_coefficient.py index 268932b..1d84e8b 100644 --- a/tsml/transformations/_ar_coefficient.py +++ b/tsml/transformations/_ar_coefficient.py @@ -50,4 +50,4 @@ def transform(self, X, y=None): return Xt def _more_tags(self): - return {"stateless": True, "optional_dependency": True} + return {"requires_fit": False, "optional_dependency": True} diff --git a/tsml/transformations/_catch22.py b/tsml/transformations/_catch22.py index a62735a..eddc5b4 100644 --- a/tsml/transformations/_catch22.py +++ b/tsml/transformations/_catch22.py @@ -16,6 +16,11 @@ from sklearn.utils.fixes import delayed from tsml.base import BaseTimeSeriesEstimator +from tsml.utils.numba_functions.general import ( + z_normalise_series, + z_normalise_series_with_mean, +) +from tsml.utils.numba_functions.stats import mean, numba_max, numba_min from tsml.utils.validation import _check_optional_dependency, check_n_jobs feature_names = [ @@ -242,26 +247,26 @@ def _transform_case(self, X, f_idx, features): if feature == 0 or feature == 1 or feature == 11: if smin is None: - smin = np.min(series) + smin = numba_min(series) if smax is None: - smax = np.max(series) + smax = numba_max(series) args = [series, smin, smax] elif feature == 2 or feature == 22: if smean is None: - smean = np.mean(series) + smean = mean(series) args = [series, smean] elif feature == 3 or feature == 4: if self.outlier_norm: if smean is None: - smean = np.mean(series) + smean = mean(series) if outlier_series is None: - outlier_series = _normalise_series(series, smean) + outlier_series = z_normalise_series_with_mean(series, smean) args = [outlier_series] else: args = [series] elif feature == 7 or feature == 8: if smean is None: - smean = np.mean(series) + smean = mean(series) if fft is None: nfft = int( np.power(2, np.ceil(np.log(len(series)) / np.log(2))) @@ -270,7 +275,7 @@ def _transform_case(self, X, f_idx, features): args = [series, fft] elif feature == 5 or feature == 6 or feature == 12: if smean is None: - smean = np.mean(series) + smean = mean(series) if fft is None: nfft = int( np.power(2, np.ceil(np.log(len(series)) / np.log(2))) @@ -281,7 +286,7 @@ def _transform_case(self, X, f_idx, features): args = [ac] elif feature == 16 or feature == 17 or feature == 20: if smean is None: - smean = np.mean(series) + smean = mean(series) if fft is None: nfft = int( np.power(2, np.ceil(np.log(len(series)) / np.log(2))) @@ -303,7 +308,7 @@ def _transform_case(self, X, f_idx, features): return c22 def _more_tags(self): - return {"X_types": ["np_list", "3darray"], "stateless": True} + return {"X_types": ["np_list", "3darray"], "requires_fit": False} @staticmethod def _DN_HistogramMode_5(X, smin, smax): @@ -1172,14 +1177,6 @@ def _spline_fit(X): return y_out -@njit(fastmath=True, cache=True) -def _normalise_series(X, mean): - std = np.std(X) - if std > 0: - return (X - mean) / std - return X - - def _verify_features(features, catch24): if isinstance(features, str): if features == "all": @@ -1417,10 +1414,7 @@ def _transform_case(self, X, f_idx, features): series = list(X[i]) if self.outlier_norm and (3 in f_idx or 4 in f_idx): - outlier_series = np.array(series) - outlier_series = list( - _normalise_series(outlier_series, np.mean(outlier_series)) - ) + outlier_series = list(z_normalise_series(X[i])) for n, feature in enumerate(f_idx): f_count += 1 @@ -1441,6 +1435,6 @@ def _transform_case(self, X, f_idx, features): def _more_tags(self): return { "X_types": ["np_list", "3darray"], - "stateless": True, + "requires_fit": False, "optional_dependency": True, } diff --git a/tsml/transformations/_function_transformer.py b/tsml/transformations/_function_transformer.py index 4424223..4b0c375 100644 --- a/tsml/transformations/_function_transformer.py +++ b/tsml/transformations/_function_transformer.py @@ -102,6 +102,6 @@ def transform(self, X): def _more_tags(self): return { "no_validation": not self.validate, - "stateless": True, + "requires_fit": False, "X_types": ["3darray", "2darray", "np_list"], } diff --git a/tsml/transformations/_periodogram.py b/tsml/transformations/_periodogram.py index d108679..1a33f55 100644 --- a/tsml/transformations/_periodogram.py +++ b/tsml/transformations/_periodogram.py @@ -64,4 +64,4 @@ def transform(self, X, y=None): return Xt def _more_tags(self): - return {"stateless": True, "optional_dependency": True} + return {"requires_fit": False, "optional_dependency": True} diff --git a/tsml/transformations/_shapelet_transform.py b/tsml/transformations/_shapelet_transform.py index 23ba49c..50a555e 100644 --- a/tsml/transformations/_shapelet_transform.py +++ b/tsml/transformations/_shapelet_transform.py @@ -183,6 +183,7 @@ def fit(self, X, y=None): This estimator. """ X, y = self._validate_data(X=X, y=y, ensure_min_samples=2) + X = self._convert_X(X) self.n_instances_, self.n_dims_, self.series_length_ = X.shape self.classes_, self._class_counts = np.unique(y, return_counts=True) @@ -335,6 +336,7 @@ def transform(self, X, y=None): check_is_fitted(self) X = self._validate_data(X=X, reset=True) + X = self._convert_X(X) output = np.zeros((len(X), len(self.shapelets_))) diff --git a/tsml/transformations/_summary_features.py b/tsml/transformations/_summary_features.py index 98088d2..1441c2a 100644 --- a/tsml/transformations/_summary_features.py +++ b/tsml/transformations/_summary_features.py @@ -87,4 +87,4 @@ def transform(self, X, y=None): return Xt def _more_tags(self): - return {"stateless": True} + return {"requires_fit": False} diff --git a/tsml/utils/_tags.py b/tsml/utils/_tags.py index 93c6a4f..3f68026 100644 --- a/tsml/utils/_tags.py +++ b/tsml/utils/_tags.py @@ -11,23 +11,14 @@ _DEFAULT_TAGS = { # sklearn tags "non_deterministic": False, - "requires_positive_X": False, - "requires_positive_y": False, "X_types": ["3darray"], - "poor_score": False, "no_validation": False, - "multioutput": False, "allow_nan": False, - "stateless": False, - "multilabel": False, - "_skip_test": False, - "_xfail_checks": False, - "multioutput_only": False, - "binary_only": False, - "requires_fit": True, "preserves_dtype": [np.float64], + "requires_fit": True, "requires_y": False, - "pairwise": False, + "_skip_test": False, + "_xfail_checks": False, # tsml tags "optional_dependency": False, "univariate_only": False, diff --git a/tsml/utils/numba_functions/general.py b/tsml/utils/numba_functions/general.py index 1b5b5b9..b338a08 100644 --- a/tsml/utils/numba_functions/general.py +++ b/tsml/utils/numba_functions/general.py @@ -135,6 +135,38 @@ def first_order_differences_3d(X: np.ndarray) -> np.ndarray: return X[:, :, 1:] - X[:, :, :-1] +@njit(fastmath=True, cache=True) +def z_normalise_series_with_mean(X: np.ndarray, series_mean: float) -> np.ndarray: + """Numba series normalization function for a 1d numpy array with mean. + + Parameters + ---------- + X : 1d numpy array + A 1d numpy array of values + series_mean : float + The mean of the series + + Returns + ------- + arr : 1d numpy array + The normalised series + + Examples + -------- + >>> import numpy as np + >>> from tsml.utils.numba_functions.general import z_normalise_series_with_mean + >>> from tsml.utils.numba_functions.stats import mean + >>> X = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4]) + >>> X_norm = z_normalise_series_with_mean(X, mean(X)) + """ + s = stats.std(X) + if s > 0: + arr = (X - series_mean) / s + else: + arr = X - series_mean + return arr + + @njit(fastmath=True, cache=True) def z_normalise_series(X: np.ndarray) -> np.ndarray: """Numba series normalization function for a 1d numpy array. diff --git a/tsml/utils/testing.py b/tsml/utils/testing.py index 776d4d9..6a8c456 100644 --- a/tsml/utils/testing.py +++ b/tsml/utils/testing.py @@ -113,6 +113,7 @@ def generate_3d_test_data( n_channels: int = 1, series_length: int = 12, n_labels: int = 2, + regression_target: bool = False, random_state: Union[int, None] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Randomly generate 3D data for testing. @@ -129,6 +130,8 @@ def generate_3d_test_data( The number of features/series length to generate. n_labels : int The number of unique labels to generate. + regression_target : bool + If True, the target will be a float, otherwise an int. random_state : int or None Seed for random number generation. @@ -152,11 +155,17 @@ def generate_3d_test_data( rng = np.random.RandomState(random_state) X = n_labels * rng.uniform(size=(n_samples, n_channels, series_length)) y = X[:, 0, 0].astype(int) + for i in range(n_labels): if len(y) > i: X[i, 0, 0] = i y[i] = i X = X * (y[:, None, None] + 1) + + if regression_target: + y = y.astype(np.float32) + y += rng.uniform(size=y.shape) + return X, y @@ -164,6 +173,7 @@ def generate_2d_test_data( n_samples: int = 10, series_length: int = 8, n_labels: int = 2, + regression_target: bool = False, random_state: Union[int, None] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Randomly generate 2D data for testing. @@ -178,6 +188,8 @@ def generate_2d_test_data( The number of features/series length to generate. n_labels : int The number of unique labels to generate. + regression_target : bool + If True, the target will be a float, otherwise an int. random_state : int or None Seed for random number generation. @@ -200,11 +212,17 @@ def generate_2d_test_data( rng = np.random.RandomState(random_state) X = n_labels * rng.uniform(size=(n_samples, series_length)) y = X[:, 0].astype(int) + for i in range(n_labels): if len(y) > i: X[i, 0] = i y[i] = i X = X * (y[:, None] + 1) + + if regression_target: + y = y.astype(np.float32) + y += rng.uniform(size=y.shape) + return X, y @@ -214,6 +232,7 @@ def generate_unequal_test_data( min_series_length: int = 6, max_series_length: int = 8, n_labels: int = 2, + regression_target: bool = False, random_state: Union[int, None] = None, ) -> Tuple[List[np.ndarray], np.ndarray]: """Randomly generate unequal length 3D data for testing. @@ -232,6 +251,8 @@ def generate_unequal_test_data( The maximum number of features/series length to generate for invidiaul series. n_labels : int The number of unique labels to generate. + regression_target : bool + If True, the target will be a float, otherwise an int. random_state : int or None Seed for random number generation. @@ -255,7 +276,7 @@ def generate_unequal_test_data( """ rng = np.random.RandomState(random_state) X = [] - y = np.zeros(n_samples) + y = np.zeros(n_samples, dtype=np.int32) for i in range(n_samples): series_length = rng.randint(min_series_length, max_series_length + 1) @@ -269,4 +290,8 @@ def generate_unequal_test_data( X.append(x) y[i] = label + if regression_target: + y = y.astype(np.float32) + y += rng.uniform(size=y.shape) + return X, y