diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c08ed041..dab4fdae3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add regressors example notebook ([#577](https://github.com/tinkoff-ai/etna/pull/577)) - Add option `season_number` to DateFlagsTransform ([#567](https://github.com/tinkoff-ai/etna/pull/567)) - - +- +- +- Create `AbstaractPipeline` ([#573](https://github.com/tinkoff-ai/etna/pull/573)) +- ### Changed - Change the way `ProphetModel` works with regressors ([#383](https://github.com/tinkoff-ai/etna/pull/383)) - Change the way `SARIMAXModel` works with regressors ([#380](https://github.com/tinkoff-ai/etna/pull/380)) @@ -39,10 +42,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update CONTRIBUTING.md ([#536](https://github.com/tinkoff-ai/etna/pull/536)) - - Rename `_CatBoostModel`, `_HoltWintersModel`, `_SklearnModel` ([#543](https://github.com/tinkoff-ai/etna/pull/543)) -- +- Add logging to TSDataset.make_future, log repr of transform instead of class name ([#555](https://github.com/tinkoff-ai/etna/pull/555)) - Rename `_SARIMAXModel` and `_ProphetModel`, make `SARIMAXModel` and `ProphetModel` inherit from `PerSegmentPredictionIntervalModel` ([#549](https://github.com/tinkoff-ai/etna/pull/549)) - -- +- Make detrending polynomial ([#566](https://github.com/tinkoff-ai/etna/pull/566)) - - Make `LabelEncoderTransform` and `OneHotEncoderTransform` multi-segment ([#554](https://github.com/tinkoff-ai/etna/pull/554)) ### Fixed diff --git a/etna/core/mixins.py b/etna/core/mixins.py index f8ac4e8e9..8f7c4a061 100644 --- a/etna/core/mixins.py +++ b/etna/core/mixins.py @@ -16,14 +16,14 @@ def __repr__(self): continue elif param.kind == param.VAR_KEYWORD: for arg_, value in self.__dict__[arg].items(): - args_str_representation += f"{arg_} = {value.__repr__()}, " + args_str_representation += f"{arg_} = {repr(value)}, " else: try: value = self.__dict__[arg] except KeyError as e: value = None warnings.warn(f"You haven't set all parameters inside class __init__ method: {e}") - args_str_representation += f"{arg} = {value.__repr__()}, " + args_str_representation += f"{arg} = {repr(value)}, " return f"{self.__class__.__name__}({args_str_representation})" diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 4f87deac1..1abff2230 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -134,7 +134,7 @@ def transform(self, transforms: Sequence["Transform"]): self._check_endings(warning=True) self.transforms = transforms for transform in self.transforms: - tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset") + tslogger.log(f"Transform {repr(transform)} is applied to dataset") columns_before = set(self.columns.get_level_values("feature")) self.df = transform.transform(self.df) columns_after = set(self.columns.get_level_values("feature")) @@ -145,7 +145,7 @@ def fit_transform(self, transforms: Sequence["Transform"]): self._check_endings(warning=True) self.transforms = transforms for transform in self.transforms: - tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset") + tslogger.log(f"Transform {repr(transform)} is applied to dataset") columns_before = set(self.columns.get_level_values("feature")) self.df = transform.fit_transform(self.df) columns_after = set(self.columns.get_level_values("feature")) @@ -288,6 +288,7 @@ def make_future(self, future_steps: int) -> "TSDataset": if self.transforms is not None: for transform in self.transforms: + tslogger.log(f"Transform {repr(transform)} is applied to dataset") df = transform.transform(df) future_dataset = df.tail(future_steps).copy(deep=True) diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py index 4166e3842..1e6379332 100644 --- a/etna/pipeline/base.py +++ b/etna/pipeline/base.py @@ -1,9 +1,91 @@ import warnings from abc import ABC from abc import abstractmethod +from typing import Any +from typing import Dict +from typing import List from typing import Sequence +from typing import Tuple + +import pandas as pd from etna.core import BaseMixin +from etna.datasets import TSDataset +from etna.metrics import Metric + + +class AbstractPipeline(ABC): + """Interface for pipeline.""" + + @abstractmethod + def fit(self, ts: TSDataset) -> "AbstractPipeline": + """Fit the Pipeline. + + Parameters + ---------- + ts: + Dataset with timeseries data + + Returns + ------- + self: + Fitted Pipeline instance + """ + pass + + @abstractmethod + def forecast(self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975)) -> TSDataset: + """Make predictions. + + Parameters + ---------- + prediction_interval: + If True returns prediction interval for forecast + quantiles: + Levels of prediction distribution. By default 2.5% and 97.5% taken to form a 95% prediction interval + + Returns + ------- + forecast: + Dataset with predictions + """ + pass + + @abstractmethod + def backtest( + self, + ts: TSDataset, + metrics: List[Metric], + n_folds: int = 5, + mode: str = "expand", + aggregate_metrics: bool = False, + n_jobs: int = 1, + joblib_params: Dict[str, Any] = dict(verbose=11, backend="multiprocessing", mmap_mode="c"), + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Run backtest with the pipeline. + + Parameters + ---------- + ts: + Dataset to fit models in backtest + metrics: + List of metrics to compute for each fold + n_folds: + Number of folds + mode: + One of 'expand', 'constant' -- train generation policy + aggregate_metrics: + If True aggregate metrics above folds, return raw metrics otherwise + n_jobs: + Number of jobs to run in parallel + joblib_params: + Additional parameters for joblib.Parallel + + Returns + ------- + metrics_df, forecast_df, fold_info_df: + Metrics dataframe, forecast dataframe and dataframe with information about folds + """ class BasePipeline(ABC, BaseMixin): diff --git a/etna/transforms/decomposition/detrend.py b/etna/transforms/decomposition/detrend.py index 60a2301ae..acbbd0c83 100644 --- a/etna/transforms/decomposition/detrend.py +++ b/etna/transforms/decomposition/detrend.py @@ -1,7 +1,10 @@ +import numpy as np import pandas as pd from sklearn.base import RegressorMixin from sklearn.linear_model import LinearRegression from sklearn.linear_model import TheilSenRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import PolynomialFeatures from etna.transforms.base import PerSegmentWrapper from etna.transforms.base import Transform @@ -11,7 +14,7 @@ class _OneSegmentLinearTrendBaseTransform(Transform): """LinearTrendBaseTransform is a base class that implements trend subtraction and reconstruction feature.""" - def __init__(self, in_column: str, regressor: RegressorMixin): + def __init__(self, in_column: str, regressor: RegressorMixin, poly_degree: int = 1): """ Create instance of _OneSegmentLinearTrendBaseTransform. @@ -21,9 +24,26 @@ def __init__(self, in_column: str, regressor: RegressorMixin): name of processed column regressor: instance of sklearn RegressorMixin to predict trend + poly_degree: + degree of polynomial to fit trend on """ - self._linear_model = regressor self.in_column = in_column + self.poly_degree = poly_degree + self._pipeline = Pipeline( + [("polynomial", PolynomialFeatures(degree=self.poly_degree, include_bias=False)), ("regressor", regressor)] + ) + # verification that this variable is fitted isn't needed because this class isn't used by the user + self._x_median = None + + @staticmethod + def _get_x(df) -> np.ndarray: + series_len = len(df) + x = df.index.to_series() + if isinstance(type(x.dtype), pd.Timestamp): + raise ValueError("Your timestamp column has wrong format. Need np.datetime64 or datetime.datetime") + x = x.apply(lambda ts: ts.timestamp()) + x = x.to_numpy().reshape(series_len, 1) + return x def fit(self, df: pd.DataFrame) -> "_OneSegmentLinearTrendBaseTransform": """ @@ -40,14 +60,11 @@ def fit(self, df: pd.DataFrame) -> "_OneSegmentLinearTrendBaseTransform": instance with trained regressor """ df = df.dropna(subset=[self.in_column]) - series_len = len(df) - x = df.index.to_series() - if isinstance(type(x.dtype), pd.Timestamp): - raise ValueError("Your timestamp column has wrong format. Need np.datetime64 or datetime.datetime") - x = x.apply(lambda ts: ts.timestamp()) - x = x.to_numpy().reshape(series_len, 1) + x = self._get_x(df) + self._x_median = np.median(x) + x -= self._x_median y = df[self.in_column].tolist() - self._linear_model.fit(x, y) + self._pipeline.fit(x, y) return self def transform(self, df: pd.DataFrame) -> pd.DataFrame: @@ -65,12 +82,10 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: residue after trend subtraction """ result = df.copy() - series_len = len(df) - x = pd.to_datetime(df.index.to_series()) - x = x.apply(lambda ts: ts.timestamp()) - x = x.to_numpy().reshape(series_len, 1) + x = self._get_x(df) + x -= self._x_median y = df[self.in_column].values - trend = self._linear_model.predict(x) + trend = self._pipeline.predict(x) no_trend_timeseries = y - trend result[self.in_column] = no_trend_timeseries return result @@ -106,12 +121,10 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: data with reconstructed trend """ result = df.copy() - series_len = len(df) - x = pd.to_datetime(df.index.to_series()) - x = x.apply(lambda ts: ts.timestamp()) - x = x.to_numpy().reshape(series_len, 1) + x = self._get_x(df) + x -= self._x_median y = df[self.in_column].values - trend = self._linear_model.predict(x) + trend = self._pipeline.predict(x) add_trend_timeseries = y + trend result[self.in_column] = add_trend_timeseries if self.in_column == "target": @@ -122,7 +135,7 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: class LinearTrendTransform(PerSegmentWrapper): - """Transform that uses sklearn.linear_model.LinearRegression to find linear trend in data. + """Transform that uses sklearn.linear_model.LinearRegression to find linear or polynomial trend in data. Warning ------- @@ -130,27 +143,32 @@ class LinearTrendTransform(PerSegmentWrapper): it uses information from the whole train part. """ - def __init__(self, in_column: str, **regression_params): + def __init__(self, in_column: str, poly_degree: int = 1, **regression_params): """Create instance of LinearTrendTransform. Parameters ---------- in_column: name of processed column + poly_degree: + degree of polynomial to fit trend on regression_params: params that should be used to init LinearRegression """ self.in_column = in_column + self.poly_degree = poly_degree self.regression_params = regression_params super().__init__( transform=_OneSegmentLinearTrendBaseTransform( - in_column=self.in_column, regressor=LinearRegression(**self.regression_params) + in_column=self.in_column, + regressor=LinearRegression(**self.regression_params), + poly_degree=self.poly_degree, ) ) class TheilSenTrendTransform(PerSegmentWrapper): - """Transform that uses sklearn.linear_model.TheilSenRegressor to find linear trend in data. + """Transform that uses sklearn.linear_model.TheilSenRegressor to find linear or polynomial trend in data. Warning ------- @@ -158,13 +176,15 @@ class TheilSenTrendTransform(PerSegmentWrapper): it uses information from the whole train part. """ - def __init__(self, in_column: str, **regression_params): + def __init__(self, in_column: str, poly_degree: int = 1, **regression_params): """Create instance of TheilSenTrendTransform. Parameters ---------- in_column: name of processed column + poly_degree: + degree of polynomial to fit trend on regression_params: params that should be used to init TheilSenRegressor @@ -174,9 +194,12 @@ def __init__(self, in_column: str, **regression_params): of features (plus 1 if fit_intercept=True) and the number of samples in the shortest segment as a maximum. """ self.in_column = in_column + self.poly_degree = poly_degree self.regression_params = regression_params super().__init__( transform=_OneSegmentLinearTrendBaseTransform( - in_column=self.in_column, regressor=TheilSenRegressor(**self.regression_params) + in_column=self.in_column, + regressor=TheilSenRegressor(**self.regression_params), + poly_degree=self.poly_degree, ) ) diff --git a/tests/test_loggers/test_console_logger.py b/tests/test_loggers/test_console_logger.py index 6b22127bd..cd739b69e 100644 --- a/tests/test_loggers/test_console_logger.py +++ b/tests/test_loggers/test_console_logger.py @@ -1,4 +1,5 @@ from tempfile import NamedTemporaryFile +from typing import Sequence import pytest from loguru import logger as _logger @@ -16,20 +17,50 @@ from etna.transforms import AddConstTransform from etna.transforms import DateFlagsTransform from etna.transforms import LagTransform +from etna.transforms import Transform + + +def check_logged_transforms(log_file: str, transforms: Sequence[Transform]): + """Check that transforms are logged into the file.""" + with open(log_file, "r") as in_file: + lines = in_file.readlines() + assert len(lines) == len(transforms) + for line, transform in zip(lines, transforms): + assert transform.__class__.__name__ in line + + +def test_tsdataset_transform_logging(example_tsds: TSDataset): + """Check working of logging inside `TSDataset.transform`.""" + transforms = [LagTransform(lags=5, in_column="target"), AddConstTransform(value=5, in_column="target")] + file = NamedTemporaryFile() + _logger.add(file.name) + example_tsds.fit_transform(transforms=transforms) + idx = tslogger.add(ConsoleLogger()) + example_tsds.transform(transforms=example_tsds.transforms) + check_logged_transforms(log_file=file.name, transforms=transforms) + tslogger.remove(idx) def test_tsdataset_fit_transform_logging(example_tsds: TSDataset): - """Check working of logging inside fit_transform of TSDataset.""" + """Check working of logging inside `TSDataset.fit_transform`.""" transforms = [LagTransform(lags=5, in_column="target"), AddConstTransform(value=5, in_column="target")] file = NamedTemporaryFile() _logger.add(file.name) idx = tslogger.add(ConsoleLogger()) example_tsds.fit_transform(transforms=transforms) - with open(file.name, "r") as in_file: - lines = in_file.readlines() - assert len(lines) == len(transforms) - for line, transform in zip(lines, transforms): - assert transform.__class__.__name__ in line + check_logged_transforms(log_file=file.name, transforms=transforms) + tslogger.remove(idx) + + +def test_tsdataset_make_future_logging(example_tsds: TSDataset): + """Check working of logging inside `TSDataset.make_future`.""" + transforms = [LagTransform(lags=5, in_column="target"), AddConstTransform(value=5, in_column="target")] + file = NamedTemporaryFile() + _logger.add(file.name) + example_tsds.fit_transform(transforms=transforms) + idx = tslogger.add(ConsoleLogger()) + _ = example_tsds.make_future(5) + check_logged_transforms(log_file=file.name, transforms=transforms) tslogger.remove(idx) @@ -88,6 +119,8 @@ def test_model_logging(example_tsds, model): with open(file.name, "r") as in_file: lines = in_file.readlines() + # filter out logs related to transforms + lines = [line for line in lines if lags.__class__.__name__ not in line] assert len(lines) == 2 assert "fit" in lines[0] assert "forecast" in lines[1] diff --git a/tests/test_transforms/test_decomposition/test_detrend_transform.py b/tests/test_transforms/test_decomposition/test_detrend_transform.py index efbdfc791..db228d758 100644 --- a/tests/test_transforms/test_decomposition/test_detrend_transform.py +++ b/tests/test_transforms/test_decomposition/test_detrend_transform.py @@ -1,3 +1,4 @@ +import numpy as np import numpy.testing as npt import pandas as pd import pytest @@ -30,11 +31,62 @@ def df_two_segments_diff_size(example_df) -> pd.DataFrame: return df -def _test_fit_transform_one_segment( +@pytest.fixture +def df_quadratic() -> pd.DataFrame: + """Make dataframe with quadratic trends. Segments 1, 2 has linear trend, segments -- 3, 4 quadratic.""" + timestamp = pd.date_range(start="2020-01-01", end="2020-02-01", freq="H") + rng = np.random.default_rng(42) + df_template = pd.DataFrame({"timestamp": timestamp, "segment": "segment", "target": np.arange(len(timestamp))}) + + # create segments + sigma = 0.05 + df_1 = df_template.copy() + df_1["target"] = 0.1 * df_1["target"] + rng.normal(scale=sigma) + df_1["segment"] = "segment_1" + + df_2 = df_template.copy() + df_2["target"] = (-2) * df_2["target"] + rng.normal(scale=sigma) + df_2["segment"] = "segment_2" + + df_3 = df_template.copy() + df_3["target"] = 0.01 * df_3["target"] ** 2 + rng.normal(scale=sigma) + df_3["segment"] = "segment_3" + + df_4 = df_template.copy() + df_4["target"] = 0.01 * df_4["target"] ** 2 + 0.1 * df_4["target"] + rng.normal(scale=sigma) + df_4["segment"] = "segment_4" + + # build final dataframe + df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True) + return df + + +@pytest.fixture +def df_one_segment_linear(df_quadratic) -> pd.DataFrame: + return df_quadratic[df_quadratic["segment"] == "segment_1"].set_index("timestamp") + + +@pytest.fixture +def df_two_segments_linear(df_quadratic) -> pd.DataFrame: + df_linear = df_quadratic[df_quadratic["segment"].isin(["segment_1", "segment_2"])] + return TSDataset.to_dataset(df_linear) + + +@pytest.fixture +def df_one_segment_quadratic(df_quadratic) -> pd.DataFrame: + return df_quadratic[df_quadratic["segment"] == "segment_3"].set_index("timestamp") + + +@pytest.fixture +def df_two_segments_quadratic(df_quadratic) -> pd.DataFrame: + return TSDataset.to_dataset(df_quadratic) + + +def _test_unbiased_fit_transform_one_segment( trend_transform: _OneSegmentLinearTrendBaseTransform, df: pd.DataFrame, **comparison_kwargs ) -> None: """ - Test if residue after trend subtraction is close to zero in one segment. + Test if mean of residue after trend subtraction is close to zero in one segment. Parameters ---------- @@ -49,9 +101,9 @@ def _test_fit_transform_one_segment( npt.assert_almost_equal(residue, 0, **comparison_kwargs) -def _test_fit_transform_many_segments(trend_transform, df: pd.DataFrame, **comparison_kwargs) -> None: +def _test_unbiased_fit_transform_many_segments(trend_transform, df: pd.DataFrame, **comparison_kwargs) -> None: """ - Test if residue after trend subtraction is close to zero in all segments. + Test if mean of residue after trend subtraction is close to zero in all segments. Parameters ---------- @@ -67,63 +119,183 @@ def _test_fit_transform_many_segments(trend_transform, df: pd.DataFrame, **compa npt.assert_almost_equal(residue[segment, "target"].mean(), 0, **comparison_kwargs) -def test_fit_transform_linear_trend_one_segment(df_one_segment: pd.DataFrame) -> None: +def _test_fit_transform_one_segment( + trend_transform: _OneSegmentLinearTrendBaseTransform, df: pd.DataFrame, **comparison_kwargs +) -> None: + """ + Test if residue after trend subtraction is close to zero in one segment. + + Parameters + ---------- + trend_transform: + instance of OneSegmentLinearTrendBaseTransform to predict trend with + df: + dataframe to predict + comparison_kwargs: + arguments for numpy.testing.assert_allclose function in key-value format + """ + residue = trend_transform.fit_transform(df)["target"] + residue = residue[~np.isnan(residue)] + npt.assert_allclose(residue, 0, **comparison_kwargs) + + +def _test_fit_transform_many_segments(trend_transform, df: pd.DataFrame, **comparison_kwargs) -> None: + """ + Test if residue after trend subtraction is close to zero in all segments. + + Parameters + ---------- + trend_transform: + instance of LinearTrendTransform or TheilSenTrendTransform to predict trend with + df: + dataframe to predict + comparison_kwargs: + arguments for numpy.testing.assert_allclose function in key-value format + """ + residue = trend_transform.fit_transform(df) + for segment in df.columns.get_level_values("segment").unique(): + segment_residue = residue[segment, "target"] + segment_residue = segment_residue[~np.isnan(segment_residue)] + npt.assert_allclose(segment_residue, 0, **comparison_kwargs) + + +def test_unbiased_fit_transform_linear_trend_one_segment(df_one_segment: pd.DataFrame) -> None: """ - This test checks that LinearRegression predicts correct trend on one segment of slightly noised data. + This test checks that LinearRegression predicts unbiased trend on one segment of slightly noised data. """ trend_transform = _OneSegmentLinearTrendBaseTransform(in_column="target", regressor=LinearRegression()) - _test_fit_transform_one_segment(trend_transform=trend_transform, df=df_one_segment) + _test_unbiased_fit_transform_one_segment(trend_transform=trend_transform, df=df_one_segment) -def test_fit_transform_theil_sen_trend_one_segment(df_one_segment: pd.DataFrame) -> None: +def test_unbiased_fit_transform_theil_sen_trend_one_segment(df_one_segment: pd.DataFrame) -> None: """ - This test checks that TheilSenRegressor predicts correct trend on one segment of slightly noised data. + This test checks that TheilSenRegressor predicts unbiased trend on one segment of slightly noised data. """ trend_transform = _OneSegmentLinearTrendBaseTransform( in_column="target", regressor=TheilSenRegressor(n_subsamples=int(len(df_one_segment) / 2), max_iter=3000, tol=1e-4), ) - _test_fit_transform_one_segment(trend_transform=trend_transform, df=df_one_segment, decimal=0) + _test_unbiased_fit_transform_one_segment(trend_transform=trend_transform, df=df_one_segment, decimal=0) -def test_fit_transform_theil_sen_trend_all_data_one_segment(df_one_segment: pd.DataFrame) -> None: +def test_unbiased_fit_transform_theil_sen_trend_all_data_one_segment(df_one_segment: pd.DataFrame) -> None: """ - This test checks that TheilSenRegressor predicts correct trend on one segment of slightly noised data + This test checks that TheilSenRegressor predicts unbiased trend on one segment of slightly noised data using all the data to train model. """ # Note that it is a corner case: we use all the data to predict trend trend_transform = _OneSegmentLinearTrendBaseTransform( in_column="target", regressor=TheilSenRegressor(n_subsamples=len(df_one_segment)) ) - _test_fit_transform_one_segment(trend_transform=trend_transform, df=df_one_segment) + _test_unbiased_fit_transform_one_segment(trend_transform=trend_transform, df=df_one_segment) -def test_fit_transform_linear_trend_two_segments(df_two_segments: pd.DataFrame) -> None: +def test_unbiased_fit_transform_linear_trend_two_segments(df_two_segments: pd.DataFrame) -> None: """ - This test checks that LinearRegression predicts correct trend on two segments of slightly noised data. + This test checks that LinearRegression predicts unbiased trend on two segments of slightly noised data. """ trend_transform = LinearTrendTransform(in_column="target") - _test_fit_transform_many_segments(trend_transform=trend_transform, df=df_two_segments) + _test_unbiased_fit_transform_many_segments(trend_transform=trend_transform, df=df_two_segments) -def test_fit_transform_theil_sen_trend_two_segments(df_two_segments: pd.DataFrame) -> None: +def test_unbiased_fit_transform_theil_sen_trend_two_segments(df_two_segments: pd.DataFrame) -> None: """ - This test checks that TheilSenRegressor predicts correct trend on two segments of slightly noised data. + This test checks that TheilSenRegressor predicts unbiased trend on two segments of slightly noised data. """ trend_transform = TheilSenTrendTransform( in_column="target", n_subsamples=int(len(df_two_segments) / 2), max_iter=3000, tol=1e-4 ) - _test_fit_transform_many_segments(trend_transform=trend_transform, df=df_two_segments, decimal=0) + _test_unbiased_fit_transform_many_segments(trend_transform=trend_transform, df=df_two_segments, decimal=0) -def test_fit_transform_theil_sen_trend_all_data_two_segments(df_two_segments: pd.DataFrame) -> None: +def test_unbiased_fit_transform_theil_sen_trend_all_data_two_segments(df_two_segments: pd.DataFrame) -> None: """ - This test checks that TheilSenRegressor predicts correct trend on two segments of slightly noised data + This test checks that TheilSenRegressor predicts unbiased trend on two segments of slightly noised data using all the data to train model. """ # Note that it is a corner case: we use all the data to predict trend trend_transform = TheilSenTrendTransform(in_column="target", n_subsamples=len(df_two_segments)) - _test_fit_transform_many_segments(trend_transform=trend_transform, df=df_two_segments) + _test_unbiased_fit_transform_many_segments(trend_transform=trend_transform, df=df_two_segments) + + +@pytest.mark.parametrize("df_fixture, poly_degree", [("df_one_segment_linear", 1), ("df_one_segment_quadratic", 2)]) +def test_fit_transform_linear_trend_one_segment(df_fixture, poly_degree, request) -> None: + """ + Test that LinearRegression predicts correct trend on one segment of slightly noised data. + """ + df = request.getfixturevalue(df_fixture) + trend_transform = _OneSegmentLinearTrendBaseTransform( + in_column="target", regressor=LinearRegression(), poly_degree=poly_degree + ) + _test_fit_transform_one_segment(trend_transform=trend_transform, df=df, atol=1e-5) + + +@pytest.mark.parametrize("df_fixture, poly_degree", [("df_one_segment_linear", 1), ("df_one_segment_quadratic", 2)]) +def test_fit_transform_theil_sen_trend_one_segment(df_fixture, poly_degree, request) -> None: + """ + Test that TheilSenRegressor predicts correct trend on one segment of slightly noised data. + + Not all data is used to train the model. + """ + df = request.getfixturevalue(df_fixture) + trend_transform = _OneSegmentLinearTrendBaseTransform( + in_column="target", + regressor=TheilSenRegressor(n_subsamples=int(len(df) / 2), max_iter=3000, tol=1e-4), + poly_degree=poly_degree, + ) + _test_fit_transform_one_segment(trend_transform=trend_transform, df=df, atol=1e-5) + + +@pytest.mark.parametrize("df_fixture, poly_degree", [("df_one_segment_linear", 1), ("df_one_segment_quadratic", 2)]) +def test_fit_transform_theil_sen_trend_all_data_one_segment(df_fixture, poly_degree, request) -> None: + """ + Test that TheilSenRegressor predicts correct trend on one segment of slightly noised data. + + All data is used to train the model. + """ + df = request.getfixturevalue(df_fixture) + # Note that it is a corner case: we use all the data to predict trend + trend_transform = _OneSegmentLinearTrendBaseTransform( + in_column="target", regressor=TheilSenRegressor(n_subsamples=len(df)), poly_degree=poly_degree + ) + _test_fit_transform_one_segment(trend_transform=trend_transform, df=df, atol=1e-5) + + +@pytest.mark.parametrize("df_fixture, poly_degree", [("df_two_segments_linear", 1), ("df_two_segments_quadratic", 2)]) +def test_fit_transform_linear_trend_two_segments(df_fixture, poly_degree, request) -> None: + """ + Test that LinearRegression predicts correct trend on two segments of slightly noised data. + """ + df = request.getfixturevalue(df_fixture) + trend_transform = LinearTrendTransform(in_column="target", poly_degree=poly_degree) + _test_fit_transform_many_segments(trend_transform=trend_transform, df=df, atol=1e-5) + + +@pytest.mark.parametrize("df_fixture, poly_degree", [("df_two_segments_linear", 1), ("df_two_segments_quadratic", 2)]) +def test_fit_transform_theil_sen_trend_two_segments(df_fixture, poly_degree, request) -> None: + """ + Test that TheilSenRegressor predicts correct trend on two segments of slightly noised data. + + Not all data is used to train the model. + """ + df = request.getfixturevalue(df_fixture) + trend_transform = TheilSenTrendTransform( + in_column="target", poly_degree=poly_degree, n_subsamples=int(len(df) / 2), max_iter=3000, tol=1e-4 + ) + _test_fit_transform_many_segments(trend_transform=trend_transform, df=df, atol=1e-5) + + +@pytest.mark.parametrize("df_fixture, poly_degree", [("df_two_segments_linear", 1), ("df_two_segments_quadratic", 2)]) +def test_fit_transform_theil_sen_trend_all_data_two_segments(df_fixture, poly_degree, request) -> None: + """ + Test that TheilSenRegressor predicts correct trend on two segments of slightly noised data. + + All data is used to train the model. + """ + df = request.getfixturevalue(df_fixture) + # Note that it is a corner case: we use all the data to predict trend + trend_transform = TheilSenTrendTransform(in_column="target", poly_degree=poly_degree, n_subsamples=len(df)) + _test_fit_transform_many_segments(trend_transform=trend_transform, df=df, atol=1e-5) def _test_inverse_transform_one_segment( @@ -143,7 +315,7 @@ def _test_inverse_transform_one_segment( """ df_transformed = trend_transform.fit_transform(df) df_inverse_transformed = trend_transform.inverse_transform(df_transformed) - npt.assert_allclose(df["target"], df_inverse_transformed["target"]) + npt.assert_allclose(df["target"], df_inverse_transformed["target"], **comparison_kwargs) def _test_inverse_transform_many_segments(trend_transform, df: pd.DataFrame, **comparison_kwargs) -> None: @@ -165,37 +337,45 @@ def _test_inverse_transform_many_segments(trend_transform, df: pd.DataFrame, **c npt.assert_allclose(df_inverse_transformed[segment, "target"], df[segment, "target"], **comparison_kwargs) -def test_inverse_transform_linear_trend_one_segment(df_one_segment: pd.DataFrame): +@pytest.mark.parametrize("poly_degree", [1, 2]) +def test_inverse_transform_linear_trend_one_segment(df_one_segment: pd.DataFrame, poly_degree: int): """ Test that LinearTrend can correctly make inverse_transform for one segment. """ - trend_transform = _OneSegmentLinearTrendBaseTransform(in_column="target", regressor=LinearRegression()) + trend_transform = _OneSegmentLinearTrendBaseTransform( + in_column="target", regressor=LinearRegression(), poly_degree=poly_degree + ) _test_inverse_transform_one_segment(trend_transform=trend_transform, df=df_one_segment) -def test_inverse_transform_theil_sen_trend_one_segment(df_one_segment: pd.DataFrame): +@pytest.mark.parametrize("poly_degree", [1, 2]) +def test_inverse_transform_theil_sen_trend_one_segment(df_one_segment: pd.DataFrame, poly_degree: int): """ Test that TheilSenRegressor can correctly make inverse_transform for one segment. """ trend_transform = _OneSegmentLinearTrendBaseTransform( - in_column="target", regressor=TheilSenRegressor(n_subsamples=len(df_one_segment)) + in_column="target", regressor=TheilSenRegressor(n_subsamples=len(df_one_segment)), poly_degree=poly_degree ) _test_inverse_transform_one_segment(trend_transform=trend_transform, df=df_one_segment) -def test_inverse_transform_linear_trend_two_segments(df_two_segments: pd.DataFrame): +@pytest.mark.parametrize("poly_degree", [1, 2]) +def test_inverse_transform_linear_trend_two_segments(df_two_segments: pd.DataFrame, poly_degree: int): """ Test that LinearTrend can correctly make inverse_transform for two segments. """ - trend_transform = LinearTrendTransform(in_column="target") + trend_transform = LinearTrendTransform(in_column="target", poly_degree=poly_degree) _test_inverse_transform_many_segments(trend_transform=trend_transform, df=df_two_segments) -def test_inverse_transform_theil_sen_trend_two_segments(df_two_segments: pd.DataFrame): +@pytest.mark.parametrize("poly_degree", [1, 2]) +def test_inverse_transform_theil_sen_trend_two_segments(df_two_segments: pd.DataFrame, poly_degree: int): """ Test that TheilSenRegressor can correctly make inverse_transform for two segments. """ - trend_transform = TheilSenTrendTransform(in_column="target", n_subsamples=len(df_two_segments)) + trend_transform = TheilSenTrendTransform( + in_column="target", poly_degree=poly_degree, n_subsamples=len(df_two_segments) + ) _test_inverse_transform_many_segments(trend_transform=trend_transform, df=df_two_segments) @@ -209,7 +389,9 @@ def test_fit_transform_two_segments_diff_size( """ Test that TrendTransform can correctly make fit_transform for two segments of different size. """ - _test_fit_transform_many_segments(trend_transform=transformer, df=df_two_segments_diff_size, decimal=decimal) + _test_unbiased_fit_transform_many_segments( + trend_transform=transformer, df=df_two_segments_diff_size, decimal=decimal + ) @pytest.mark.parametrize( @@ -227,4 +409,4 @@ def test_inverse_transform_segments_diff_size(df_two_segments_diff_size: pd.Data [(LinearTrendTransform(in_column="target"), 7), (TheilSenTrendTransform(in_column="target"), 0)], ) def test_fit_transform_with_nans(transformer, df_with_nans, decimal): - _test_fit_transform_many_segments(trend_transform=transformer, df=df_with_nans, decimal=decimal) + _test_unbiased_fit_transform_many_segments(trend_transform=transformer, df=df_with_nans, decimal=decimal)