Skip to content

Commit

Permalink
Add regressors updating in transform loops (#374)
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-hse-repository committed Feb 8, 2022
1 parent c0a1805 commit 76d5e0e
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 5 deletions.
50 changes: 50 additions & 0 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List
from typing import Optional
from typing import Sequence
from typing import Set
from typing import Tuple
from typing import Union

Expand Down Expand Up @@ -134,15 +135,64 @@ def transform(self, transforms: Sequence["Transform"]):
self.transforms = transforms
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
columns_before = set(self.columns.get_level_values("feature"))
self.df = transform.transform(self.df)
columns_after = set(self.columns.get_level_values("feature"))
self._update_regressors(transform=transform, columns_before=columns_before, columns_after=columns_after)

def fit_transform(self, transforms: Sequence["Transform"]):
"""Fit and apply given transforms to the data."""
self._check_endings(warning=True)
self.transforms = transforms
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
columns_before = set(self.columns.get_level_values("feature"))
self.df = transform.fit_transform(self.df)
columns_after = set(self.columns.get_level_values("feature"))
self._update_regressors(transform=transform, columns_before=columns_before, columns_after=columns_after)

def _update_regressors(self, transform: "Transform", columns_before: Set[str], columns_after: Set[str]):
from etna.transforms.base import FutureMixin

unseen_columns = list(columns_after - columns_before)
if len(unseen_columns) == 0:
return

new_regressors = []

if isinstance(transform, FutureMixin):
# Every column from FutureMixin is regressor
out_columns = list(columns_after - columns_before)
new_regressors = out_columns

elif hasattr(transform, "in_column"):
# Only the columns created with the other transforms from regressors are regressors
in_columns = transform.in_column if isinstance(transform.in_column, list) else [transform.in_column] # type: ignore
if hasattr(transform, "out_columns") and transform.out_columns is not None: # type: ignore
# User defined out_columns in sklearn
# TODO: remove this case after fixing the out_column attribute in SklearnTransform
out_columns = transform.out_columns # type: ignore
regressors_in_column_ids = [i for i, in_column in enumerate(in_columns) if in_column in self.regressors]
new_regressors = [out_columns[i] for i in regressors_in_column_ids]
elif hasattr(transform, "out_column") and transform.out_column is not None: # type: ignore
# User defined out_columns
out_columns = transform.out_column if isinstance(transform.out_column, list) else [transform.out_column] # type: ignore
regressors_in_column_ids = [i for i, in_column in enumerate(in_columns) if in_column in self.regressors]
new_regressors = [out_columns[i] for i in regressors_in_column_ids]
else:
# Default out_columns
out_columns = list(columns_after - columns_before)
regressors_in_column = [in_column for in_column in in_columns if in_column in self.regressors]
new_regressors = [
out_column
for out_column in out_columns
if np.any([regressor in out_column for regressor in regressors_in_column])
]

else:
raise ValueError("Transform is not FutureMixin and does not have in_column attribute!")

self._regressors.extend(new_regressors)

def __repr__(self):
return self.df.__repr__()
Expand Down
88 changes: 88 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
from typing import List
from typing import Tuple

Expand All @@ -9,6 +10,10 @@
from etna.datasets import generate_ar_df
from etna.datasets.tsdataset import TSDataset
from etna.transforms import TimeSeriesImputerTransform
from etna.transforms import AddConstTransform
from etna.transforms import LagTransform
from etna.transforms import MaxAbsScalerTransform
from etna.transforms import SegmentEncoderTransform


@pytest.fixture()
Expand Down Expand Up @@ -598,3 +603,86 @@ def test_describe(df_and_regressors):
assert np.all(description["num_exogs"] == 2)
assert np.all(description["num_regressors"] == 2)
assert np.all(description["freq"] == "D")

@pytest.fixture()
def ts_with_regressors(df_and_regressors):
df, df_exog, regressors = df_and_regressors
ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future="all")
return ts


def _test_update_regressors_transform(ts, transforms, expected_regressors):
fitted_transforms = [transform.fit(ts.df) for transform in transforms]
ts.transform(fitted_transforms)
regressors = ts.regressors
assert sorted(regressors) == sorted(expected_regressors)


def _test_update_regressors_fit_transform(ts, transforms, expected_regressors):
ts.fit_transform(transforms)
regressors = ts.regressors
assert sorted(regressors) == sorted(expected_regressors)


@pytest.mark.parametrize(
"transforms, expected_regressors",
(
([SegmentEncoderTransform()], ["regressor_1", "regressor_2", "regressor_segment_code"]),
(
[LagTransform(in_column="target", lags=[1, 2], out_column="regressor_lag")],
["regressor_1", "regressor_2", "regressor_lag_1", "regressor_lag_2"],
),
),
)
def test_update_regressors_with_futuremixin_transform(ts_with_regressors, transforms, expected_regressors):
_test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
_test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)


@pytest.mark.parametrize(
"transforms, expected_regressors",
(
(
[MaxAbsScalerTransform(in_column="regressor_1", inplace=False, out_column="scaled")],
["regressor_1", "regressor_2", "scaled_regressor_1"],
),
(
[MaxAbsScalerTransform(in_column=["regressor_1", "regressor_2"], inplace=False, out_column=None)],
[
"regressor_1",
"regressor_2",
MaxAbsScalerTransform(in_column=["regressor_1"], inplace=False, out_column=None).__repr__(),
MaxAbsScalerTransform(in_column=["regressor_2"], inplace=False, out_column=None).__repr__(),
],
),
(
[
AddConstTransform(
in_column="regressor_1", value=2, inplace=False, out_column="regressor_add_constant_regressor_1"
)
],
["regressor_1", "regressor_2", "regressor_add_constant_regressor_1"],
),
),
)
def test_update_regressors_with_regressor_in_column(ts_with_regressors, transforms, expected_regressors):
_test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
_test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)


@pytest.mark.parametrize(
"transforms, expected_regressors",
(
(
[MaxAbsScalerTransform(in_column="target", inplace=False, out_column="scaled_target")],
["regressor_1", "regressor_2"],
),
(
[AddConstTransform(in_column="target", value=2, inplace=False, out_column="add_constant_target")],
["regressor_1", "regressor_2"],
),
),
)
def test_update_regressors_not_add_not_regressors(ts_with_regressors, transforms, expected_regressors):
_test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
_test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from etna.datasets.tsdataset import TSDataset
from etna.transforms.decomposition import TrendTransform
from etna.transforms.decomposition.trend import _OneSegmentTrendTransform
from etna.transforms.decomposition.trend import _TrendTransform

DEFAULT_SEGMENT = "segment_1"

Expand Down Expand Up @@ -61,9 +60,8 @@ def test_fit_transform_many_segments(example_tsds: TSDataset) -> None:
"""
out_column = "regressor_result"
example_tsds_original = deepcopy(example_tsds)
trend_transform = _TrendTransform(
trend_transform = TrendTransform(
in_column="target",
change_point_model=Binseg(),
detrend_model=LinearRegression(),
n_bkps=5,
out_column=out_column,
Expand All @@ -82,9 +80,8 @@ def test_inverse_transform_many_segments(example_tsds: TSDataset) -> None:
"""
Test that inverse_transform interface works correctly for many segment.
"""
trend_transform = _TrendTransform(
trend_transform = TrendTransform(
in_column="target",
change_point_model=Binseg(),
detrend_model=LinearRegression(),
n_bkps=5,
out_column="test",
Expand Down

0 comments on commit 76d5e0e

Please sign in to comment.