Skip to content

Make DifferencingTransform to fail on new segments with understandable error #1141

Merged
merged 16 commits into from
Mar 6, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix `SegmentEncoderTransform` to work with subset of segments and raise error on new segments ([#1103](https://github.com/tinkoff-ai/etna/pull/1103))
- Fix `SklearnTransform` in per-segment mode to work on subset of segments and raise error on new segments ([#1107](https://github.com/tinkoff-ai/etna/pull/1107))
- Fix `OutliersTransform` and its children to raise error on new segments ([#1139](https://github.com/tinkoff-ai/etna/pull/1139))
- Fix `DifferencingTransform` to raise error on new segments during `transform` and `inverse_transform` in inplace mode ([#1141](https://github.com/tinkoff-ai/etna/pull/1141))
## [1.14.0] - 2022-12-16
### Added
- Add python 3.10 support ([#1005](https://github.com/tinkoff-ai/etna/pull/1005))
Expand Down
54 changes: 42 additions & 12 deletions etna/transforms/math/differencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from typing import Optional
from typing import Set
from typing import Union
from typing import cast

import numpy as np
import pandas as pd

from etna.transforms.base import Transform
from etna.transforms.utils import check_new_segments
from etna.transforms.utils import match_target_quantiles


Expand Down Expand Up @@ -88,16 +90,18 @@ def fit(self, df: pd.DataFrame) -> "_SingleDifferencingTransform":
segments = sorted(set(df.columns.get_level_values("segment")))
fit_df = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy()

self._train_timestamp = fit_df.index
self._train_init_dict = {}
train_init_dict = {}
for current_segment in segments:
cur_series = fit_df.loc[:, pd.IndexSlice[current_segment, self.in_column]]
cur_series = cur_series.loc[cur_series.first_valid_index() :]

if cur_series.isna().sum() > 0:
raise ValueError(f"There should be no NaNs inside the segments")

self._train_init_dict[current_segment] = cur_series[: self.period]
train_init_dict[current_segment] = cur_series[: self.period]

self._train_init_dict = train_init_dict
self._train_timestamp = fit_df.index
self._test_init_df = fit_df.iloc[-self.period :, :]
# make multiindex levels consistent
self._test_init_df.columns = self._test_init_df.columns.remove_unused_levels()
Expand All @@ -113,12 +117,9 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:

Returns
-------
result: pd.Dataframe
result:
transformed dataframe
"""
if self._train_init_dict is None or self._test_init_df is None or self._train_timestamp is None:
raise AttributeError("Transform is not fitted")

segments = sorted(set(df.columns.get_level_values("segment")))
transformed = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy()
for current_segment in segments:
Expand Down Expand Up @@ -207,11 +208,11 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:

Returns
-------
result: pd.DataFrame
result:
transformed DataFrame.
"""
if self._train_init_dict is None or self._test_init_df is None or self._train_timestamp is None:
raise AttributeError("Transform is not fitted")
# we assume this to be fitted
self._train_timestamp = cast(pd.DatetimeIndex, self._train_timestamp)

if not self.inplace:
return df
Expand Down Expand Up @@ -312,6 +313,7 @@ def __init__(
self._differencing_transforms.append(
_SingleDifferencingTransform(in_column=result_out_column, period=self.period, inplace=True)
)
self._fit_segments: Optional[List[str]] = None
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved

def _get_column_name(self) -> str:
if self.inplace:
Expand All @@ -337,8 +339,13 @@ def fit(self, df: pd.DataFrame) -> "DifferencingTransform":
result_df = df.copy()
for transform in self._differencing_transforms:
result_df = transform.fit_transform(result_df)
self._fit_segments = df.columns.get_level_values("segment").unique().tolist()
return self

def _check_is_fitted(self):
if self._fit_segments is None:
raise ValueError("Transform is not fitted!")

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Make a differencing transformation.

Expand All @@ -349,9 +356,21 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:

Returns
-------
result: pd.Dataframe
result:
transformed dataframe

Raises
------
ValueError:
if transform isn't fitted
NotImplementedError:
if there are segments that weren't present during training
"""
self._check_is_fitted()
segments = df.columns.get_level_values("segment").unique().tolist()
if self.inplace:
check_new_segments(transform_segments=segments, fit_segments=self._fit_segments)

result_df = df.copy()
for transform in self._differencing_transforms:
result_df = transform.transform(result_df)
Expand All @@ -367,12 +386,23 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:

Returns
-------
result: pd.DataFrame
result:
transformed DataFrame.

Raises
------
ValueError:
if transform isn't fitted
NotImplementedError:
if there are segments that weren't present during training
"""
self._check_is_fitted()
if not self.inplace:
return df

segments = df.columns.get_level_values("segment").unique().tolist()
check_new_segments(transform_segments=segments, fit_segments=self._fit_segments)

result_df = df.copy()
for transform in self._differencing_transforms[::-1]:
result_df = transform.inverse_transform(result_df)
Expand Down
10 changes: 3 additions & 7 deletions etna/transforms/math/sklearn.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import reprlib
import warnings
from copy import deepcopy
from typing import Dict
Expand All @@ -14,6 +13,7 @@
from etna.core import StringEnumWithRepr
from etna.datasets import set_columns_wide
from etna.transforms.base import Transform
from etna.transforms.utils import check_new_segments
from etna.transforms.utils import match_target_quantiles


Expand Down Expand Up @@ -231,12 +231,8 @@ def _postprocess_macro(self, df: pd.DataFrame, transformed: np.ndarray) -> np.nd

def _preprocess_per_segment(self, df: pd.DataFrame) -> np.ndarray:
self._fit_segments = cast(List[str], self._fit_segments)
transform_segments = df.columns.get_level_values("segment").unique()
new_segments = set(transform_segments) - set(self._fit_segments)
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)
transform_segments = df.columns.get_level_values("segment").unique().tolist()
check_new_segments(transform_segments=transform_segments, fit_segments=self._fit_segments)

df = df.loc[:, pd.IndexSlice[:, self.in_column]]
to_add_segments = set(self._fit_segments) - set(transform_segments)
Expand Down
15 changes: 3 additions & 12 deletions etna/transforms/outliers/base.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import reprlib
from abc import ABC
from abc import abstractmethod
from typing import Dict
from typing import List
from typing import Optional
from typing import cast

import numpy as np
import pandas as pd

from etna.datasets import TSDataset
from etna.transforms.base import Transform
from etna.transforms.utils import check_new_segments


class OutliersTransform(Transform, ABC):
Expand Down Expand Up @@ -68,14 +67,6 @@ def fit(self, df: pd.DataFrame) -> "OutliersTransform":

return self

def _validate_segments(self, segments: List[str]):
self._fit_segments = cast(List[str], self._fit_segments)
new_segments = set(segments) - set(self._fit_segments)
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Replace found outliers with NaNs.
Expand All @@ -101,7 +92,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
raise ValueError("Transform is not fitted! Fit the Transform before calling transform method.")
result_df = df.copy()
segments = df.columns.get_level_values("segment").unique().tolist()
self._validate_segments(segments)
check_new_segments(transform_segments=segments, fit_segments=self._fit_segments)
for segment in segments:
result_df.loc[self.outliers_timestamps[segment], pd.IndexSlice[segment, self.in_column]] = np.NaN
return result_df
Expand Down Expand Up @@ -131,7 +122,7 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
raise ValueError("Transform is not fitted! Fit the Transform before calling inverse_transform method.")
result = df.copy()
segments = df.columns.get_level_values("segment").unique().tolist()
self._validate_segments(segments)
check_new_segments(transform_segments=segments, fit_segments=self._fit_segments)
for segment in segments:
segment_ts = result[segment, self.in_column]
segment_ts[segment_ts.index.isin(self.outliers_timestamps[segment])] = self.original_values[segment]
Expand Down
15 changes: 15 additions & 0 deletions etna/transforms/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
import re
import reprlib
from typing import List
from typing import Optional
from typing import Set


def match_target_quantiles(features: Set[str]) -> Set[str]:
"""Find quantiles in dataframe columns."""
pattern = re.compile("target_\d+\.\d+$")
return {i for i in list(features) if pattern.match(i) is not None}


def check_new_segments(transform_segments: List[str], fit_segments: Optional[List[str]]):
"""Check if there are any new segments that weren't present during training."""
if fit_segments is None:
raise ValueError("Transform is not fitted!")

new_segments = set(transform_segments) - set(fit_segments)
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ line_length = 120
minversion = "6.0"
doctest_optionflags = "NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL NUMBER"
filterwarnings = [
"error",
"ignore: Torchmetrics v0.9 introduced a new argument class property called `full_state_update` that",
"ignore: TSDataset freq can't be inferred",
"ignore: test_size, test_start and test_end cannot be",
Expand Down
24 changes: 3 additions & 21 deletions tests/test_transforms/test_inference/test_inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ def test_inverse_transform_train_new_segments(self, transform, dataset_name, exp
(MeanSegmentEncoderTransform(), "regular_ts"),
(SegmentEncoderTransform(), "regular_ts"),
# math
(DifferencingTransform(in_column="target", inplace=True), "regular_ts"),
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"),
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"),
(MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"),
Expand Down Expand Up @@ -701,23 +702,6 @@ def test_inverse_transform_train_new_segments_failed_not_implemented(self, trans
ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={}
)

@to_be_fixed(raises=Exception)
@pytest.mark.parametrize(
"transform, dataset_name, expected_changes",
[
# math
# TODO: error should be understandable, not like now
(DifferencingTransform(in_column="target", inplace=True), "regular_ts", {"change": {"target"}}),
],
)
def test_inverse_transform_train_new_segments_failed_error(
self, transform, dataset_name, expected_changes, request
):
ts = request.getfixturevalue(dataset_name)
self._test_inverse_transform_train_new_segments(
ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes
)


class TestInverseTransformFutureNewSegments:
"""Test inverse transform on future part of new segments.
Expand Down Expand Up @@ -990,6 +974,8 @@ def test_inverse_transform_future_new_segments(self, transform, dataset_name, ex
(MeanSegmentEncoderTransform(), "regular_ts"),
(SegmentEncoderTransform(), "regular_ts"),
# math
(DifferencingTransform(in_column="target", inplace=True), "regular_ts"),
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
(DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"),
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"),
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"),
(BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"),
Expand Down Expand Up @@ -1080,10 +1066,6 @@ def test_inverse_transform_future_new_segments_failed_not_implemented(self, tran
"ts_with_exog",
{"create": {"year", "month", "weekday"}},
),
# math
# TODO: error should be understandable, not like now
(DifferencingTransform(in_column="target", inplace=True), "regular_ts", {}),
(DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}),
],
)
def test_inverse_transform_future_new_segments_failed_error(
Expand Down
6 changes: 3 additions & 3 deletions tests/test_transforms/test_inference/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,6 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe
"regular_ts",
{"create": {"res"}},
),
(DifferencingTransform(in_column="target", inplace=True), "regular_ts", {"change": {"target"}}),
(MADTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}),
(MaxTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}),
(MeanTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}),
Expand Down Expand Up @@ -601,6 +600,7 @@ def test_transform_train_new_segments(self, transform, dataset_name, expected_ch
(MeanSegmentEncoderTransform(), "regular_ts"),
(SegmentEncoderTransform(), "regular_ts"),
# math
(DifferencingTransform(in_column="target", inplace=True), "regular_ts"),
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"),
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"),
(MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"),
Expand Down Expand Up @@ -772,8 +772,6 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp
"regular_ts",
{"create": {"res"}},
),
(DifferencingTransform(in_column="target", inplace=True), "regular_ts", {}),
(DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}),
(MADTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}),
(MaxTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}),
(MeanTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}),
Expand Down Expand Up @@ -924,6 +922,8 @@ def test_transform_future_new_segments(self, transform, dataset_name, expected_c
(MeanSegmentEncoderTransform(), "regular_ts"),
(SegmentEncoderTransform(), "regular_ts"),
# math
(DifferencingTransform(in_column="target", inplace=True), "regular_ts"),
(DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"),
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"),
(BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"),
(BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"),
Expand Down