Skip to content

Teach DifferencingTransform to inverse_transform with NaNs #1155

Merged
merged 3 commits into from
Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix `SklearnTransform` in per-segment mode to work on subset of segments and raise error on new segments ([#1107](https://github.com/tinkoff-ai/etna/pull/1107))
- Fix `OutliersTransform` and its children to raise error on new segments ([#1139](https://github.com/tinkoff-ai/etna/pull/1139))
- Fix `DifferencingTransform` to raise error on new segments during `transform` and `inverse_transform` in inplace mode ([#1141](https://github.com/tinkoff-ai/etna/pull/1141))
- Teach `DifferencingTransform` to `inverse_transform` with NaNs ([#1155](https://github.com/tinkoff-ai/etna/pull/1155))
## [1.14.0] - 2022-12-16
### Added
- Add python 3.10 support ([#1005](https://github.com/tinkoff-ai/etna/pull/1005))
Expand Down
36 changes: 26 additions & 10 deletions etna/transforms/math/differencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
class _SingleDifferencingTransform(Transform):
"""Calculate a time series differences of order 1.

This transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment.
During ``fit`` this transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment.
During ``transform`` and ``inverse_transform`` there is no special treatment of NaNs.

Notes
-----
Expand Down Expand Up @@ -86,6 +87,11 @@ def fit(self, df: pd.DataFrame) -> "_SingleDifferencingTransform":
Returns
-------
result: _SingleDifferencingTransform

Raises
------
ValueError:
if NaNs are present inside the segment
"""
segments = sorted(set(df.columns.get_level_values("segment")))
fit_df = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy()
Expand Down Expand Up @@ -124,11 +130,8 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
transformed = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy()
for current_segment in segments:
to_transform = transformed.loc[:, pd.IndexSlice[current_segment, self.in_column]]
start_idx = to_transform.first_valid_index()
# make a differentiation
transformed.loc[start_idx:, pd.IndexSlice[current_segment, self.in_column]] = to_transform.loc[
start_idx:
].diff(periods=self.period)
transformed.loc[:, pd.IndexSlice[current_segment, self.in_column]] = to_transform.diff(periods=self.period)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be be should rework the current test to check the new behaviour, I mean run tests in fixtures with NaNs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What tests do you mean?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current tests for DifferencingTransform (test_differencing_transform) have a fixture with NaNs at the beginning. And tests check that everything is working.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about nans in the middle?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to add more tests in test_differencing_transform or tests in test_inference are enough?
Ok, I'll look at it.


if self.inplace:
result_df = df.copy()
Expand Down Expand Up @@ -188,10 +191,6 @@ def _reconstruct_test(self, df: pd.DataFrame, columns_to_inverse: Set[str]) -> p
init_df = init_df[segments]
to_transform = pd.concat([init_df, to_transform])

# validate values inside the series to transform
if to_transform.isna().sum().sum() > 0:
raise ValueError(f"There should be no NaNs inside the segments")

# run reconstruction and save the result
to_transform = self._make_inv_diff(to_transform)
result_df.loc[:, pd.IndexSlice[segments, column]] = to_transform
Expand All @@ -210,6 +209,13 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
-------
result:
transformed DataFrame.

Raises
------
ValueError:
if inverse transform is applied not to full train nor to test that goes after train
ValueError:
if inverse transform is applied to test that goes after train with gap
"""
# we assume this to be fitted
self._train_timestamp = cast(pd.DatetimeIndex, self._train_timestamp)
Expand Down Expand Up @@ -241,7 +247,8 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
class DifferencingTransform(Transform):
"""Calculate a time series differences.

This transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment.
During ``fit`` this transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment.
During ``transform`` and ``inverse_transform`` there is no special treatment of NaNs.

Notes
-----
Expand Down Expand Up @@ -334,6 +341,11 @@ def fit(self, df: pd.DataFrame) -> "DifferencingTransform":
Returns
-------
result: DifferencingTransform

Raises
------
ValueError:
if NaNs are present inside the segment
"""
# this is made because transforms of high order may need some columns created by transforms of lower order
result_df = df.copy()
Expand Down Expand Up @@ -395,6 +407,10 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
if transform isn't fitted
NotImplementedError:
if there are segments that weren't present during training
ValueError:
if inverse transform is applied not to full train nor to test that goes after train
ValueError:
if inverse transform is applied to test that goes after train with gap
"""
self._check_is_fitted()
if not self.inplace:
Expand Down
12 changes: 1 addition & 11 deletions tests/test_transforms/test_inference/test_inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments
(LogTransform(in_column="target", inplace=True), "positive_ts"),
(LogTransform(in_column="positive", inplace=True), "ts_with_exog"),
(DifferencingTransform(in_column="target", inplace=False), "regular_ts"),
(DifferencingTransform(in_column="target", inplace=True), "regular_ts"),
(DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"),
(MADTransform(in_column="target", window=14), "regular_ts"),
(MaxTransform(in_column="target", window=14), "regular_ts"),
Expand Down Expand Up @@ -389,17 +390,6 @@ def test_inverse_transform_future_subset_segments(self, transform, dataset_name,
ts = request.getfixturevalue(dataset_name)
self._test_inverse_transform_future_subset_segments(ts, transform, segments=["segment_2"])

@to_be_fixed(ValueError, match="There should be no NaNs inside the segments")
@pytest.mark.parametrize(
"transform, dataset_name",
[
(DifferencingTransform(in_column="target", inplace=True), "regular_ts"),
],
)
def test_inverse_transform_future_subset_difference_fail(self, transform, dataset_name, request):
ts = request.getfixturevalue(dataset_name)
self._test_inverse_transform_future_subset_segments(ts, transform, segments=["segment_2"])


class TestInverseTransformTrainNewSegments:
"""Test inverse transform on train part of new segments.
Expand Down
96 changes: 63 additions & 33 deletions tests/test_transforms/test_math/test_differencing_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ def df_nans() -> pd.DataFrame:
return df


@pytest.fixture
def df_nans_middle() -> pd.DataFrame:
"""Create DataFrame with nans in the middle of the segment."""
timestamp = pd.date_range("2021-01-01", "2021-04-01")
df_1 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(timestamp.shape[0]), "segment": "1"})
df_2 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(timestamp.shape[0]) * 2, "segment": "2"})
df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(df)
df.iloc[5:10, 0] = np.NaN
return df


@pytest.fixture
def df_segments_split(df_nans) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Create a pair of DataFrames with different segments."""
Expand Down Expand Up @@ -106,15 +118,10 @@ def check_transform(
series_init = df.loc[:, pd.IndexSlice[segment, "target"]]
series_transformed = transformed_df.loc[:, pd.IndexSlice[segment, out_column]]

series_init = series_init.loc[series_init.first_valid_index() :]
series_transformed = series_transformed.loc[series_transformed.first_valid_index() :]

assert series_init.shape[0] == series_transformed.shape[0] + order * period

for _ in range(order):
series_init = series_init.diff(periods=period).iloc[period:]
series_init = series_init.diff(periods=period)

assert np.all(series_init == series_transformed)
assert series_init.equals(series_transformed)


def check_inverse_transform_not_inplace(
Expand All @@ -134,10 +141,10 @@ def check_inverse_transform_inplace_train(transform: GeneralDifferencingTransfor
assert inverse_transformed_df.equals(df)


def check_inverse_transform_inplace_test(
def check_inverse_transform_inplace_filled_test(
transform: GeneralDifferencingTransform, period: int, order: int, df: pd.DataFrame
):
"""Check that differencing transform correctly makes inverse_transform on test data in inplace mode."""
"""Check that differencing transform correctly makes inverse_transform on filled test data in inplace mode."""
ts = TSDataset(df, freq="D")
ts_train, ts_test = ts.train_test_split(test_size=20)
ts_train.fit_transform(transforms=[transform])
Expand All @@ -158,6 +165,19 @@ def check_inverse_transform_inplace_test(
assert np.all(future_ts.to_pandas() == ts_test.to_pandas())


def check_inverse_transform_inplace_unfilled_test(transform: GeneralDifferencingTransform, df: pd.DataFrame):
"""Check that differencing transform correctly makes inverse_transform on unfilled test data in inplace mode."""
ts = TSDataset(df, freq="D")
ts_train, ts_test = ts.train_test_split(test_size=20)
ts_train.fit_transform(transforms=[transform])

future_ts = ts_train.make_future(20)

# check values from inverse_transform
future_ts.inverse_transform()
assert future_ts.to_pandas().isna().all().all()


def check_inverse_transform_inplace_test_quantiles(transform: GeneralDifferencingTransform, df: pd.DataFrame):
"""Check that differencing transform correctly makes inverse_transform on test data with quantiles."""
ts = TSDataset(df, freq="D")
Expand Down Expand Up @@ -345,6 +365,25 @@ def test_full_transform(period, order, inplace, out_column, df_nans):
check_transform(transform, period, order, out_column, df_nans, df_nans)


@pytest.mark.parametrize("period", [1, 7])
@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")])
def test_single_transform_nans_middle(period, inplace, out_column, df_nans, df_nans_middle):
"""Test that _SingleDifferencingTransform generates correct values in transform with NaNs in the middle."""
transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=inplace, out_column=out_column)
check_transform(transform, period, 1, out_column, df_nans, df_nans_middle)


@pytest.mark.parametrize("period", [1, 7])
@pytest.mark.parametrize("order", [1, 2])
@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")])
def test_full_transform_nans_middle(period, order, inplace, out_column, df_nans, df_nans_middle):
"""Test that DifferencingTransform generates correct values in transform with NaNs in the middle."""
transform = DifferencingTransform(
in_column="target", period=period, order=order, inplace=inplace, out_column=out_column
)
check_transform(transform, period, order, out_column, df_nans, df_nans_middle)


@pytest.mark.parametrize("period", [1, 7])
def test_single_transform_not_inplace_new_segments(period, df_segments_split):
"""Test that _SingleDifferencingTransform generates correct values in transform on new segments in non-inplace mode."""
Expand Down Expand Up @@ -466,43 +505,34 @@ def test_full_inverse_transform_inplace_train(period, order, df_nans):
check_inverse_transform_inplace_train(transform, df_nans)


@pytest.mark.parametrize(
"transform",
[
_SingleDifferencingTransform(in_column="target", period=1, inplace=True),
DifferencingTransform(in_column="target", period=1, order=1, inplace=True),
],
)
def test_general_inverse_transform_inplace_test_fail_nans(transform, df_nans):
"""Test that differencing transform fails to make inverse_transform on test data if there are NaNs."""
ts = TSDataset(df_nans, freq="D")
ts_train, ts_test = ts.train_test_split(test_size=20)

ts_train.fit_transform(transforms=[transform])
@pytest.mark.parametrize("period", [1, 7])
def test_single_inverse_transform_inplace_filled_test(period, df_nans):
"""Test that _SingleDifferencingTransform correctly makes inverse_transform on filled test data in inplace mode."""
transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True)
check_inverse_transform_inplace_filled_test(transform, period, 1, df_nans)

# make predictions by hand only on one segment
future_ts = ts_train.make_future(20)
future_ts.df.loc[:, pd.IndexSlice["1", "target"]] = np.NaN
future_ts.df.loc[:, pd.IndexSlice["2", "target"]] = 2

# check fail on inverse_transform
with pytest.raises(ValueError, match="There should be no NaNs inside the segments"):
future_ts.inverse_transform()
@pytest.mark.parametrize("period", [1, 7])
@pytest.mark.parametrize("order", [1, 2])
def test_full_inverse_transform_inplace_test(period, order, df_nans):
"""Test that DifferencingTransform correctly makes inverse_transform on filled test data in inplace mode."""
transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=True)
check_inverse_transform_inplace_filled_test(transform, period, order, df_nans)


@pytest.mark.parametrize("period", [1, 7])
def test_single_inverse_transform_inplace_test(period, df_nans):
"""Test that _SingleDifferencingTransform correctly makes inverse_transform on test data in inplace mode."""
"""Test that _SingleDifferencingTransform correctly makes inverse_transform on unfilled test data in inplace mode."""
transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True)
check_inverse_transform_inplace_test(transform, period, 1, df_nans)
check_inverse_transform_inplace_unfilled_test(transform, df_nans)


@pytest.mark.parametrize("period", [1, 7])
@pytest.mark.parametrize("order", [1, 2])
def test_full_inverse_transform_inplace_test(period, order, df_nans):
"""Test that DifferencingTransform correctly makes inverse_transform on test data in inplace mode."""
"""Test that DifferencingTransform correctly makes inverse_transform on unfilled test data in inplace mode."""
transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=True)
check_inverse_transform_inplace_test(transform, period, order, df_nans)
check_inverse_transform_inplace_unfilled_test(transform, df_nans)


@pytest.mark.parametrize("period", [1, 7])
Expand Down