From 313cd34f4fb4f0ae31e76a240b09871b7436ee6b Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 7 Feb 2023 17:46:14 +0300 Subject: [PATCH 1/5] Fix SegmentEncoderTransform to work on subset of segments --- etna/transforms/encoders/segment_encoder.py | 13 +++++++--- .../test_segment_encoder_transform.py | 24 +++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py index e899b8eac..7de110489 100644 --- a/etna/transforms/encoders/segment_encoder.py +++ b/etna/transforms/encoders/segment_encoder.py @@ -45,11 +45,18 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: : result dataframe """ - encoded_matrix = self._le.transform(self._le.classes_) - encoded_matrix = encoded_matrix.reshape(len(self._le.classes_), -1).repeat(len(df), axis=1).T + segments = df.columns.get_level_values("segment").unique().tolist() + new_segments = set(segments) - set(self._le.classes_) + if len(new_segments) > 0: + raise ValueError( + f"This transform can't process segments that weren't present on train data: {new_segments}" + ) + + encoded_matrix = self._le.transform(segments) + encoded_matrix = encoded_matrix.reshape(len(segments), -1).repeat(len(df), axis=1).T encoded_df = pd.DataFrame( encoded_matrix, - columns=pd.MultiIndex.from_product([self._le.classes_, ["segment_code"]], names=("segment", "feature")), + columns=pd.MultiIndex.from_product([segments, ["segment_code"]], names=("segment", "feature")), index=df.index, ) encoded_df = encoded_df.astype("category") diff --git a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py index 8a8891bb3..7a8996063 100644 --- a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest from etna.transforms import SegmentEncoderTransform from tests.test_transforms.utils import assert_transformation_equals_loaded_original @@ -21,6 +22,29 @@ def test_segment_encoder_transform(dummy_df): assert codes == {0, 1}, "Codes are not 0 and 1" +def test_subset_segments(dummy_df): + train_df = dummy_df + test_df = dummy_df.loc[:, pd.IndexSlice["Omsk", :]] + transform = SegmentEncoderTransform() + + transform.fit(train_df) + transformed_test_df = transform.transform(test_df) + + assert transformed_test_df.columns.get_level_values("segment").unique().tolist() == ["Omsk"] + values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]] + assert np.all(values == values.iloc[0]) + + +def test_new_segments_error(dummy_df): + train_df = dummy_df.loc[:, pd.IndexSlice["Moscow", :]] + test_df = dummy_df.loc[:, pd.IndexSlice["Omsk", :]] + transform = SegmentEncoderTransform() + + transform.fit(train_df) + with pytest.raises(ValueError, match="This transform can't process segments that weren't present on train data"): + _ = transform.transform(test_df) + + def test_save_load(example_tsds): transform = SegmentEncoderTransform() assert_transformation_equals_loaded_original(transform=transform, ts=example_tsds) From ca79f3c1617627db1c1691e7fc31e5fa9dd18085 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 7 Feb 2023 17:48:34 +0300 Subject: [PATCH 2/5] Add in documentation about raising exception on new segments --- etna/transforms/encoders/segment_encoder.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py index 7de110489..5abf16f39 100644 --- a/etna/transforms/encoders/segment_encoder.py +++ b/etna/transforms/encoders/segment_encoder.py @@ -44,6 +44,11 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- : result dataframe + + Raises + ------ + ValueError: + If there are segments that weren't present during training """ segments = df.columns.get_level_values("segment").unique().tolist() new_segments = set(segments) - set(self._le.classes_) From cdb6f5f7bd8ec863eab9d59a6ee0b5d15b880be4 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 7 Feb 2023 17:51:05 +0300 Subject: [PATCH 3/5] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80a5759ef..db776cfd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - - -- +- Fix `SegmentEncoderTransform` to work with subset of segments and raise error on new segments ([#1103](https://github.com/tinkoff-ai/etna/pull/1103)) - - ## [1.14.0] - 2022-12-16 From 79a4e966a12a52a05b53072cd98725310f3ef9bf Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 7 Feb 2023 19:10:24 +0300 Subject: [PATCH 4/5] Add test on not fitting in SegmentEncoderTransform --- etna/transforms/encoders/segment_encoder.py | 11 +++++++++-- .../test_encoders/test_segment_encoder_transform.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py index 5abf16f39..02fc2c3d0 100644 --- a/etna/transforms/encoders/segment_encoder.py +++ b/etna/transforms/encoders/segment_encoder.py @@ -48,10 +48,17 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: Raises ------ ValueError: - If there are segments that weren't present during training + If transform isn't fitted. + ValueError: + If there are segments that weren't present during training. """ segments = df.columns.get_level_values("segment").unique().tolist() - new_segments = set(segments) - set(self._le.classes_) + + try: + new_segments = set(segments) - set(self._le.classes_) + except AttributeError: + raise ValueError("The transform isn't fitted!") + if len(new_segments) > 0: raise ValueError( f"This transform can't process segments that weren't present on train data: {new_segments}" diff --git a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py index 7a8996063..5a599d70f 100644 --- a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py @@ -30,11 +30,20 @@ def test_subset_segments(dummy_df): transform.fit(train_df) transformed_test_df = transform.transform(test_df) - assert transformed_test_df.columns.get_level_values("segment").unique().tolist() == ["Omsk"] + segments = sorted(transformed_test_df.columns.get_level_values("segment").unique()) + features = sorted(transformed_test_df.columns.get_level_values("feature").unique()) + assert segments == ["Omsk"] + assert features == ["segment_code", "target"] values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]] assert np.all(values == values.iloc[0]) +def test_not_fitted_error(dummy_df): + encoder = SegmentEncoderTransform() + with pytest.raises(ValueError, match="The transform isn't fitted"): + encoder.transform(dummy_df) + + def test_new_segments_error(dummy_df): train_df = dummy_df.loc[:, pd.IndexSlice["Moscow", :]] test_df = dummy_df.loc[:, pd.IndexSlice["Omsk", :]] From 7b427044773909801fead07505916ad9ab8e13ed Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 8 Feb 2023 12:48:24 +0300 Subject: [PATCH 5/5] Fix PR comments --- etna/transforms/encoders/segment_encoder.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py index 02fc2c3d0..09b9fbe70 100644 --- a/etna/transforms/encoders/segment_encoder.py +++ b/etna/transforms/encoders/segment_encoder.py @@ -1,3 +1,6 @@ +import reprlib + +import numpy as np import pandas as pd from sklearn import preprocessing @@ -61,11 +64,11 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: if len(new_segments) > 0: raise ValueError( - f"This transform can't process segments that weren't present on train data: {new_segments}" + f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}" ) encoded_matrix = self._le.transform(segments) - encoded_matrix = encoded_matrix.reshape(len(segments), -1).repeat(len(df), axis=1).T + encoded_matrix = np.tile(encoded_matrix, (len(df), 1)) encoded_df = pd.DataFrame( encoded_matrix, columns=pd.MultiIndex.from_product([segments, ["segment_code"]], names=("segment", "feature")),