From f885cc3d8b7de9acf60ef4593f90061be4e468e4 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 12 Nov 2021 18:26:55 +0300 Subject: [PATCH 1/9] Add FilterFeaturesTransform --- etna/transforms/__init__.py | 1 + etna/transforms/filter.py | 73 ++++++++++++++++ .../test_transforms/test_filter_transform.py | 84 +++++++++++++++++++ 3 files changed, 158 insertions(+) create mode 100644 etna/transforms/filter.py create mode 100644 tests/test_transforms/test_filter_transform.py diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py index b5fb24a79..bc8fd262c 100644 --- a/etna/transforms/__init__.py +++ b/etna/transforms/__init__.py @@ -8,6 +8,7 @@ from etna.transforms.detrend import TheilSenTrendTransform from etna.transforms.feature_importance import MRMRFeatureSelectionTransform from etna.transforms.feature_importance import TreeFeatureSelectionTransform +from etna.transforms.filter import FilterFeaturesTransform from etna.transforms.imputation import TimeSeriesImputerTransform from etna.transforms.lags import LagTransform from etna.transforms.log import LogTransform diff --git a/etna/transforms/filter.py b/etna/transforms/filter.py new file mode 100644 index 000000000..375e88160 --- /dev/null +++ b/etna/transforms/filter.py @@ -0,0 +1,73 @@ +from typing import Optional +from typing import Sequence + +import pandas as pd + +from etna.transforms.base import Transform + + +class FilterFeaturesTransform(Transform): + """Filters features in each segment of the dataframe.""" + + def __init__(self, include: Optional[Sequence[str]] = None, exclude: Optional[Sequence[str]] = None): + """Create instance of LagTransform. + + Parameters + ---------- + include: + list of columns to pass through filter + exclude: + list of columns to not pass through + + Raises + ------ + ValueError: + if both options set or non of them + """ + if include is not None and exclude is None: + self.include = list(set(include)) + self.exclude = None + elif exclude is not None and include is None: + self.include = None + self.exclude = list(set(exclude)) + else: + raise ValueError("There should be exactly one option set: include or exclude") + + def fit(self, df: pd.DataFrame) -> "FilterFeaturesTransform": + """Fit method does nothing and is kept for compatibility. + + Parameters + ---------- + df: + dataframe with data to fit label encoder. + + Returns + ------- + self + """ + return self + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Filter features according to include/exclude parameters. + + Parameters + ---------- + df: + dataframe with data to transform. + + Returns + ------- + result: pd.Dataframe + transformed dataframe + """ + result = df.copy() + segments = sorted(set(df.columns.get_level_values("segment"))) + if self.include is not None: + if not set(self.include).issubset(df.columns.get_level_values("feature")): + raise ValueError("Some features in include are not present in the dataset") + result = result.loc[:, (segments, self.include)] + if self.exclude is not None and self.exclude: + if not set(self.exclude).issubset(df.columns.get_level_values("feature")): + raise ValueError("Some features in exclude are not present in the dataset") + result = result.drop(columns=self.exclude, level="feature") + return result diff --git a/tests/test_transforms/test_filter_transform.py b/tests/test_transforms/test_filter_transform.py new file mode 100644 index 000000000..1128a0885 --- /dev/null +++ b/tests/test_transforms/test_filter_transform.py @@ -0,0 +1,84 @@ +import pandas as pd +import pytest + +from etna.datasets import TSDataset +from etna.transforms import FilterFeaturesTransform + + +@pytest.fixture +def ts_with_features() -> TSDataset: + timestamp = pd.date_range("2020-01-01", periods=100, freq="D") + df_1 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_1", "target": 1}) + df_2 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_2", "target": 2}) + df = TSDataset.to_dataset(pd.concat([df_1, df_2], ignore_index=False)) + + df_exog_1 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_1", "exog_1": 1, "exog_2": 2}) + df_exog_2 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_2", "exog_1": 3, "exog_2": 4}) + df_exog = TSDataset.to_dataset(pd.concat([df_exog_1, df_exog_2], ignore_index=False)) + + return TSDataset(df=df, df_exog=df_exog, freq="D") + + +def test_set_only_include(): + """Test that transform is created with include.""" + _ = FilterFeaturesTransform(include=["exog_1", "exog_2"]) + + +def test_set_only_exclude(): + """Test that transform is created with exclude.""" + _ = FilterFeaturesTransform(exclude=["exog_1", "exog_2"]) + + +def test_set_include_and_exclude(): + """Test that transform is not created with include and exclude.""" + with pytest.raises(ValueError, match="There should be exactly one option set: include or exclude"): + _ = FilterFeaturesTransform() + + +def test_set_none(): + """Test that transform is not created without include and exclude.""" + with pytest.raises(ValueError, match="There should be exactly one option set: include or exclude"): + _ = FilterFeaturesTransform() + + +@pytest.mark.parametrize("include", [[], ["target"], ["exog_1"], ["exog_1", "exog_2", "target"]]) +def test_include_filter(ts_with_features, include): + """Test that transform remains only features in include.""" + transform = FilterFeaturesTransform(include=include) + ts_with_features.fit_transform([transform]) + df_transformed = ts_with_features.df + expected_columns = set(df_transformed.columns.get_level_values("feature")) + got_columns = set(df_transformed.columns.get_level_values("feature")) + assert got_columns == expected_columns + + +@pytest.mark.parametrize( + "exclude, expected_columns", + [ + ([], ["target", "exog_1", "exog_2"]), + (["target"], ["exog_1", "exog_2"]), + (["exog_1", "exog_2"], ["target"]), + (["target", "exog_1", "exog_2"], []), + ], +) +def test_exclude_filter(ts_with_features, exclude, expected_columns): + """Test that transform removes only features in exclude.""" + transform = FilterFeaturesTransform(exclude=exclude) + ts_with_features.fit_transform([transform]) + df_transformed = ts_with_features.df + got_columns = set(df_transformed.columns.get_level_values("feature")) + assert got_columns == set(expected_columns) + + +def test_include_filter_wrong_column(ts_with_features): + """Test that transform raises error with non-existent column in include.""" + transform = FilterFeaturesTransform(include=["non-existent-column"]) + with pytest.raises(ValueError, match="Some features in include are not present in the dataset"): + ts_with_features.fit_transform([transform]) + + +def test_exclude_filter_wrong_column(ts_with_features): + """Test that transform raises error with non-existent column in exclude.""" + transform = FilterFeaturesTransform(exclude=["non-existent-column"]) + with pytest.raises(ValueError, match="Some features in exclude are not present in the dataset"): + ts_with_features.fit_transform([transform]) From 83be6f0ee3d3c81a0605f3420e9e92bf8d71cbd2 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 12 Nov 2021 18:29:45 +0300 Subject: [PATCH 2/9] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cfb12791..cbe40823b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - omegaconf config parser in cli ([#258](https://github.com/tinkoff-ai/etna-ts/pull/258)) - Feature relevance table calculation using feature importance ([#261](https://github.com/tinkoff-ai/etna-ts/pull/261)) - MeanSegmentEncoderTransform ([#265](https://github.com/tinkoff-ai/etna-ts/pull/265)) +- FilterFeaturesTransform ([#277](https://github.com/tinkoff-ai/etna-ts/pull/277)) ### Changed - Add possibility to set custom in_column for ConfidenceIntervalOutliersTransform ([#240](https://github.com/tinkoff-ai/etna-ts/pull/240)) From 4846f4c6cc48a9dcffe30945807537a0610d8185 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Fri, 12 Nov 2021 18:33:50 +0300 Subject: [PATCH 3/9] Refactor transform method --- etna/transforms/filter.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/etna/transforms/filter.py b/etna/transforms/filter.py index 375e88160..070058f82 100644 --- a/etna/transforms/filter.py +++ b/etna/transforms/filter.py @@ -61,13 +61,14 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: transformed dataframe """ result = df.copy() - segments = sorted(set(df.columns.get_level_values("segment"))) + features = df.columns.get_level_values("feature") if self.include is not None: - if not set(self.include).issubset(df.columns.get_level_values("feature")): + if not set(self.include).issubset(features): raise ValueError("Some features in include are not present in the dataset") + segments = sorted(set(df.columns.get_level_values("segment"))) result = result.loc[:, (segments, self.include)] if self.exclude is not None and self.exclude: - if not set(self.exclude).issubset(df.columns.get_level_values("feature")): + if not set(self.exclude).issubset(features): raise ValueError("Some features in exclude are not present in the dataset") result = result.drop(columns=self.exclude, level="feature") return result From 41a9be86c45b6237e5e771c76cc44f8b0d83373d Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 15 Nov 2021 10:21:28 +0300 Subject: [PATCH 4/9] Fix tests, add checking on values in columns --- etna/transforms/filter.py | 2 +- tests/test_transforms/test_filter_transform.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/etna/transforms/filter.py b/etna/transforms/filter.py index 070058f82..bcf6c55cc 100644 --- a/etna/transforms/filter.py +++ b/etna/transforms/filter.py @@ -66,7 +66,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: if not set(self.include).issubset(features): raise ValueError("Some features in include are not present in the dataset") segments = sorted(set(df.columns.get_level_values("segment"))) - result = result.loc[:, (segments, self.include)] + result = result.loc[:, pd.IndexSlice[segments, self.include]] if self.exclude is not None and self.exclude: if not set(self.exclude).issubset(features): raise ValueError("Some features in exclude are not present in the dataset") diff --git a/tests/test_transforms/test_filter_transform.py b/tests/test_transforms/test_filter_transform.py index 1128a0885..f624c0888 100644 --- a/tests/test_transforms/test_filter_transform.py +++ b/tests/test_transforms/test_filter_transform.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest @@ -47,9 +48,15 @@ def test_include_filter(ts_with_features, include): transform = FilterFeaturesTransform(include=include) ts_with_features.fit_transform([transform]) df_transformed = ts_with_features.df - expected_columns = set(df_transformed.columns.get_level_values("feature")) + expected_columns = set(ts_with_features.columns.get_level_values("feature")) got_columns = set(df_transformed.columns.get_level_values("feature")) assert got_columns == expected_columns + segments = ts_with_features.segments + for column in got_columns: + assert np.all( + df_transformed.loc[:, pd.IndexSlice[segments, column]] + == ts_with_features.df.loc[:, pd.IndexSlice[segments, column]] + ) @pytest.mark.parametrize( @@ -68,6 +75,12 @@ def test_exclude_filter(ts_with_features, exclude, expected_columns): df_transformed = ts_with_features.df got_columns = set(df_transformed.columns.get_level_values("feature")) assert got_columns == set(expected_columns) + segments = ts_with_features.segments + for column in got_columns: + assert np.all( + df_transformed.loc[:, pd.IndexSlice[segments, column]] + == ts_with_features.df.loc[:, pd.IndexSlice[segments, column]] + ) def test_include_filter_wrong_column(ts_with_features): From 896be953198e9e7b7dfa285b84b7746e449227e3 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 15 Nov 2021 17:43:19 +0300 Subject: [PATCH 5/9] Fix bugs, change error message --- etna/transforms/filter.py | 4 ++-- tests/test_transforms/test_filter_transform.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/etna/transforms/filter.py b/etna/transforms/filter.py index bcf6c55cc..e06cc813a 100644 --- a/etna/transforms/filter.py +++ b/etna/transforms/filter.py @@ -64,11 +64,11 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: features = df.columns.get_level_values("feature") if self.include is not None: if not set(self.include).issubset(features): - raise ValueError("Some features in include are not present in the dataset") + raise ValueError(f"Features {set(self.include) - set(features)} are not present in the dataset.") segments = sorted(set(df.columns.get_level_values("segment"))) result = result.loc[:, pd.IndexSlice[segments, self.include]] if self.exclude is not None and self.exclude: if not set(self.exclude).issubset(features): - raise ValueError("Some features in exclude are not present in the dataset") + raise ValueError(f"Features {set(self.exclude) - set(features)} are not present in the dataset.") result = result.drop(columns=self.exclude, level="feature") return result diff --git a/tests/test_transforms/test_filter_transform.py b/tests/test_transforms/test_filter_transform.py index f624c0888..786c24fd7 100644 --- a/tests/test_transforms/test_filter_transform.py +++ b/tests/test_transforms/test_filter_transform.py @@ -33,7 +33,7 @@ def test_set_only_exclude(): def test_set_include_and_exclude(): """Test that transform is not created with include and exclude.""" with pytest.raises(ValueError, match="There should be exactly one option set: include or exclude"): - _ = FilterFeaturesTransform() + _ = FilterFeaturesTransform(include=["exog_1"], exclude=["exog_2"]) def test_set_none(): @@ -45,17 +45,18 @@ def test_set_none(): @pytest.mark.parametrize("include", [[], ["target"], ["exog_1"], ["exog_1", "exog_2", "target"]]) def test_include_filter(ts_with_features, include): """Test that transform remains only features in include.""" + original_df = ts_with_features.to_pandas() transform = FilterFeaturesTransform(include=include) ts_with_features.fit_transform([transform]) - df_transformed = ts_with_features.df - expected_columns = set(ts_with_features.columns.get_level_values("feature")) + df_transformed = ts_with_features.to_pandas() + expected_columns = set(include) got_columns = set(df_transformed.columns.get_level_values("feature")) assert got_columns == expected_columns segments = ts_with_features.segments for column in got_columns: assert np.all( df_transformed.loc[:, pd.IndexSlice[segments, column]] - == ts_with_features.df.loc[:, pd.IndexSlice[segments, column]] + == original_df.loc[:, pd.IndexSlice[segments, column]] ) @@ -70,28 +71,29 @@ def test_include_filter(ts_with_features, include): ) def test_exclude_filter(ts_with_features, exclude, expected_columns): """Test that transform removes only features in exclude.""" + original_df = ts_with_features.to_pandas() transform = FilterFeaturesTransform(exclude=exclude) ts_with_features.fit_transform([transform]) - df_transformed = ts_with_features.df + df_transformed = ts_with_features.to_pandas() got_columns = set(df_transformed.columns.get_level_values("feature")) assert got_columns == set(expected_columns) segments = ts_with_features.segments for column in got_columns: assert np.all( df_transformed.loc[:, pd.IndexSlice[segments, column]] - == ts_with_features.df.loc[:, pd.IndexSlice[segments, column]] + == original_df.loc[:, pd.IndexSlice[segments, column]] ) def test_include_filter_wrong_column(ts_with_features): """Test that transform raises error with non-existent column in include.""" transform = FilterFeaturesTransform(include=["non-existent-column"]) - with pytest.raises(ValueError, match="Some features in include are not present in the dataset"): + with pytest.raises(ValueError, match="Features {.*} are not present in the dataset"): ts_with_features.fit_transform([transform]) def test_exclude_filter_wrong_column(ts_with_features): """Test that transform raises error with non-existent column in exclude.""" transform = FilterFeaturesTransform(exclude=["non-existent-column"]) - with pytest.raises(ValueError, match="Some features in exclude are not present in the dataset"): + with pytest.raises(ValueError, match="Features {.*} are not present in the dataset"): ts_with_features.fit_transform([transform]) From eb021bccce6ae7037fb80896b01befe84ac5ccd9 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 16 Nov 2021 11:17:36 +0300 Subject: [PATCH 6/9] Fix docstring mistakes --- etna/transforms/filter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etna/transforms/filter.py b/etna/transforms/filter.py index e06cc813a..c72f5cb7a 100644 --- a/etna/transforms/filter.py +++ b/etna/transforms/filter.py @@ -10,7 +10,7 @@ class FilterFeaturesTransform(Transform): """Filters features in each segment of the dataframe.""" def __init__(self, include: Optional[Sequence[str]] = None, exclude: Optional[Sequence[str]] = None): - """Create instance of LagTransform. + """Create instance of FilterFeaturesTransform. Parameters ---------- @@ -39,11 +39,11 @@ def fit(self, df: pd.DataFrame) -> "FilterFeaturesTransform": Parameters ---------- df: - dataframe with data to fit label encoder. + dataframe with data. Returns ------- - self + result: FilterFeaturesTransform """ return self From 020db7635dd3ef9a642a36d3e6d7a87c6c3c8c41 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 16 Nov 2021 16:36:31 +0300 Subject: [PATCH 7/9] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 485cc4c96..d78e00b7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - RelevanceTable returns rank ([#268](https://github.com/tinkoff-ai/etna-ts/pull/268/)) +- FilterFeaturesTransform ([#277](https://github.com/tinkoff-ai/etna-ts/pull/277)) ### Changed @@ -27,7 +28,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - omegaconf config parser in cli ([#258](https://github.com/tinkoff-ai/etna-ts/pull/258)) - Feature relevance table calculation using feature importance ([#261](https://github.com/tinkoff-ai/etna-ts/pull/261)) - MeanSegmentEncoderTransform ([#265](https://github.com/tinkoff-ai/etna-ts/pull/265)) -- FilterFeaturesTransform ([#277](https://github.com/tinkoff-ai/etna-ts/pull/277)) ### Changed - Add possibility to set custom in_column for ConfidenceIntervalOutliersTransform ([#240](https://github.com/tinkoff-ai/etna-ts/pull/240)) From c75c23b88f68f7148c73c8c8986332861ca94962 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 17 Nov 2021 12:56:11 +0300 Subject: [PATCH 8/9] Update etna-ts -> etna in changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e169661c..82a3b3eac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - FilterFeaturesTransform ([#277](https://github.com/tinkoff-ai/etna/pull/277)) ### Changed -- Rename confidence interval to prediction interval, start working with quantiles instead of interval_width ([#285](https://github.com/tinkoff-ai/etna-ts/pull/285)) +- Rename confidence interval to prediction interval, start working with quantiles instead of interval_width ([#285](https://github.com/tinkoff-ai/etna/pull/285)) ### Fixed From b03b456b7d37083911a386d346b4bb6c064d8c01 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 17 Nov 2021 16:22:26 +0300 Subject: [PATCH 9/9] Reformat code --- etna/transforms/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py index 9d898cc13..a480c26e9 100644 --- a/etna/transforms/__init__.py +++ b/etna/transforms/__init__.py @@ -7,8 +7,8 @@ from etna.transforms.detrend import LinearTrendTransform from etna.transforms.detrend import TheilSenTrendTransform from etna.transforms.feature_importance import TreeFeatureSelectionTransform -from etna.transforms.gale_shapley import GaleShapleyFeatureSelectionTransform from etna.transforms.filter import FilterFeaturesTransform +from etna.transforms.gale_shapley import GaleShapleyFeatureSelectionTransform from etna.transforms.imputation import TimeSeriesImputerTransform from etna.transforms.lags import LagTransform from etna.transforms.log import LogTransform