Skip to content

Add FilterFeaturesTransform #277

Merged
merged 12 commits into from
Nov 17, 2021
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- omegaconf config parser in cli ([#258](https://github.com/tinkoff-ai/etna-ts/pull/258))
- Feature relevance table calculation using feature importance ([#261](https://github.com/tinkoff-ai/etna-ts/pull/261))
- MeanSegmentEncoderTransform ([#265](https://github.com/tinkoff-ai/etna-ts/pull/265))
- FilterFeaturesTransform ([#277](https://github.com/tinkoff-ai/etna-ts/pull/277))

### Changed
- Add possibility to set custom in_column for ConfidenceIntervalOutliersTransform ([#240](https://github.com/tinkoff-ai/etna-ts/pull/240))
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from etna.transforms.detrend import TheilSenTrendTransform
from etna.transforms.feature_importance import MRMRFeatureSelectionTransform
from etna.transforms.feature_importance import TreeFeatureSelectionTransform
from etna.transforms.filter import FilterFeaturesTransform
from etna.transforms.imputation import TimeSeriesImputerTransform
from etna.transforms.lags import LagTransform
from etna.transforms.log import LogTransform
Expand Down
74 changes: 74 additions & 0 deletions etna/transforms/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Optional
from typing import Sequence

import pandas as pd

from etna.transforms.base import Transform


class FilterFeaturesTransform(Transform):
"""Filters features in each segment of the dataframe."""

def __init__(self, include: Optional[Sequence[str]] = None, exclude: Optional[Sequence[str]] = None):
"""Create instance of FilterFeaturesTransform.

Parameters
----------
include:
list of columns to pass through filter
exclude:
list of columns to not pass through

Raises
------
ValueError:
if both options set or non of them
"""
if include is not None and exclude is None:
self.include = list(set(include))
self.exclude = None
elif exclude is not None and include is None:
self.include = None
self.exclude = list(set(exclude))
else:
raise ValueError("There should be exactly one option set: include or exclude")

def fit(self, df: pd.DataFrame) -> "FilterFeaturesTransform":
"""Fit method does nothing and is kept for compatibility.

Parameters
----------
df:
dataframe with data.

Returns
-------
result: FilterFeaturesTransform
"""
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Filter features according to include/exclude parameters.

Parameters
----------
df:
dataframe with data to transform.

Returns
-------
result: pd.Dataframe
transformed dataframe
"""
result = df.copy()
features = df.columns.get_level_values("feature")
if self.include is not None:
if not set(self.include).issubset(features):
raise ValueError(f"Features {set(self.include) - set(features)} are not present in the dataset.")
segments = sorted(set(df.columns.get_level_values("segment")))
result = result.loc[:, pd.IndexSlice[segments, self.include]]
if self.exclude is not None and self.exclude:
julia-shenshina marked this conversation as resolved.
Show resolved Hide resolved
if not set(self.exclude).issubset(features):
raise ValueError(f"Features {set(self.exclude) - set(features)} are not present in the dataset.")
result = result.drop(columns=self.exclude, level="feature")
return result
99 changes: 99 additions & 0 deletions tests/test_transforms/test_filter_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.transforms import FilterFeaturesTransform


@pytest.fixture
def ts_with_features() -> TSDataset:
timestamp = pd.date_range("2020-01-01", periods=100, freq="D")
df_1 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_1", "target": 1})
df_2 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_2", "target": 2})
df = TSDataset.to_dataset(pd.concat([df_1, df_2], ignore_index=False))

df_exog_1 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_1", "exog_1": 1, "exog_2": 2})
df_exog_2 = pd.DataFrame({"timestamp": timestamp, "segment": "segment_2", "exog_1": 3, "exog_2": 4})
df_exog = TSDataset.to_dataset(pd.concat([df_exog_1, df_exog_2], ignore_index=False))

return TSDataset(df=df, df_exog=df_exog, freq="D")


def test_set_only_include():
"""Test that transform is created with include."""
_ = FilterFeaturesTransform(include=["exog_1", "exog_2"])


def test_set_only_exclude():
"""Test that transform is created with exclude."""
_ = FilterFeaturesTransform(exclude=["exog_1", "exog_2"])


def test_set_include_and_exclude():
"""Test that transform is not created with include and exclude."""
with pytest.raises(ValueError, match="There should be exactly one option set: include or exclude"):
_ = FilterFeaturesTransform(include=["exog_1"], exclude=["exog_2"])


def test_set_none():
"""Test that transform is not created without include and exclude."""
with pytest.raises(ValueError, match="There should be exactly one option set: include or exclude"):
_ = FilterFeaturesTransform()


@pytest.mark.parametrize("include", [[], ["target"], ["exog_1"], ["exog_1", "exog_2", "target"]])
def test_include_filter(ts_with_features, include):
"""Test that transform remains only features in include."""
original_df = ts_with_features.to_pandas()
transform = FilterFeaturesTransform(include=include)
ts_with_features.fit_transform([transform])
df_transformed = ts_with_features.to_pandas()
expected_columns = set(include)
got_columns = set(df_transformed.columns.get_level_values("feature"))
assert got_columns == expected_columns
segments = ts_with_features.segments
for column in got_columns:
assert np.all(
df_transformed.loc[:, pd.IndexSlice[segments, column]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

df_transformed = ts_with_features.df , why you compare it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a bug earlier when values between columns mixed with each other, I wanted to unsure, it is not present here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a mistake, it will be fixed

== original_df.loc[:, pd.IndexSlice[segments, column]]
)


@pytest.mark.parametrize(
"exclude, expected_columns",
[
([], ["target", "exog_1", "exog_2"]),
(["target"], ["exog_1", "exog_2"]),
(["exog_1", "exog_2"], ["target"]),
(["target", "exog_1", "exog_2"], []),
],
)
def test_exclude_filter(ts_with_features, exclude, expected_columns):
"""Test that transform removes only features in exclude."""
original_df = ts_with_features.to_pandas()
transform = FilterFeaturesTransform(exclude=exclude)
ts_with_features.fit_transform([transform])
df_transformed = ts_with_features.to_pandas()
got_columns = set(df_transformed.columns.get_level_values("feature"))
assert got_columns == set(expected_columns)
segments = ts_with_features.segments
for column in got_columns:
assert np.all(
df_transformed.loc[:, pd.IndexSlice[segments, column]]
== original_df.loc[:, pd.IndexSlice[segments, column]]
)


def test_include_filter_wrong_column(ts_with_features):
"""Test that transform raises error with non-existent column in include."""
transform = FilterFeaturesTransform(include=["non-existent-column"])
with pytest.raises(ValueError, match="Features {.*} are not present in the dataset"):
ts_with_features.fit_transform([transform])


def test_exclude_filter_wrong_column(ts_with_features):
"""Test that transform raises error with non-existent column in exclude."""
transform = FilterFeaturesTransform(exclude=["non-existent-column"])
with pytest.raises(ValueError, match="Features {.*} are not present in the dataset"):
ts_with_features.fit_transform([transform])