Skip to content

Speed up feature selection #1294

Merged
merged 13 commits into from
Jul 3, 2023
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add tuning stage into `Auto.fit` ([#1272](https://github.com/tinkoff-ai/etna/pull/1272))
- Add `params_to_tune` into `Tune` init ([#1282](https://github.com/tinkoff-ai/etna/pull/1282))
- Skip duplicates during `Tune.fit`, skip duplicates in `top_k`, add AutoML notebook ([#1285](https://github.com/tinkoff-ai/etna/pull/1285))
- Add parameter `fast_redundancy` in `mrmm`, fix relevance calculation in `get_model_relevance_table` ([#1294](https://github.com/tinkoff-ai/etna/pull/1294))

### Fixed
- Fix `plot_backtest` and `plot_backtest_interactive` on one-step forecast ([1260](https://github.com/tinkoff-ai/etna/pull/1260))
- Fix `BaseReconciliator` to work on `pandas==1.1.5` ([#1229](https://github.com/tinkoff-ai/etna/pull/1229))
Expand Down
9 changes: 7 additions & 2 deletions etna/analysis/feature_relevance/relevance_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def _prepare_df(df: pd.DataFrame, df_exog: pd.DataFrame, segment: str, regressor

def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> pd.DataFrame:
"""Calculate relevance table with p-values from tsfresh.

Parameters
----------
df:
Expand All @@ -48,6 +47,10 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p
-------
pd.DataFrame
dataframe with p-values.

Notes
-----
Time complexity of this method is :math:`O(n\_segments * n\_features * history\_len)`
"""
regressors = sorted(df_exog.columns.get_level_values("feature").unique())
segments = sorted(df.columns.get_level_values("segment").unique())
Expand All @@ -64,7 +67,9 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p
"Exogenous data contains columns with category type! It will be converted to float. If this is not desired behavior, use encoders."
)

relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg)[["feature", "p_value"]].values
relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg, ml_task="regression")[
["feature", "p_value"]
].values
result[k] = np.array(sorted(relevance, key=lambda x: x[0]))[:, 1]
relevance_table = pd.DataFrame(result)
relevance_table.index = segments
Expand Down
38 changes: 31 additions & 7 deletions etna/analysis/feature_selection/mrmr_selection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from enum import Enum
from typing import List

Expand Down Expand Up @@ -26,6 +27,7 @@ def mrmr(
relevance_table: pd.DataFrame,
regressors: pd.DataFrame,
top_k: int,
fast_redundancy: bool = False,
relevance_aggregation_mode: str = AggregationMode.mean,
redundancy_aggregation_mode: str = AggregationMode.mean,
atol: float = 1e-10,
Expand All @@ -47,6 +49,9 @@ def mrmr(
dataframe with regressors in etna format
top_k:
num of regressors to select; if there are not enough regressors, then all will be selected
fast_redundancy:
* True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)`
* False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)`
relevance_aggregation_mode:
the method for relevance values per-segment aggregation
redundancy_aggregation_mode:
Expand All @@ -59,12 +64,18 @@ def mrmr(
selected_features: List[str]
list of ``top_k`` selected regressors, sorted by their importance
"""
if not fast_redundancy:
warnings.warn(
"Option `fast_redundancy=False` was added for backward compatibility and will be removed in etna 3.0.0.",
DeprecationWarning,
)
relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)]
redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)]

relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)

all_features = relevance.index.to_list()
segments = set(regressors.columns.get_level_values("segment"))
selected_features: List[str] = []
not_selected_features = all_features.copy()

Expand All @@ -76,16 +87,29 @@ def mrmr(
score_denominator = pd.Series(1, index=not_selected_features)
if i > 0:
last_selected_feature = selected_features[-1]
not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]]
last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]]
not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]]

if fast_redundancy:
segment_redundancy = pd.concat(
[
not_selected_regressors[segment].apply(
lambda col: last_selected_regressor[segment].corrwith(col) # noqa: B023
)
for segment in segments
]
).abs()
else:
segment_redundancy = (
not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023
.abs()
.groupby("feature")
.apply(redundancy_aggregation_fn)
.T.groupby("feature")
)

redundancy_table.loc[not_selected_features, last_selected_feature] = (
not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023
.abs()
.groupby("feature")
.apply(redundancy_aggregation_fn)
.T.groupby("feature")
.apply(redundancy_aggregation_fn)
segment_redundancy.apply(redundancy_aggregation_fn)
.clip(atol)
.fillna(np.inf)
.loc[not_selected_features]
Expand Down
6 changes: 6 additions & 0 deletions etna/transforms/feature_selection/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def __init__(
relevance_table: RelevanceTable,
top_k: int,
features_to_use: Union[List[str], Literal["all"]] = "all",
fast_redundancy: bool = False,
relevance_aggregation_mode: str = AggregationMode.mean,
redundancy_aggregation_mode: str = AggregationMode.mean,
atol: float = 1e-10,
Expand All @@ -189,6 +190,9 @@ def __init__(
features_to_use:
columns of the dataset to select from
if "all" value is given, all columns are used
fast_redundancy:
* True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)
* False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)`
relevance_aggregation_mode:
the method for relevance values per-segment aggregation
redundancy_aggregation_mode:
Expand All @@ -204,6 +208,7 @@ def __init__(
super().__init__(features_to_use=features_to_use, return_features=return_features)
self.relevance_table = relevance_table
self.top_k = top_k
self.fast_redundancy = fast_redundancy
self.relevance_aggregation_mode = relevance_aggregation_mode
self.redundancy_aggregation_mode = redundancy_aggregation_mode
self.atol = atol
Expand Down Expand Up @@ -232,6 +237,7 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
relevance_table=relevance_table,
regressors=ts[:, :, features],
top_k=self.top_k,
fast_redundancy=self.fast_redundancy,
relevance_aggregation_mode=self.relevance_aggregation_mode,
redundancy_aggregation_mode=self.redundancy_aggregation_mode,
atol=self.atol,
Expand Down
41 changes: 33 additions & 8 deletions tests/test_analysis/test_feature_selection/test_mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,39 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
}


@pytest.mark.parametrize("fast_redundancy", [True, False])
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"relevance_method, expected_regressors",
[(ModelRelevanceTable(), ["regressor_useful_0", "regressor_useful_1", "regressor_useful_2"])],
)
def test_mrmr_right_regressors(df_with_regressors, relevance_method, expected_regressors):
def test_mrmr_right_regressors(df_with_regressors, relevance_method, expected_regressors, fast_redundancy):
relevance_table = relevance_method(
df=df_with_regressors["target"], df_exog=df_with_regressors["regressors"], model=RandomForestRegressor()
)
selected_regressors = mrmr(relevance_table=relevance_table, regressors=df_with_regressors["regressors"], top_k=3)
selected_regressors = mrmr(
relevance_table=relevance_table,
regressors=df_with_regressors["regressors"],
top_k=3,
fast_redundancy=fast_redundancy,
)
assert set(selected_regressors) == set(expected_regressors)


def test_mrmr_not_depend_on_columns_order(df_with_regressors):
@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_not_depend_on_columns_order(df_with_regressors, fast_redundancy):
df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
expected_answer = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=5)
expected_answer = mrmr(
relevance_table=relevance_table, regressors=regressors, top_k=5, fast_redundancy=fast_redundancy
)
columns = list(regressors.columns.get_level_values("feature").unique())
for i in range(10):
np.random.shuffle(columns)
answer = mrmr(
relevance_table=relevance_table[columns],
regressors=regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, columns]],
top_k=5,
fast_redundancy=fast_redundancy,
)
assert answer == expected_answer

Expand Down Expand Up @@ -131,21 +141,36 @@ def high_relevance_high_redundancy_problem_diff_starts(periods=10):
}


def test_mrmr_select_less_redundant_regressor(high_relevance_high_redundancy_problem):
@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_select_less_redundant_regressor(high_relevance_high_redundancy_problem, fast_redundancy):
"""Check that transform selects the less redundant regressor out of regressors with same relevance."""
relevance_table, regressors = (
high_relevance_high_redundancy_problem["relevance_table"],
high_relevance_high_redundancy_problem["regressors"],
)
selected_regressors = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2)
selected_regressors = mrmr(
relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=fast_redundancy
)
assert set(selected_regressors) == set(high_relevance_high_redundancy_problem["expected_answer"])


def test_mrmr_select_less_redundant_regressor_diff_start(high_relevance_high_redundancy_problem_diff_starts):
@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_select_less_redundant_regressor_diff_start(
high_relevance_high_redundancy_problem_diff_starts, fast_redundancy
):
"""Check that transform selects the less redundant regressor out of regressors with same relevance."""
relevance_table, regressors = (
high_relevance_high_redundancy_problem_diff_starts["relevance_table"],
high_relevance_high_redundancy_problem_diff_starts["regressors"],
)
selected_regressors = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2)
selected_regressors = mrmr(
relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=fast_redundancy
)
assert set(selected_regressors) == set(high_relevance_high_redundancy_problem_diff_starts["expected_answer"])


def test_fast_redundancy_deprecation_warning(df_with_regressors):
df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)
Original file line number Diff line number Diff line change
Expand Up @@ -279,13 +279,14 @@ def test_fit_transform_with_nans(model, ts_diff_endings):
selector.fit_transform(ts_diff_endings)


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([StatisticsRelevanceTable()]))
@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors):
def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors, fast_redundancy):
"""Check that transform selects exactly top_k regressors."""
all_regressors = ts_with_regressors.regressors
ts = ts_with_regressors
mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=top_k)
mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=top_k, fast_redundancy=fast_redundancy)
df_selected = mrmr.fit_transform(ts).to_pandas()

selected_regressors = set()
Expand All @@ -296,11 +297,14 @@ def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors):
assert len(selected_regressors) == min(len(all_regressors), top_k)


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()]))
def test_mrmr_right_regressors(relevance_table, ts_with_regressors):
def test_mrmr_right_regressors(relevance_table, ts_with_regressors, fast_redundancy):
"""Check that transform selects right top_k regressors."""
ts = ts_with_regressors
mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=3, model=RandomForestRegressor())
mrmr = MRMRFeatureSelectionTransform(
relevance_table=relevance_table, top_k=3, model=RandomForestRegressor(), fast_redundancy=fast_redundancy
)
df_selected = mrmr.fit_transform(ts).to_pandas()
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
Expand All @@ -316,7 +320,8 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors):
MRMRFeatureSelectionTransform(
relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42)
),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=True),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=False),
],
)
def test_save_load(transform, ts_with_regressors):
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -330,7 +335,8 @@ def test_save_load(transform, ts_with_regressors):
MRMRFeatureSelectionTransform(
relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42)
),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=True),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=False),
],
)
def test_params_to_tune(transform, ts_with_regressors):
Expand Down
Loading