Skip to content

Add get_model_relevance_table #261

Merged
merged 6 commits into from
Nov 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Backtest cli ([#223](https://github.com/tinkoff-ai/etna-ts/pull/223), [#259](https://github.com/tinkoff-ai/etna-ts/pull/259))
- TreeFeatureSelectionTransform ([#229](https://github.com/tinkoff-ai/etna-ts/pull/229))
- Feature relevance table calculation ([#227](https://github.com/tinkoff-ai/etna-ts/pull/227), [#249](https://github.com/tinkoff-ai/etna-ts/pull/249))
- Feature relevance table calculation using tsfresh ([#227](https://github.com/tinkoff-ai/etna-ts/pull/227), [#249](https://github.com/tinkoff-ai/etna-ts/pull/249))
- Method flatten to TSDataset ([#241](https://github.com/tinkoff-ai/etna-ts/pull/241)
- Out_column parameter to not inplace transforms([#211](https://github.com/tinkoff-ai/etna-ts/pull/211))
- omegaconf config parser in cli ([#258](https://github.com/tinkoff-ai/etna-ts/pull/258))
- Feature relevance table calculation using feature importance ([#261](https://github.com/tinkoff-ai/etna-ts/pull/261))

### Changed
- Add possibility to set custom in_column for ConfidenceIntervalOutliersTransform ([#240](https://github.com/tinkoff-ai/etna-ts/pull/240))
Expand Down
2 changes: 2 additions & 0 deletions etna/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from etna.analysis.eda_utils import cross_corr_plot
from etna.analysis.eda_utils import distribution_plot
from etna.analysis.eda_utils import sample_pacf_plot
from etna.analysis.feature_relevance.relevance import ModelRelevanceTable
from etna.analysis.feature_relevance.relevance import RelevanceTable
from etna.analysis.feature_relevance.relevance import StatisticsRelevanceTable
from etna.analysis.feature_relevance.relevance_table import get_model_relevance_table
from etna.analysis.feature_relevance.relevance_table import get_statistics_relevance_table
from etna.analysis.outliers.confidence_interval_outliers import get_anomalies_confidence_interval
from etna.analysis.outliers.density_outliers import get_anomalies_density
Expand Down
2 changes: 2 additions & 0 deletions etna/analysis/feature_relevance/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from etna.analysis.feature_relevance.relevance import ModelRelevanceTable
from etna.analysis.feature_relevance.relevance import RelevanceTable
from etna.analysis.feature_relevance.relevance import StatisticsRelevanceTable
from etna.analysis.feature_relevance.relevance_table import get_model_relevance_table
from etna.analysis.feature_relevance.relevance_table import get_statistics_relevance_table
13 changes: 13 additions & 0 deletions etna/analysis/feature_relevance/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd

from etna.analysis.feature_relevance.relevance_table import get_model_relevance_table
from etna.analysis.feature_relevance.relevance_table import get_statistics_relevance_table
from etna.core.mixins import BaseMixin

Expand Down Expand Up @@ -50,3 +51,15 @@ def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.Data
"""Compute feature relevance table with etna.analysis.get_statistics_relevance_table method."""
table = get_statistics_relevance_table(df=df, df_exog=df_exog)
return table


class ModelRelevanceTable(RelevanceTable):
"""ModelRelevanceTable builds feature relevance table using feature relevance values obtained from model."""

def __init__(self):
super().__init__(greater_is_better=True)

def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
"""Compute feature relevance table with etna.analysis.get_model_relevance_table method."""
table = get_model_relevance_table(df=df, df_exog=df_exog, **kwargs)
return table
52 changes: 51 additions & 1 deletion etna/analysis/feature_relevance/relevance_table.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
from typing import Union

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

from etna.libs.tsfresh import calculate_relevance_table

TreeBasedRegressor = Union[
DecisionTreeRegressor,
ExtraTreeRegressor,
RandomForestRegressor,
ExtraTreesRegressor,
GradientBoostingRegressor,
CatBoostRegressor,
]


def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> pd.DataFrame:
"""Calculate relevance table with p-values from tsfresh.
Expand All @@ -16,7 +33,8 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p

Returns
-------
dataframe with p-values.
pd.DataFrame
dataframe with p-values.
"""
regressors = sorted(df_exog.columns.get_level_values("feature").unique())
segments = sorted(df.columns.get_level_values("segment").unique())
Expand All @@ -31,3 +49,35 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p
relevance_table.index = segments
relevance_table.columns = regressors
return relevance_table


def get_model_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame, model: TreeBasedRegressor) -> pd.DataFrame:
"""Calculate relevance table with feature importance from model.

Parameters
----------
df:
dataframe with timeseries
df_exog:
dataframe with exogenous data
model:
model to obtain feature importance, should have feature_importances_ property

Returns
-------
pd.DataFrame
dataframe with feature importance values.
"""
regressors = sorted(df_exog.columns.get_level_values("feature").unique())
segments = sorted(df.columns.get_level_values("segment").unique())
result = np.empty((len(segments), len(regressors)))
for k, seg in enumerate(segments):
df_exog_seg = df_exog.loc[:, seg].dropna()[regressors]
df_seg = df.loc[:, seg].dropna()["target"]
common_index = df_seg.index.intersection(df_exog_seg.index)
model.fit(df_exog_seg.loc[common_index], df_seg.loc[common_index])
result[k] = model.feature_importances_
relevance_table = pd.DataFrame(result)
relevance_table.index = segments
relevance_table.columns = regressors
return relevance_table
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,6 @@ def big_example_tsdf(random_seed) -> TSDataset:
@pytest.fixture
def simple_df_relevance() -> Tuple[pd.DataFrame, pd.DataFrame]:
timestamp = pd.date_range("2021-01-01", "2021-02-01")
tmp = np.random.random(len(timestamp))

df_1 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(32), "segment": "1"})
df_2 = pd.DataFrame({"timestamp": timestamp[5:], "target": np.arange(5, 32), "segment": "2"})
Expand Down
10 changes: 10 additions & 0 deletions tests/test_analysis/test_feature_relevance/test_relevance.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from sklearn.tree import DecisionTreeRegressor

from etna.analysis.feature_relevance import ModelRelevanceTable
from etna.analysis.feature_relevance import StatisticsRelevanceTable


Expand All @@ -6,3 +9,10 @@ def test_statistics_relevance_table(simple_df_relevance):
assert not rt.greater_is_better
df, df_exog = simple_df_relevance
assert rt(df=df, df_exog=df_exog).shape == (2, 2)


def test_model_relevance_table(simple_df_relevance):
rt = ModelRelevanceTable()
assert rt.greater_is_better
df, df_exog = simple_df_relevance
assert rt(df=df, df_exog=df_exog, model=DecisionTreeRegressor()).shape == (2, 2)
36 changes: 36 additions & 0 deletions tests/test_analysis/test_feature_relevance/test_relevance_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import pandas as pd
import pytest
from sklearn.tree import DecisionTreeRegressor

from etna.analysis.feature_relevance import get_model_relevance_table
from etna.analysis.feature_relevance import get_statistics_relevance_table


@pytest.mark.parametrize(
"method,method_kwargs",
((get_statistics_relevance_table, {}), (get_model_relevance_table, {"model": DecisionTreeRegressor()})),
)
def test_interface(method, method_kwargs, simple_df_relevance):
df, df_exog = simple_df_relevance
relevance_table = method(df=df, df_exog=df_exog, **method_kwargs)
assert isinstance(relevance_table, pd.DataFrame)
assert sorted(relevance_table.index) == sorted(df.columns.get_level_values("segment").unique())
assert sorted(relevance_table.columns) == sorted(df_exog.columns.get_level_values("feature").unique())


def test_statistics_relevance_table(simple_df_relevance):
df, df_exog = simple_df_relevance
relevance_table = get_statistics_relevance_table(df=df, df_exog=df_exog)
assert relevance_table["regressor_1"]["1"] < 1e-14
assert relevance_table["regressor_1"]["2"] > 1e-1
assert np.isnan(relevance_table["regressor_2"]["1"])
assert relevance_table["regressor_2"]["2"] < 1e-10


def test_model_relevance_table(simple_df_relevance):
df, df_exog = simple_df_relevance
relevance_table = get_model_relevance_table(df=df, df_exog=df_exog, model=DecisionTreeRegressor())
assert np.allclose(relevance_table["regressor_1"]["1"], 1)
assert np.allclose(relevance_table["regressor_2"]["1"], 0)
assert relevance_table["regressor_1"]["2"] < relevance_table["regressor_2"]["2"]

This file was deleted.