Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into issue-271
Browse files Browse the repository at this point in the history
  • Loading branch information
d.a.bunin committed Nov 16, 2021
2 parents eb021bc + 4b1a41c commit 7f97ae9
Show file tree
Hide file tree
Showing 8 changed files with 557 additions and 575 deletions.
14 changes: 12 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
- RelevanceTable returns rank ([#268](https://github.com/tinkoff-ai/etna-ts/pull/268/))

### Changed

### Fixed

## [1.3.1] - 2021-11-12
### Changed
- Delete restriction on version of pandas ([#274](https://github.com/tinkoff-ai/etna-ts/pull/274))

## [1.3.0] - 2021-11-12
### Added
- Backtest cli ([#223](https://github.com/tinkoff-ai/etna-ts/pull/223), [#259](https://github.com/tinkoff-ai/etna-ts/pull/259))
- TreeFeatureSelectionTransform ([#229](https://github.com/tinkoff-ai/etna-ts/pull/229))
- MRMRFeatureSelectionTransform ([#251](https://github.com/tinkoff-ai/etna-ts/pull/251))
- Feature relevance table calculation ([#227](https://github.com/tinkoff-ai/etna-ts/pull/227), [#249](https://github.com/tinkoff-ai/etna-ts/pull/249))
- Feature relevance table calculation using tsfresh ([#227](https://github.com/tinkoff-ai/etna-ts/pull/227), [#249](https://github.com/tinkoff-ai/etna-ts/pull/249))
- Method to_flatten to TSDataset ([#241](https://github.com/tinkoff-ai/etna-ts/pull/241)
- Out_column parameter to not inplace transforms([#211](https://github.com/tinkoff-ai/etna-ts/pull/211))
Expand Down
20 changes: 17 additions & 3 deletions etna/analysis/feature_relevance/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import abstractmethod

import pandas as pd
import scipy.stats

from etna.analysis.feature_relevance.relevance_table import get_model_relevance_table
from etna.analysis.feature_relevance.relevance_table import get_statistics_relevance_table
Expand All @@ -21,8 +22,15 @@ def __init__(self, greater_is_better: bool):
"""
self.greater_is_better = greater_is_better

def _get_ranks(self, table: pd.DataFrame) -> pd.DataFrame:
"""Compute rank relevance table from relevance table."""
if self.greater_is_better:
table *= -1
rank_table = pd.DataFrame(scipy.stats.rankdata(table, axis=1), columns=table.columns, index=table.index)
return rank_table.astype(int)

@abstractmethod
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool = False, **kwargs) -> pd.DataFrame:
"""Compute relevance table.
For each series in df compute relevance of corresponding series in df_exog.
Expand All @@ -32,6 +40,8 @@ def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.Data
dataframe with series that will be used as target
df_exog:
dataframe with series to compute relevance for df
return_ranks:
if False return relevance values else return ranks of relevance values
Returns
-------
Expand All @@ -47,9 +57,11 @@ class StatisticsRelevanceTable(RelevanceTable):
def __init__(self):
super().__init__(greater_is_better=False)

def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool = False, **kwargs) -> pd.DataFrame:
"""Compute feature relevance table with etna.analysis.get_statistics_relevance_table method."""
table = get_statistics_relevance_table(df=df, df_exog=df_exog)
if return_ranks:
return self._get_ranks(table)
return table


Expand All @@ -59,7 +71,9 @@ class ModelRelevanceTable(RelevanceTable):
def __init__(self):
super().__init__(greater_is_better=True)

def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool = False, **kwargs) -> pd.DataFrame:
"""Compute feature relevance table with etna.analysis.get_model_relevance_table method."""
table = get_model_relevance_table(df=df, df_exog=df_exog, **kwargs)
if return_ranks:
return self._get_ranks(table)
return table
1 change: 0 additions & 1 deletion etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from etna.transforms.datetime_flags import TimeFlagsTransform
from etna.transforms.detrend import LinearTrendTransform
from etna.transforms.detrend import TheilSenTrendTransform
from etna.transforms.feature_importance import MRMRFeatureSelectionTransform
from etna.transforms.feature_importance import TreeFeatureSelectionTransform
from etna.transforms.filter import FilterFeaturesTransform
from etna.transforms.imputation import TimeSeriesImputerTransform
Expand Down
109 changes: 0 additions & 109 deletions etna/transforms/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,12 @@
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from mrmr import mrmr_classif
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

from etna.analysis import RelevanceTable
from etna.clustering import EuclideanClustering
from etna.clustering import HierarchicalClustering
from etna.datasets import TSDataset
from etna.transforms.base import Transform

Expand Down Expand Up @@ -134,108 +130,3 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
)
result = result.loc[:, pd.IndexSlice[:, selected_columns]]
return result


class MRMRFeatureSelectionTransform(Transform):
"""Transform that selects regressors according to mRMR variable selection method."""

def __init__(
self,
relevance_method: RelevanceTable,
top_k: int,
clustering_method: HierarchicalClustering = EuclideanClustering(),
n_clusters: int = 10,
linkage: str = "average",
**relevance_params,
):
"""
Init MRMRFeatureSelectionTransform.
Parameters
----------
relevance_method:
method to calculate relevance table
top_k:
num of regressors to select; if there are not enough regressors, then all will be selected
clustering_method:
method of time series clustering
n_clusters:
number of clusters
linkage:
rule for distance computation for new clusters, allowed "ward", "single", "average", "maximum", "complete"
"""
if not isinstance(top_k, int) or top_k < 0:
raise ValueError("Parameter top_k should be positive integer")

if not isinstance(n_clusters, int) or n_clusters < 2:
raise ValueError("Parameter n_clusters should be integer and greater than 1")

self.relevance_method = relevance_method
self.clustering = clustering_method
self.n_clusters = n_clusters
self.linkage = linkage
self.top_k = top_k
self.relevance_params = relevance_params
self.selected_regressors: Optional[List[str]] = None

@staticmethod
def _get_regressors(df: pd.DataFrame) -> List[str]:
"""Get list of regressors in the dataframe."""
result = set()
for column in df.columns.get_level_values("feature"):
if column.startswith("regressor_"):
result.add(column)
return sorted(list(result))

def fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
"""
Fit the method and remember features to select.
Parameters
----------
df:
dataframe with all segments data
Returns
-------
result: MRMRFeatureSelectionTransform
instance after fitting
"""
if len(self._get_regressors(df)) <= self.n_clusters:
raise ValueError("The number of clusters must be strictly less than the number of regressors")

ts = TSDataset(df=df, freq=pd.infer_freq(df.index))
self.clustering.build_distance_matrix(ts=ts)
self.clustering.build_clustering_algo(n_clusters=self.n_clusters, linkage=self.linkage)
s2c = self.clustering.fit_predict()
relevance_table = self.relevance_method(ts[:, :, "target"], ts[:, :, ts.regressors], **self.relevance_params)
y = np.empty(len(relevance_table))
for k, cluster in enumerate(relevance_table.index):
y[k] = s2c[cluster]
self.selected_regressors = mrmr_classif(relevance_table, y, K=self.top_k)
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Select top_k regressors.
Parameters
----------
df:
dataframe with all segments data
Returns
-------
result: pd.DataFrame
Dataframe with with only selected regressors
"""
result = df.copy()
selected_columns = sorted(
[
column
for column in df.columns.get_level_values("feature").unique()
if not column.startswith("regressor_") or column in self.selected_regressors
]
)
result = result.loc[:, pd.IndexSlice[:, selected_columns]]
return result
Loading

0 comments on commit 7f97ae9

Please sign in to comment.