Skip to content

Commit

Permalink
Merge branch 'master' into example/regressors-notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
iKintosh committed Feb 28, 2022
2 parents 800432f + 09a7938 commit a67945f
Show file tree
Hide file tree
Showing 7 changed files with 395 additions and 71 deletions.
9 changes: 6 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add regressors example notebook ([#577](https://github.com/tinkoff-ai/etna/pull/577))
- Add option `season_number` to DateFlagsTransform ([#567](https://github.com/tinkoff-ai/etna/pull/567))
-

-
-
- Create `AbstaractPipeline` ([#573](https://github.com/tinkoff-ai/etna/pull/573))
-
### Changed
- Change the way `ProphetModel` works with regressors ([#383](https://github.com/tinkoff-ai/etna/pull/383))
- Change the way `SARIMAXModel` works with regressors ([#380](https://github.com/tinkoff-ai/etna/pull/380))
Expand All @@ -39,10 +42,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Update CONTRIBUTING.md ([#536](https://github.com/tinkoff-ai/etna/pull/536))
-
- Rename `_CatBoostModel`, `_HoltWintersModel`, `_SklearnModel` ([#543](https://github.com/tinkoff-ai/etna/pull/543))
-
- Add logging to TSDataset.make_future, log repr of transform instead of class name ([#555](https://github.com/tinkoff-ai/etna/pull/555))
- Rename `_SARIMAXModel` and `_ProphetModel`, make `SARIMAXModel` and `ProphetModel` inherit from `PerSegmentPredictionIntervalModel` ([#549](https://github.com/tinkoff-ai/etna/pull/549))
-
-
- Make detrending polynomial ([#566](https://github.com/tinkoff-ai/etna/pull/566))
-
- Make `LabelEncoderTransform` and `OneHotEncoderTransform` multi-segment ([#554](https://github.com/tinkoff-ai/etna/pull/554))
### Fixed
Expand Down
4 changes: 2 additions & 2 deletions etna/core/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ def __repr__(self):
continue
elif param.kind == param.VAR_KEYWORD:
for arg_, value in self.__dict__[arg].items():
args_str_representation += f"{arg_} = {value.__repr__()}, "
args_str_representation += f"{arg_} = {repr(value)}, "
else:
try:
value = self.__dict__[arg]
except KeyError as e:
value = None
warnings.warn(f"You haven't set all parameters inside class __init__ method: {e}")
args_str_representation += f"{arg} = {value.__repr__()}, "
args_str_representation += f"{arg} = {repr(value)}, "
return f"{self.__class__.__name__}({args_str_representation})"


Expand Down
5 changes: 3 additions & 2 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def transform(self, transforms: Sequence["Transform"]):
self._check_endings(warning=True)
self.transforms = transforms
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
tslogger.log(f"Transform {repr(transform)} is applied to dataset")
columns_before = set(self.columns.get_level_values("feature"))
self.df = transform.transform(self.df)
columns_after = set(self.columns.get_level_values("feature"))
Expand All @@ -145,7 +145,7 @@ def fit_transform(self, transforms: Sequence["Transform"]):
self._check_endings(warning=True)
self.transforms = transforms
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
tslogger.log(f"Transform {repr(transform)} is applied to dataset")
columns_before = set(self.columns.get_level_values("feature"))
self.df = transform.fit_transform(self.df)
columns_after = set(self.columns.get_level_values("feature"))
Expand Down Expand Up @@ -288,6 +288,7 @@ def make_future(self, future_steps: int) -> "TSDataset":

if self.transforms is not None:
for transform in self.transforms:
tslogger.log(f"Transform {repr(transform)} is applied to dataset")
df = transform.transform(df)

future_dataset = df.tail(future_steps).copy(deep=True)
Expand Down
82 changes: 82 additions & 0 deletions etna/pipeline/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,91 @@
import warnings
from abc import ABC
from abc import abstractmethod
from typing import Any
from typing import Dict
from typing import List
from typing import Sequence
from typing import Tuple

import pandas as pd

from etna.core import BaseMixin
from etna.datasets import TSDataset
from etna.metrics import Metric


class AbstractPipeline(ABC):
"""Interface for pipeline."""

@abstractmethod
def fit(self, ts: TSDataset) -> "AbstractPipeline":
"""Fit the Pipeline.
Parameters
----------
ts:
Dataset with timeseries data
Returns
-------
self:
Fitted Pipeline instance
"""
pass

@abstractmethod
def forecast(self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975)) -> TSDataset:
"""Make predictions.
Parameters
----------
prediction_interval:
If True returns prediction interval for forecast
quantiles:
Levels of prediction distribution. By default 2.5% and 97.5% taken to form a 95% prediction interval
Returns
-------
forecast:
Dataset with predictions
"""
pass

@abstractmethod
def backtest(
self,
ts: TSDataset,
metrics: List[Metric],
n_folds: int = 5,
mode: str = "expand",
aggregate_metrics: bool = False,
n_jobs: int = 1,
joblib_params: Dict[str, Any] = dict(verbose=11, backend="multiprocessing", mmap_mode="c"),
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Run backtest with the pipeline.
Parameters
----------
ts:
Dataset to fit models in backtest
metrics:
List of metrics to compute for each fold
n_folds:
Number of folds
mode:
One of 'expand', 'constant' -- train generation policy
aggregate_metrics:
If True aggregate metrics above folds, return raw metrics otherwise
n_jobs:
Number of jobs to run in parallel
joblib_params:
Additional parameters for joblib.Parallel
Returns
-------
metrics_df, forecast_df, fold_info_df:
Metrics dataframe, forecast dataframe and dataframe with information about folds
"""


class BasePipeline(ABC, BaseMixin):
Expand Down
73 changes: 48 additions & 25 deletions etna/transforms/decomposition/detrend.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import TheilSenRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform
Expand All @@ -11,7 +14,7 @@
class _OneSegmentLinearTrendBaseTransform(Transform):
"""LinearTrendBaseTransform is a base class that implements trend subtraction and reconstruction feature."""

def __init__(self, in_column: str, regressor: RegressorMixin):
def __init__(self, in_column: str, regressor: RegressorMixin, poly_degree: int = 1):
"""
Create instance of _OneSegmentLinearTrendBaseTransform.
Expand All @@ -21,9 +24,26 @@ def __init__(self, in_column: str, regressor: RegressorMixin):
name of processed column
regressor:
instance of sklearn RegressorMixin to predict trend
poly_degree:
degree of polynomial to fit trend on
"""
self._linear_model = regressor
self.in_column = in_column
self.poly_degree = poly_degree
self._pipeline = Pipeline(
[("polynomial", PolynomialFeatures(degree=self.poly_degree, include_bias=False)), ("regressor", regressor)]
)
# verification that this variable is fitted isn't needed because this class isn't used by the user
self._x_median = None

@staticmethod
def _get_x(df) -> np.ndarray:
series_len = len(df)
x = df.index.to_series()
if isinstance(type(x.dtype), pd.Timestamp):
raise ValueError("Your timestamp column has wrong format. Need np.datetime64 or datetime.datetime")
x = x.apply(lambda ts: ts.timestamp())
x = x.to_numpy().reshape(series_len, 1)
return x

def fit(self, df: pd.DataFrame) -> "_OneSegmentLinearTrendBaseTransform":
"""
Expand All @@ -40,14 +60,11 @@ def fit(self, df: pd.DataFrame) -> "_OneSegmentLinearTrendBaseTransform":
instance with trained regressor
"""
df = df.dropna(subset=[self.in_column])
series_len = len(df)
x = df.index.to_series()
if isinstance(type(x.dtype), pd.Timestamp):
raise ValueError("Your timestamp column has wrong format. Need np.datetime64 or datetime.datetime")
x = x.apply(lambda ts: ts.timestamp())
x = x.to_numpy().reshape(series_len, 1)
x = self._get_x(df)
self._x_median = np.median(x)
x -= self._x_median
y = df[self.in_column].tolist()
self._linear_model.fit(x, y)
self._pipeline.fit(x, y)
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -65,12 +82,10 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
residue after trend subtraction
"""
result = df.copy()
series_len = len(df)
x = pd.to_datetime(df.index.to_series())
x = x.apply(lambda ts: ts.timestamp())
x = x.to_numpy().reshape(series_len, 1)
x = self._get_x(df)
x -= self._x_median
y = df[self.in_column].values
trend = self._linear_model.predict(x)
trend = self._pipeline.predict(x)
no_trend_timeseries = y - trend
result[self.in_column] = no_trend_timeseries
return result
Expand Down Expand Up @@ -106,12 +121,10 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
data with reconstructed trend
"""
result = df.copy()
series_len = len(df)
x = pd.to_datetime(df.index.to_series())
x = x.apply(lambda ts: ts.timestamp())
x = x.to_numpy().reshape(series_len, 1)
x = self._get_x(df)
x -= self._x_median
y = df[self.in_column].values
trend = self._linear_model.predict(x)
trend = self._pipeline.predict(x)
add_trend_timeseries = y + trend
result[self.in_column] = add_trend_timeseries
if self.in_column == "target":
Expand All @@ -122,49 +135,56 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:


class LinearTrendTransform(PerSegmentWrapper):
"""Transform that uses sklearn.linear_model.LinearRegression to find linear trend in data.
"""Transform that uses sklearn.linear_model.LinearRegression to find linear or polynomial trend in data.
Warning
-------
This transform can suffer from look-ahead bias. For transforming data at some timestamp
it uses information from the whole train part.
"""

def __init__(self, in_column: str, **regression_params):
def __init__(self, in_column: str, poly_degree: int = 1, **regression_params):
"""Create instance of LinearTrendTransform.
Parameters
----------
in_column:
name of processed column
poly_degree:
degree of polynomial to fit trend on
regression_params:
params that should be used to init LinearRegression
"""
self.in_column = in_column
self.poly_degree = poly_degree
self.regression_params = regression_params
super().__init__(
transform=_OneSegmentLinearTrendBaseTransform(
in_column=self.in_column, regressor=LinearRegression(**self.regression_params)
in_column=self.in_column,
regressor=LinearRegression(**self.regression_params),
poly_degree=self.poly_degree,
)
)


class TheilSenTrendTransform(PerSegmentWrapper):
"""Transform that uses sklearn.linear_model.TheilSenRegressor to find linear trend in data.
"""Transform that uses sklearn.linear_model.TheilSenRegressor to find linear or polynomial trend in data.
Warning
-------
This transform can suffer from look-ahead bias. For transforming data at some timestamp
it uses information from the whole train part.
"""

def __init__(self, in_column: str, **regression_params):
def __init__(self, in_column: str, poly_degree: int = 1, **regression_params):
"""Create instance of TheilSenTrendTransform.
Parameters
----------
in_column:
name of processed column
poly_degree:
degree of polynomial to fit trend on
regression_params:
params that should be used to init TheilSenRegressor
Expand All @@ -174,9 +194,12 @@ def __init__(self, in_column: str, **regression_params):
of features (plus 1 if fit_intercept=True) and the number of samples in the shortest segment as a maximum.
"""
self.in_column = in_column
self.poly_degree = poly_degree
self.regression_params = regression_params
super().__init__(
transform=_OneSegmentLinearTrendBaseTransform(
in_column=self.in_column, regressor=TheilSenRegressor(**self.regression_params)
in_column=self.in_column,
regressor=TheilSenRegressor(**self.regression_params),
poly_degree=self.poly_degree,
)
)

0 comments on commit a67945f

Please sign in to comment.