Skip to content

Commit

Permalink
Add TSDatasets.__init__ regressors logic (#357)
Browse files Browse the repository at this point in the history
  • Loading branch information
Mr-Geekman committed Dec 9, 2021
1 parent f0c2bfe commit 692ff58
Show file tree
Hide file tree
Showing 11 changed files with 175 additions and 105 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Add regressors logic to TSDatasets init ([#357](https://github.com/tinkoff-ai/etna/pull/357))

## [1.4.0] - 2021-12-03
### Added
- ACF plot ([#318](https://github.com/tinkoff-ai/etna/pull/318))
Expand Down
77 changes: 56 additions & 21 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
import warnings
from copy import copy
from typing import TYPE_CHECKING
from typing import List
from typing import Optional
Expand All @@ -10,6 +11,7 @@
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from typing_extensions import Literal

from etna.loggers import tslogger

Expand Down Expand Up @@ -56,7 +58,7 @@ class TSDataset:
>>> df_regressors["segment"] = "segment_0"
>>> df_to_forecast = TSDataset.to_dataset(df_to_forecast)
>>> df_regressors = TSDataset.to_dataset(df_regressors)
>>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors)
>>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors, known_future="all")
>>> tsdataset.df.head(5)
segment segment_0
feature regressor_0 regressor_1 regressor_2 regressor_3 regressor_4 target
Expand All @@ -70,7 +72,13 @@ class TSDataset:

idx = pd.IndexSlice

def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None):
def __init__(
self,
df: pd.DataFrame,
freq: str,
df_exog: Optional[pd.DataFrame] = None,
known_future: Union[Literal["all"], Sequence] = (),
):
"""Init TSDataset.
Parameters
Expand All @@ -82,6 +90,9 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame]
df_exog:
dataframe with exogenous data;
if the series is known in the future features' names should start with prefix 'regressor_`.
known_future:
columns in df_exog[known_future] that are regressors,
if "all" value is given, all columns are meant to be regressors
"""
self.raw_df = df.copy(deep=True)
self.raw_df.index = pd.to_datetime(self.raw_df.index)
Expand All @@ -105,13 +116,15 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame]

self.df = self.raw_df.copy(deep=True)

self.known_future = self._check_known_future(known_future, df_exog)
self._regressors = copy(self.known_future)

if df_exog is not None:
self.df_exog = df_exog.copy(deep=True)
self.df_exog.index = pd.to_datetime(self.df_exog.index)
self.df = self._merge_exog(self.df)

self.transforms: Optional[Sequence["Transform"]] = None
self._update_regressors()

def transform(self, transforms: Sequence["Transform"]):
"""Apply given transform to the data."""
Expand All @@ -120,7 +133,6 @@ def transform(self, transforms: Sequence["Transform"]):
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
self.df = transform.transform(self.df)
self._update_regressors()

def fit_transform(self, transforms: Sequence["Transform"]):
"""Fit and apply given transforms to the data."""
Expand All @@ -129,7 +141,6 @@ def fit_transform(self, transforms: Sequence["Transform"]):
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
self.df = transform.fit_transform(self.df)
self._update_regressors()

def __repr__(self):
return self.df.__repr__()
Expand Down Expand Up @@ -177,7 +188,9 @@ def make_future(self, future_steps: int) -> "TSDataset":
... })
>>> df_ts_format = TSDataset.to_dataset(df)
>>> df_regressors_ts_format = TSDataset.to_dataset(df_regressors)
>>> ts = TSDataset(df_ts_format, "D", df_exog=df_regressors_ts_format)
>>> ts = TSDataset(
... df_ts_format, "D", df_exog=df_regressors_ts_format, known_future="all"
... )
>>> ts.make_future(4)
segment segment_0 segment_1
feature regressor_1 regressor_2 target regressor_1 regressor_2 target
Expand Down Expand Up @@ -221,14 +234,39 @@ def make_future(self, future_steps: int) -> "TSDataset":
return future_ts

@staticmethod
def _check_regressors(df: pd.DataFrame, df_exog: pd.DataFrame):
"""Check that regressors in df_exog begin not later than in df and end later than in df."""
def _check_known_future(
known_future: Union[Literal["all"], Sequence], df_exog: Optional[pd.DataFrame]
) -> List[str]:
"""Check that `known_future` corresponds to `df_exog` and returns initial list of regressors."""
if df_exog is None:
exog_columns = set()
else:
exog_columns = set(df_exog.columns.get_level_values("feature"))

if isinstance(known_future, str):
if known_future == "all":
return sorted(list(exog_columns))
else:
raise ValueError("The only possible literal is 'all'")
else:
known_future_unique = set(known_future)
if not known_future_unique.issubset(exog_columns):
raise ValueError(
f"Some features in known_future are not present in df_exog: "
f"{known_future_unique.difference(exog_columns)}"
)
else:
return sorted(list(known_future_unique))

@staticmethod
def _check_regressors(df: pd.DataFrame, df_regressors: pd.DataFrame):
"""Check that regressors begin not later than in df and end later than in df."""
# TODO: check performance
df_segments = df.columns.get_level_values("segment")
for segment in df_segments:
target = df[segment]["target"].dropna()
exog_regressor_columns = [x for x in set(df_exog[segment].columns) if x.startswith("regressor")]
for series in exog_regressor_columns:
exog_series = df_exog[segment][series].dropna()
target = df.loc[:, pd.IndexSlice[segment, "target"]].dropna()
for series in df_regressors.columns.get_level_values("feature"):
exog_series = df_regressors.loc[:, pd.IndexSlice[segment, series]].dropna()
if target.index.min() < exog_series.index.min():
raise ValueError(
f"All the regressor series should start not later than corresponding 'target'."
Expand All @@ -243,7 +281,9 @@ def _check_regressors(df: pd.DataFrame, df_exog: pd.DataFrame):
)

def _merge_exog(self, df: pd.DataFrame) -> pd.DataFrame:
self._check_regressors(df=df, df_exog=self.df_exog)
segments = sorted(set(df.columns.get_level_values("segment")))
df_regressors = self.df_exog.loc[:, pd.IndexSlice[segments, self.known_future]]
self._check_regressors(df=df, df_regressors=df_regressors)
df = pd.merge(df, self.df_exog, left_index=True, right_index=True, how="left").sort_index(axis=1, level=(0, 1))
return df

Expand Down Expand Up @@ -279,13 +319,6 @@ def segments(self) -> List[str]:
"""
return self.df.columns.get_level_values("segment").unique().tolist()

def _update_regressors(self):
result = set()
for column in self.columns.get_level_values("feature"):
if column.startswith("regressor"):
result.add(column)
self._regressors = list(result)

@property
def regressors(self) -> List[str]:
"""Get list of all regressors across all segments in dataset.
Expand All @@ -307,7 +340,9 @@ def regressors(self) -> List[str]:
... )
>>> df_exog = pd.concat([df_regressors_1, df_regressors_2], ignore_index=True)
>>> df_exog_ts_format = TSDataset.to_dataset(df_exog)
>>> ts = TSDataset(df_ts_format, df_exog=df_exog_ts_format, freq="D")
>>> ts = TSDataset(
... df_ts_format, df_exog=df_exog_ts_format, freq="D", known_future="all"
... )
>>> ts.regressors
['regressor_1']
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def example_reg_tsds(random_seed) -> TSDataset:
df = TSDataset.to_dataset(df)
exog = TSDataset.to_dataset(exog)

tsds = TSDataset(df, freq="D", df_exog=exog)
tsds = TSDataset(df, freq="D", df_exog=exog, known_future="all")

return tsds

Expand Down Expand Up @@ -235,7 +235,7 @@ def outliers_tsds():
df.columns.names = ["segment", "feature"]

exog = df.copy()
exog.columns = pd.MultiIndex.from_arrays([["1", "2"], ["exog", "exog"]])
exog.columns.set_levels(["exog"], level="feature", inplace=True)

tsds = TSDataset(df, "1d", exog)

Expand Down
Loading

0 comments on commit 692ff58

Please sign in to comment.