diff --git a/CHANGELOG.md b/CHANGELOG.md index be484d550..1746581ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,7 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add tests on `inverse_transform` method of transforms on subset of segments, on new segments, on future with gap ([#1127](https://github.com/tinkoff-ai/etna/pull/1127)) - In-sample prediction for `BATSModel` and `TBATSModel` ([#1181](https://github.com/tinkoff-ai/etna/pull/1181)) - Method `predict_components` for forecast decomposition in `_TBATSAdapter` ([#1181](https://github.com/tinkoff-ai/etna/pull/1181)) -- +- Forecast decomposition for `DeadlineMovingAverageModel`([#1186](https://github.com/tinkoff-ai/etna/pull/1186)) +- ### Changed - Add optional `features` parameter in the signature of `TSDataset.to_pandas`, `TSDataset.to_flatten` ([#809](https://github.com/tinkoff-ai/etna/pull/809)) - Signature of the constructor of `TFTModel`, `DeepARModel` ([#1110](https://github.com/tinkoff-ai/etna/pull/1110)) diff --git a/etna/models/deadline_ma.py b/etna/models/deadline_ma.py index 58c2cc468..a02c91bdd 100644 --- a/etna/models/deadline_ma.py +++ b/etna/models/deadline_ma.py @@ -1,6 +1,7 @@ import warnings from enum import Enum from typing import Optional +from typing import Tuple import numpy as np import pandas as pd @@ -25,7 +26,14 @@ def _missing_(cls, value): class DeadlineMovingAverageModel( NonPredictionIntervalContextRequiredAbstractModel, ): - """Moving average model that uses exact previous dates to predict.""" + """Moving average model that uses exact previous dates to predict. + + Notes + _____ + This model supports in-sample and out-of-sample prediction decomposition. + Prediction components are corresponding target seasonal lags (monthly or annual) + with weights of :math:`1/window`. + """ def __init__(self, window: int = 3, seasonality: str = "month"): """Initialize deadline moving average model. @@ -156,6 +164,53 @@ def _get_context_beginning( return first_index + def _get_previous_date(self, date, offset): + """Get previous date using seasonality offset.""" + if self.seasonality == SeasonalityMode.month: + prev_date = date - pd.DateOffset(months=offset) + elif self.seasonality == SeasonalityMode.year: + prev_date = date - pd.DateOffset(years=offset) + + return prev_date + + def _make_prediction_components( + self, result_template: pd.DataFrame, context: pd.DataFrame, prediction_size: int + ) -> pd.DataFrame: + """Estimate prediction components using ``result_template`` as a base and ``context`` as a context.""" + index = result_template.index + end_idx = len(result_template) + start_idx = end_idx - prediction_size + + components_data = [] + for i in range(start_idx, end_idx): + + obs_components = [] + for w in range(1, self.window + 1): + prev_date = self._get_previous_date(date=result_template.index[i], offset=w) + obs_components.append(context.loc[prev_date].values) + + components_data.append(obs_components) + + # shape: (prediction_size, window, num_segments) + raw_components = np.asarray(components_data, dtype=float) + + # shape: (prediction_size, num_segments, window) + # this is needed to place elements in the right order + raw_components = np.swapaxes(raw_components, -1, -2) + + # shape: (prediction_size, num_segments * window) + raw_components = raw_components.reshape(raw_components.shape[0], -1) + raw_components /= self.window + + components_names = [f"target_component_{self.seasonality.name}_lag_{w}" for w in range(1, self.window + 1)] + + segment_names = context.columns.get_level_values("segment") + column_names = pd.MultiIndex.from_product([segment_names, components_names], names=("segment", "feature")) + + target_components_df = pd.DataFrame(data=raw_components, columns=column_names, index=index[start_idx:end_idx]) + + return target_components_df + def _make_predictions( self, result_template: pd.DataFrame, context: pd.DataFrame, prediction_size: int ) -> np.ndarray: @@ -165,10 +220,7 @@ def _make_predictions( end_idx = len(result_template) for i in range(start_idx, end_idx): for w in range(1, self.window + 1): - if self.seasonality == SeasonalityMode.month: - prev_date = result_template.index[i] - pd.DateOffset(months=w) - elif self.seasonality == SeasonalityMode.year: - prev_date = result_template.index[i] - pd.DateOffset(years=w) + prev_date = self._get_previous_date(date=result_template.index[i], offset=w) result_template.loc[index[i]] += context.loc[prev_date] @@ -177,7 +229,9 @@ def _make_predictions( result_values = result_template.values[-prediction_size:] return result_values - def _forecast(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: + def _forecast( + self, df: pd.DataFrame, prediction_size: int, return_components: bool = False + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """Make autoregressive forecasts on a wide dataframe.""" context_beginning = self._get_context_beginning( df=df, prediction_size=prediction_size, seasonality=self.seasonality, window=self.window @@ -200,7 +254,14 @@ def _forecast(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: df = df.iloc[-prediction_size:] y_pred = result_values[-prediction_size:] df.loc[:, pd.IndexSlice[:, "target"]] = y_pred - return df + + target_components_df = None + if return_components: + target_components_df = self._make_prediction_components( + result_template=result_template, context=result_template, prediction_size=prediction_size + ) + + return df, target_components_df def forecast(self, ts: TSDataset, prediction_size: int, return_components: bool = False) -> TSDataset: """Make autoregressive forecasts. @@ -231,16 +292,22 @@ def forecast(self, ts: TSDataset, prediction_size: int, return_components: bool ValueError: if forecast context contains NaNs """ - if return_components: - raise NotImplementedError("This mode isn't currently implemented!") self._validate_fitted() df = ts.to_pandas() - new_df = self._forecast(df=df, prediction_size=prediction_size) + new_df, target_components_df = self._forecast( + df=df, prediction_size=prediction_size, return_components=return_components + ) ts.df = new_df + + if return_components: + ts.add_target_components(target_components_df=target_components_df) + return ts - def _predict(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: + def _predict( + self, df: pd.DataFrame, prediction_size: int, return_components: bool = False + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """Make predictions on a wide dataframe using true values as autoregression context.""" context_beginning = self._get_context_beginning( df=df, prediction_size=prediction_size, seasonality=self.seasonality, window=self.window @@ -261,7 +328,14 @@ def _predict(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: df = df.iloc[-prediction_size:] y_pred = result_values[-prediction_size:] df.loc[:, pd.IndexSlice[:, "target"]] = y_pred - return df + + target_components_df = None + if return_components: + target_components_df = self._make_prediction_components( + result_template=result_template, context=context, prediction_size=prediction_size + ) + + return df, target_components_df def predict(self, ts: TSDataset, prediction_size: int, return_components: bool = False) -> TSDataset: """Make predictions using true values as autoregression context (teacher forcing). @@ -292,13 +366,17 @@ def predict(self, ts: TSDataset, prediction_size: int, return_components: bool = ValueError: if forecast context contains NaNs """ - if return_components: - raise NotImplementedError("This mode isn't currently implemented!") self._validate_fitted() df = ts.to_pandas() - new_df = self._predict(df=df, prediction_size=prediction_size) + new_df, target_components_df = self._predict( + df=df, prediction_size=prediction_size, return_components=return_components + ) ts.df = new_df + + if return_components: + ts.add_target_components(target_components_df=target_components_df) + return ts diff --git a/tests/test_models/test_simple_models.py b/tests/test_models/test_simple_models.py index 3a31c4855..549f5acac 100644 --- a/tests/test_models/test_simple_models.py +++ b/tests/test_models/test_simple_models.py @@ -55,6 +55,26 @@ def df(): return tsds +@pytest.fixture() +def long_periodic_ts(): + history = 400 + + df1 = pd.DataFrame() + df1["target"] = np.sin(np.arange(history)) + df1["segment"] = "A" + df1["timestamp"] = pd.date_range(start="2020-01-01", periods=history) + + df2 = df1.copy() + df2["segment"] = "B" + df2["target"] *= 4 + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + ts = TSDataset(df, freq="D") + + return ts + + @pytest.mark.parametrize("model", [SeasonalMovingAverageModel, NaiveModel, MovingAverageModel]) def test_sma_model_forecast(simple_df, model): _check_forecast(ts=simple_df, model=model(), horizon=7) @@ -767,6 +787,7 @@ def test_sma_model_predict_components_sum_up_to_target(example_tsds, method_name def test_sma_model_predict_components_correct( simple_df, method_name, expected_values, window=1, seasonality=2, horizon=3 ): + """Testing that correct lag used as a component.""" model = SeasonalMovingAverageModel(window=window, seasonality=seasonality) model.fit(simple_df) to_call = getattr(model, method_name) @@ -774,3 +795,71 @@ def test_sma_model_predict_components_correct( target_components_df = forecast.get_target_components() np.testing.assert_allclose(target_components_df.values, expected_values) + + +@pytest.mark.parametrize("method", ("predict", "forecast")) +@pytest.mark.parametrize( + "window,seasonality,expected_components_names", + ( + (1, "month", ["target_component_month_lag_1"]), + (3, "month", ["target_component_month_lag_1", "target_component_month_lag_2", "target_component_month_lag_3"]), + (1, "year", ["target_component_year_lag_1"]), + ), +) +def test_deadline_ma_predict_components_correct_names( + long_periodic_ts, method, window, seasonality, expected_components_names, horizon=10 +): + model = DeadlineMovingAverageModel(window=window, seasonality=seasonality) + model.fit(ts=long_periodic_ts) + + method_to_call = getattr(model, method) + forecast = method_to_call(ts=long_periodic_ts, prediction_size=horizon, return_components=True) + + assert sorted(forecast.target_components_names) == sorted(expected_components_names) + + +@pytest.mark.parametrize("method", ("predict", "forecast")) +@pytest.mark.parametrize( + "window,seasonality", + ( + (1, "month"), + (3, "month"), + (1, "year"), + ), +) +def test_deadline_ma_predict_components_sum_up_to_target(long_periodic_ts, method, window, seasonality, horizon=10): + model = DeadlineMovingAverageModel(window=window, seasonality=seasonality) + model.fit(ts=long_periodic_ts) + + method_to_call = getattr(model, method) + forecast = method_to_call(ts=long_periodic_ts, prediction_size=horizon, return_components=True) + + target = forecast.to_pandas(features=["target"]) + components = forecast.get_target_components() + + np.testing.assert_allclose(target.values, components.sum(axis=1, level="segment").values) + + +@pytest.mark.parametrize( + "method_name, out_of_sample_pred", + (("forecast", [-0.75100715, -3.00402861]), ("predict", [-0.42019439, -1.68077756])), +) +def test_deadline_ma_predict_components_correct( + long_periodic_ts, method_name, out_of_sample_pred, window=1, seasonality="month", horizon=32 +): + """Testing that correct lag used as a component.""" + predict_lags = long_periodic_ts.df.values[-63:-32] + + model = DeadlineMovingAverageModel(window=window, seasonality=seasonality) + model.fit(long_periodic_ts) + + to_call = getattr(model, method_name) + forecast = to_call(ts=long_periodic_ts, prediction_size=horizon, return_components=True) + + target_components_df = forecast.get_target_components() + + # testing in-sample prediction + np.testing.assert_allclose(target_components_df.values[:-1], predict_lags) + + # testing out-of-sample prediction + np.testing.assert_allclose(target_components_df.values[-1], out_of_sample_pred)