Skip to content

Folds number estimation #1279

Merged
merged 11 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Notebook `forecast_interpretation.ipynb` with forecast decomposition ([#1220](https://github.com/tinkoff-ai/etna/pull/1220))
- Exogenous variables shift transform `ExogShiftTransform`([#1254](https://github.com/tinkoff-ai/etna/pull/1254))
- Parameter `start_timestamp` to forecast CLI command ([#1265](https://github.com/tinkoff-ai/etna/pull/1265))
-
- Function `estimate_max_n_folds` for folds number estimation ([#1279](https://github.com/tinkoff-ai/etna/pull/1279))
-
### Changed
- Set the default value of `final_model` to `LinearRegression(positive=True)` in the constructor of `StackingEnsemble` ([#1238](https://github.com/tinkoff-ai/etna/pull/1238))
- Add microseconds to `FileLogger`'s directory name ([#1264](https://github.com/tinkoff-ai/etna/pull/1264))
Expand Down
111 changes: 111 additions & 0 deletions etna/commands/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from enum import Enum
from math import floor
from typing import Literal
from typing import Optional
from typing import Union

from etna.datasets import TSDataset
from etna.pipeline import Pipeline


class MethodsWithFolds(str, Enum):
"""Enum for methods that use `n_folds` argument."""

forecast = "forecast"
backtest = "backtest"

@classmethod
def _missing_(cls, value):
raise ValueError(
f"{value} is not a valid method name. Only {', '.join([repr(m.value) for m in cls])} are allowed"
)


def _estimate_n_folds(num_points: int, horizon: int, stride: int, context_size: int) -> int:
"""Estimate number of folds."""
if num_points < horizon + context_size:
raise ValueError("Not enough data points!")

res = (num_points - horizon + stride - context_size) / stride
return floor(res)


def _max_n_folds_forecast(pipeline: Pipeline, context_size: int, ts: Optional[TSDataset] = None) -> int:
"""Estimate max n_folds for forecast method."""
if ts is None:
if pipeline.ts is None:
raise ValueError(
"There is no ts for forecast method! Pass ts into function or make sure that pipeline is fitted."
)

else:
ts = pipeline.ts

num_points = len(ts.index)
horizon = pipeline.horizon

return _estimate_n_folds(num_points=num_points, horizon=horizon, stride=horizon, context_size=context_size)


def _max_n_folds_backtest(pipeline: Pipeline, context_size: int, ts: TSDataset, **method_kwargs) -> int:
"""Estimate max n_folds for backtest method."""
# process backtest with intervals case
backtest_with_intervals = "forecast_params" in method_kwargs and method_kwargs["forecast_params"].get(
"prediction_interval", False
)

if backtest_with_intervals:
raise NotImplementedError("Number of folds estimation for backtest with intervals is not implemented!")

num_points = len(ts.index)

horizon = pipeline.horizon
stride = method_kwargs.get("stride", horizon)

return _estimate_n_folds(num_points=num_points, horizon=horizon, stride=stride, context_size=context_size)


def estimate_max_n_folds(
pipeline: Pipeline,
method_name: Union[Literal["forecast"], Literal["backtest"]],
context_size: int,
ts: Optional[TSDataset] = None,
**method_kwargs,
) -> int:
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
"""Estimate number of folds using provided data and pipeline configuration.
This function helps to estimate maximum number of folds that can be used when performing
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
forecast with intervals or pipeline backtest.

Parameters
----------
pipeline:
Pipeline for which to estimate number of folds.
method_name:
Method name for which to estimate number of folds.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see here indentation of 3 spaces instead of 4.

context_size:
Minimum number of points for pipeline to be estimated.
ts:
Dataset which will be used for estimation.
method_kwargs:
Additional arguments for methods that impact number of folds.

Returns
-------
:
Number of folds.
"""
if context_size < 1:
raise ValueError("Pipeline `context_size` parameter must be positive integer!")

if ts is None and method_name != MethodsWithFolds.forecast:
raise ValueError("Parameter `ts` is required when estimating for backtest method")

method = MethodsWithFolds(method_name)

if method == MethodsWithFolds.forecast:
n_folds = _max_n_folds_forecast(pipeline=pipeline, context_size=context_size, ts=ts)

else:
n_folds = _max_n_folds_backtest(pipeline=pipeline, context_size=context_size, ts=ts, **method_kwargs) # type: ignore
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved

return n_folds
8 changes: 8 additions & 0 deletions tests/test_commands/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.datasets import generate_ar_df


Expand Down Expand Up @@ -171,3 +172,10 @@ def start_timestamp_forecast_omegaconf_path():
tmp.flush()
yield Path(tmp.name)
tmp.close()


@pytest.fixture
def empty_ts():
df = pd.DataFrame({"segment": [], "timestamp": [], "target": []})
df = TSDataset.to_dataset(df=df)
return TSDataset(df=df, freq="D")
256 changes: 256 additions & 0 deletions tests/test_commands/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
from copy import deepcopy

import pytest

from etna.commands.utils import _estimate_n_folds
from etna.commands.utils import _max_n_folds_backtest
from etna.commands.utils import _max_n_folds_forecast
from etna.commands.utils import estimate_max_n_folds
from etna.metrics import MAE
from etna.models import HoltWintersModel
from etna.models import LinearPerSegmentModel
from etna.models import SeasonalMovingAverageModel
from etna.pipeline import Pipeline
from etna.transforms import DensityOutliersTransform
from etna.transforms import DifferencingTransform
from etna.transforms import LagTransform
from etna.transforms import MeanTransform


def run_estimate_max_n_folds_forecast_test(pipeline, context_size, ts, expected):
pipeline.fit(ts=ts)

n_folds = estimate_max_n_folds(pipeline=pipeline, method_name="forecast", context_size=context_size)

assert n_folds == expected
pipeline.forecast(prediction_interval=True, n_folds=n_folds)


def run_estimate_max_n_folds_backtest_test(pipeline, context_size, ts, stride, expected):
n_folds = estimate_max_n_folds(
pipeline=pipeline, ts=ts, method_name="backtest", stride=stride, context_size=context_size
)

assert n_folds == expected
pipeline.backtest(ts=ts, metrics=[MAE()], n_folds=n_folds, stride=stride)


@pytest.fixture
def pipeline_with_context(request):
if hasattr(request, "param"):
horizon = request.param["horizon"]
window = request.param["window"]
else:
horizon = 1
window = 1

pipeline = Pipeline(transforms=[], model=SeasonalMovingAverageModel(seasonality=1, window=window), horizon=horizon)
return pipeline


@pytest.fixture
def pipeline_without_context(request):
horizon = request.param if hasattr(request, "param") else 1
pipeline = Pipeline(transforms=[], model=HoltWintersModel(), horizon=horizon)
return pipeline


@pytest.fixture
def pipeline_with_transforms():
transforms = [
LagTransform(in_column="target", lags=[14, 17]),
DifferencingTransform(in_column="target"),
MeanTransform(in_column="target", window=7),
DensityOutliersTransform(in_column="target"),
]

pipeline = Pipeline(transforms=transforms, model=LinearPerSegmentModel(), horizon=14)
return pipeline


@pytest.mark.parametrize(
"num_points, horizon, stride, context_size, expected",
(
(13, 2, 2, 2, 5),
(13, 2, 1, 2, 10),
(13, 2, 2, 1, 6),
(13, 2, 1, 1, 11),
(13, 1, 1, 1, 12),
(13, 4, 4, 6, 1),
(13, 4, 1, 6, 4),
(10, 5, 1, 5, 1),
(10, 5, 5, 5, 1),
),
)
def test_estimate_n_folds(num_points, horizon, stride, context_size, expected):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be it should be called test_private_estimate_n_folds

res = _estimate_n_folds(num_points=num_points, horizon=horizon, stride=stride, context_size=context_size)
assert res == expected


def test_estimate_n_folds_not_enough_points(num_points=10, horizon=7, stride=1, context_size=5):
with pytest.raises(ValueError, match="Not enough data points!"):
_ = _estimate_n_folds(num_points=num_points, horizon=horizon, stride=stride, context_size=context_size)


def test_estimate_n_folds_forecast_no_ts(pipeline_without_context):
with pytest.raises(ValueError, match="There is no ts for forecast method!"):
_ = _max_n_folds_forecast(pipeline=pipeline_without_context, ts=None, context_size=1)


def test_estimate_n_folds_backtest_no_ts(pipeline_without_context):
with pytest.raises(ValueError, match="Parameter `ts` is required when estimating for backtest method"):
_ = estimate_max_n_folds(pipeline=pipeline_without_context, method_name="backtest", context_size=1)


def test_estimate_n_folds_backtest_intervals_error(pipeline_without_context, example_tsds):
with pytest.raises(
NotImplementedError, match="Number of folds estimation for backtest with intervals is not implemented!"
):
_ = _max_n_folds_backtest(
pipeline=pipeline_without_context,
ts=example_tsds,
forecast_params={"prediction_interval": True},
context_size=1,
)


def test_estimate_max_n_folds_invalid_method_name(pipeline_without_context, example_tsds, method_name="fit"):
with pytest.raises(ValueError, match="fit is not a valid method name."):
_ = estimate_max_n_folds(
pipeline=pipeline_without_context, ts=example_tsds, method_name=method_name, context_size=1
)


def test_estimate_max_n_folds_empty_ts(pipeline_without_context, empty_ts):
with pytest.raises(ValueError, match="Not enough data points!"):
_ = estimate_max_n_folds(pipeline=pipeline_without_context, ts=empty_ts, method_name="forecast", context_size=1)


def test_estimate_max_n_folds_negative_context(pipeline_without_context, example_tsds):
with pytest.raises(ValueError, match="Pipeline `context_size` parameter must be positive integer!"):
_ = estimate_max_n_folds(
pipeline=pipeline_without_context, ts=example_tsds, method_name="forecast", context_size=-1
)


Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
def test_estimate_max_n_folds_forecast_with_ts(pipeline_without_context, example_tsds, context_size=3, expected=7):
pipeline = pipeline_without_context

pipeline.fit(ts=example_tsds)

ts_to_forecast = deepcopy(example_tsds)
ts_to_forecast.df = ts_to_forecast.df.iloc[-(context_size + expected) :]

n_folds = estimate_max_n_folds(
pipeline=pipeline, method_name="forecast", ts=ts_to_forecast, context_size=context_size
)

assert n_folds == expected
pipeline.forecast(ts=ts_to_forecast, prediction_interval=True, n_folds=n_folds)


@pytest.mark.parametrize(
"pipeline_without_context,context_size,ts_name,expected",
(
(1, 3, "example_tsds", 97),
(4, 3, "example_tsds", 24),
(13, 3, "example_tsds", 7),
(97, 3, "example_tsds", 1),
(40, 3, "ts_with_different_series_length", 18),
),
indirect=["pipeline_without_context"],
)
def test_estimate_max_n_folds_forecast_no_context(pipeline_without_context, context_size, ts_name, expected, request):
ts = request.getfixturevalue(ts_name)
run_estimate_max_n_folds_forecast_test(
pipeline=pipeline_without_context, ts=ts, expected=expected, context_size=context_size
)


@pytest.mark.parametrize(
"pipeline_with_context,context_size,ts_name,expected",
(
({"horizon": 1, "window": 1}, 1, "example_tsds", 99),
({"horizon": 1, "window": 2}, 2, "example_tsds", 98),
({"horizon": 13, "window": 10}, 10, "example_tsds", 6),
({"horizon": 10, "window": 1}, 1, "ts_with_different_series_length", 74),
),
indirect=["pipeline_with_context"],
)
def test_estimate_max_n_folds_forecast_with_context(pipeline_with_context, context_size, ts_name, expected, request):
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
ts = request.getfixturevalue(ts_name)
run_estimate_max_n_folds_forecast_test(
pipeline=pipeline_with_context, context_size=context_size, ts=ts, expected=expected
)


@pytest.mark.parametrize(
"context_size,ts_name,expected",
(
(18, "example_tsds", 5),
(18, "ts_with_different_series_length", 51),
),
)
def test_estimate_max_n_folds_forecast_with_transforms(
pipeline_with_transforms, context_size, ts_name, expected, request
):
ts = request.getfixturevalue(ts_name)
run_estimate_max_n_folds_forecast_test(
pipeline=pipeline_with_transforms, ts=ts, expected=expected, context_size=context_size
)


@pytest.mark.parametrize(
"pipeline_without_context,context_size,stride,ts_name,expected",
(
(4, 3, 8, "example_tsds", 12),
(13, 3, 13, "example_tsds", 7),
(13, 3, 3, "example_tsds", 29),
(97, 3, 3, "example_tsds", 1),
(40, 3, 60, "ts_with_different_series_length", 12),
),
indirect=["pipeline_without_context"],
)
def test_estimate_max_n_folds_backtest_no_context(
pipeline_without_context, context_size, stride, ts_name, expected, request
):
ts = request.getfixturevalue(ts_name)
run_estimate_max_n_folds_backtest_test(
pipeline=pipeline_without_context, context_size=context_size, ts=ts, stride=stride, expected=expected
)


@pytest.mark.parametrize(
"pipeline_with_context,context_size,stride,ts_name,expected",
(
({"horizon": 1, "window": 1}, 1, 8, "example_tsds", 13),
({"horizon": 5, "window": 8}, 8, 13, "example_tsds", 7),
({"horizon": 13, "window": 7}, 7, 3, "example_tsds", 27),
({"horizon": 13, "window": 60}, 60, 40, "ts_with_different_series_length", 17),
),
indirect=["pipeline_with_context"],
)
def test_estimate_max_n_folds_backtest_with_context(
pipeline_with_context, context_size, stride, ts_name, expected, request
):
ts = request.getfixturevalue(ts_name)
run_estimate_max_n_folds_backtest_test(
pipeline=pipeline_with_context, context_size=context_size, ts=ts, stride=stride, expected=expected
)


@pytest.mark.parametrize(
"context_size,stride,ts_name,expected",
(
(18, 1, "example_tsds", 69),
(18, 14, "example_tsds", 5),
(18, 60, "ts_with_different_series_length", 12),
),
)
def test_estimate_max_n_folds_backtest_with_transforms(
pipeline_with_transforms, context_size, stride, ts_name, expected, request
):
ts = request.getfixturevalue(ts_name)
run_estimate_max_n_folds_backtest_test(
pipeline=pipeline_with_transforms, context_size=context_size, ts=ts, stride=stride, expected=expected
)