tinkoff-ai · brsnw250 · Jun 6, 2023 · Jun 1, 2023 · Jun 1, 2023 · Jun 2, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Notebook `forecast_interpretation.ipynb` with forecast decomposition ([#1220](https://github.com/tinkoff-ai/etna/pull/1220))
 - Exogenous variables shift transform `ExogShiftTransform`([#1254](https://github.com/tinkoff-ai/etna/pull/1254))
 - Parameter `start_timestamp` to forecast CLI command ([#1265](https://github.com/tinkoff-ai/etna/pull/1265))
-- 
+- Function `estimate_max_n_folds` for folds number estimation ([#1279](https://github.com/tinkoff-ai/etna/pull/1279))
+-
 ### Changed
 - Set the default value of `final_model` to `LinearRegression(positive=True)` in the constructor of `StackingEnsemble` ([#1238](https://github.com/tinkoff-ai/etna/pull/1238))
 - Add microseconds to `FileLogger`'s directory name ([#1264](https://github.com/tinkoff-ai/etna/pull/1264))

diff --git a/etna/commands/utils.py b/etna/commands/utils.py
@@ -0,0 +1,111 @@
+from enum import Enum
+from math import floor
+from typing import Literal
+from typing import Optional
+from typing import Union
+
+from etna.datasets import TSDataset
+from etna.pipeline import Pipeline
+
+
+class MethodsWithFolds(str, Enum):
+    """Enum for methods that use `n_folds` argument."""
+
+    forecast = "forecast"
+    backtest = "backtest"
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid method name. Only {', '.join([repr(m.value) for m in cls])} are allowed"
+        )
+
+
+def _estimate_n_folds(num_points: int, horizon: int, stride: int, context_size: int) -> int:
+    """Estimate number of folds."""
+    if num_points < horizon + context_size:
+        raise ValueError("Not enough data points!")
+
+    res = (num_points - horizon + stride - context_size) / stride
+    return floor(res)
+
+
+def _max_n_folds_forecast(pipeline: Pipeline, context_size: int, ts: Optional[TSDataset] = None) -> int:
+    """Estimate max n_folds for forecast method."""
+    if ts is None:
+        if pipeline.ts is None:
+            raise ValueError(
+                "There is no ts for forecast method! Pass ts into function or make sure that pipeline is fitted."
+            )
+
+        else:
+            ts = pipeline.ts
+
+    num_points = len(ts.index)
+    horizon = pipeline.horizon
+
+    return _estimate_n_folds(num_points=num_points, horizon=horizon, stride=horizon, context_size=context_size)
+
+
+def _max_n_folds_backtest(pipeline: Pipeline, context_size: int, ts: TSDataset, **method_kwargs) -> int:
+    """Estimate max n_folds for backtest method."""
+    # process backtest with intervals case
+    backtest_with_intervals = "forecast_params" in method_kwargs and method_kwargs["forecast_params"].get(
+        "prediction_interval", False
+    )
+
+    if backtest_with_intervals:
+        raise NotImplementedError("Number of folds estimation for backtest with intervals is not implemented!")
+
+    num_points = len(ts.index)
+
+    horizon = pipeline.horizon
+    stride = method_kwargs.get("stride", horizon)
+
+    return _estimate_n_folds(num_points=num_points, horizon=horizon, stride=stride, context_size=context_size)
+
+
+def estimate_max_n_folds(
+    pipeline: Pipeline,
+    method_name: Union[Literal["forecast"], Literal["backtest"]],
+    context_size: int,
+    ts: Optional[TSDataset] = None,
+    **method_kwargs,
+) -> int:
+    """Estimate number of folds using provided data and pipeline configuration.
+    This function helps to estimate maximum number of folds that can be used when performing
+    forecast with intervals or pipeline backtest.
+
+    Parameters
+    ----------
+    pipeline:
+        Pipeline for which to estimate number of folds.
+    method_name:
+       Method name for which to estimate number of folds.
+    context_size:
+       Minimum number of points for pipeline to be estimated.
+    ts:
+       Dataset which will be used for estimation.
+    method_kwargs:
+       Additional arguments for methods that impact number of folds.
+
+    Returns
+    -------
+    :
+        Number of folds.
+    """
+    if context_size < 1:
+        raise ValueError("Pipeline `context_size` parameter must be positive integer!")
+
+    if ts is None and method_name != MethodsWithFolds.forecast:
+        raise ValueError("Parameter `ts` is required when estimating for backtest method")
+
+    method = MethodsWithFolds(method_name)
+
+    if method == MethodsWithFolds.forecast:
+        n_folds = _max_n_folds_forecast(pipeline=pipeline, context_size=context_size, ts=ts)
+
+    else:
+        n_folds = _max_n_folds_backtest(pipeline=pipeline, context_size=context_size, ts=ts, **method_kwargs)  # type: ignore
+
+    return n_folds
diff --git a/tests/test_commands/conftest.py b/tests/test_commands/conftest.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import pytest
 
+from etna.datasets import TSDataset
 from etna.datasets import generate_ar_df
 
 
@@ -171,3 +172,10 @@ def start_timestamp_forecast_omegaconf_path():
     tmp.flush()
     yield Path(tmp.name)
     tmp.close()
+
+
+@pytest.fixture
+def empty_ts():
+    df = pd.DataFrame({"segment": [], "timestamp": [], "target": []})
+    df = TSDataset.to_dataset(df=df)
+    return TSDataset(df=df, freq="D")
diff --git a/tests/test_commands/test_utils.py b/tests/test_commands/test_utils.py
@@ -0,0 +1,256 @@
+from copy import deepcopy
+
+import pytest
+
+from etna.commands.utils import _estimate_n_folds
+from etna.commands.utils import _max_n_folds_backtest
+from etna.commands.utils import _max_n_folds_forecast
+from etna.commands.utils import estimate_max_n_folds
+from etna.metrics import MAE
+from etna.models import HoltWintersModel
+from etna.models import LinearPerSegmentModel
+from etna.models import SeasonalMovingAverageModel
+from etna.pipeline import Pipeline
+from etna.transforms import DensityOutliersTransform
+from etna.transforms import DifferencingTransform
+from etna.transforms import LagTransform
+from etna.transforms import MeanTransform
+
+
+def run_estimate_max_n_folds_forecast_test(pipeline, context_size, ts, expected):
+    pipeline.fit(ts=ts)
+
+    n_folds = estimate_max_n_folds(pipeline=pipeline, method_name="forecast", context_size=context_size)
+
+    assert n_folds == expected
+    pipeline.forecast(prediction_interval=True, n_folds=n_folds)
+
+
+def run_estimate_max_n_folds_backtest_test(pipeline, context_size, ts, stride, expected):
+    n_folds = estimate_max_n_folds(
+        pipeline=pipeline, ts=ts, method_name="backtest", stride=stride, context_size=context_size
+    )
+
+    assert n_folds == expected
+    pipeline.backtest(ts=ts, metrics=[MAE()], n_folds=n_folds, stride=stride)
+
+
+@pytest.fixture
+def pipeline_with_context(request):
+    if hasattr(request, "param"):
+        horizon = request.param["horizon"]
+        window = request.param["window"]
+    else:
+        horizon = 1
+        window = 1
+
+    pipeline = Pipeline(transforms=[], model=SeasonalMovingAverageModel(seasonality=1, window=window), horizon=horizon)
+    return pipeline
+
+
+@pytest.fixture
+def pipeline_without_context(request):
+    horizon = request.param if hasattr(request, "param") else 1
+    pipeline = Pipeline(transforms=[], model=HoltWintersModel(), horizon=horizon)
+    return pipeline
+
+
+@pytest.fixture
+def pipeline_with_transforms():
+    transforms = [
+        LagTransform(in_column="target", lags=[14, 17]),
+        DifferencingTransform(in_column="target"),
+        MeanTransform(in_column="target", window=7),
+        DensityOutliersTransform(in_column="target"),
+    ]
+
+    pipeline = Pipeline(transforms=transforms, model=LinearPerSegmentModel(), horizon=14)
+    return pipeline
+
+
+@pytest.mark.parametrize(
+    "num_points, horizon, stride, context_size, expected",
+    (
+        (13, 2, 2, 2, 5),
+        (13, 2, 1, 2, 10),
+        (13, 2, 2, 1, 6),
+        (13, 2, 1, 1, 11),
+        (13, 1, 1, 1, 12),
+        (13, 4, 4, 6, 1),
+        (13, 4, 1, 6, 4),
+        (10, 5, 1, 5, 1),
+        (10, 5, 5, 5, 1),
+    ),
+)
+def test_estimate_n_folds(num_points, horizon, stride, context_size, expected):
+    res = _estimate_n_folds(num_points=num_points, horizon=horizon, stride=stride, context_size=context_size)
+    assert res == expected
+
+
+def test_estimate_n_folds_not_enough_points(num_points=10, horizon=7, stride=1, context_size=5):
+    with pytest.raises(ValueError, match="Not enough data points!"):
+        _ = _estimate_n_folds(num_points=num_points, horizon=horizon, stride=stride, context_size=context_size)
+
+
+def test_estimate_n_folds_forecast_no_ts(pipeline_without_context):
+    with pytest.raises(ValueError, match="There is no ts for forecast method!"):
+        _ = _max_n_folds_forecast(pipeline=pipeline_without_context, ts=None, context_size=1)
+
+
+def test_estimate_n_folds_backtest_no_ts(pipeline_without_context):
+    with pytest.raises(ValueError, match="Parameter `ts` is required when estimating for backtest method"):
+        _ = estimate_max_n_folds(pipeline=pipeline_without_context, method_name="backtest", context_size=1)
+
+
+def test_estimate_n_folds_backtest_intervals_error(pipeline_without_context, example_tsds):
+    with pytest.raises(
+        NotImplementedError, match="Number of folds estimation for backtest with intervals is not implemented!"
+    ):
+        _ = _max_n_folds_backtest(
+            pipeline=pipeline_without_context,
+            ts=example_tsds,
+            forecast_params={"prediction_interval": True},
+            context_size=1,
+        )
+
+
+def test_estimate_max_n_folds_invalid_method_name(pipeline_without_context, example_tsds, method_name="fit"):
+    with pytest.raises(ValueError, match="fit is not a valid method name."):
+        _ = estimate_max_n_folds(
+            pipeline=pipeline_without_context, ts=example_tsds, method_name=method_name, context_size=1
+        )
+
+
+def test_estimate_max_n_folds_empty_ts(pipeline_without_context, empty_ts):
+    with pytest.raises(ValueError, match="Not enough data points!"):
+        _ = estimate_max_n_folds(pipeline=pipeline_without_context, ts=empty_ts, method_name="forecast", context_size=1)
+
+
+def test_estimate_max_n_folds_negative_context(pipeline_without_context, example_tsds):
+    with pytest.raises(ValueError, match="Pipeline `context_size` parameter must be positive integer!"):
+        _ = estimate_max_n_folds(
+            pipeline=pipeline_without_context, ts=example_tsds, method_name="forecast", context_size=-1
+        )
+
+
+def test_estimate_max_n_folds_forecast_with_ts(pipeline_without_context, example_tsds, context_size=3, expected=7):
+    pipeline = pipeline_without_context
+
+    pipeline.fit(ts=example_tsds)
+
+    ts_to_forecast = deepcopy(example_tsds)
+    ts_to_forecast.df = ts_to_forecast.df.iloc[-(context_size + expected) :]
+
+    n_folds = estimate_max_n_folds(
+        pipeline=pipeline, method_name="forecast", ts=ts_to_forecast, context_size=context_size
+    )
+
+    assert n_folds == expected
+    pipeline.forecast(ts=ts_to_forecast, prediction_interval=True, n_folds=n_folds)
+
+
+@pytest.mark.parametrize(
+    "pipeline_without_context,context_size,ts_name,expected",
+    (
+        (1, 3, "example_tsds", 97),
+        (4, 3, "example_tsds", 24),
+        (13, 3, "example_tsds", 7),
+        (97, 3, "example_tsds", 1),
+        (40, 3, "ts_with_different_series_length", 18),
+    ),
+    indirect=["pipeline_without_context"],
+)
+def test_estimate_max_n_folds_forecast_no_context(pipeline_without_context, context_size, ts_name, expected, request):
+    ts = request.getfixturevalue(ts_name)
+    run_estimate_max_n_folds_forecast_test(
+        pipeline=pipeline_without_context, ts=ts, expected=expected, context_size=context_size
+    )
+
+
+@pytest.mark.parametrize(
+    "pipeline_with_context,context_size,ts_name,expected",
+    (
+        ({"horizon": 1, "window": 1}, 1, "example_tsds", 99),
+        ({"horizon": 1, "window": 2}, 2, "example_tsds", 98),
+        ({"horizon": 13, "window": 10}, 10, "example_tsds", 6),
+        ({"horizon": 10, "window": 1}, 1, "ts_with_different_series_length", 74),
+    ),
+    indirect=["pipeline_with_context"],
+)
+def test_estimate_max_n_folds_forecast_with_context(pipeline_with_context, context_size, ts_name, expected, request):
+    ts = request.getfixturevalue(ts_name)
+    run_estimate_max_n_folds_forecast_test(
+        pipeline=pipeline_with_context, context_size=context_size, ts=ts, expected=expected
+    )
+
+
+@pytest.mark.parametrize(
+    "context_size,ts_name,expected",
+    (
+        (18, "example_tsds", 5),
+        (18, "ts_with_different_series_length", 51),
+    ),
+)
+def test_estimate_max_n_folds_forecast_with_transforms(
+    pipeline_with_transforms, context_size, ts_name, expected, request
+):
+    ts = request.getfixturevalue(ts_name)
+    run_estimate_max_n_folds_forecast_test(
+        pipeline=pipeline_with_transforms, ts=ts, expected=expected, context_size=context_size
+    )
+
+
+@pytest.mark.parametrize(
+    "pipeline_without_context,context_size,stride,ts_name,expected",
+    (
+        (4, 3, 8, "example_tsds", 12),
+        (13, 3, 13, "example_tsds", 7),
+        (13, 3, 3, "example_tsds", 29),
+        (97, 3, 3, "example_tsds", 1),
+        (40, 3, 60, "ts_with_different_series_length", 12),
+    ),
+    indirect=["pipeline_without_context"],
+)
+def test_estimate_max_n_folds_backtest_no_context(
+    pipeline_without_context, context_size, stride, ts_name, expected, request
+):
+    ts = request.getfixturevalue(ts_name)
+    run_estimate_max_n_folds_backtest_test(
+        pipeline=pipeline_without_context, context_size=context_size, ts=ts, stride=stride, expected=expected
+    )
+
+
+@pytest.mark.parametrize(
+    "pipeline_with_context,context_size,stride,ts_name,expected",
+    (
+        ({"horizon": 1, "window": 1}, 1, 8, "example_tsds", 13),
+        ({"horizon": 5, "window": 8}, 8, 13, "example_tsds", 7),
+        ({"horizon": 13, "window": 7}, 7, 3, "example_tsds", 27),
+        ({"horizon": 13, "window": 60}, 60, 40, "ts_with_different_series_length", 17),
+    ),
+    indirect=["pipeline_with_context"],
+)
+def test_estimate_max_n_folds_backtest_with_context(
+    pipeline_with_context, context_size, stride, ts_name, expected, request
+):
+    ts = request.getfixturevalue(ts_name)
+    run_estimate_max_n_folds_backtest_test(
+        pipeline=pipeline_with_context, context_size=context_size, ts=ts, stride=stride, expected=expected
+    )
+
+
+@pytest.mark.parametrize(
+    "context_size,stride,ts_name,expected",
+    (
+        (18, 1, "example_tsds", 69),
+        (18, 14, "example_tsds", 5),
+        (18, 60, "ts_with_different_series_length", 12),
+    ),
+)
+def test_estimate_max_n_folds_backtest_with_transforms(
+    pipeline_with_transforms, context_size, stride, ts_name, expected, request
+):
+    ts = request.getfixturevalue(ts_name)
+    run_estimate_max_n_folds_backtest_test(
+        pipeline=pipeline_with_transforms, context_size=context_size, ts=ts, stride=stride, expected=expected
+    )