unit8co · dennisbader · Aug 15, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
@@ -10,6 +10,10 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 
 ### For users of the library:
 
+**Improvement**
+- `TimeSeries` with a `RangeIndex` starting in the negative start are now supported by `historical_forecasts`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou).
+- Added a new argument `start_format` to `historical_forecasts()`, `backtest()` and `gridsearch`; an integer `start` can be used as a `RangeIndex` position (previously, had to be able of the index). [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou).
+
 **Fixed**
 - Fixed a bug in `TimeSeries.from_dataframe()` when using a pandas.DataFrame with `df.columns.name != None`. [#1938](https://github.com/unit8co/darts/pull/1938) by [Antoine Madrona](https://github.com/madtoinou).
 - Fixed a bug in `RegressionEnsembleModel.extreme_lags` when the forecasting models have only covariates lags. [#1942](https://github.com/unit8co/darts/pull/1942) by [Antoine Madrona](https://github.com/madtoinou).

@@ -24,6 +24,11 @@
 from random import sample
 from typing import Any, BinaryIO, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
 import numpy as np
 import pandas as pd
 
@@ -560,6 +565,7 @@ def historical_forecasts(
         num_samples: int = 1,
         train_length: Optional[int] = None,
         start: Optional[Union[pd.Timestamp, float, int]] = None,
+        start_format: Literal["position", "value"] = "value",
         forecast_horizon: int = 1,
         stride: int = 1,
         retrain: Union[bool, int, Callable[..., bool]] = True,
@@ -609,15 +615,14 @@ def historical_forecasts(
             steps available, all steps up until prediction time are used, as in default case. Needs to be at least
             `min_train_series_length`.
         start
-            Optionally, the first point in time at which a prediction is computed for a future time.
-            This parameter supports: ``float``, ``int`` and ``pandas.Timestamp``, and ``None``.
-            If a ``float``, the parameter will be treated as the proportion of the time series
-            that should lie before the first prediction point.
-            If an ``int``, the parameter will be treated as an integer index to the time index of
-            `series` that will be used as first prediction time.
-            If a ``pandas.Timestamp``, the time stamp will be used to determine the first prediction time
-            directly.
-            If ``None``, the first prediction time will automatically be set to:
+            Optionally, the first point in time at which a prediction is computed. This parameter supports:
+            ``float``, ``int``, ``pandas.Timestamp``, and ``None``.
+            If a ``float``, it is the proportion of the time series that should lie before the first prediction point.
+            If an ``int``, it is either the index position of the first prediction point for `series` with a
+            `pd.DatetimeIndex`, or the index value for `series` with a `pd.RangeIndex`. The latter can be changed to
+            the index position with `start_format="position"`.
+            If a ``pandas.Timestamp``, it is the time stamp of the first prediction point.
+            If ``None``, the first prediction point will automatically be set to:
 
             - the first predictable point if `retrain` is ``False``, or `retrain` is a Callable and the first
               predictable point is earlier than the first trainable point.
@@ -628,6 +633,13 @@ def historical_forecasts(
             Note: Raises a ValueError if `start` yields a time outside the time index of `series`.
             Note: If `start` is outside the possible historical forecasting times, will ignore the parameter
             (default behavior with ``None``) and start at the first trainable/predictable point.
+        start_format
+            Defines the `start` format. Only effective when `start` is an integer and `series` is indexed with a
+            `pd.RangeIndex`.
+            If set to 'position', `start` corresponds to the index position of the first predicted point and can range
+            from `(-len(series), len(series) - 1)`.
+            If set to 'value', `start` corresponds to the index value/label of the first predicted point. Will raise
+            an error if the value is not in `series`' index. Default: ``'value'``
         forecast_horizon
             The forecast horizon for the predictions.
         stride
@@ -798,6 +810,7 @@ def retrain_func(
                 future_covariates=future_covariates,
                 num_samples=num_samples,
                 start=start,
+                start_format=start_format,
                 forecast_horizon=forecast_horizon,
                 stride=stride,
                 overlap_end=overlap_end,
@@ -876,6 +889,7 @@ def retrain_func(
                 forecast_horizon=forecast_horizon,
                 overlap_end=overlap_end,
                 start=start,
+                start_format=start_format,
                 show_warnings=show_warnings,
             )
 
@@ -1030,6 +1044,7 @@ def backtest(
         num_samples: int = 1,
         train_length: Optional[int] = None,
         start: Optional[Union[pd.Timestamp, float, int]] = None,
+        start_format: Literal["position", "value"] = "value",
         forecast_horizon: int = 1,
         stride: int = 1,
         retrain: Union[bool, int, Callable[..., bool]] = True,
@@ -1085,25 +1100,14 @@ def backtest(
             steps available, all steps up until prediction time are used, as in default case. Needs to be at least
             `min_train_series_length`.
         start
-            Optionally, the first point in time at which a prediction is computed for a future time.
-            This parameter supports: ``float``, ``int`` and ``pandas.Timestamp``, and ``None``.
-            If a ``float``, the parameter will be treated as the proportion of the time series
-            that should lie before the first prediction point.
-            If an ``int``, the parameter will be treated as an integer index to the time index of
-            `series` that will be used as first prediction time.
-            If a ``pandas.Timestamp``, the time stamp will be used to determine the first prediction time
-            directly.
-            If ``None``, the first prediction time will automatically be set to:
-                 - the first predictable point if `retrain` is ``False``, or `retrain` is a Callable and the first
-                 predictable point is earlier than the first trainable point.
-
-                 - the first trainable point if `retrain` is ``True`` or ``int`` (given `train_length`),
-                 or `retrain` is a Callable and the first trainable point is earlier than the first predictable point.
-
-                 - the first trainable point (given `train_length`) otherwise
-            Note: Raises a ValueError if `start` yields a time outside the time index of `series`.
-            Note: If `start` is outside the possible historical forecasting times, will ignore the parameter
-            (default behavior with ``None``) and start at the first trainable/predictable point.
+            Optionally, the first point in time at which a prediction is computed. This parameter supports:
+            ``float``, ``int``, ``pandas.Timestamp``, and ``None``.
+            For a detailed description of how the different data types are interpreted, please see the documentation
+            for `ForecastingModel.historical_forecasts`. Only used in expanding window mode.
+        start_format
+            Defines the `start` format. Only effective when `start` is an integer and `series` is indexed with a
+            `pd.RangeIndex`. For a detailed description this argument, please see the documentation for
+            `ForecastingModel.historical_forecasts`.
         forecast_horizon
             The forecast horizon for the point predictions.
         stride
@@ -1160,6 +1164,7 @@ def backtest(
                 num_samples=num_samples,
                 train_length=train_length,
                 start=start,
+                start_format=start_format,
                 forecast_horizon=forecast_horizon,
                 stride=stride,
                 retrain=retrain,
@@ -1210,6 +1215,7 @@ def gridsearch(
         forecast_horizon: Optional[int] = None,
         stride: int = 1,
         start: Union[pd.Timestamp, float, int] = 0.5,
+        start_format: Literal["position", "value"] = "value",
         last_points_only: bool = False,
         show_warnings: bool = True,
         val_series: Optional[TimeSeries] = None,
@@ -1277,10 +1283,14 @@ def gridsearch(
         stride
             The number of time steps between two consecutive predictions. Only used in expanding window mode.
         start
-            The ``int``, ``float`` or ``pandas.Timestamp`` that represents the starting point in the time index
-            of `series` from which predictions will be made to evaluate the model.
+            Optionally, the first point in time at which a prediction is computed. This parameter supports:
+            ``float``, ``int``, ``pandas.Timestamp``, and ``None``.
             For a detailed description of how the different data types are interpreted, please see the documentation
-            for `ForecastingModel.backtest`. Only used in expanding window mode.
+            for `ForecastingModel.historical_forecasts`. Only used in expanding window mode.
+        start_format
+            Defines the `start` format. Only effective when `start` is an integer and `series` is indexed with a
+            `pd.RangeIndex`. For a detailed description this argument, please see the documentation for
+            `ForecastingModel.historical_forecasts`.
         last_points_only
             Whether to use the whole forecasts or only the last point of each forecast to compute the error. Only used
             in expanding window mode.
@@ -1386,6 +1396,7 @@ def _evaluate_combination(param_combination) -> float:
                     future_covariates=future_covariates,
                     num_samples=1,
                     start=start,
+                    start_format=start_format,
                     forecast_horizon=forecast_horizon,
                     stride=stride,
                     metric=metric,
@@ -1893,6 +1904,7 @@ def _optimized_historical_forecasts(
         future_covariates: Optional[Sequence[TimeSeries]] = None,
         num_samples: int = 1,
         start: Optional[Union[pd.Timestamp, float, int]] = None,
+        start_format: Literal["position", "value"] = "value",
         forecast_horizon: int = 1,
         stride: int = 1,
         overlap_end: bool = False,

@@ -29,6 +29,11 @@
 from collections import OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
 import numpy as np
 import pandas as pd
 from sklearn.linear_model import LinearRegression
@@ -897,6 +902,7 @@ def _optimized_historical_forecasts(
         future_covariates: Optional[Sequence[TimeSeries]] = None,
         num_samples: int = 1,
         start: Optional[Union[pd.Timestamp, float, int]] = None,
+        start_format: Literal["position", "value"] = "value",
         forecast_horizon: int = 1,
         stride: int = 1,
         overlap_end: bool = False,
@@ -949,6 +955,7 @@ def _optimized_historical_forecasts(
                 future_covariates=future_covariates,
                 num_samples=num_samples,
                 start=start,
+                start_format=start_format,
                 forecast_horizon=forecast_horizon,
                 stride=stride,
                 overlap_end=overlap_end,
@@ -963,6 +970,7 @@ def _optimized_historical_forecasts(
                 future_covariates=future_covariates,
                 num_samples=num_samples,
                 start=start,
+                start_format=start_format,
                 forecast_horizon=forecast_horizon,
                 stride=stride,
                 overlap_end=overlap_end,

@@ -374,6 +374,47 @@ def test_historical_forecasts_local_models(self):
             "LocalForecastingModel does not support historical forecasting with `retrain` set to `False`"
         )
 
+    def test_historical_forecasts_position_start(self):
+        series = tg.sine_timeseries(length=10)
+
+        model = LinearRegressionModel(lags=2)
+        model.fit(series[:8])
+
+        # negative index
+        forecasts_neg = model.historical_forecasts(
+            series=series, start=-2, start_format="position", retrain=False
+        )
+        assert len(forecasts_neg) == 2
+        assert (series.time_index[-2:] == forecasts_neg.time_index).all()
+
+        # positive index
+        forecasts_pos = model.historical_forecasts(
+            series=series, start=8, start_format="position", retrain=False
+        )
+        assert forecasts_pos == forecasts_neg
+
+    def test_historical_forecasts_negative_rangeindex(self):
+        series = TimeSeries.from_times_and_values(
+            times=pd.RangeIndex(start=-5, stop=5, step=1), values=np.arange(10)
+        )
+
+        model = LinearRegressionModel(lags=2)
+        model.fit(series[:8])
+
+        # start as point
+        forecasts = model.historical_forecasts(
+            series=series, start=-2, start_format="value", retrain=False
+        )
+        assert len(forecasts) == 7
+        assert (series.time_index[-7:] == forecasts.time_index).all()
+
+        # start as index
+        forecasts = model.historical_forecasts(
+            series=series, start=-2, start_format="position", retrain=False
+        )
+        assert len(forecasts) == 2
+        assert (series.time_index[-2:] == forecasts.time_index).all()
+
     def test_historical_forecasts(self):
         train_length = 10
         forecast_horizon = 8
@@ -551,18 +592,40 @@ def test_sanity_check_invalid_start(self):
         rangeidx_step1 = tg.linear_timeseries(start=0, length=10, freq=1)
         rangeidx_step2 = tg.linear_timeseries(start=0, length=10, freq=2)
 
-        # index too large
+        # label_index (int), too large
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(timeidx_, start=11)
         assert str(msg.value).startswith("`start` index `11` is out of bounds")
         with pytest.raises(ValueError) as msg:
-            LinearRegressionModel(lags=1).historical_forecasts(rangeidx_step1, start=11)
-        assert str(msg.value).startswith("`start` index `11` is out of bounds")
+            LinearRegressionModel(lags=1).historical_forecasts(
+                rangeidx_step1, start=rangeidx_step1.end_time() + rangeidx_step1.freq
+            )
+        assert str(msg.value).startswith(
+            "`start` index `10` is larger than the last index"
+        )
+        with pytest.raises(ValueError) as msg:
+            LinearRegressionModel(lags=1).historical_forecasts(
+                rangeidx_step2, start=rangeidx_step2.end_time() + rangeidx_step2.freq
+            )
+        assert str(msg.value).startswith(
+            "`start` index `20` is larger than the last index"
+        )
+
+        # label_index (timestamp) too high
+        with pytest.raises(ValueError) as msg:
+            LinearRegressionModel(lags=1).historical_forecasts(
+                timeidx_, start=timeidx_.end_time() + timeidx_.freq
+            )
+        assert str(msg.value).startswith(
+            "`start` time `2000-01-11 00:00:00` is after the last timestamp `2000-01-10 00:00:00`"
+        )
+
+        # label_index, invalid
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(rangeidx_step2, start=11)
         assert str(msg.value).startswith("The provided point is not a valid index")
 
-        # value too low
+        # label_index, too low
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(
                 timeidx_, start=timeidx_.start_time() - timeidx_.freq
@@ -574,32 +637,43 @@ def test_sanity_check_invalid_start(self):
             LinearRegressionModel(lags=1).historical_forecasts(
                 rangeidx_step1, start=rangeidx_step1.start_time() - rangeidx_step1.freq
             )
-        assert str(msg.value).startswith("if `start` is an integer, must be `>= 0`")
+        assert str(msg.value).startswith(
+            "`start` index `-1` is smaller than the first index `0`"
+        )
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(
                 rangeidx_step2, start=rangeidx_step2.start_time() - rangeidx_step2.freq
             )
-        assert str(msg.value).startswith("if `start` is an integer, must be `>= 0`")
+        assert str(msg.value).startswith(
+            "`start` index `-2` is smaller than the first index `0`"
+        )
+
+        # positional_index, predicting only the last position
+        LinearRegressionModel(lags=1).historical_forecasts(
+            timeidx_, start=9, start_format="position"
+        )
 
-        # value too high
+        # positional_index, predicting from the first position with retrain=True
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(
-                timeidx_, start=timeidx_.end_time() + timeidx_.freq
+                timeidx_, start=-10, start_format="position"
             )
-        assert str(msg.value).startswith(
-            "`start` time `2000-01-11 00:00:00` is after the last timestamp `2000-01-10 00:00:00`"
-        )
+        assert str(msg.value).endswith(", resulting in an empty training set.")
+
+        # positional_index, beyond boundaries
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(
-                rangeidx_step1, start=rangeidx_step1.end_time() + rangeidx_step1.freq
+                timeidx_, start=10, start_format="position"
             )
-        assert str(msg.value).startswith("`start` index `10` is out of bounds")
+        assert str(msg.value).startswith(
+            "`start` index `10` is out of bounds for series of length 10"
+        )
         with pytest.raises(ValueError) as msg:
             LinearRegressionModel(lags=1).historical_forecasts(
-                rangeidx_step2, start=rangeidx_step2.end_time() + rangeidx_step2.freq
+                timeidx_, start=-11, start_format="position"
             )
         assert str(msg.value).startswith(
-            "`start` index `20` is larger than the last index `18`"
+            "`start` index `-11` is out of bounds for series of length 10"
         )
 
     def test_regression_auto_start_multiple_no_cov(self):