# Tabularization Refactoring Experiments

## Set-Up

In [1]:
import time
import numpy as np
import pandas as pd
from itertools import product
from math import inf
from darts import TimeSeries
from darts.utils.data.tabularization import (
    _create_lagged_data,
    create_lagged_features_and_labels,
    create_lagged_features,
)

Utility function to create 'dummy' index time series, where $y_t = t$:

In [2]:
def create_index_series(num_timesteps, num_components, offset=0, freq=1, start=None):
    vals = np.arange(0, num_timesteps * num_components, 1) + offset
    vals = vals.reshape(num_timesteps, num_components, 1)
    if not isinstance(freq, str):
        if start is None:
            # Default range index start:
            start = 0
        dates = pd.RangeIndex(start, freq * num_timesteps, freq)
    else:
        if start is None:
            # Default datetime index start:
            start = "1/1/1900"
        dates = pd.date_range(start, periods=num_timesteps, freq=freq)
    return TimeSeries.from_times_and_values(values=vals, times=dates)

## Training Data Generation

### Correctness Checks

Test correctness of refactored implementation vs current implementation over many different combinations of input values

In [3]:
def test_training_correctness():
    # Timeseries with different start times, lengths, frequencies, and number of components:
    num_timesteps = 1000
    target_series = create_index_series(
        num_timesteps, num_components=2, offset=10, start="5/1/1900", freq="d"
    )
    past_series = create_index_series(
        num_timesteps - 13, num_components=3, offset=30, freq="2d"
    )
    future_series = create_index_series(
        num_timesteps + 5, num_components=5, offset=60, freq="3d"
    )

    # With and without specifying past series:
    past_combo = [past_series, None]
    # With and without specifying future series:
    future_combo = [future_series, None]
    # Single lags, multiple lags, multiple + noncontiguous lags:
    lag_combos = ([-1], [-2, -1], [-4, -3, -2, -1], [-6, -4, -2])
    # Small + large horizons:
    horizon_combos = [1, 5, 10, 20]
    # With and without multiple model predictions:
    multiple_output_combos = [False, True]
    # With and without maximum number of samples:
    max_sample_combos = [1, 5, 10, 20, None]
    param_combos = product(
        past_combo,
        future_combo,
        lag_combos,
        lag_combos,
        lag_combos,
        horizon_combos,
        multiple_output_combos,
        max_sample_combos,
    )
    len_combos = (
        len(past_combo)
        * len(future_combo)
        * (len(lag_combos) ** 3)
        * len(horizon_combos)
        * len(multiple_output_combos)
        * len(max_sample_combos)
    )
    for i, (
        past,
        future,
        target_lag,
        past_lag,
        future_lag,
        horizon,
        multiple_outputs,
        max_samples,
    ) in enumerate(param_combos):

        # Current implmentation:
        (X, y, Ts) = _create_lagged_data(
            target_series,
            lags=target_lag,
            past_covariates=past_series,
            future_covariates=future_series,
            lags_past_covariates=past_lag,
            lags_future_covariates=future_lag,
            output_chunk_length=horizon,
            multi_models=multiple_outputs,
            is_training=True,
            max_samples_per_ts=max_samples,
        )
        # Refactored implementation:
        my_X, my_y, my_Ts = create_lagged_features_and_labels(
            horizon=horizon,
            target_series=target_series,
            past_series=past_series,
            future_series=future_series,
            target_lags=target_lag,
            past_lags=past_lag,
            future_lags=future_lag,
            max_samples=max_samples if max_samples is not None else inf,
            multiple_outputs=multiple_outputs,
        )

        try:
            assert np.allclose(my_X.squeeze(), X.squeeze())
        except:
            raise ValueError("X incorrect")

        try:
            assert np.allclose(my_y.squeeze(), y.squeeze())
        except:
            raise ValueError("y incorrect")

        try:
            assert list(my_Ts) == list(Ts[0])
        except:
            raise ValueError("Ts incorrect")

        if (i + 1) % 500 == 0:
            print(f"Passed {i+1}/{len_combos}")


test_training_correctness()

Passed 500/10240
Passed 1000/10240
Passed 1500/10240
Passed 2000/10240
Passed 2500/10240
Passed 3000/10240
Passed 3500/10240
Passed 4000/10240
Passed 4500/10240
Passed 5000/10240
Passed 5500/10240
Passed 6000/10240
Passed 6500/10240
Passed 7000/10240
Passed 7500/10240
Passed 8000/10240
Passed 8500/10240
Passed 9000/10240
Passed 9500/10240
Passed 10000/10240


### Speed Benchmarks

Informal speed benchmarks of refactored implementation vs current implementation:

In [15]:
def benchmark_training_data_generation(
    num_repeats,
    target_lags,
    past_lags,
    future_lags,
    horizon,
    max_samples,
    multiple_outputs,
):
    # Timeseries with different start times, lengths, frequencies, and number of components:
    num_timesteps = 10000
    target_series = create_index_series(
        num_timesteps, num_components=2, offset=10, start="5/1/1900", freq="d"
    )
    past_series = create_index_series(
        num_timesteps - 13, num_components=3, offset=30, freq="2d"
    )
    future_series = create_index_series(
        num_timesteps + 5, num_components=5, offset=60, freq="3d"
    )

    start_time = time.time()
    for _ in range(num_repeats):
        (X, y, Ts) = _create_lagged_data(
            target_series,
            lags=target_lags,
            past_covariates=past_series,
            future_covariates=future_series,
            lags_past_covariates=past_lags,
            lags_future_covariates=future_lags,
            output_chunk_length=horizon,
            multi_models=multiple_outputs,
            is_training=True,
            max_samples_per_ts=max_samples,
        )
    current_implem_time = time.time() - start_time

    start_time = time.time()
    for _ in range(num_repeats):
        my_X, my_y, my_Ts = create_lagged_features_and_labels(
            horizon=horizon,
            target_series=target_series,
            past_series=past_series,
            future_series=future_series,
            target_lags=target_lags,
            past_lags=past_lags,
            future_lags=future_lags,
            max_samples=max_samples if max_samples is not None else inf,
            multiple_outputs=multiple_outputs,
        )
    refact_implem_time = time.time() - start_time

    # Ensure reimplemented function is correct:
    assert np.allclose(my_X.squeeze(), X.squeeze())
    assert np.allclose(my_y.squeeze(), y.squeeze())
    assert list(my_Ts) == list(Ts[0])

    return current_implem_time, refact_implem_time

Benchmarks with small number of lags:

In [16]:
# Number of times to repeat function call for timing purposes:
num_repeats = 1000
multiple_outputs = True
target_lags = [-1]
past_lags = [-2]
future_lags = [-3]
horizon = 10
max_samples = None
old_time, new_time = benchmark_training_data_generation(
    num_repeats,
    target_lags,
    past_lags,
    future_lags,
    horizon,
    max_samples,
    multiple_outputs,
)
print(f"Current implementation: {old_time} secs")
print(f"New implementation: {new_time} secs")
print(f"Speed up = {old_time/new_time} fold")

Current implementation: 26.26463222503662 secs
New implementation: 2.086974859237671 secs
Speed up = 12.585025693424287 fold


Benchmarks with large number of lags:

In [17]:
# Use fewer repeats here for sake of brevity (these benchmarks take longer):
num_repeats = 200
multiple_outputs = True
target_lags = range(-52, 0, 3)
past_lags = range(-15, 0, 2)
future_lags = range(-20, 0, 1)
horizon = 10
max_samples = None
old_time, new_time = benchmark_training_data_generation(
    num_repeats,
    target_lags,
    past_lags,
    future_lags,
    horizon,
    max_samples,
    multiple_outputs,
)
print(f"Current implementation: {old_time} secs")
print(f"New implementation: {new_time} secs")
print(f"Speed up = {old_time/new_time} fold")

Current implementation: 49.359559774398804 secs
New implementation: 1.2991654872894287 secs
Speed up = 37.993281269642026 fold


## TODO: Prediction Data Generation

### TODO: Test Correctness

In [7]:
# def test_predicting_correctness():
#     num_timesteps = 90
#     target_series = create_index_series(num_timesteps, num_components=1, offset=10, start='5/1/1900',  freq='d')
#     past_series = create_index_series(num_timesteps - 13, num_components=1, offset=30, freq='2d')
#     future_series = create_index_series(num_timesteps + 5, num_components=1, offset=60, freq='3d')
#     target_lag_combos = ([-1], [-2, -1], [-4, -3, -2, -1], [-6, -4, -2]) # ( [-1], [-2, -1], [-3, -2, -1], [-3, -1], [-5, -3, -1]) # [-1], [-1, -2, -3, -4], [-2, -4, -6]
#     past_lag_combos = ([-1], [-2, -1], [-4, -3, -2, -1], [-6, -4, -2]) # ( [-1], [-2, -1], [-3, -2, -1], [-3, -1], [-5, -3, -1] )
#     future_lag_combos = ([-1], [-2, -1], [-4, -3, -2, -1], [-6, -4, -2]) # ( [-1], [-2, -1], [-3, -2, -1], [-3, -1], [-5, -3, -1])
#     max_samp_combos = [1000] # [1, 2, 3, 1000]
#     param_combos = product(
#         target_lag_combos, past_lag_combos, future_lag_combos, max_samp_combos
#     )
#     len_combos = (
#         len(target_lag_combos)
#         * len(past_lag_combos)
#         * len(future_lag_combos)
#         * len(max_samp_combos)
#     )
#     for i, (tl, pl, fl, max_samp) in enumerate(param_combos):
#         (X, y, Ts) = _create_lagged_data(
#             target_series,
#             lags=tl,
#             past_covariates=past_series,
#             future_covariates=future_series,
#             lags_past_covariates=pl,
#             lags_future_covariates=fl,
#             output_chunk_length=1,
#             is_training=False,
#             max_samples_per_ts=max_samp,
#             multi_models=True
#         )
#         my_X, my_Ts = create_lagged_features(
#             target_series=target_series,
#             past_series=past_series,
#             future_series=future_series,
#             target_lags=tl,
#             past_lags=pl,
#             future_lags=fl,
#             max_samples=max_samp,
#         )
#         print(tl, pl, fl, max_samp)
#         # try:
#         #     assert list(my_Ts)==list(Ts[0])
#         # except:
#         #     raise ValueError("Ts incorrect")
#         try:
#             assert np.allclose(my_X.squeeze(), X.squeeze())
#         except:
#             print(Ts, my_Ts)
#             print(X.shape, my_X.shape)
#             print(X.squeeze())
#             print(my_X.squeeze())
#             raise ValueError("X incorrect")

#         if (i+1) % 1 == 0:
#             print(f"Passed {i+1}/{len_combos}")


# test_predicting_correctness()

### Understanding Behaviour of `_create_lagged_data` when `is_training=False`

In [38]:
num_timesteps = 12
target_series = create_index_series(num_timesteps, num_components=1, offset=10, freq=1)
past_series = create_index_series(
    num_timesteps - 1, num_components=1, offset=30, freq=2
)
future_series = create_index_series(
    num_timesteps + 1, num_components=1, offset=60, freq=3
)

(X, y, Ts) = _create_lagged_data(
    target_series,
    lags=[-1],
    past_covariates=past_series,
    future_covariates=future_series,
    lags_past_covariates=[-2],
    lags_future_covariates=[-3],
    output_chunk_length=10,
    multi_models=False,
    is_training=False,
    max_samples_per_ts=1,
)

In [39]:
print(np.stack([target_series.all_values().squeeze(), list(target_series.time_index)]))

[[10. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20. 21.]
 [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11.]]


In [40]:
print(np.stack([past_series.all_values().squeeze(), list(past_series.time_index)]))

[[30. 31. 32. 33. 34. 35. 36. 37. 38. 39. 40.]
 [ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18. 20.]]


In [41]:
print(np.stack([future_series.all_values().squeeze(), list(future_series.time_index)]))

[[60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70. 71. 72.]
 [ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27. 30. 33. 36.]]


In [42]:
Ts

[Int64Index([6], dtype='int64', name='time')]

In [43]:
X

array([[15., 32., 61.]])

In [44]:
y

array([[nan]])