In [477]:
import numpy as np
import pandas as pd
from autogluon.timeseries.metrics import TimeSeriesScorer
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from plotnine import *
from statsmodels.tsa.arima_process import ArmaProcess
from typing import Any, Dict, Type

Fill in the function below. Assume that
- The `target` column of `test_data` contains the data.
- `predictions` contains predictions for the last `prediction_length` observations in `test_data`.
- The next-to-last column of `predictions` contains lower bounds for prediction intervals and the last column contains upper bounds.

The function should return the coverage, i.e., the proportion of observations that fall into the corresponding prediction interval.

In [478]:
def help_calc_coverages_for_1_window(
        target: str,
        test_data: TimeSeriesDataFrame,
        prediction_length: int,
        predictions: TimeSeriesDataFrame
    ) -> float:
    # print(predictions)
    upper_bound = predictions.iloc[:, -1].values
    lower_bound  = predictions.iloc[:, -2].values
    
    actuals = test_data[target].iloc[-prediction_length:].values
    # print("#"*80)
    # print(lower_bound)
    coverage = ((actuals >= lower_bound) & (actuals <= upper_bound))
    
    # print(coverage.values)
    return np.mean(coverage)

Fill in the function below. Assume that

- The `timestamp_col` column of `window_data` contains observation times and the `target` column contains observation values.
- The last `prediction_length` observations in `window_data` are test observations and the prior observations are training observations.
- Predictions are to be evaluated using `eval_metric`.
- Level-`ci_level` prediction intervals are needed. `ci_level` could be 0.95, for example.
- The training of any one model cannot take longer than `time_limit` seconds.
- The models to be used are specified in `hyperparameters`. For example, `hyperparameters` could equal `{"AutoARIMA": {}}`.

The function should compute the test set coverage for each model in `hyperparameters`. It should return a `DataFrame` with one row for each model. The `DataFrame`'s columns should be `test_start_time`, `test_end_time`, `model`, and `coverage`, in that order. Use `help_calc_coverages_for_1_window` to compute the coverage for each model.

In [None]:
def calc_coverages_for_1_window(
        window_data: pd.DataFrame,
        timestamp_col: str,
        target: str | None,
        prediction_length: int,
        eval_metric: str | TimeSeriesScorer | None,
        ci_level: float,
        time_limit: int | None,
        hyperparameters: Dict[str | Type, Any]
    ) -> float:
    # print("Columns in window_data:", window_data.columns)
    train_data = window_data.iloc[:-prediction_length]
    test_data = window_data.iloc[:prediction_length]
    # print(test_data)

    train_data = TimeSeriesDataFrame.from_data_frame(train_data)
    test_data = TimeSeriesDataFrame.from_data_frame(test_data)

    results = []
    lower_quantile = (1 - ci_level) / 2
    upper_quantile = 1 - (1 - ci_level) / 2
    
    predictor = TimeSeriesPredictor(target=target, prediction_length=prediction_length, eval_metric=eval_metric, quantile_levels=[lower_quantile, upper_quantile])
    predictor.fit(train_data, hyperparameters=hyperparameters, time_limit=time_limit, random_seed=123)
    
    # predictions = predictor.predict(train_data, quantiles=[(1 - ci_level) / 2, 1 - (1 - ci_level) / 2])
    models = []
    coverages = []
    start_t = []
    end_t = []
    for model in predictor.model_names():
        
        predictions = predictor.predict(train_data, model=model)
        coverage = help_calc_coverages_for_1_window(target=target, test_data=test_data, prediction_length=prediction_length, predictions=predictions)
        
        models.append(model)
        coverages.append(coverage)
        start_t.append(test_data.reset_index()[timestamp_col].iloc[0])
        end_t.append(test_data.reset_index()[timestamp_col].iloc[-1])
    # coverages = {
    #     model: 
    #     for model in predictor.model_names()
    # }
    # print("#"*80)
    # print(test_data.reset_index())
    results = pd.DataFrame({
            "test_start_time": start_t,
            "test_end_time": end_t,
            "model": models, 
            "coverages": coverages

        # for model, coverage in coverages.items()
    })
    
    # results["model"] = models
    # results["coverage"] = coverages
    # results["test_start_time"] = test_data.reset_index()[timestamp_col].iloc[0]
    # results["test_end_time"] = test_data.reset_index()[timestamp_col].iloc[-1]
    return results

Fill in the function below. Assume that

- The `timestamp_col` column of `data` contains observation times and the `target` column contains observation values. Windows are to be carved out of `data`, with one or more time series models being trained and tested on each window.
- The first `train_size` observations in a window are training observations and the last `prediction_length` observations are test observations.
- The starting indices of consecutive windows differ by `stride`.
- Predictions are to be evaluated using `eval_metric`.
- Level-`ci_level` prediction intervals are needed. `ci_level` could be 0.95, for example.
- The training of any one model cannot take longer than `time_limit` seconds.
- The models to be used are specified in `hyperparameters`. For example, `hyperparameters` could equal `{"AutoARIMA": {}}`.

The function should compute the test set coverages for the models in `hyperparameters` on all of the windows. It should return a `DataFrame` with one row for each pair of window and model. The `DataFrame`'s columns should be `test_start_time`, `test_end_time`, `model`, and `coverage`, in that order. Use `calc_coverages_for_1_window` to compute the coverages for each window.

In [480]:
def calc_coverages(
        data: pd.DataFrame,
        train_size: int,
        prediction_length: int,
        stride: int,
        timestamp_col: str,
        target: str | None,
        eval_metric: str | TimeSeriesScorer | None,
        ci_level: float,
        time_limit: int | None,
        hyperparameters: Dict[str | Type, Any]
    ) -> pd.DataFrame:
    
    results = []
    data["item_id"] = "0"
    for start_idx in range(0, len(data) - train_size - prediction_length + 1, stride):
        window_data = data.iloc[start_idx:start_idx + train_size + prediction_length]
        # print(window_data)
        coverage_df = calc_coverages_for_1_window(
            window_data, timestamp_col, target, prediction_length, eval_metric, ci_level, time_limit, hyperparameters
        )
        
        results.append(coverage_df)
    return pd.concat(results, ignore_index=True)

When we write a function, we ought to test it to ensure that it works. When a function operates on data, one way to test it is to simulate data from a model and verify that the function returns what it should for the simulated data.

One of the simplest time series models is the AR(1) model, the autoregressive model of order 1. It is defined by the equation
$$
Y_t = \phi Y_{t - 1} + \epsilon_t, \ t \in \mathbb{Z},
$$
where the $\epsilon_t$'s are uncorrelated random variables with common mean zero and common variance $\sigma^2$, and $\epsilon_t$ is independent of $Y_{t - 1}, Y_{t - 2}, Y_{t - 3}, \ldots$. The $\epsilon_t$'s are called *innovations*. For a Gaussian AR(1) model, the innovations are $N(0, \sigma^2)$ random variables.

One desirable property of a time series model is *stationarity*. If the model is stationary, then the mean and variance of $Y_t$ don't depend on $t$. Also, the covariance between $Y_t$ and $Y_u$ depends on $t$ and $u$ only through $|t - u|$, so we can talk about *the* covariance at lag $\ell$, $\text{Cov}(Y_t, Y_{t + \ell})$, which doesn't depend on $t$. The covariances at the various lags are called *autocovariances*. It can be shown that the AR(1) model is stationary if and only if $|\phi| < 1$.

Define a function `simulate_ar1` that draws a sample of size $n$ from a Gaussian AR(1) model with coefficient $\phi$ and innovation standard deviation $\sigma$.
- Check whether $|\phi| < 1$ - raise a `ValueError` if it isn't.
- Use `statsmodels.tsa.arima_process.ArmaProcess` to create an object representing the time series.
- Use the object's `generate_sample` method to generate a sample of size $n$.
- Return the sample in a `DataFrame` with two columns:
    - `timestamp`, which contains a sequence of times that starts at `Timestamp("2020-01-01 00:00:00")` and has a one-minute step size.
    - `target`, which contains the sample.

In [481]:
def simulate_ar1(phi: float, sigma: float, n: int, rng: np.random.Generator) -> pd.DataFrame:
    if not (-1 < phi < 1):
        raise ValueError("Phi must be in the range (-1, 1) for stationarity.")
    
    ar = [1, -phi]
    ma = [1]
    
    ar_process = ArmaProcess(ar, ma)
    sample = ar_process.generate_sample(n, scale=sigma, distrvs=lambda size: rng.standard_normal(size))
    
    timestamps = pd.date_range(start="2020-01-01", periods=n, freq="T")
    
    return pd.DataFrame({"timestamp": timestamps, "target": sample, "item_id": 0})

For a model of the form $Y = f(X) + \epsilon$, where $X$ and $\epsilon$ are independent, the *signal-to-noise ratio (SNR)* is defined as
$$
\frac{\text{Var}(f(X))}{\text{Var}(\epsilon)}.
$$
The fraction of the variance of $Y$ explained by the signal $f(X)$, which we'll call the FVE, is
$$
\frac{\text{Var}(f(X))}{\text{Var}(Y)} = \frac{\text{Var}(f(X))}{\text{Var}(f(X)) + \text{Var}(\epsilon)} = \frac{\text{SNR}}{\text{SNR} + 1}.
$$

For a stationary AR(1) model, derive expressions for the SNR and FVE. Then define a function `calc_phi_from_fve` that takes an FVE and returns the nonnegative $\phi$ that yields that FVE. The function should raise a `ValueError` if the FVE isn't in $[0, 1)$.

In [482]:
def calc_phi_from_fve(fve: float) -> float:
    if not (0 <= fve < 1):
        raise ValueError("FVE must be in the range [0,1).")
    
    return np.sqrt(fve)

Once the functions have been filled in, run the code below to verify that the coverage functions work. Since `ci_level` equals 0.95, the coverage you get should be 0.95.

In [483]:
fve = 0.9
phi = calc_phi_from_fve(fve)
sigma = 1
n = 1000
rng = np.random.default_rng(12345)
window_data = simulate_ar1(phi, sigma, n, rng)
timestamp_col = "timestamp"
target = "target"
prediction_length = 100
eval_metric = "RMSE"
ci_level = 0.95
time_limit = 60
hyperparameters = {"AutoARIMA": {}, "PatchTST":{}}

calc_coverages_for_1_window(window_data, timestamp_col, target, prediction_length, eval_metric, ci_level, time_limit, hyperparameters)

No path specified. Models will be saved in: "AutogluonModels/ag-20250204_154640"
Beginning AutoGluon training... Time limit = 60s
AutoGluon will save models to '/Users/jakegwinn/Documents/umich/Y5S2/urps/urps_2025/notebooks/jake_notebooks/AutogluonModels/ag-20250204_154640'
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:04 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6020
CPU Count:          12
GPU Count:          0
Memory Avail:       25.16 GB / 64.00 GB (39.3%)
Disk Space Avail:   1041.44 GB / 1858.19 GB (56.0%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': RMSE,
 'hyperparameters': {'AutoARIMA': {}, 'PatchTST': {}},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 100,
 'quantile_levels': [0.025000000000000022, 0.975],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selecti

ValueError: Length of values (3) does not match length of index (1)