In [1]:
import sys

import numpy as np
import plotly
import plotly.graph_objects as go
import os
import plotly.io as pio
from plotly.subplots import make_subplots

# Plotly notebooks: force the Jupyter renderer.
pio.renderers.default = os.environ.get("PLOTLY_RENDERER", "notebook")
pio.templates.default = "plotly_white"

print("Python:", sys.version.split()[0])
print("NumPy:", np.__version__)
print("Plotly:", plotly.__version__)


Python: 3.12.9
NumPy: 1.26.2
Plotly: 6.5.2


In [2]:
def simulate_ar(
    phi: np.ndarray,
    *,
    c: float = 0.0,
    sigma: float = 1.0,
    n: int = 600,
    burn_in: int = 200,
    seed: int = 42,
) -> np.ndarray:
    """Simulate a univariate AR(p): y_t = c + sum_i phi_i y_{t-i} + eps_t."""
    phi = np.asarray(phi, dtype=float)
    p = int(phi.size)
    if p < 1:
        raise ValueError("phi must have length >= 1")
    if n <= 0:
        raise ValueError("n must be positive")
    if burn_in < 0:
        raise ValueError("burn_in must be >= 0")

    rng = np.random.default_rng(seed)
    eps = rng.normal(loc=0.0, scale=sigma, size=n + burn_in)
    y = np.zeros(n + burn_in, dtype=float)

    # Start at t=p so y_{t-i} exists.
    for t in range(p, n + burn_in):
        lags = y[t - 1 : t - p - 1 : -1]  # [y_{t-1}, ..., y_{t-p}]
        y[t] = c + float(phi @ lags) + eps[t]

    return y[burn_in:]


def make_lagged_matrix(y: np.ndarray, p: int, *, include_intercept: bool = True):
    """Build (X, y_target) for AR(p) regression.

    For t = p..T-1:
      y_target[t-p] = y[t]
      X[t-p] = [1, y[t-1], ..., y[t-p]]
    """
    y = np.asarray(y, dtype=float)
    p = int(p)
    if p < 1:
        raise ValueError("p must be >= 1")
    if y.ndim != 1:
        raise ValueError("y must be 1D")
    n = y.size
    if n <= p:
        raise ValueError(f"Need at least p+1 points; got n={n}, p={p}")

    # Columns are lag-1, lag-2, ..., lag-p.
    lag_cols = [y[p - i : n - i] for i in range(1, p + 1)]
    X = np.column_stack(lag_cols)
    if include_intercept:
        X = np.column_stack([np.ones(n - p, dtype=float), X])
    y_target = y[p:]
    return X, y_target


def fit_ar_ols(y: np.ndarray, p: int, *, include_intercept: bool = True):
    """Fit AR(p) by OLS and return a small result dict."""
    X, y_target = make_lagged_matrix(y, p, include_intercept=include_intercept)
    beta, *_ = np.linalg.lstsq(X, y_target, rcond=None)
    y_hat_target = X @ beta
    resid = y_target - y_hat_target
    rss = float(resid @ resid)
    n_eff = int(y_target.size)
    k = int(X.shape[1])
    sigma2 = rss / n_eff

    y_hat = np.full_like(np.asarray(y, dtype=float), np.nan)
    y_hat[p:] = y_hat_target

    return {
        "p": int(p),
        "include_intercept": bool(include_intercept),
        "beta": beta,
        "y_hat": y_hat,
        "resid": resid,
        "rss": rss,
        "sigma2": sigma2,
        "n_eff": n_eff,
        "k": k,
    }


def forecast_ar(beta: np.ndarray, y_history: np.ndarray, p: int, *, steps: int, include_intercept: bool = True):
    """Iterative multi-step forecast using the model's own predictions."""
    beta = np.asarray(beta, dtype=float)
    y_hist = list(np.asarray(y_history, dtype=float).tolist())
    p = int(p)
    if steps < 1:
        return np.array([], dtype=float)
    if len(y_hist) < p:
        raise ValueError(f"Need at least p history points; got {len(y_hist)}")

    out = []
    for _ in range(int(steps)):
        lags = np.array(y_hist[-1 : -p - 1 : -1], dtype=float)
        x = np.concatenate(([1.0], lags)) if include_intercept else lags
        y_next = float(x @ beta)
        y_hist.append(y_next)
        out.append(y_next)
    return np.array(out, dtype=float)


def aic_bic_from_rss(rss: float, n: int, k: int):
    """AIC/BIC up to additive constants (sufficient for comparing p)."""
    rss = float(rss)
    n = int(n)
    k = int(k)
    if n <= 0:
        raise ValueError("n must be positive")
    if rss <= 0:
        rss = 1e-12
    aic = 2 * k + n * np.log(rss / n)
    bic = k * np.log(n) + n * np.log(rss / n)
    return float(aic), float(bic)


def ar_stationary(phi: np.ndarray):
    """Check covariance-stationarity for AR(p) via characteristic roots."""
    phi = np.asarray(phi, dtype=float)
    coeffs = np.concatenate(([1.0], -phi))  # 1 - phi1 z - ... - phip z^p
    roots = np.roots(coeffs)
    return bool(np.all(np.abs(roots) > 1.0)), roots


def acf(x: np.ndarray, nlags: int = 30):
    """Autocorrelation function for lags 0..nlags (simple, biased estimator)."""
    x = np.asarray(x, dtype=float)
    x = x - np.mean(x)
    denom = float(x @ x)
    out = np.empty(int(nlags) + 1, dtype=float)
    out[0] = 1.0
    for k in range(1, int(nlags) + 1):
        out[k] = float(x[:-k] @ x[k:]) / denom
    return out


In [3]:
# Ground-truth AR(3)
phi_true = np.array([0.65, -0.25, 0.15])
c_true = 0.2
sigma_true = 0.7

is_stat, roots = ar_stationary(phi_true)
print("Stationary (true process)?", is_stat)
print("Characteristic roots:", np.round(roots, 3))

y = simulate_ar(phi_true, c=c_true, sigma=sigma_true, n=700, burn_in=300, seed=7)
t = np.arange(y.size)

train_n = 520
y_train = y[:train_n]
y_test = y[train_n:]
t_test = t[train_n:]

fig = go.Figure()
fig.add_trace(go.Scatter(x=t, y=y, mode="lines", name="series"))
fig.add_vline(x=train_n, line_dash="dash", line_color="black")
fig.update_layout(title="Synthetic AR(3) series (train/test split)", xaxis_title="t", yaxis_title="y")
fig.show()


Stationary (true process)? False
Characteristic roots: [0.631+0.j    0.01 +0.488j 0.01 -0.488j]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 3)

In [4]:
P_MAX = 15

ps = np.arange(1, P_MAX + 1)
aics = []
bics = []
rss_list = []
mse_dyn = []

for p in ps:
    fit = fit_ar_ols(y_train, p, include_intercept=True)
    aic, bic = aic_bic_from_rss(fit["rss"], fit["n_eff"], fit["k"])
    aics.append(aic)
    bics.append(bic)
    rss_list.append(fit["rss"])

    # (Optional sanity metric) dynamic multi-step MSE on the test horizon
    y_fc = forecast_ar(fit["beta"], y_train, p, steps=y_test.size, include_intercept=True)
    mse_dyn.append(float(np.mean((y_test - y_fc) ** 2)))

aics = np.array(aics)
bics = np.array(bics)
mse_dyn = np.array(mse_dyn)

p_best_aic = int(ps[np.argmin(aics)])
p_best_bic = int(ps[np.argmin(bics)])
print("Best p by AIC:", p_best_aic)
print("Best p by BIC:", p_best_bic)

fig = make_subplots(rows=1, cols=2, subplot_titles=["AIC (lower is better)", "BIC (lower is better)"])
fig.add_trace(go.Scatter(x=ps, y=aics, mode="lines+markers", name="AIC"), row=1, col=1)
fig.add_vline(x=p_best_aic, line_dash="dash", line_color="#1f77b4", row=1, col=1)

fig.add_trace(go.Scatter(x=ps, y=bics, mode="lines+markers", name="BIC"), row=1, col=2)
fig.add_vline(x=p_best_bic, line_dash="dash", line_color="#1f77b4", row=1, col=2)

fig.update_xaxes(title_text="p", row=1, col=1)
fig.update_xaxes(title_text="p", row=1, col=2)
fig.update_layout(title="Lag-order selection on training data")
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=ps, y=mse_dyn, mode="lines+markers", name="Dynamic forecast MSE"))
fig.update_layout(title="Forecast error vs lag order (dynamic multi-step)", xaxis_title="p", yaxis_title="MSE")
fig.show()


NameError: name 'y_train' is not defined

In [5]:
P_SHOW = [1, 2, 3, 6, 12]

fig = go.Figure()
fig.add_trace(go.Scatter(x=t_test, y=y_test, mode="lines", name="actual (test)", line=dict(color="black")))

for p in P_SHOW:
    fit = fit_ar_ols(y_train, p, include_intercept=True)
    y_fc = forecast_ar(fit["beta"], y_train, p, steps=y_test.size, include_intercept=True)
    mse = float(np.mean((y_test - y_fc) ** 2))
    fig.add_trace(
        go.Scatter(
            x=t_test,
            y=y_fc,
            mode="lines",
            name=f"AR({p}) forecast (MSE={mse:.3f})",
        )
    )

fig.update_layout(
    title="Effect of lag order: multi-step forecasts on the same test window",
    xaxis_title="t",
    yaxis_title="y",
)
fig.show()


NameError: name 't_test' is not defined

In [6]:
p_best = p_best_bic
fit_best = fit_ar_ols(y_train, p_best, include_intercept=True)

# One-step predictions on the full series using true lags.
X_full, y_target_full = make_lagged_matrix(y, p_best, include_intercept=True)
y_hat_target_full = X_full @ fit_best["beta"]
y_hat_full = np.full_like(y, np.nan)
y_hat_full[p_best:] = y_hat_target_full

fig = go.Figure()
fig.add_trace(go.Scatter(x=t, y=y, mode="lines", name="actual", line=dict(color="black")))
fig.add_trace(go.Scatter(x=t, y=y_hat_full, mode="lines", name=f"AR({p_best}) one-step prediction"))
fig.add_vline(x=train_n, line_dash="dash", line_color="black")
fig.update_layout(title="Prediction vs actual (one-step ahead)", xaxis_title="t", yaxis_title="y")
fig.show()


NameError: name 'p_best_bic' is not defined

In [7]:
# Residuals from the training fit (one-step ahead on training window)
resid = fit_best["resid"]
fitted = y_train[p_best:] - resid

res_acf = acf(resid, nlags=30)
lags = np.arange(res_acf.size)

fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=[
        "Residuals over time (train)",
        "Residual histogram (train)",
        "Residual ACF (train)",
        "Residuals vs fitted (train)",
    ],
)

fig.add_trace(
    go.Scatter(x=np.arange(resid.size), y=resid, mode="lines", name="residual"),
    row=1,
    col=1,
)
fig.add_hline(y=0, line_color="gray", line_width=1, row=1, col=1)

fig.add_trace(go.Histogram(x=resid, nbinsx=40, name="residuals"), row=1, col=2)

fig.add_trace(go.Bar(x=lags, y=res_acf, name="ACF"), row=2, col=1)
fig.add_hline(y=0, line_color="gray", line_width=1, row=2, col=1)

fig.add_trace(go.Scatter(x=fitted, y=resid, mode="markers", name="resid vs fitted", opacity=0.6), row=2, col=2)
fig.add_hline(y=0, line_color="gray", line_width=1, row=2, col=2)

fig.update_layout(title=f"Residual diagnostics for AR({p_best}) (fit on training)")
fig.update_xaxes(title_text="index", row=1, col=1)
fig.update_xaxes(title_text="residual", row=1, col=2)
fig.update_xaxes(title_text="lag", row=2, col=1)
fig.update_xaxes(title_text="fitted", row=2, col=2)
fig.update_yaxes(title_text="residual", row=1, col=1)
fig.update_yaxes(title_text="count", row=1, col=2)
fig.update_yaxes(title_text="ACF", row=2, col=1)
fig.update_yaxes(title_text="residual", row=2, col=2)
fig.show()


NameError: name 'fit_best' is not defined

In [8]:
y_stat = simulate_ar(np.array([0.7]), c=0.0, sigma=1.0, n=250, burn_in=200, seed=0)
y_nonstat = simulate_ar(np.array([1.02]), c=0.0, sigma=1.0, n=250, burn_in=200, seed=0)

fig = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=["Stationary AR(1): φ=0.7", "Non-stationary AR(1): φ=1.02"])
fig.add_trace(go.Scatter(y=y_stat, mode="lines", name="φ=0.7"), row=1, col=1)
fig.add_trace(go.Scatter(y=y_nonstat, mode="lines", name="φ=1.02"), row=2, col=1)
fig.update_layout(title="Stationarity matters: |φ|<1 vs |φ|>1", xaxis_title="t", yaxis_title="y")
fig.show()


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 0 is different from 1)