In [1]:
from gdc.data_access import *
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import r2_score, mean_squared_error
from arch import arch_model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
df_load_simulated_normalized.head()[[0, 1, 2]]

Unnamed: 0,0,1,2
2023-01-01 00:00:00,0.308875,0.241084,0.435789
2023-01-01 01:00:00,0.325552,0.246717,0.424862
2023-01-01 02:00:00,3.468337,0.245007,2.300304
2023-01-01 03:00:00,2.21584,1.272533,2.411101
2023-01-01 04:00:00,1.259481,1.027133,0.483457


In [3]:
df_temp_simulated_normalized.head()[[0, 1, 2]]

Unnamed: 0,0,1,2
2023-01-01 00:00:00,13.021741,7.970941,7.970941
2023-01-01 01:00:00,12.621741,7.970941,7.970941
2023-01-01 02:00:00,12.621741,8.070941,8.070941
2023-01-01 03:00:00,12.421742,8.070941,8.070941
2023-01-01 04:00:00,12.321742,7.870941,7.870941


In [7]:

def print_coeffs_r2(Yv, HDDv, CDDv, m_idx, beta, alpha, label="Model"):
    """
    Compute and print coefficients and R² for a fitted model.
    Parameters
    ----------
    Yv : np.ndarray, shape (T, N)
        Actual consumption
    HDDv, CDDv : np.ndarray, shape (T, N)
        Heating and cooling degree arrays
    m_idx : np.ndarray, shape (T,)
        Month index (0..11)
    beta : array-like, length 2
        Coefficients [β_HDD, β_CDD]
    alpha : np.ndarray, shape (12, N) or (12,) 
        Intercepts, either by (month, consumer) or month only
    label : str
        Model label to print
    """
    if alpha.ndim == 1:
        base = alpha[m_idx, None]
    else:
        base = alpha[m_idx, :]
    yhat = base + beta[0]*HDDv + beta[1]*CDDv

    ssr = np.sum((Yv - yhat)**2)
    sst = np.sum((Yv - Yv.mean())**2)
    r2 = 1 - ssr/sst

    print(f"\n{label}")
    print("-" * len(label))
    print(f"β_HDD = {beta[0]:.4f}")
    print(f"β_CDD = {beta[1]:.4f}")
    print(f"R²    = {r2:.4f}")


In [8]:
Y = df_load_simulated_normalized
T = df_temp_simulated_normalized

In [9]:
tau_h, tau_c = 15.0, 20.0
HDD = (tau_h - T).clip(lower=0)
CDD = (T - tau_c).clip(lower=0)

month = Y.index.month.values              # (nT,)
m_idx = month - 1                         # 0..11
nT, nI = Y.shape
months = np.arange(12)

# Cast once if you want memory speedups
Yv   = Y.to_numpy(dtype=np.float32, copy=False)
HDDv = HDD.to_numpy(dtype=np.float32, copy=False)
CDDv = CDD.to_numpy(dtype=np.float32, copy=False)

### Model A — Month FE (pooled) + iid

cons ~ temp_low + temp_high

In [10]:
# Month means over ALL consumers & times in that month
mY  = np.array([Yv[m_idx==k].mean()   for k in months], dtype=np.float32)
mH  = np.array([HDDv[m_idx==k].mean() for k in months], dtype=np.float32)
mC  = np.array([CDDv[m_idx==k].mean() for k in months], dtype=np.float32)

# Demeaned arrays (broadcast)
Yw   = Yv   - mY[m_idx, None]
HDDw = HDDv - mH[m_idx, None]
CDDw = CDDv - mC[m_idx, None]

# Sufficient stats for beta = argmin ||Yw - HDDw*b1 - CDDw*b2||
Shh = np.einsum('ij,ij->', HDDw, HDDw, optimize=True)
Scc = np.einsum('ij,ij->', CDDw, CDDw, optimize=True)
Shc = np.einsum('ij,ij->', HDDw, CDDw, optimize=True)
Shy = np.einsum('ij,ij->', HDDw, Yw,   optimize=True)
Scy = np.einsum('ij,ij->', CDDw, Yw,   optimize=True)

det = Shh*Scc - Shc*Shc
beta_A = np.array([( Shy*Scc - Scy*Shc)/det,
                   (-Shy*Shc + Scy*Shh)/det], dtype=np.float64)

# Month intercepts: alpha_m = mean_y_m - beta' mean_x_m
alpha_m = mY - (beta_A[0]*mH + beta_A[1]*mC)

def predict_A(T_next: pd.DataFrame) -> np.ndarray:
    HDDn = (tau_h - T_next).clip(lower=0).to_numpy(np.float32, copy=False)
    CDDn = (T_next - tau_c).clip(lower=0).to_numpy(np.float32, copy=False)
    mn = (T_next.index.month.values - 1)
    base = alpha_m[mn][:, None]
    return base + beta_A[0]*HDDn + beta_A[1]*CDDn


In [11]:
print_coeffs_r2(Yv, HDDv, CDDv, m_idx, beta_A, alpha_m, "Model A")


Model A
-------
β_HDD = 0.0281
β_CDD = -0.0032
R²    = 0.0799


### Model B — Month x Individual FE + uncorrelated errors

cons ~ temp_low + temp_high

In [None]:
# 12×nI monthly means per consumer
My = np.vstack([Yv[m_idx==k, :].mean(axis=0)   for k in months]).astype(np.float32)   # (12,nI)
Mh = np.vstack([HDDv[m_idx==k, :].mean(axis=0) for k in months]).astype(np.float32)
Mc = np.vstack([CDDv[m_idx==k, :].mean(axis=0) for k in months]).astype(np.float32)

Yw   = Yv   - My[m_idx, :]
HDDw = HDDv - Mh[m_idx, :]
CDDw = CDDv - Mc[m_idx, :]

Shh = np.einsum('ij,ij->', HDDw, HDDw, optimize=True)
Scc = np.einsum('ij,ij->', CDDw, CDDw, optimize=True)
Shc = np.einsum('ij,ij->', HDDw, CDDw, optimize=True)
Shy = np.einsum('ij,ij->', HDDw, Yw,   optimize=True)
Scy = np.einsum('ij,ij->', CDDw, Yw,   optimize=True)

det = Shh*Scc - Shc*Shc
beta_B = np.array([( Shy*Scc - Scy*Shc)/det,
                   (-Shy*Shc + Scy*Shh)/det], dtype=np.float64)

# alpha_{i,m} = mean_y_{i,m} - beta * mean_x_{i,m}
alpha_im_B = (My - (beta_B[0]*Mh + beta_B[1]*Mc)).astype(np.float32)   # (12,nI)

In [31]:
def fit_model_B(Yv, HDDv, CDDv, m_idx):
    months = np.arange(12)
    # monthly means per consumer (12×N)
    My = np.vstack([Yv[m_idx==k,:].mean(axis=0)   for k in months]).astype(np.float64)
    Mh = np.vstack([HDDv[m_idx==k,:].mean(axis=0) for k in months]).astype(np.float64)
    Mc = np.vstack([CDDv[m_idx==k,:].mean(axis=0) for k in months]).astype(np.float64)

    # within (i,m) transform
    Yw   = Yv   - My[m_idx, :]
    HDDw = HDDv - Mh[m_idx, :]
    CDDw = CDDv - Mc[m_idx, :]

    # 2×2 normal equations
    Shh = np.einsum('ij,ij->', HDDw, HDDw, optimize=True)
    Scc = np.einsum('ij,ij->', CDDw, CDDw, optimize=True)
    Shc = np.einsum('ij,ij->', HDDw, CDDw, optimize=True)
    Shy = np.einsum('ij,ij->', HDDw, Yw,   optimize=True)
    Scy = np.einsum('ij,ij->', CDDw, Yw,   optimize=True)
    det = Shh*Scc - Shc*Shc
    beta_B = np.array([( Shy*Scc - Scy*Shc)/det,
                       (-Shy*Shc + Scy*Shh)/det], dtype=np.float64)

    # intercepts α_{i,m}
    alpha_im_B = (My - (beta_B[0]*Mh + beta_B[1]*Mc)).astype(np.float64)
    return alpha_im_B, beta_B, (My, Mh, Mc)  # return means for later reuse

alpha_im_B, beta_B, MyMhMc = fit_model_B(Yv, HDDv, CDDv, m_idx)

In [32]:
def predict_B(T_next: pd.DataFrame) -> np.ndarray:
    HDDn = (tau_h - T_next).clip(lower=0).to_numpy(np.float32, copy=False)
    CDDn = (T_next - tau_c).clip(lower=0).to_numpy(np.float32, copy=False)
    mn = (T_next.index.month.values - 1)
    base = alpha_im[mn, :]
    return base + beta_B[0]*HDDn + beta_B[1]*CDDn

In [33]:
print_coeffs_r2(Yv, HDDv, CDDv, m_idx, beta_B, alpha_im_B, "Model B")


Model B
-------
β_HDD = 0.0322
β_CDD = -0.0041
R²    = 0.3346


### Model C — Month x Individual FE + correlated errors

cons ~ temp_low + temp_high

In [45]:
def fit_ar1_sar24_from_B(Yv, HDDv, CDDv, m_idx, alpha_im_B, beta_B):
    """
    Fit pooled AR(1)+SAR(24) residual process on Model B residuals.
    Returns (phi1, phi24, sigma_e).
    Model: e_t = phi1 * e_{t-1} + phi24 * e_{t-24} + eps_t
    """
    base_B = alpha_im_B[m_idx, :] + beta_B[0]*HDDv + beta_B[1]*CDDv
    r = Yv - base_B
    y   = r[24:, :]
    x1  = r[23:-1, :]
    x24 = r[:-24, :]

    S11 = np.einsum('ij,ij->', x1, x1)
    S22 = np.einsum('ij,ij->', x24, x24)
    S12 = np.einsum('ij,ij->', x1, x24)
    Sy1 = np.einsum('ij,ij->', x1, y)
    Sy2 = np.einsum('ij,ij->', x24, y)
    det = S11*S22 - S12*S12 + 1e-12

    phi1  = float(np.clip(( Sy1*S22 - Sy2*S12)/det,  -0.98, 0.98))
    phi24 = float(np.clip((-Sy1*S12 + Sy2*S11)/det, -0.98, 0.98))

    yhat = phi1*x1 + phi24*x24
    sigma_e = float(np.sqrt(np.mean((y - yhat)**2)))
    return phi1, phi24, sigma_e


phi1, phi24, sigma_e = fit_ar1_sar24_from_B(Yv, HDDv, CDDv, m_idx, alpha_im_B, beta_B)

In [46]:
def gls_refit_C_ar1_sar24(Yv, HDDv, CDDv, m_idx, phi1, phi24, MyMhMc):
    """
    One-shot GLS refit of beta under AR(1)+SAR(24) residuals.
    Transform:
      y*_t   = y_t   - phi1*y_{t-1}   - phi24*y_{t-24}
      X*_t   = X_t   - phi1*X_{t-1}   - phi24*X_{t-24}
    Drop first 24 rows, then within-(i,m) OLS on transformed arrays.
    Returns:
      alpha_im_C : (12, N) month×individual intercepts (on original scale)
      beta_C     : (2,)    [β_HDD, β_CDD]
    """
    My, Mh, Mc = MyMhMc

    y_co   = Yv[24:, :]   - phi1*Yv[23:-1, :]   - phi24*Yv[:-24, :]
    hdd_co = HDDv[24:, :] - phi1*HDDv[23:-1, :] - phi24*HDDv[:-24, :]
    cdd_co = CDDv[24:, :] - phi1*CDDv[23:-1, :] - phi24*CDDv[:-24, :]
    m_idx_co = m_idx[24:]

    months = np.arange(12)
    Myco = np.vstack([y_co[m_idx_co==k,   :].mean(axis=0)   for k in months])
    Mhco = np.vstack([hdd_co[m_idx_co==k, :].mean(axis=0)   for k in months])
    Mcco = np.vstack([cdd_co[m_idx_co==k, :].mean(axis=0)   for k in months])

    ycw   = y_co   - Myco[m_idx_co, :]
    hdcow = hdd_co - Mhco[m_idx_co, :]
    cdcow = cdd_co - Mcco[m_idx_co, :]

    Shh = np.einsum('ij,ij->', hdcow, hdcow, optimize=True)
    Scc = np.einsum('ij,ij->', cdcow, cdcow, optimize=True)
    Shc = np.einsum('ij,ij->', hdcow, cdcow, optimize=True)
    Shy = np.einsum('ij,ij->', hdcow, ycw,   optimize=True)
    Scy = np.einsum('ij,ij->', cdcow, ycw,   optimize=True)
    det = Shh*Scc - Shc*Shc
    beta_C = np.array([( Shy*Scc - Scy*Shc)/det,
                       (-Shy*Shc + Scy*Shh)/det], dtype=np.float64)

    # Recompute α_{i,m} on *untransformed* monthly means
    alpha_im_C = (My - (beta_C[0]*Mh + beta_C[1]*Mc)).astype(np.float64)
    return alpha_im_C, beta_C

alpha_im_C, beta_C = gls_refit_C_ar1_sar24(Yv, HDDv, CDDv, m_idx, phi1, phi24, MyMhMc)

In [47]:
def predict_dynamic_one_step_ar1_sar24(Yv, mu, phi1, phi24):
    """
    One-step-ahead dynamic forecast using realized residuals:
      ŷ_t = μ_t + φ1 * e_{t-1} + φ24 * e_{t-24},  e_t = y_t - μ_t
    Uses observed e_{t-1}, e_{t-24}. For evaluation only.
    """
    yhat = mu.copy()
    e = Yv - mu
    yhat[1:,  :] += phi1  * e[:-1,  :]
    yhat[24:, :] += phi24 * e[:-24, :]
    return yhat

In [48]:
def print_coeffs_and_forecast_metrics_ar1_sar24(
    Yv, HDDv, CDDv, m_idx, beta, alpha_im, phi1=None, phi24=None,
    label="Model C (AR(1)+SAR(24))"
):
    """
    Prints β, static R²/RMSE (mean fit), and dynamic one-step R²/RMSE
    using realized residuals with AR(1)+SAR(24) dynamics if φ's provided.
    """
    mu = alpha_im[m_idx, :] + beta[0]*HDDv + beta[1]*CDDv

    # Static (mean) fit
    resid_s = Yv - mu
    sse_s = float(np.sum(resid_s**2))
    sst   = float(np.sum((Yv - Yv.mean())**2))
    r2_s  = 1.0 - sse_s/sst
    rmse_s = float(np.sqrt(sse_s / Yv.size))

    print(f"\n{label}")
    print("-"*len(label))
    print(f"β_HDD = {beta[0]:.4f}")
    print(f"β_CDD = {beta[1]:.4f}")
    print("Static (mean) fit:")
    print(f"  R²   = {r2_s:.4f}")
    print(f"  RMSE = {rmse_s:.4f}")

    # Dynamic one-step (evaluation)
    if (phi1 is not None) and (phi24 is not None):
        yhat_dyn = predict_dynamic_one_step_ar1_sar24(Yv, mu, phi1, phi24)
        resid_d = Yv - yhat_dyn
        sse_d = float(np.sum(resid_d**2))
        r2_d  = 1.0 - sse_d/sst
        rmse_d = float(np.sqrt(sse_d / Yv.size))
        print("\nDynamic one-step forecast (AR(1)+SAR(24)):")
        print(f"  φ1   = {phi1:.4f}")
        print(f"  φ24  = {phi24:.4f}")
        print(f"  R²   = {r2_d:.4f}")
        print(f"  RMSE = {rmse_d:.4f}")

In [50]:
print_coeffs_and_forecast_metrics_ar1_sar24(
    Yv, HDDv, CDDv, m_idx, beta_C, alpha_im_C, phi1, phi24,
    label="Model C (AR(1)+SAR(24))"
)


Model C (AR(1)+SAR(24))
-----------------------
β_HDD = 0.0117
β_CDD = -0.0005
Static (mean) fit:
  R²   = 0.3294
  RMSE = 0.7377

Dynamic one-step forecast (AR(1)+SAR(24)):
  φ1   = 0.2494
  φ24  = 0.5644
  R²   = 0.6421
  RMSE = 0.5390


In [51]:
def fit_pooled_ar1_from_B(Yv, HDDv, CDDv, m_idx, alpha_im_B, beta_B):
    base_B = alpha_im_B[m_idx, :] + beta_B[0]*HDDv + beta_B[1]*CDDv
    res = Yv - base_B
    # pooled ρ
    num = np.einsum('ij,ij->', res[1:,:], res[:-1,:])
    den = np.einsum('ij,ij->', res[:-1,:], res[:-1,:]) + 1e-12
    rho = float(np.clip(num/den, -0.98, 0.98))
    # innovation std
    innov = res[1:,:] - rho*res[:-1,:]
    sigma_e = float(np.sqrt(np.mean(innov**2)))
    return rho, sigma_e
rho, sigma_e = fit_pooled_ar1_from_B(Yv, HDDv, CDDv, m_idx, alpha_im_B, beta_B)

In [52]:
def gls_refit_C(Yv, HDDv, CDDv, m_idx, rho, MyMhMc):
    My, Mh, Mc = MyMhMc
    # Cochrane–Orcutt transform
    Yco   = Yv[1:,:]   - rho*Yv[:-1,:]
    HDDco = HDDv[1:,:] - rho*HDDv[:-1,:]
    CDDco = CDDv[1:,:] - rho*CDDv[:-1,:]
    m_idx_co = m_idx[1:]

    # monthly means of transformed vars
    months = np.arange(12)
    Myco = np.vstack([Yco[m_idx_co==k,:].mean(axis=0)   for k in months])
    Mhco = np.vstack([HDDco[m_idx_co==k,:].mean(axis=0) for k in months])
    Mcco = np.vstack([CDDco[m_idx_co==k,:].mean(axis=0) for k in months])

    # within on transformed arrays
    Yco_w   = Yco   - Myco[m_idx_co, :]
    HDDco_w = HDDco - Mhco[m_idx_co, :]
    CDDco_w = CDDco - Mcco[m_idx_co, :]

    # 2×2 GLS normal equations
    Shh = np.einsum('ij,ij->', HDDco_w, HDDco_w, optimize=True)
    Scc = np.einsum('ij,ij->', CDDco_w, CDDco_w, optimize=True)
    Shc = np.einsum('ij,ij->', HDDco_w, CDDco_w, optimize=True)
    Shy = np.einsum('ij,ij->', HDDco_w, Yco_w,   optimize=True)
    Scy = np.einsum('ij,ij->', CDDco_w, Yco_w,   optimize=True)
    det = Shh*Scc - Shc*Shc
    beta_C = np.array([( Shy*Scc - Scy*Shc)/det,
                       (-Shy*Shc + Scy*Shh)/det], dtype=np.float64)

    # α_{i,m} recomputed on *untransformed* monthly means
    alpha_im_C = (My - (beta_C[0]*Mh + beta_C[1]*Mc)).astype(np.float64)
    return alpha_im_C, beta_C
alpha_im_C, beta_C = gls_refit_C(Yv, HDDv, CDDv, m_idx, rho, MyMhMc)

In [53]:
def predict_mean(HDDv, CDDv, m_idx, beta, alpha_im):
    return alpha_im[m_idx, :] + beta[0]*HDDv + beta[1]*CDDv

def predict_dynamic_one_step(Yv, mean_static, rho):
    # ŷ_t = μ_t + ρ * (y_{t-1} - μ_{t-1})
    yhat = mean_static.copy()
    yhat[1:,:] += rho * (Yv[:-1,:] - mean_static[:-1,:])
    return yhat


In [55]:
def print_coeffs_and_forecast_metrics_pooledAR1(
    Yv, HDDv, CDDv, m_idx, beta, alpha_im, rho=None, label="Model"
):
    """
    Print coefficients, static fit metrics, and dynamic (one-step) forecast metrics.
    Now includes R² for both static and dynamic fits.
    """
    # --- Static mean prediction ---
    mean_static = predict_mean(HDDv, CDDv, m_idx, beta, alpha_im)
    resid_static = Yv - mean_static

    sse_s = float(np.sum(resid_static**2))
    sst   = float(np.sum((Yv - Yv.mean())**2))
    r2_s  = 1.0 - sse_s/sst
    rmse_s = float(np.sqrt(sse_s / Yv.size))

    # --- Dynamic one-step-ahead prediction ---
    if rho is None:
        yhat_dyn = mean_static
    else:
        yhat_dyn = predict_dynamic_one_step(Yv, mean_static, rho)
    resid_dyn = Yv - yhat_dyn

    sse_d = float(np.sum(resid_dyn**2))
    r2_d  = 1.0 - sse_d/sst          # same total variance denominator
    rmse_d = float(np.sqrt(sse_d / Yv.size))

    # --- Print summary ---
    print(f"\n{label}")
    print("-"*len(label))
    print(f"β_HDD = {beta[0]:.4f}")
    print(f"β_CDD = {beta[1]:.4f}")

    print("\nStatic (mean) fit:")
    print(f"  R²   = {r2_s:.4f}")
    print(f"  RMSE = {rmse_s:.4f}")

    if rho is not None:
        print("\nDynamic one-step forecast (pooled AR1):")
        print(f"  ρ    = {rho:.4f}")
        print(f"  R²   = {r2_d:.4f}")
        print(f"  RMSE = {rmse_d:.4f}")


print_coeffs_and_forecast_metrics_pooledAR1(
    Yv, HDDv, CDDv, m_idx, beta_C, alpha_im_C, rho=rho, label="Model C"
)


Model C
-------
β_HDD = 0.0312
β_CDD = -0.0042

Static (mean) fit:
  R²   = 0.3346
  RMSE = 0.7349

Dynamic one-step forecast (pooled AR1):
  ρ    = 0.4117
  R²   = 0.4474
  RMSE = 0.6697


### Diagnostics 

In [56]:
mu = alpha_im_C[m_idx,:] + beta_C[0]*HDDv + beta_C[1]*CDDv
num = np.sum((Yv - mu)**2, axis=0)
den = np.sum((Yv - Yv.mean(axis=0, keepdims=True))**2, axis=0)
r2_i = 1 - num/den
np.nanquantile(r2_i, [0.1,0.5,0.9])


array([-0.01868965,  0.11543623,  0.49912783])