In [1]:
from gdc.data_access import *
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import r2_score, mean_squared_error
from arch import arch_model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
df_load_simulated_normalized.head()[[0, 1, 2]]

Unnamed: 0,0,1,2
2023-01-01 00:00:00,0.308875,0.241084,0.435789
2023-01-01 01:00:00,0.325552,0.246717,0.424862
2023-01-01 02:00:00,3.468337,0.245007,2.300304
2023-01-01 03:00:00,2.21584,1.272533,2.411101
2023-01-01 04:00:00,1.259481,1.027133,0.483457


In [5]:
df_temp_simulated_normalized.head()[[0, 1, 2]]

Unnamed: 0,0,1,2
2023-01-01 00:00:00,13.021741,7.970941,7.970941
2023-01-01 01:00:00,12.621741,7.970941,7.970941
2023-01-01 02:00:00,12.621741,8.070941,8.070941
2023-01-01 03:00:00,12.421742,8.070941,8.070941
2023-01-01 04:00:00,12.321742,7.870941,7.870941


In [6]:
Y = df_load_simulated_normalized
T = df_temp_simulated_normalized

In [7]:
tau_h, tau_c = 15.0, 20.0
HDD = (tau_h - T).clip(lower=0)
CDD = (T - tau_c).clip(lower=0)

month = Y.index.month.values              # (nT,)
m_idx = month - 1                         # 0..11
nT, nI = Y.shape
months = np.arange(12)

# Cast once if you want memory speedups
Yv   = Y.to_numpy(dtype=np.float32, copy=False)
HDDv = HDD.to_numpy(dtype=np.float32, copy=False)
CDDv = CDD.to_numpy(dtype=np.float32, copy=False)

In [10]:

def print_coeffs_r2(Yv, HDDv, CDDv, m_idx, beta, alpha, label="Model"):
    """
    Compute and print coefficients and R² for a fitted model.
    Parameters
    ----------
    Yv : np.ndarray, shape (T, N)
        Actual consumption
    HDDv, CDDv : np.ndarray, shape (T, N)
        Heating and cooling degree arrays
    m_idx : np.ndarray, shape (T,)
        Month index (0..11)
    beta : array-like, length 2
        Coefficients [β_HDD, β_CDD]
    alpha : np.ndarray, shape (12, N) or (12,) 
        Intercepts, either by (month, consumer) or month only
    label : str
        Model label to print
    """
    if alpha.ndim == 1:
        base = alpha[m_idx, None]
    else:
        base = alpha[m_idx, :]
    yhat = base + beta[0]*HDDv + beta[1]*CDDv

    ssr = np.sum((Yv - yhat)**2)
    sst = np.sum((Yv - Yv.mean())**2)
    r2 = 1 - ssr/sst

    print(f"\n{label}")
    print("-" * len(label))
    print(f"β_HDD = {beta[0]:.4f}")
    print(f"β_CDD = {beta[1]:.4f}")
    print(f"R²    = {r2:.4f}")


### Model A — Month FE (pooled) + iid

In [9]:
# Month means over ALL consumers & times in that month
mY  = np.array([Yv[m_idx==k].mean()   for k in months], dtype=np.float32)
mH  = np.array([HDDv[m_idx==k].mean() for k in months], dtype=np.float32)
mC  = np.array([CDDv[m_idx==k].mean() for k in months], dtype=np.float32)

# Demeaned arrays (broadcast)
Yw   = Yv   - mY[m_idx, None]
HDDw = HDDv - mH[m_idx, None]
CDDw = CDDv - mC[m_idx, None]

# Sufficient stats for beta = argmin ||Yw - HDDw*b1 - CDDw*b2||
Shh = np.einsum('ij,ij->', HDDw, HDDw, optimize=True)
Scc = np.einsum('ij,ij->', CDDw, CDDw, optimize=True)
Shc = np.einsum('ij,ij->', HDDw, CDDw, optimize=True)
Shy = np.einsum('ij,ij->', HDDw, Yw,   optimize=True)
Scy = np.einsum('ij,ij->', CDDw, Yw,   optimize=True)

det = Shh*Scc - Shc*Shc
beta_A = np.array([( Shy*Scc - Scy*Shc)/det,
                   (-Shy*Shc + Scy*Shh)/det], dtype=np.float64)

# Month intercepts: alpha_m = mean_y_m - beta' mean_x_m
alpha_m = mY - (beta_A[0]*mH + beta_A[1]*mC)

def predict_A(T_next: pd.DataFrame) -> np.ndarray:
    HDDn = (tau_h - T_next).clip(lower=0).to_numpy(np.float32, copy=False)
    CDDn = (T_next - tau_c).clip(lower=0).to_numpy(np.float32, copy=False)
    mn = (T_next.index.month.values - 1)
    base = alpha_m[mn][:, None]
    return base + beta_A[0]*HDDn + beta_A[1]*CDDn


In [11]:
print_coeffs_r2(Yv, HDDv, CDDv, m_idx, beta_A, alpha_m, "Model A")


Model A
-------
β_HDD = 0.0281
β_CDD = -0.0032
R²    = 0.0799
