In [1]:
# Voeg src/ toe aan importpad
import sys
from pathlib import Path
project_root = Path.cwd().parent if Path.cwd().name == "tests" else Path.cwd()
src_path = project_root / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
import numpy as np
import sqlite3

from models.naive_model import run_naive_model, evaluate_rmse
from models.sarimax_model import run_sarimax

In [2]:
# DB connectie en master_warp inladen
db_path = project_root / "src" / "data" / "WARP.db"
conn = sqlite3.connect(db_path)
df = pd.read_sql("SELECT * FROM master_warp", conn)
conn.close()

# Zet datetime index
df["target_datetime"] = pd.to_datetime(df["target_datetime"], utc=True)
df = df.sort_values("target_datetime").set_index("target_datetime")

# Targets en features
target_col = "Price"
feature_cols = ["Total_Flow", "temperature_2m", "Solar_Vol"]
horizon = 168

# Opschonen en splitsen
y = df[target_col].dropna()
y = y[~y.index.duplicated()]
X = df[feature_cols].loc[y.index].dropna()

In [3]:
# 📅 Handmatige datums instellen
train_start = pd.Timestamp("2025-01-01 00:00:00", tz="UTC")
train_end = pd.Timestamp("2025-03-14 23:00:00", tz="UTC")
test_start = pd.Timestamp("2025-03-15 00:00:00", tz="UTC")
test_end = pd.Timestamp("2025-04-14 23:00:00", tz="UTC")
fh_start = pd.Timestamp("2025-03-15 00:00:00", tz="UTC")  # VASTZETTEN

# Horizon (7 dagen vooruit)
horizon = 168
fh = pd.date_range(start=fh_start, periods=horizon, freq="h")

# SARIMAX splits
y_train_sarimax = y.loc[train_start:train_end]
X_train_sarimax = X.loc[train_start:train_end]
X_test_sarimax = X.loc[fh]

# Naive splits
y_train_naive = y.loc[train_end - pd.Timedelta(hours=167):train_end]
y_test = y.loc[fh]

In [4]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sarimax), index=X_train_sarimax.index, columns=X_train_sarimax.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_sarimax), index=X_test_sarimax.index, columns=X_test_sarimax.columns)

In [11]:
# ------------------------------
# 🟡 Naive model
# ------------------------------
naive_preds = run_naive_model(y, lag=horizon).loc[fh]
rmse_naive = root_mean_squared_error(y.loc[fh], naive_preds)

# ------------------------------
# 🔵 SARIMAX zonder features
# ------------------------------
sarimax_preds_nofeatures, rmse_sarimax_nf = run_sarimax(
    y_train_sarimax,
    X_train=None,
    X_test=pd.DataFrame(index=fh),
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 24)
)

# ------------------------------
# 🔵 SARIMAX met geschaalde exogene features
# ------------------------------
sarimax_preds_with_features, rmse_sarimax_feat = run_sarimax(
    y_train_sarimax,
    X_train_scaled,
    X_test_scaled,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 24)
)

# ------------------------------
# 📊 Toon resultaten
# ------------------------------
print(f"🟡 Naive RMSE:          {rmse_naive:.3f}")
print(f"🔵 SARIMAX (no exog):   {rmse_sarimax_nf:.3f}")
print(f"🟢 SARIMAX (with exog): {rmse_sarimax_feat:.3f}")

2025-05-16 10:44:38,134 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)

No frequency information was provided, so inferred frequency h will be used.


No frequency information was provided, so inferred frequency h will be used.

2025-05-16 10:44:44,328 - sarimax - INFO - 📊 RMSE: 0.03
2025-05-16 10:44:44,341 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)

No frequency information was provided, so inferred frequency h will be used.


No frequency information was provided, so inferred frequency h will be used.


Maximum Likelihood optimization failed to converge. Check mle_retvals

2025-05-16 10:45:06,463 - sarimax - INFO - 📊 RMSE: 0.03


🟡 Naive RMSE:          0.048
🔵 SARIMAX (no exog):   0.028
🟢 SARIMAX (with exog): 0.028


In [6]:
# 📈 Interactieve plot zonder Test
fig = go.Figure()

# Trainingset
fig.add_trace(go.Scatter(
    x=y_train_sarimax.index,
    y=y_train_sarimax.values,
    mode="lines",
    name="Train",
    line=dict(color="lightgray", width=2)
))

# Werkelijke waarden tijdens forecast
fig.add_trace(go.Scatter(
    x=y.loc[fh].index,
    y=y.loc[fh].values,
    mode="lines",
    name="Actual",
    line=dict(color="black", width=2)
))

# Naive forecast
fig.add_trace(go.Scatter(
    x=naive_preds.index,
    y=naive_preds.values,
    mode="lines",
    name="Naive",
    line=dict(color="orange", dash="dash")
))

# SARIMAX zonder features
fig.add_trace(go.Scatter(
    x=fh,
    y=sarimax_preds_nofeatures,
    mode="lines",
    name="SARIMAX (no exog)",
    line=dict(color="steelblue", dash="dot")
))

# SARIMAX met features
fig.add_trace(go.Scatter(
    x=fh,
    y=sarimax_preds_with_features,
    mode="lines",
    name="SARIMAX + features",
    line=dict(color="forestgreen", dash="dashdot")
))

# Layout
fig.update_layout(
    template="plotly_white",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=40, b=40),
    xaxis_title="Tijd (UTC)",
    yaxis_title="Prijs",
    hovermode="x unified"
)

fig.show()

In [7]:
day_rmse_data = []
for model_name, preds in {
    "Naive": naive_preds,
    "SARIMAX (no exog)": sarimax_preds_nofeatures,
    "SARIMAX + features": sarimax_preds_with_features
}.items():
    df_day = y.loc[fh].to_frame("actual").join(preds.rename("pred"))
    df_day["date"] = df_day.index.date
    daily_rmse = df_day.groupby("date").apply(lambda x: root_mean_squared_error(x["actual"], x["pred"]))
    day_rmse_data.append(daily_rmse.round(3).rename(model_name))

rmse_day_df = pd.concat(day_rmse_data, axis=1)
display(HTML("<h3>📅 RMSE per dag</h3>"))
display(rmse_day_df)









Unnamed: 0_level_0,Naive,SARIMAX (no exog),SARIMAX + features
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-03-15,0.029,0.025,0.024
2025-03-16,0.014,0.048,0.048
2025-03-17,0.028,0.029,0.028
2025-03-18,0.049,0.04,0.038
2025-03-19,0.046,0.041,0.04
2025-03-20,0.051,0.041,0.041
2025-03-21,0.069,0.064,0.056


In [8]:
rmse_full_hourly_df = pd.DataFrame(index=fh)
for model_name, preds in {
    "Naive": naive_preds,
    "SARIMAX (no exog)": sarimax_preds_nofeatures,
    "SARIMAX + features": sarimax_preds_with_features
}.items():
    actual = y.loc[fh]
    rmse = ((actual - preds) ** 2) ** 0.5
    rmse_full_hourly_df[model_name] = rmse.round(3)

display(HTML("<h3>🕒 RMSE per tijdstip (alle 168 uur)</h3>"))
display(rmse_full_hourly_df)

Unnamed: 0,Naive,SARIMAX (no exog),SARIMAX + features
2025-03-15 00:00:00+00:00,0.001,0.004,0.004
2025-03-15 01:00:00+00:00,0.009,0.009,0.009
2025-03-15 02:00:00+00:00,0.014,0.013,0.012
2025-03-15 03:00:00+00:00,0.013,0.015,0.013
2025-03-15 04:00:00+00:00,0.018,0.027,0.025
...,...,...,...
2025-03-21 19:00:00+00:00,0.066,0.077,0.069
2025-03-21 20:00:00+00:00,0.085,0.089,0.080
2025-03-21 21:00:00+00:00,0.084,0.089,0.079
2025-03-21 22:00:00+00:00,0.085,0.080,0.070


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from models.naive_model import run_naive_model
from models.sarimax_model import run_sarimax
from IPython.display import display
import numpy as np

def evaluate_rmse(y_true: pd.Series, y_pred: pd.Series) -> float:
    mask = y_true.index.intersection(y_pred.dropna().index)
    return np.sqrt(mean_squared_error(y_true.loc[mask], y_pred.loc[mask]))

# Sliding window validatie
loop_days = 5
horizon = 168
results = []

for i in range(loop_days):
    delta = pd.Timedelta(days=i)
    t_start = train_start + delta
    t_end = train_end + delta
    fh_start = t_end + pd.Timedelta(hours=1)
    fh = pd.date_range(start=fh_start, periods=horizon, freq="h")

    try:
        # Naive model
        y_train_naive = y.loc[t_end - pd.Timedelta(hours=167):t_end]
        naive_preds = run_naive_model(y_train_naive)
        rmse_naive = evaluate_rmse(y.loc[fh], naive_preds.loc[fh])

        # SARIMAX met features
        y_train_sarimax = y.loc[t_start:t_end]
        X_train_sarimax = X.loc[t_start:t_end]
        X_test_sarimax = X.loc[fh]

        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sarimax), index=X_train_sarimax.index, columns=X_train_sarimax.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test_sarimax), index=X_test_sarimax.index, columns=X_test_sarimax.columns)

        sarimax_preds, _ = run_sarimax(y_train_sarimax, X_train_scaled, X_test_scaled, order=(1, 1, 1), seasonal_order=(1, 1, 1, 24))
        rmse_sarimax = evaluate_rmse(y.loc[fh], sarimax_preds)

        results.append({
            "start": str(t_start.date()),
            "end": str(t_end.date()),
            "rmse_naive": round(rmse_naive, 3),
            "rmse_sarimax": round(rmse_sarimax, 3)
        })

    except Exception as e:
        print(f"⚠️ Iteratie {i} overgeslagen: {e}")

df_results = pd.DataFrame(results)
display(df_results)

⚠️ Iteratie 0 overgeslagen: Test set is too small for lag 168. Minimum required rows: 169, found: 168
⚠️ Iteratie 1 overgeslagen: Test set is too small for lag 168. Minimum required rows: 169, found: 168
⚠️ Iteratie 2 overgeslagen: Test set is too small for lag 168. Minimum required rows: 169, found: 168
⚠️ Iteratie 3 overgeslagen: Test set is too small for lag 168. Minimum required rows: 169, found: 168
⚠️ Iteratie 4 overgeslagen: Test set is too small for lag 168. Minimum required rows: 169, found: 168
