In [1]:
# Voeg src/ toe aan importpad
import sys
from pathlib import Path
project_root = Path.cwd().parent if Path.cwd().name == "tests" else Path.cwd()
src_path = project_root / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
import numpy as np
import sqlite3

from models.naive_model import run_naive_model, evaluate_rmse
from models.sarimax_model import run_sarimax

In [2]:
# DB connectie en master_warp inladen
db_path = project_root / "src" / "data" / "WARP.db"
conn = sqlite3.connect(db_path)
df = pd.read_sql("SELECT * FROM master_warp", conn)
conn.close()

# Print kolomnamen als lijst
print("Kolomnamen in df:", df.columns.tolist())

# Zet datetime index
df["target_datetime"] = pd.to_datetime(df["target_datetime"], utc=True)
df = df.sort_values("target_datetime").set_index("target_datetime")

# Targets en features
target_col = "Price"
feature_cols = ["Total_Flow", "temperature_2m", "Solar_Vol"]
horizon = 168

# Opschonen en splitsen
y = df[target_col].dropna()
y = y[~y.index.duplicated()]
X = df[feature_cols].loc[y.index].dropna()


Kolomnamen in df: ['hour', 'day_of_week', 'month', 'day_of_year', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'yearday_sin', 'yearday_cos', 'is_holiday', 'is_weekend', 'is_non_working_day', 'target_datetime', 'Load', 'Price', 'Flow_BE_to_NL', 'Flow_NL_to_BE', 'Flow_DE_to_NL', 'Flow_NL_to_DE', 'Flow_GB_to_NL', 'Flow_NL_to_GB', 'Flow_DK_to_NL', 'Flow_NL_to_DK', 'Flow_NO_to_NL', 'Flow_NL_to_NO', 'Flow_BE', 'Flow_DE', 'Flow_GB', 'Flow_DK', 'Flow_NO', 'Total_Flow', 'temperature_2m', 'wind_speed_10m', 'apparent_temperature', 'cloud_cover', 'snowfall', 'diffuse_radiation', 'direct_normal_irradiance', 'shortwave_radiation', 'Wind_Vol', 'WindOffshore_Vol', 'Solar_Vol', 'Nuclear_Vol']


In [3]:
# 📅 Handmatige datums instellen
train_start = pd.Timestamp("2025-01-01 00:00:00", tz="UTC")
train_end = pd.Timestamp("2025-03-14 23:00:00", tz="UTC")
test_start = pd.Timestamp("2025-03-15 00:00:00", tz="UTC")
test_end = pd.Timestamp("2025-04-14 23:00:00", tz="UTC")
fh_start = pd.Timestamp("2025-03-15 00:00:00", tz="UTC")  # VASTZETTEN

# Horizon (7 dagen vooruit)
horizon = 168
fh = pd.date_range(start=fh_start, periods=horizon, freq="h")

# SARIMAX splits
y_train_sarimax = y.loc[train_start:train_end]
X_train_sarimax = X.loc[train_start:train_end]
X_test_sarimax = X.loc[fh]

# Naive splits
y_train_naive = y.loc[train_end - pd.Timedelta(hours=167):train_end]
y_test = y.loc[fh]

In [4]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sarimax), index=X_train_sarimax.index, columns=X_train_sarimax.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_sarimax), index=X_test_sarimax.index, columns=X_test_sarimax.columns)

In [5]:
# ------------------------------
# 🟡 Naive model
# ------------------------------
naive_preds = run_naive_model(y, lag=horizon).loc[fh]
rmse_naive = root_mean_squared_error(y.loc[fh], naive_preds)

# ------------------------------
# 🔵 SARIMAX zonder features
# ------------------------------
sarimax_preds_nofeatures, rmse_sarimax_nf = run_sarimax(
    y_train_sarimax,
    X_train=None,
    X_test=pd.DataFrame(index=fh),
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 24)
)

# ------------------------------
# 🔵 SARIMAX met geschaalde exogene features
# ------------------------------
sarimax_preds_with_features, rmse_sarimax_feat = run_sarimax(
    y_train_sarimax,
    X_train_scaled,
    X_test_scaled,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 24)
)

# ------------------------------
# 📊 Toon resultaten
# ------------------------------
print(f"🟡 Naive RMSE:          {rmse_naive:.3f}")
print(f"🔵 SARIMAX (no exog):   {rmse_sarimax_nf:.3f}")
print(f"🟢 SARIMAX (with exog): {rmse_sarimax_feat:.3f}")

2025-05-20 10:23:14,773 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-20 10:23:23,854 - sarimax - INFO - 📊 RMSE: 0.03
2025-05-20 10:23:23,912 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-20 10:23:50,366 - sarimax - INFO - 📊 RMSE: 0.03


🟡 Naive RMSE:          0.044


TypeError: unsupported format string passed to NoneType.__format__

In [None]:
# 📈 Interactieve plot zonder Test
fig = go.Figure()

# Trainingset
fig.add_trace(go.Scatter(
    x=y_train_sarimax.index,
    y=y_train_sarimax.values,
    mode="lines",
    name="Train",
    line=dict(color="lightgray", width=2)
))

# Werkelijke waarden tijdens forecast
fig.add_trace(go.Scatter(
    x=y.loc[fh].index,
    y=y.loc[fh].values,
    mode="lines",
    name="Actual",
    line=dict(color="black", width=2)
))

# Naive forecast
fig.add_trace(go.Scatter(
    x=naive_preds.index,
    y=naive_preds.values,
    mode="lines",
    name="Naive",
    line=dict(color="orange", dash="dash")
))

# SARIMAX zonder features
fig.add_trace(go.Scatter(
    x=fh,
    y=sarimax_preds_nofeatures,
    mode="lines",
    name="SARIMAX (no exog)",
    line=dict(color="steelblue", dash="dot")
))

# SARIMAX met features
fig.add_trace(go.Scatter(
    x=fh,
    y=sarimax_preds_with_features,
    mode="lines",
    name="SARIMAX + features",
    line=dict(color="forestgreen", dash="dashdot")
))

# Layout
fig.update_layout(
    template="plotly_white",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=40, b=40),
    xaxis_title="Tijd (UTC)",
    yaxis_title="Prijs",
    hovermode="x unified"
)

fig.show()

In [None]:
day_rmse_data = []
for model_name, preds in {
    "Naive": naive_preds,
    "SARIMAX (no exog)": sarimax_preds_nofeatures,
    "SARIMAX + features": sarimax_preds_with_features
}.items():
    df_day = y.loc[fh].to_frame("actual").join(preds.rename("pred"))
    df_day["date"] = df_day.index.date
    daily_rmse = df_day.groupby("date").apply(lambda x: root_mean_squared_error(x["actual"], x["pred"]))
    day_rmse_data.append(daily_rmse.round(3).rename(model_name))

rmse_day_df = pd.concat(day_rmse_data, axis=1)
display(HTML("<h3>📅 RMSE per dag</h3>"))
display(rmse_day_df)

In [None]:
rmse_full_hourly_df = pd.DataFrame(index=fh)
for model_name, preds in {
    "Naive": naive_preds,
    "SARIMAX (no exog)": sarimax_preds_nofeatures,
    "SARIMAX + features": sarimax_preds_with_features
}.items():
    actual = y.loc[fh]
    rmse = ((actual - preds) ** 2) ** 0.5
    rmse_full_hourly_df[model_name] = rmse.round(3)

display(HTML("<h3>🕒 RMSE per tijdstip (alle 168 uur)</h3>"))
display(rmse_full_hourly_df)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from models.naive_model import run_naive_model
from models.sarimax_model import run_sarimax
from IPython.display import display
import numpy as np

def evaluate_rmse(y_true: pd.Series, y_pred: pd.Series) -> float:
    mask = y_true.index.intersection(y_pred.dropna().index)
    return np.sqrt(mean_squared_error(y_true.loc[mask], y_pred.loc[mask]))

# Sliding window validatie
loop_days = 5
horizon = 168
results = []

for i in range(loop_days):
    delta = pd.Timedelta(days=i)
    t_start = train_start + delta
    t_end = train_end + delta
    fh_start = t_end + pd.Timedelta(hours=1)
    fh = pd.date_range(start=fh_start, periods=horizon, freq="h")

    try:
        # Naive model
        y_train_naive = y.loc[t_end - pd.Timedelta(hours=167):t_end]
        naive_preds = run_naive_model(y_train_naive)
        rmse_naive = evaluate_rmse(y.loc[fh], naive_preds.loc[fh])

        # SARIMAX met features
        y_train_sarimax = y.loc[t_start:t_end]
        X_train_sarimax = X.loc[t_start:t_end]
        X_test_sarimax = X.loc[fh]

        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sarimax), index=X_train_sarimax.index, columns=X_train_sarimax.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test_sarimax), index=X_test_sarimax.index, columns=X_test_sarimax.columns)

        sarimax_preds, _ = run_sarimax(y_train_sarimax, X_train_scaled, X_test_scaled, order=(1, 1, 1), seasonal_order=(1, 1, 1, 24))
        rmse_sarimax = evaluate_rmse(y.loc[fh], sarimax_preds)

        results.append({
            "start": str(t_start.date()),
            "end": str(t_end.date()),
            "rmse_naive": round(rmse_naive, 3),
            "rmse_sarimax": round(rmse_sarimax, 3)
        })

    except Exception as e:
        print(f"⚠️ Iteratie {i} overgeslagen: {e}")

df_results = pd.DataFrame(results)
display(df_results)

In [None]:
from models.sarimax_model import run_sarimax
import pandas as pd
import numpy as np

# Genereer dummy time series data
np.random.seed(42)
date_rng = pd.date_range(start="2024-01-01", periods=200, freq='H')
data = pd.Series(10 + np.random.randn(200).cumsum(), index=date_rng)

# Simuleer train/test split
train = data.iloc[:168]
test = data.iloc[168:]

# Geen exogene variabelen voor deze test
X_train = None
X_test = None

# Modelconfiguratie
order = (1, 1, 1)
seasonal_order = (1, 1, 1, 24)

# Run het model + log automatisch naar logs.db
y_pred, rmse = run_sarimax(
    train_df=train,
    X_train=X_train,
    X_test=X_test,
    order=order,
    seasonal_order=seasonal_order
)

print(f"✅ Test gelukt — RMSE: {rmse:.4f}")


## Test 

In [6]:
import inspect
from utils import build_training_set
print("🧠 build_training_set.py pad:", inspect.getfile(build_training_set))


🧠 build_training_set.py pad: c:\Users\dai\ENEXIS\src\utils\build_training_set.py


In [1]:
import sys
from pathlib import Path
import sqlite3
import pandas as pd

# 📦 Setup paths
project_root = Path.cwd().parent if Path.cwd().name == "tests" else Path.cwd()
src_path = project_root / "src"
db_path = src_path / "data" / "WARP.db"

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# 📅 Zelfde datums als in build_training_set.py
train_start = pd.Timestamp("2025-01-01 00:00:00", tz="UTC")
train_end = pd.Timestamp("2025-03-14 23:00:00", tz="UTC")
forecast_start = train_end + pd.Timedelta(hours=1)
forecast_end = forecast_start + pd.Timedelta(hours=167)

# 🧠 Import en build runnen
from utils.build_training_set import build_training_set
build_training_set()

# 📥 Dataset ophalen
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM training_set", conn)
conn.close()

# 📊 Basisinspectie
df["target_datetime"] = pd.to_datetime(df["target_datetime"], utc=True)
print(f"✅ Gecombineerde trainingset: {df.shape[0]} rijen, {df.shape[1]} kolommen")
print("📅 target_datetime range:", df["target_datetime"].min(), "→", df["target_datetime"].max())
display(df.head(3))

# 📦 Kolomnamen inspecteren
forecast_suffixes = [col for col in df.columns if "_forecast" in col]
if forecast_suffixes:
    print("⚠️ Deze forecast-suffix kolommen zijn nog aanwezig:")
    print(forecast_suffixes)
else:
    print("✅ Geen forecast-suffix kolommen aanwezig.")

# 🔎 Splits actuals vs forecast op basis van tijd
df_actuals = df[df["target_datetime"] <= train_end]
df_forecast = df[df["target_datetime"] >= forecast_start]

print(f"\n🟢 Aantal actuals: {df_actuals.shape[0]} (verwacht: {(train_end - train_start).days * 24 + 24})")
print(f"🔵 Aantal forecast uren: {df_forecast.shape[0]} (verwacht: 168)")
print(f"📍 Forecast start bij: {df_forecast['target_datetime'].min()}")
print(f"📍 Forecast eindigt bij: {df_forecast['target_datetime'].max()}")

# 🔁 Unieke target_datetime check
dupes = df["target_datetime"].duplicated().sum()
print(f"\n♻️ Dubbele target_datetime entries: {dupes}")

# 🕳️ Missende waarden
missing = df.isna().sum()
print("\n🕳️ Top 10 kolommen met missende waarden:")
print(missing[missing > 0].sort_values(ascending=False).head(10))

# 🧪 Unieke run_date check
if "run_date" in df_forecast.columns:
    unique_runs = df_forecast["run_date"].nunique()
    print(f"\n🔁 Aantal unieke run_date's in forecast: {unique_runs}")
    if unique_runs != 1:
        print(df_forecast["run_date"].value_counts())


2025-05-20 15:11:31,636 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-20 15:11:31,639 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-14 23:00:00+00:00
2025-05-20 15:11:31,642 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 00:00:00+00:00, target range: 2025-03-15 00:00:00+00:00 → 2025-03-21 23:00:00+00:00
2025-05-20 15:11:31,776 - build_training_set - INFO - ✅ master_warp geladen: 1752 rijen
2025-05-20 15:11:32,043 - build_training_set - INFO - ✅ Forecast geladen: 0 rijen voor run_date 2025-03-15
2025-05-20 15:11:32,057 - build_training_set - INFO - 📦 Eindtabel bevat: 1752 rijen, 41 kolommen
2025-05-20 15:11:32,060 - build_training_set - INFO - 🧾 Kolommen: ['hour', 'day_of_week', 'month', 'day_of_year', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'yearday_sin', 'yearday_cos', 'local_datetime', 'is_dst', 'is_holiday', 'is_weekend', 'is_non_working_day', 'target_datetime', 'Load', 'Price', 'Flow_BE', 'Flo

✅ Gecombineerde trainingset: 1752 rijen, 41 kolommen
📅 target_datetime range: 2025-01-01 00:00:00+00:00 → 2025-03-14 23:00:00+00:00


Unnamed: 0,hour,day_of_week,month,day_of_year,hour_sin,hour_cos,weekday_sin,weekday_cos,yearday_sin,yearday_cos,...,shortwave_radiation,Wind_Vol,WindOffshore_Vol,Solar_Vol,Nuclear_Vol,index,fetch_moment,run_date,wind_direction_10m,direct_radiation
0,0,2,1,1,0.0,1.0,0.974928,-0.222521,0.017202,0.999852,...,0.0,6519520.0,4158000.0,0.0,486250.0,,,,,
1,1,2,1,1,0.258819,0.965926,0.974928,-0.222521,0.017202,0.999852,...,0.0,5917659.0,4158000.0,0.0,487000.0,,,,,
2,2,2,1,1,0.5,0.866025,0.974928,-0.222521,0.017202,0.999852,...,0.0,4994553.0,4158000.0,0.0,487000.0,,,,,


✅ Geen forecast-suffix kolommen aanwezig.

🟢 Aantal actuals: 1752 (verwacht: 1752)
🔵 Aantal forecast uren: 0 (verwacht: 168)
📍 Forecast start bij: NaT
📍 Forecast eindigt bij: NaT

♻️ Dubbele target_datetime entries: 0

🕳️ Top 10 kolommen met missende waarden:
index                 1752
fetch_moment          1752
run_date              1752
wind_direction_10m    1752
direct_radiation      1752
dtype: int64

🔁 Aantal unieke run_date's in forecast: 0
Series([], Name: count, dtype: int64)


## Test log met dummy data

In [8]:
# Voeg src/ toe aan het importpad
import sys
from pathlib import Path

project_root = Path.cwd().parent if Path.cwd().name == "tests" else Path.cwd()
src_path = project_root / "src"
utils_path = src_path / "utils"

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Importeer de logfunctie
from utils.log_rmse_to_sqlite import log_rmse_to_sqlite

# Dummy testdata opbouwen
rmse_per_day = {str(k): round(0.1 + 0.01 * k, 3) for k in range(1, 8)}  # 1 t/m 7
rmse_per_hour = {str(k): round(0.2 + 0.001 * k, 3) for k in range(168)}  # 0 t/m 167

# Testaanroep
log_rmse_to_sqlite(
    model_name="SARIMAX",
    variant="with_exog_test",
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    forecast_start="2025-03-15 00:00:00",
    forecast_end="2025-03-21 23:00:00",
    rmse_overall=0.235,
    rmse_per_day=rmse_per_day,
    rmse_per_hour=rmse_per_hour,
    parameters={"order": [1, 1, 1], "seasonal_order": [1, 1, 1, 24]},
    features_used=["Total_Flow", "temperature_2m", "Solar_Vol"]
)

# ✅ Controleer of het is gelogd
import sqlite3
import pandas as pd

log_path = src_path / "data" / "logs.db"
conn = sqlite3.connect(log_path)
df_logs = pd.read_sql_query("SELECT * FROM model_rmse_logs ORDER BY id DESC LIMIT 1", conn)
conn.close()

print("✅ Laatste logregel:")
display(df_logs)


2025-05-20 10:25:24,556 - sarimax - INFO - ✅ RMSE-log succesvol opgeslagen voor model: SARIMAX (with_exog_test)


✅ Laatste logregel:


Unnamed: 0,id,model_name,variant,train_start,train_end,forecast_start,forecast_end,forecast_horizon,rmse_json,parameters_json,features_used_json,created_at
0,2,SARIMAX,with_exog_test,2025-01-01 00:00:00,2025-03-14 23:00:00,2025-03-15 00:00:00,2025-03-21 23:00:00,168,"{""overall"": 0.235, ""per_day"": {""1"": 0.11, ""2"":...","{""order"": [1, 1, 1], ""seasonal_order"": [1, 1, ...","[""Total_Flow"", ""temperature_2m"", ""Solar_Vol""]",2025-05-20T08:25:24.547066
