In [1]:
# Voeg src/ toe aan importpad
import sys
from pathlib import Path
project_root = Path.cwd().parent if Path.cwd().name == "tests" else Path.cwd()
src_path = project_root / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML
import numpy as np
import sqlite3

from models.naive_model import run_naive_model, evaluate_rmse
from models.sarimax_model import run_sarimax

ImportError: cannot import name 'ModelFactory' from partially initialized module 'models.factory' (most likely due to a circular import) (/Users/redouan/ENEXIS/src/models/factory.py)

In [2]:
# DB connectie en master_warp inladen
db_path = project_root / "src" / "data" / "WARP.db"
conn = sqlite3.connect(db_path)
df = pd.read_sql("SELECT * FROM master_warp", conn)
conn.close()

# Print kolomnamen als lijst
print("Kolomnamen in df:", df.columns.tolist())

# Zet datetime index
df["target_datetime"] = pd.to_datetime(df["target_datetime"], utc=True)
df = df.sort_values("target_datetime").set_index("target_datetime")

# Targets en features
# Targets and features
target_col = "Price"
# Original feature_cols with 'wind_direction_10m' and 'direct_radiation' removed
feature_cols = ["Load", "shortwave_radiation", "temperature_2m", "direct_normal_irradiance", "diffuse_radiation", 
               "Flow_NO", "yearday_cos", "Flow_GB", "month", "is_dst", "yearday_sin", 
               "is_non_working_day", "hour_cos", "is_weekend", "cloud_cover", "weekday_sin", 
               "hour_sin", "weekday_cos"]

# Opschonen en splitsen
y = df[target_col].dropna()
y = y[~y.index.duplicated()]
X = df[feature_cols].loc[y.index].dropna()


Kolomnamen in df: ['hour', 'day_of_week', 'month', 'day_of_year', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'yearday_sin', 'yearday_cos', 'local_datetime', 'is_dst', 'is_holiday', 'is_weekend', 'is_non_working_day', 'target_datetime', 'Load', 'Price', 'Flow_BE', 'Flow_DE', 'Flow_GB', 'Flow_DK', 'Flow_NO', 'Total_Flow', 'temperature_2m', 'wind_speed_10m', 'apparent_temperature', 'cloud_cover', 'snowfall', 'diffuse_radiation', 'direct_normal_irradiance', 'shortwave_radiation']


In [5]:
# 📅 Handmatige datums instellen
train_start = pd.Timestamp("2025-01-01 00:00:00", tz="UTC")
train_end = pd.Timestamp("2025-03-14 23:00:00", tz="UTC")
test_start = pd.Timestamp("2025-03-15 00:00:00", tz="UTC")
test_end = pd.Timestamp("2025-04-14 23:00:00", tz="UTC")
fh_start = pd.Timestamp("2025-03-15 00:00:00", tz="UTC")  # VASTZETTEN

# Horizon (7 dagen vooruit)
horizon = 168
fh = pd.date_range(start=fh_start, periods=horizon, freq="h")

# SARIMAX splits
y_train_sarimax = y.loc[train_start:train_end]
X_train_sarimax = X.loc[train_start:train_end]
X_test_sarimax = X.loc[fh]

# Naive splits
y_train_naive = y.loc[train_end - pd.Timedelta(hours=167):train_end]
y_test = y.loc[fh]

In [6]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_sarimax), index=X_train_sarimax.index, columns=X_train_sarimax.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_sarimax), index=X_test_sarimax.index, columns=X_test_sarimax.columns)

In [None]:
# ------------------------------
# 🟡 Naive model
# ------------------------------
naive_preds = run_naive_model(y, lag=horizon).loc[fh]
rmse_naive = root_mean_squared_error(y.loc[fh], naive_preds)

# ------------------------------
# 🔵 SARIMAX zonder features
# ------------------------------
# Add frequency information to avoid warnings - use freq directly instead of to_period
y_train_sarimax_freq = y_train_sarimax.copy()
y_train_sarimax_freq.index = pd.DatetimeIndex(y_train_sarimax_freq.index, freq='H')

sarimax_preds_nofeatures, rmse_sarimax_nf = run_sarimax(
    y_train_sarimax_freq,
    X_train=None,
    X_test=pd.DataFrame(index=pd.DatetimeIndex(fh, freq='H')),
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 24)
)

# Handle case where run_sarimax returns predictions but not RMSE
if rmse_sarimax_nf is None and isinstance(sarimax_preds_nofeatures, pd.Series):
    rmse_sarimax_nf = root_mean_squared_error(y.loc[fh], sarimax_preds_nofeatures)

# ------------------------------
# 🟢 SARIMAX met geschaalde exogene features
# ------------------------------
# First ensure indices have frequency information - use freq directly
X_train_scaled_freq = X_train_scaled.copy()
X_train_scaled_freq.index = pd.DatetimeIndex(X_train_scaled_freq.index, freq='H')
X_test_scaled_freq = X_test_scaled.copy()
X_test_scaled_freq.index = pd.DatetimeIndex(X_test_scaled_freq.index, freq='H')

# Increase max iterations to help with convergence
sarimax_preds_with_features, rmse_sarimax_feat = run_sarimax(
    y_train_sarimax_freq,
    X_train_scaled_freq,
    X_test_scaled_freq,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 24)
)

# Handle case where run_sarimax returns predictions but not RMSE
if rmse_sarimax_feat is None and isinstance(sarimax_preds_with_features, pd.Series):
    rmse_sarimax_feat = root_mean_squared_error(y.loc[fh], sarimax_preds_with_features)

# ------------------------------
# 📊 Toon resultaten
# ------------------------------
print(f"🟡 Naive RMSE:          {rmse_naive:.3f}")
print(f"🔵 SARIMAX (no exog):   {rmse_sarimax_nf:.3f}")
print(f"🟢 SARIMAX (with exog): {rmse_sarimax_feat:.3f}")

# Check which model performs best
best_rmse = min(rmse_naive, rmse_sarimax_nf, rmse_sarimax_feat)
if best_rmse == rmse_naive:
    best_model = "Naive"
elif best_rmse == rmse_sarimax_nf:
    best_model = "SARIMAX (no exog)"
else:
    best_model = "SARIMAX (with exog)"

print(f"\n🏆 Best model: {best_model} with RMSE: {best_rmse:.3f}")

  y_train_sarimax_freq.index = pd.DatetimeIndex(y_train_sarimax_freq.index).to_period('H').to_timestamp()
  y_train_sarimax_freq.index = pd.DatetimeIndex(y_train_sarimax_freq.index).to_period('H').to_timestamp()
2025-05-23 08:12:57,253 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)
2025-05-23 08:13:12,458 - sarimax - INFO - 📊 RMSE: 0.03
  X_train_scaled_freq.index = pd.DatetimeIndex(X_train_scaled_freq.index).to_period('H').to_timestamp()
  X_train_scaled_freq.index = pd.DatetimeIndex(X_train_scaled_freq.index).to_period('H').to_timestamp()
  X_test_scaled_freq.index = pd.DatetimeIndex(X_test_scaled_freq.index).to_period('H').to_timestamp()
  X_test_scaled_freq.index = pd.DatetimeIndex(X_test_scaled_freq.index).to_period('H').to_timestamp()
2025-05-23 08:13:12,474 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)
2025-05-23 08:14:21,122 - sarimax - INFO - 📊 RMSE: 0.04


🟡 Naive RMSE:          0.043
🔵 SARIMAX (no exog):   0.038237021335844154
🟢 SARIMAX (with exog): 0.024475335747416987


In [8]:
# 📈 Interactieve plot zonder Test
fig = go.Figure()

# Trainingset
fig.add_trace(go.Scatter(
    x=y_train_sarimax.index,
    y=y_train_sarimax.values,
    mode="lines",
    name="Train",
    line=dict(color="lightgray", width=2)
))

# Werkelijke waarden tijdens forecast
fig.add_trace(go.Scatter(
    x=y.loc[fh].index,
    y=y.loc[fh].values,
    mode="lines",
    name="Actual",
    line=dict(color="black", width=2)
))

# Naive forecast
fig.add_trace(go.Scatter(
    x=naive_preds.index,
    y=naive_preds.values,
    mode="lines",
    name="Naive",
    line=dict(color="orange", dash="dash")
))

# SARIMAX zonder features
fig.add_trace(go.Scatter(
    x=fh,
    y=sarimax_preds_nofeatures,
    mode="lines",
    name="SARIMAX (no exog)",
    line=dict(color="steelblue", dash="dot")
))

# SARIMAX met features
fig.add_trace(go.Scatter(
    x=fh,
    y=sarimax_preds_with_features,
    mode="lines",
    name="SARIMAX + features",
    line=dict(color="forestgreen", dash="dashdot")
))

# Layout
fig.update_layout(
    template="plotly_white",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=40, b=40),
    xaxis_title="Tijd (UTC)",
    yaxis_title="Prijs",
    hovermode="x unified"
)

fig.show()

In [None]:
# Create dictionary of model predictions
model_predictions = {
    "Naive": naive_preds,
    "SARIMAX (no exog)": sarimax_preds_nofeatures,
    "SARIMAX + features": sarimax_preds_with_features
}

# Calculate RMSE per day
day_rmse_data = []
for model_name, preds in model_predictions.items():
    # Ensure preds is a Series and has the right index
    if not isinstance(preds, pd.Series):
        print(f"⚠️ {model_name} predictions not available as a Series, skipping...")
        continue
        
    # Create dataframe with actual and predicted values
    df_day = y.loc[fh].to_frame("actual").join(preds.rename("pred"))
    
    # Add date column
    df_day["date"] = df_day.index.date
    
    # Function to calculate RMSE for a group with error handling
    def calc_group_rmse(group):
        try:
            return root_mean_squared_error(group["actual"], group["pred"])
        except Exception as e:
            print(f"⚠️ Error calculating RMSE for {model_name} on {group.name}: {e}")
            return np.nan
            
    # Group by date and calculate RMSE for each day
    daily_rmse = df_day.groupby("date").apply(calc_group_rmse)
    day_rmse_data.append(daily_rmse.round(3).rename(model_name))

# Combine all daily RMSE data
rmse_day_df = pd.concat(day_rmse_data, axis=1)
display(HTML("<h3>📅 RMSE per dag</h3>"))
display(rmse_day_df)

# Create empty DataFrame for hourly errors
rmse_full_hourly_df = pd.DataFrame(index=fh)

# Calculate absolute error at each timestamp
for model_name, preds in model_predictions.items():
    if not isinstance(preds, pd.Series):
        print(f"⚠️ {model_name} predictions not available as a Series, skipping...")
        continue
        
    # Get actuals for the forecast horizon
    actual = y.loc[fh]
    
    # Calculate absolute error at each timestamp
    abs_error = np.abs(actual - preds)
    rmse_full_hourly_df[model_name] = abs_error.round(3)

display(HTML("<h3>🕒 Absolute error per tijdstip (alle 168 uur)</h3>"))
display(rmse_full_hourly_df)

# Add summary table showing average daily RMSE
display(HTML("<h3>📊 Gemiddelde RMSE per dag</h3>"))
avg_day_rmse = rmse_day_df.mean().to_frame("Avg Daily RMSE").round(3)
best_day = avg_day_rmse["Avg Daily RMSE"].idxmin()
avg_day_rmse["Rank"] = avg_day_rmse["Avg Daily RMSE"].rank()
avg_day_rmse.loc[best_day, "Note"] = "Best model"
display(avg_day_rmse)

# Add summary table showing percentage of days each model is best
display(HTML("<h3>🥇 Aantal dagen dat model het best presteert</h3>"))
best_days = rmse_day_df.idxmin(axis=1).value_counts().to_frame("Number of days best")
best_days["Percentage"] = (best_days["Number of days best"] / len(rmse_day_df) * 100).round(1)
best_days["Percentage"] = best_days["Percentage"].astype(str) + '%'
display(best_days)

⚠️ Error calculating daily RMSE for SARIMAX + features: Cannot join tz-naive with tz-aware DatetimeIndex








Unnamed: 0,Naive,SARIMAX (no exog),SARIMAX + features
2025-03-15,0.029,0.021,
2025-03-16,0.014,0.044,
2025-03-17,0.028,0.026,
2025-03-18,0.049,0.036,
2025-03-19,0.046,0.039,
2025-03-20,0.051,0.039,
2025-03-21,0.064,0.053,


⚠️ Error calculating hourly errors for SARIMAX + features: Cannot join tz-naive with tz-aware DatetimeIndex


Unnamed: 0,Naive,SARIMAX (no exog),SARIMAX + features
2025-03-15 00:00:00+00:00,0.005,0.011,
2025-03-15 01:00:00+00:00,0.001,0.006,
2025-03-15 02:00:00+00:00,0.001,0.001,
2025-03-15 03:00:00+00:00,0.009,0.004,
2025-03-15 04:00:00+00:00,0.014,0.008,
...,...,...,...
2025-03-21 19:00:00+00:00,0.069,0.069,
2025-03-21 20:00:00+00:00,0.062,0.062,
2025-03-21 21:00:00+00:00,0.066,0.070,
2025-03-21 22:00:00+00:00,0.085,0.082,



📊 Overall RMSE per model:
Naive: 0.043
SARIMAX (no exog): 0.038
SARIMAX + features: 0.024


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from models.naive_model import run_naive_model
from models.sarimax_model import run_sarimax
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt

def evaluate_rmse(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Calculate RMSE between two series"""
    # Find common indices
    common_idx = y_true.index.intersection(y_pred.index)
    if len(common_idx) == 0:
        return None
    
    return np.sqrt(mean_squared_error(y_true.loc[common_idx], y_pred.loc[common_idx]))

# Custom naive model function to avoid the lag validation issue
def custom_naive_model(y_true: pd.Series, lag: int = 168) -> pd.Series:
    """Run naive model without strict validation on lag length"""
    # Simply shift the series by lag periods
    return y_true.shift(lag)

# Sliding window validatie
loop_days = 5
horizon = 168
results = []

print(f"🔄 Starting sliding window validation with {loop_days} windows, horizon={horizon} hours")

for i in range(loop_days):
    delta = pd.Timedelta(days=i)
    t_start = train_start + delta
    t_end = train_end + delta
    fh_start = t_end + pd.Timedelta(hours=1)
    fh = pd.date_range(start=fh_start, periods=horizon, freq="h")
    
    print(f"\n📊 Window {i+1}/{loop_days}: {t_start.date()} → {t_end.date()}, forecast: {fh_start.date()} → {fh[-1].date()}")

    # --- Naive model ---
    try:
        # Get a longer training window for naive model to satisfy the lag requirement
        extended_start = t_end - pd.Timedelta(hours=horizon)
        y_train_naive = y.loc[extended_start:t_end]
        
        if len(y_train_naive) > horizon:
            # Use the original run_naive_model function if we have enough data
            naive_preds = run_naive_model(y_train_naive, lag=horizon)
        else:
            # Use our custom function that doesn't enforce the length check
            naive_preds = custom_naive_model(y_train_naive, lag=min(horizon, len(y_train_naive)-1))
            
        # Make sure we have predictions for the forecast horizon
        if fh[0] not in naive_preds.index:
            # If not, generate predictions directly for the forecast horizon
            # by using the last value in training repeated for each forecast step
            last_value = y_train_naive.iloc[-1]
            naive_preds = pd.Series(index=fh, data=[last_value] * len(fh))
            
        rmse_naive = evaluate_rmse(y.loc[fh], naive_preds.loc[fh])
        print(f"  ✓ Naive RMSE: {rmse_naive:.3f}")
    except Exception as e:
        print(f"  ⚠️ Naive model error: {e}")
        # Create dummy predictions using the last known value
        last_value = y.loc[:t_end].iloc[-1] if not y.loc[:t_end].empty else 0
        naive_preds = pd.Series(index=fh, data=[last_value] * len(fh))
        rmse_naive = evaluate_rmse(y.loc[fh], naive_preds)
        print(f"  ✓ Naive RMSE (fallback): {rmse_naive:.3f}")
    
    # --- SARIMAX model ---
    # Prepare data with frequency
    y_train_sarimax = y.loc[t_start:t_end]
    y_train_sarimax.index = pd.DatetimeIndex(y_train_sarimax.index, freq='H')
    
    # SARIMAX without features
    try:
        sarimax_preds_nofeatures, rmse_sarimax_nf_model = run_sarimax(
            y_train_sarimax,
            X_train=None,
            X_test=pd.DataFrame(index=pd.DatetimeIndex(fh, freq='H')),
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 24)
        )
        
        # Calculate RMSE manually if model didn't return it
        if rmse_sarimax_nf_model is None:
            rmse_sarimax_nf = evaluate_rmse(y.loc[fh], sarimax_preds_nofeatures)
        else:
            rmse_sarimax_nf = rmse_sarimax_nf_model
            
        print(f"  ✓ SARIMAX (no exog) RMSE: {rmse_sarimax_nf:.3f}")
    except Exception as e:
        print(f"  ⚠️ SARIMAX without features error: {e}")
        rmse_sarimax_nf = np.nan
    
    # SARIMAX with features
    try:
        # Prepare exogenous variables
        X_train_sarimax = X.loc[t_start:t_end]
        X_test_sarimax = X.loc[fh]
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(
            scaler.fit_transform(X_train_sarimax), 
            index=X_train_sarimax.index, 
            columns=X_train_sarimax.columns
        )
        X_test_scaled = pd.DataFrame(
            scaler.transform(X_test_sarimax), 
            index=X_test_sarimax.index, 
            columns=X_test_sarimax.columns
        )
        
        # Set frequency
        X_train_scaled.index = pd.DatetimeIndex(X_train_scaled.index, freq='H')
        X_test_scaled.index = pd.DatetimeIndex(X_test_scaled.index, freq='H')
        
        # Run model
        sarimax_preds_with_features, rmse_sarimax_feat_model = run_sarimax(
            y_train_sarimax,
            X_train_scaled,
            X_test_scaled,
            order=(1, 1, 1),
            seasonal_order=(1, 1, 1, 24)
        )
        
        # Calculate RMSE manually if model didn't return it
        if rmse_sarimax_feat_model is None:
            rmse_sarimax_feat = evaluate_rmse(y.loc[fh], sarimax_preds_with_features)
        else:
            rmse_sarimax_feat = rmse_sarimax_feat_model
            
        print(f"  ✓ SARIMAX (with features) RMSE: {rmse_sarimax_feat:.3f}")
    except Exception as e:
        print(f"  ⚠️ SARIMAX with features error: {e}")
        rmse_sarimax_feat = np.nan
    
    # Add results to list
    results.append({
        "start": str(t_start.date()),
        "end": str(t_end.date()),
        "forecast_start": str(fh_start.date()),
        "forecast_end": str(fh[-1].date()),
        "rmse_naive": round(rmse_naive, 3) if rmse_naive is not None else np.nan,
        "rmse_sarimax_no_exog": round(rmse_sarimax_nf, 3) if not np.isnan(rmse_sarimax_nf) else np.nan,
        "rmse_sarimax_with_exog": round(rmse_sarimax_feat, 3) if not np.isnan(rmse_sarimax_feat) else np.nan
    })

# Create results dataframe
df_results = pd.DataFrame(results)

# Add best model column
def get_best_model(row):
    models = {
        "Naive": row["rmse_naive"],
        "SARIMAX (no exog)": row["rmse_sarimax_no_exog"],
        "SARIMAX (with exog)": row["rmse_sarimax_with_exog"]
    }
    # Filter out NaN values
    valid_models = {k: v for k, v in models.items() if not np.isnan(v)}
    if not valid_models:
        return "No valid model"
    return min(valid_models.items(), key=lambda x: x[1])[0]

df_results["best_model"] = df_results.apply(get_best_model, axis=1)

# Display results
display(df_results)

# Create summary statistics
print("\n📊 Summary Statistics:")
valid_naive = df_results["rmse_naive"].dropna()
valid_sarimax_no_exog = df_results["rmse_sarimax_no_exog"].dropna()
valid_sarimax_with_exog = df_results["rmse_sarimax_with_exog"].dropna()

print(f"Average Naive RMSE: {valid_naive.mean():.3f}" if not valid_naive.empty else "No valid Naive results")
print(f"Average SARIMAX (no exog) RMSE: {valid_sarimax_no_exog.mean():.3f}" if not valid_sarimax_no_exog.empty else "No valid SARIMAX (no exog) results")
print(f"Average SARIMAX (with exog) RMSE: {valid_sarimax_with_exog.mean():.3f}" if not valid_sarimax_with_exog.empty else "No valid SARIMAX (with exog) results")

# Count best models
best_model_counts = df_results["best_model"].value_counts()
print("\n🏆 Best Model Counts:")
for model, count in best_model_counts.items():
    print(f"{model}: {count} windows ({count/len(df_results)*100:.1f}%)")

# Visualize RMSE comparison
plt.figure(figsize=(10, 6))
for col, color, marker in zip(
    ["rmse_naive", "rmse_sarimax_no_exog", "rmse_sarimax_with_exog"],
    ["orange", "blue", "green"],
    ["o", "s", "^"]
):
    # Skip columns with all NaN values
    if df_results[col].isna().all():
        continue
        
    plt.plot(
        range(len(df_results)), 
        df_results[col], 
        marker=marker, 
        color=color, 
        label=col.replace("rmse_", "").replace("_", " ").title(),
        linestyle='-' if not df_results[col].isna().any() else '--'  # Use dashed line if there are any NaNs
    )

plt.xlabel("Window")
plt.ylabel("RMSE")
plt.title("RMSE Comparison Across Time Windows")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.

2025-05-23 08:14:21,655 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)


🔄 Starting sliding window validation with 5 windows, horizon=168 hours

📊 Window 1/5: 2025-01-01 → 2025-03-14, forecast: 2025-03-15 → 2025-03-21
  🟡 Running Naive model...
  ❌ Naive model error: Test set is too small for lag 168. Minimum required rows: 169, found: 168
  🟢 Running SARIMAX with features...



Maximum Likelihood optimization failed to converge. Check mle_retvals

2025-05-23 08:15:29,405 - sarimax - INFO - 📊 RMSE: 0.04

'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.

2025-05-23 08:15:29,451 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)


     ✓ SARIMAX RMSE: 0.024

📊 Window 2/5: 2025-01-02 → 2025-03-15, forecast: 2025-03-16 → 2025-03-22
  🟡 Running Naive model...
  ❌ Naive model error: Test set is too small for lag 168. Minimum required rows: 169, found: 168
  🟢 Running SARIMAX with features...



Maximum Likelihood optimization failed to converge. Check mle_retvals

2025-05-23 08:16:38,590 - sarimax - INFO - 📊 RMSE: 0.02

'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.

2025-05-23 08:16:38,615 - sarimax - INFO - 📈 Fitting SARIMAX with order=(1, 1, 1), seasonal_order=(1, 1, 1, 24)


     ✓ SARIMAX RMSE: 0.043

📊 Window 3/5: 2025-01-03 → 2025-03-16, forecast: 2025-03-17 → 2025-03-23
  🟡 Running Naive model...
  ❌ Naive model error: Test set is too small for lag 168. Minimum required rows: 169, found: 168
  🟢 Running SARIMAX with features...


KeyboardInterrupt: 

## Test log met dummy data

In [None]:
# Voeg src/ toe aan het importpad
import sys
from pathlib import Path

project_root = Path.cwd().parent if Path.cwd().name == "tests" else Path.cwd()
src_path = project_root / "src"
utils_path = src_path / "utils"

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Importeer de logfunctie
from utils.log_rmse_to_sqlite import log_rmse_to_sqlite

# Dummy testdata opbouwen
rmse_per_day = {str(k): round(0.1 + 0.01 * k, 3) for k in range(1, 8)}  # 1 t/m 7
rmse_per_hour = {str(k): round(0.2 + 0.001 * k, 3) for k in range(168)}  # 0 t/m 167

# Testaanroep
log_rmse_to_sqlite(
    model_name="SARIMAX",
    variant="with_exog_test",
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    forecast_start="2025-03-15 00:00:00",
    forecast_end="2025-03-21 23:00:00",
    rmse_overall=0.235,
    rmse_per_day=rmse_per_day,
    rmse_per_hour=rmse_per_hour,
    parameters={"order": [1, 1, 1], "seasonal_order": [1, 1, 1, 24]},
    features_used=["Total_Flow", "temperature_2m", "Solar_Vol"]
)

# ✅ Controleer of het is gelogd
import sqlite3
import pandas as pd

log_path = src_path / "data" / "logs.db"
conn = sqlite3.connect(log_path)
df_logs = pd.read_sql_query("SELECT * FROM model_rmse_logs ORDER BY id DESC LIMIT 1", conn)
conn.close()

print("✅ Laatste logregel:")
display(df_logs)


In [5]:
# Cell 2 - Enhanced Model Configuration

import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler

# Original exogenous variables
EXOG_VARS = [
    'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 
    'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 
    'yearday_sin', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 
    'weekday_sin', 'hour_sin', 'weekday_cos'
]

def create_lag_features(data, key_vars=['Load', 'temperature_2m', 'Flow_NO'], max_lags=3):
    """Create lag features for most important variables"""
    enhanced_data = data.copy()
    
    for var in key_vars:
        if var in data.columns:
            for lag in range(1, max_lags + 1):
                lag_col = f'{var}_lag{lag}'
                enhanced_data[lag_col] = data[var].shift(lag)
    
    return enhanced_data

def create_interaction_features(data, interactions=[('temperature_2m', 'Load'), ('hour_cos', 'is_weekend')]):
    """Create interaction features for key variable pairs"""
    enhanced_data = data.copy()
    
    for var1, var2 in interactions:
        if var1 in data.columns and var2 in data.columns:
            interaction_col = f'{var1}_x_{var2}'
            enhanced_data[interaction_col] = data[var1] * data[var2]
    
    return enhanced_data

def select_optimal_features(data, target_col='Price', max_features=15):
    """Select top features based on mutual information"""
    feature_cols = [col for col in data.columns if col != target_col]
    
    if len(feature_cols) <= max_features:
        return feature_cols
    
    X = data[feature_cols].fillna(0)
    y = data[target_col].fillna(method='ffill')
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    mi_scores = mutual_info_regression(X_scaled, y, random_state=42)
    feature_importance = list(zip(feature_cols, mi_scores))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    
    selected_features = [feat for feat, _ in feature_importance[:max_features]]
    return selected_features

def get_smart_auto_arima_bounds(data, target_col='Price'):
    """Determine Auto-ARIMA bounds based on data characteristics"""
    prices = data[target_col].dropna()
    
    volatility = prices.std()
    trend_strength = abs(prices.diff().mean()) / prices.std()
    seasonal_strength = abs(prices.groupby(prices.index.hour).mean().std()) / prices.std()
    
    if volatility > 0.08 or trend_strength > 0.02:
        bounds = {
            'max_p': 4, 'max_q': 3, 'max_d': 2,
            'max_P': 2, 'max_Q': 2, 'max_D': 1,
            'stepwise': False
        }
        complexity = "HIGH"
    elif seasonal_strength > 0.15:
        bounds = {
            'max_p': 3, 'max_q': 2, 'max_d': 2,
            'max_P': 2, 'max_Q': 2, 'max_D': 1,
            'stepwise': True
        }
        complexity = "MEDIUM"
    else:
        bounds = {
            'max_p': 2, 'max_q': 2, 'max_d': 1,
            'max_P': 1, 'max_Q': 1, 'max_D': 1,
            'stepwise': True
        }
        complexity = "LOW"
    
    return bounds, complexity

def get_optimized_parameters():
    """Load optimized parameters if available"""
    try:
        config_file = project_root / "src" / "config" / "best_sarimax_params.json"
        if config_file.exists():
            with open(config_file, 'r') as f:
                best_params = json.load(f)
            return {
                'order': tuple(best_params['order']),
                'seasonal_order': tuple(best_params['seasonal_order']),
                'improvement': best_params.get('improvement_vs_baseline', 0),
                'updated_at': best_params.get('updated_at', 'Unknown')[:19]
            }
    except Exception:
        pass
    return None

print("Enhancing feature set...")

enhanced_data = create_lag_features(training_data)
enhanced_data = create_interaction_features(enhanced_data)

feature_candidates = [col for col in enhanced_data.columns if col != 'Price']
print(f"Feature candidates: {len(feature_candidates)} (original: {len(EXOG_VARS)})")

enhanced_data_clean = enhanced_data.dropna()
if len(enhanced_data_clean) < len(training_data) * 0.8:
    print("Too much data lost to NaN, using original features")
    FINAL_EXOG_VARS = [col for col in EXOG_VARS if col in training_data.columns]
    enhanced_data_clean = training_data
else:
    FINAL_EXOG_VARS = select_optimal_features(enhanced_data_clean, max_features=15)
    print(f"Selected features: {len(FINAL_EXOG_VARS)}")

auto_arima_bounds, data_complexity = get_smart_auto_arima_bounds(enhanced_data_clean)
print(f"Data complexity: {data_complexity}")
print(f"Auto-ARIMA bounds: max_p={auto_arima_bounds['max_p']}, max_q={auto_arima_bounds['max_q']}, max_P={auto_arima_bounds['max_P']}, max_Q={auto_arima_bounds['max_Q']}")

optimized_params = get_optimized_parameters()
if optimized_params:
    current_order = optimized_params['order']
    current_seasonal = optimized_params['seasonal_order']
    print(f"Using optimized parameters: order={current_order}, seasonal={current_seasonal}")
    print(f"Expected improvement: {optimized_params['improvement']:.1f}%")
    print(f"Last updated: {optimized_params['updated_at']}")
else:
    current_order = (1, 0, 1)
    current_seasonal = (1, 1, 1, 24)
    print(f"Using default parameters: order={current_order}, seasonal={current_seasonal}")

print(f"Final exogenous variables: {len(FINAL_EXOG_VARS)}")
print(f"Enhanced dataset: {enhanced_data_clean.shape[0]} rows (original: {training_data.shape[0]})")

training_data_enhanced = enhanced_data_clean
EXOG_VARS_ENHANCED = FINAL_EXOG_VARS

Enhancing feature set...
Feature candidates: 29 (original: 18)
Selected features: 15
Data complexity: MEDIUM
Auto-ARIMA bounds: max_p=3, max_q=2, max_P=2, max_Q=2
Using optimized parameters: order=(2, 0, 0), seasonal=(1, 1, 0, 24)
Expected improvement: 30.1%
Last updated: 2025-05-29T12:56:58
Final exogenous variables: 15
Enhanced dataset: 1749 rows (original: 1752)


In [None]:
# Cell 5 - Smart Optimization Engine
import sys
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import warnings
from datetime import datetime, timedelta
import time
import sqlite3
import hashlib
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from utils.validation_utils import run_validation_experiment

warnings.filterwarnings('ignore')
logging.getLogger('build_training_set').setLevel(logging.ERROR)

current_dir = Path.cwd()
if "ENEXIS" in str(current_dir):
    while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
        current_dir = current_dir.parent
    project_root = current_dir
else:
    project_root = current_dir

src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

from utils.build_training_set import build_training_set

training_data = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 12:00:00"
)

training_data = training_data.set_index('target_datetime')
training_data.index = pd.to_datetime(training_data.index, utc=True)


def get_data_signature(training_data):
    """Create unique signature for current dataset"""
    data_str = f"{training_data.shape}_{training_data.iloc[0, 0]:.6f}_{training_data.iloc[-1, 0]:.6f}"
    return hashlib.md5(data_str.encode()).hexdigest()[:8]

def setup_optimization_database():
    """Setup optimization results database"""
    log_db = project_root / "src" / "data" / "optimization_logs.db"
    log_db.parent.mkdir(parents=True, exist_ok=True)
    
    conn = sqlite3.connect(log_db)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS optimization_results (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            data_signature TEXT, order_params TEXT, seasonal_order_params TEXT,
            mean_rmse REAL, std_rmse REAL, improvement_vs_baseline REAL,
            overall_score REAL, test_days INTEGER, created_at TEXT
        )
    """)
    conn.commit()
    conn.close()
    return log_db

def check_cached_results(order, seasonal_order, data_signature, log_db):
    """Check for existing optimization results"""
    conn = sqlite3.connect(log_db)
    cursor = conn.cursor()
    cursor.execute("""
        SELECT mean_rmse, improvement_vs_baseline, overall_score, created_at
        FROM optimization_results 
        WHERE order_params = ? AND seasonal_order_params = ? AND data_signature = ?
        AND created_at > datetime('now', '-7 days')
        ORDER BY created_at DESC LIMIT 1
    """, (str(order), str(seasonal_order), data_signature))
    result = cursor.fetchone()
    conn.close()
    return result

def test_parameter_configuration(order, seasonal_order, training_data, exog_vars, baseline_rmse, data_signature):
    """Test a single parameter configuration efficiently"""
    try:
        # Use validation_utils but with modified parameters
        import utils.validation_utils as val_utils
        original_validation = val_utils.run_single_day_validation
        
        def modified_validation(day, training_data, exog_vars):
            result = original_validation(day, training_data, exog_vars)
            
            # Override SARIMAX with our test parameters
            if 'SARIMAX' in result or result.get('Status') not in ['SPLIT_FAIL', 'LOAD_FAIL']:
                try:
                    from datetime import datetime, timedelta
                    import warnings
                    from sklearn.metrics import mean_squared_error
                    from statsmodels.tsa.statespace.sarimax import SARIMAX
                    
                    # Recreate the split
                    daily_data = training_data.copy()
                    if day > 0:
                        np.random.seed(day)
                        noise_factor = 0.001 * day
                        daily_data['Price'] = daily_data['Price'] + np.random.normal(0, noise_factor, len(daily_data))
                    
                    split_point = daily_data.index[-24]
                    train_data = daily_data[daily_data.index < split_point]['Price'].copy()
                    test_data = daily_data[daily_data.index >= split_point]['Price'].copy()
                    
                    if exog_vars:
                        train_exog = daily_data[daily_data.index < split_point][exog_vars].copy()
                        test_exog = daily_data[daily_data.index >= split_point][exog_vars].copy()
                        
                        with warnings.catch_warnings():
                            warnings.simplefilter("ignore")
                            model = SARIMAX(
                                train_data, exog=train_exog, 
                                order=order, seasonal_order=seasonal_order,
                                enforce_stationarity=False, enforce_invertibility=False
                            )
                            fitted_model = model.fit(method='lbfgs', maxiter=25, disp=False)
                            forecast = fitted_model.forecast(steps=len(test_data), exog=test_exog)
                            rmse = np.sqrt(mean_squared_error(test_data, forecast))
                            result['SARIMAX'] = rmse
                except:
                    pass
            
            return result
        
        val_utils.run_single_day_validation = modified_validation
        
        # Test on representative days
        test_results = val_utils.run_validation_experiment(training_data, exog_vars, n_days=7)
        val_utils.run_single_day_validation = original_validation
        
        valid_results = test_results['SARIMAX'].dropna()
        
        if len(valid_results) >= 3:
            mean_rmse = valid_results.mean()
            std_rmse = valid_results.std()
            improvement = ((baseline_rmse - mean_rmse) / baseline_rmse) * 100
            stability_bonus = max(0, (0.02 - std_rmse) * 10)  # Reward stability
            overall_score = improvement + stability_bonus
            
            return {
                'order': order, 'seasonal_order': seasonal_order,
                'mean_rmse': mean_rmse, 'std_rmse': std_rmse,
                'improvement_vs_baseline': improvement, 'overall_score': overall_score,
                'test_days': len(valid_results), 'success': True
            }
    except:
        pass
    
    return {'success': False}

def run_smart_optimization(training_data, exog_vars, mode='balanced'):
    """
    Smart optimization with different modes:
    - quick: Test 4 key combinations (30 seconds)
    - balanced: Test 8 combinations with cache (1-2 minutes)  
    - thorough: Test 12 combinations fresh (3-5 minutes)
    """
    
    print(f"🚀 Smart Optimization Engine - {mode.upper()} mode")
    print("=" * 55)
    
    log_db = setup_optimization_database()
    data_signature = get_data_signature(training_data)
    baseline_rmse = 0.025  # Approximate baseline
    
    # Parameter combinations by mode
    if mode == 'quick':
        combinations = [
            ((1, 0, 1), (1, 1, 1, 24)),  # Current standard
            ((2, 0, 1), (1, 1, 1, 24)),  # More AR
            ((1, 0, 2), (1, 1, 1, 24)),  # More MA
            ((2, 0, 0), (1, 1, 0, 24)),  # AR only
        ]
        use_parallel = False
    elif mode == 'balanced':
        combinations = [
            ((1, 0, 1), (1, 1, 1, 24)), ((2, 0, 1), (1, 1, 1, 24)),
            ((1, 0, 2), (1, 1, 1, 24)), ((2, 0, 0), (1, 1, 0, 24)),
            ((1, 1, 1), (1, 1, 1, 24)), ((2, 0, 2), (0, 1, 0, 24)),
            ((1, 0, 1), (2, 1, 1, 24)), ((1, 0, 1), (1, 1, 2, 24))
        ]
        use_parallel = True
    else:  # thorough
        combinations = [
            ((1, 0, 1), (1, 1, 1, 24)), ((2, 0, 1), (1, 1, 1, 24)), ((1, 0, 2), (1, 1, 1, 24)),
            ((2, 0, 0), (1, 1, 0, 24)), ((1, 1, 1), (1, 1, 1, 24)), ((2, 0, 2), (0, 1, 0, 24)),
            ((1, 0, 1), (2, 1, 1, 24)), ((1, 0, 1), (1, 1, 2, 24)), ((3, 0, 1), (1, 1, 1, 24)),
            ((1, 0, 3), (1, 1, 1, 24)), ((2, 1, 0), (1, 1, 0, 24)), ((0, 1, 2), (1, 1, 1, 24))
        ]
        use_parallel = True
    
    print(f"Testing {len(combinations)} parameter combinations...")
    start_time = time.time()
    
    results = []
    cached_count = 0
    tested_count = 0
    
    if use_parallel and len(combinations) > 4:
        # Parallel execution
        with ThreadPoolExecutor(max_workers=3) as executor:
            future_to_params = {}
            
            for order, seasonal in combinations:
                cached = check_cached_results(order, seasonal, data_signature, log_db)
                if cached and mode != 'thorough':
                    cached_count += 1
                    mean_rmse, improvement, score, created_at = cached
                    results.append({
                        'order': order, 'seasonal_order': seasonal,
                        'mean_rmse': mean_rmse, 'improvement_vs_baseline': improvement,
                        'overall_score': score, 'cached': True
                    })
                    print(f"📦 CACHED {order}, {seasonal}: RMSE={mean_rmse:.6f}, Score={score:.2f}")
                else:
                    future = executor.submit(test_parameter_configuration, order, seasonal, 
                                           training_data, exog_vars, baseline_rmse, data_signature)
                    future_to_params[future] = (order, seasonal)
            
            for future in as_completed(future_to_params):
                order, seasonal = future_to_params[future]
                tested_count += 1
                print(f"🧪 TESTING {order}, {seasonal}...", end=" ")
                
                result = future.result()
                if result['success']:
                    results.append(result)
                    print(f"RMSE={result['mean_rmse']:.6f}, Score={result['overall_score']:.2f}")
                    
                    # Save to database
                    conn = sqlite3.connect(log_db)
                    conn.execute("""
                        INSERT INTO optimization_results 
                        (data_signature, order_params, seasonal_order_params, mean_rmse, std_rmse,
                         improvement_vs_baseline, overall_score, test_days, created_at)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                    """, (data_signature, str(order), str(seasonal), result['mean_rmse'], result['std_rmse'],
                          result['improvement_vs_baseline'], result['overall_score'], result['test_days'],
                          datetime.utcnow().isoformat()))
                    conn.commit()
                    conn.close()
                else:
                    print("FAILED")
    else:
        # Sequential execution
        for order, seasonal in combinations:
            cached = check_cached_results(order, seasonal, data_signature, log_db)
            if cached and mode != 'thorough':
                cached_count += 1
                mean_rmse, improvement, score, created_at = cached
                results.append({
                    'order': order, 'seasonal_order': seasonal,
                    'mean_rmse': mean_rmse, 'improvement_vs_baseline': improvement,
                    'overall_score': score, 'cached': True
                })
                print(f"📦 CACHED {order}, {seasonal}: RMSE={mean_rmse:.6f}")
            else:
                tested_count += 1
                print(f"🧪 TESTING {order}, {seasonal}...", end=" ")
                result = test_parameter_configuration(order, seasonal, training_data, exog_vars, baseline_rmse, data_signature)
                if result['success']:
                    results.append(result)
                    print(f"RMSE={result['mean_rmse']:.6f}")
                else:
                    print("FAILED")
    
    elapsed_time = time.time() - start_time
    
    # Results summary
    print(f"\n🏆 Optimization Results ({elapsed_time:.1f}s)")
    print(f"Cached: {cached_count}, Tested: {tested_count}, Total: {len(results)}")
    
    if results:
        # Sort by overall score
        results.sort(key=lambda x: x['overall_score'], reverse=True)
        
        print(f"\nTop 5 Configurations:")
        for i, result in enumerate(results[:5], 1):
            cached_marker = " 📦" if result.get('cached') else ""
            print(f"  {i}. {result['order']}, {result['seasonal_order']}: "
                  f"RMSE={result['mean_rmse']:.6f}, "
                  f"Improvement={result['improvement_vs_baseline']:+.1f}%, "
                  f"Score={result['overall_score']:.2f}{cached_marker}")
        
        # Update best configuration
        best_result = results[0]
        if best_result['improvement_vs_baseline'] > 0:
            config_file = project_root / "src" / "config" / "best_sarimax_params.json"
            config_file.parent.mkdir(parents=True, exist_ok=True)
            
            config_data = {
                'order': best_result['order'],
                'seasonal_order': best_result['seasonal_order'],
                'mean_rmse': best_result['mean_rmse'],
                'improvement_vs_baseline': best_result['improvement_vs_baseline'],
                'overall_score': best_result['overall_score'],
                'updated_at': datetime.utcnow().isoformat(),
                'optimization_mode': mode
            }
            
            with open(config_file, 'w') as f:
                json.dump(config_data, f, indent=2)
            
            print(f"\n✅ Best configuration saved to config file")
            print(f"Expected improvement: {best_result['improvement_vs_baseline']:.1f}%")
        
        return results
    else:
        print("❌ No successful configurations found")
        return []

# Interactive optimization
print("Choose optimization mode:")
print("1. ⚡ QUICK (4 configs, ~30s)")
print("2. 🎯 BALANCED (8 configs with cache, ~1-2min)")  
print("3. 🔬 THOROUGH (12 configs fresh, ~3-5min)")

try:
    choice = input("Enter 1, 2, or 3: ").strip()
    mode_map = {'1': 'quick', '2': 'balanced', '3': 'thorough'}
    selected_mode = mode_map.get(choice, 'balanced')
except:
    selected_mode = 'balanced'
    print("Using BALANCED mode")

if 'EXOG_VARS_ENHANCED' in locals():
    print("Using enhanced feature set")
    optimization_results = run_smart_optimization(training_data_enhanced, EXOG_VARS_ENHANCED, mode=selected_mode)
else:
    print("Using original feature set")
    optimization_results = run_smart_optimization(training_data, EXOG_VARS, mode=selected_mode)

Choose optimization mode:
1. ⚡ QUICK (4 configs, ~30s)
2. 🎯 BALANCED (8 configs with cache, ~1-2min)
3. 🔬 THOROUGH (12 configs fresh, ~3-5min)
Using enhanced feature set
🚀 Smart Optimization Engine - THOROUGH mode
Testing 12 parameter combinations...
🧪 TESTING (1, 0, 1), (1, 1, 1, 24)... RMSE=0.015633, Score=37.66
