In [1]:
# Cell 1 - Imports & Dynamic Validation Setup
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
from datetime import datetime, timedelta
import sqlite3
import json
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

# Setup paths
current_dir = Path.cwd()
if "ENEXIS" in str(current_dir):
    while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
        current_dir = current_dir.parent
    project_root = current_dir
else:
    project_root = current_dir

sys.path.insert(0, str(project_root / "src"))
from utils.build_training_set import build_training_set

# Load training data
training_data = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 12:00:00"
).set_index('target_datetime')
training_data.index = pd.to_datetime(training_data.index, utc=True)

def load_validation_data(start_date: str, end_date: str) -> pd.DataFrame:
    """Load actual price data from master_warp table"""
    db_path = project_root / "src" / "data" / "WARP.db"
    conn = sqlite3.connect(db_path)
    
    query = "SELECT target_datetime, Price FROM master_warp WHERE target_datetime >= ? AND target_datetime <= ? ORDER BY target_datetime"
    validation_data = pd.read_sql_query(query, conn, params=[start_date, end_date], parse_dates=['target_datetime'])
    conn.close()
    
    if len(validation_data) > 0:
        validation_data = validation_data.set_index('target_datetime')
        validation_data.index = pd.to_datetime(validation_data.index, utc=True)
        return validation_data
    return None

# Calculate validation periods
train_end = training_data.index.max()
validation_start = train_end + timedelta(hours=1)
validation_end = validation_start + timedelta(days=30)

print(f"Training: {training_data.shape[0]} rows, ends {train_end.date()}")
print(f"Validation: {validation_start.date()} to {validation_end.date()} (30 days)")

2025-05-30 12:04:59,979 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-30 12:04:59,986 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-14 23:00:00+00:00
2025-05-30 12:05:00,011 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 12:00:00+00:00, target range: 2025-03-15 12:00:00+00:00 → 2025-03-22 11:00:00+00:00
2025-05-30 12:05:00,012 - build_training_set - INFO - 📥 Loading actuals with selected columns only...
2025-05-30 12:05:00,014 - build_training_set - INFO - 📋 Requested columns found: 20/20
2025-05-30 12:05:00,015 - build_training_set - INFO - 📋 Using columns: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 'hour_sin', 'weekday_cos']
2025-05-30 12:05:00,031 - build_training_set - INFO - ✅ 

Training: 1752 rows, ends 2025-03-14
Validation: 2025-03-15 to 2025-04-14 (30 days)


In [2]:
# Cell 2 - Model Configuration

EXOG_VARS = [
    'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 
    'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 
    'yearday_sin', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 
    'weekday_sin', 'hour_sin', 'weekday_cos'
]

available_exog = [col for col in EXOG_VARS if col in training_data.columns]

# Load your optimized SARIMAX parameters
config_file = project_root / "src" / "config" / "best_sarimax_params.json"
if config_file.exists():
    with open(config_file, 'r') as f:
        best_params = json.load(f)
    current_order = tuple(best_params['order'])
    current_seasonal = tuple(best_params['seasonal_order'])
    print(f"Using optimized: order={current_order}, seasonal={current_seasonal}")
else:
    # Fallback defaults
    current_order = (1, 0, 1)
    current_seasonal = (1, 1, 1, 24)
    print(f"Using defaults: order={current_order}, seasonal={current_seasonal}")

print(f"Exog variables: {len(available_exog)}/{len(EXOG_VARS)}")

Using optimized: order=(2, 0, 0), seasonal=(1, 1, 0, 24)
Exog variables: 18/18


In [None]:
# Cell 3 - 30 Day Rolling Window Validation (Notebook, Threaded)

from concurrent.futures import ThreadPoolExecutor, as_completed
import time

def validate_single_day(day, training_data, exog_vars, forecast_days, current_order, current_seasonal):
    """
    Voert een rolling window forecast uit voor 1 dag.
    """
    train_end = training_data.index.max()
    forecast_start = train_end + timedelta(days=day, hours=1)
    forecast_end = forecast_start + timedelta(days=forecast_days-1, hours=23)
    try:
        # Haal validatie data op
        validation_data = load_validation_data(
            forecast_start.strftime("%Y-%m-%d %H:%M:%S"),
            forecast_end.strftime("%Y-%m-%d %H:%M:%S")
        )
        if validation_data is None or len(validation_data) == 0:
            return {'Day': day+1, 'Date': forecast_start.date(), 'Naive': np.nan, 'SARIMA': np.nan, 'SARIMAX': np.nan, 'Status': 'NO_DATA'}
        # Training window tot aan de voorspeldatum
        train_cutoff = forecast_start - timedelta(hours=1)
        train_data = training_data[training_data.index <= train_cutoff]
        if len(train_data) < 168:
            return {'Day': day+1, 'Date': forecast_start.date(), 'Naive': np.nan, 'SARIMA': np.nan, 'SARIMAX': np.nan, 'Status': 'INSUFFICIENT_TRAIN'}
        y_actual = validation_data['Price']
        forecast_hours = len(y_actual)
        result = {'Day': day+1, 'Date': forecast_start.date(), 'Status': 'SUCCESS'}
        # NAIVE
        try:
            y_train = train_data['Price']
            naive_forecast = [y_train.iloc[-(168 - h % 168)] if len(y_train) >= 168 else y_train.iloc[-1] for h in range(forecast_hours)]
            result['Naive'] = np.sqrt(mean_squared_error(y_actual, naive_forecast))
        except Exception:
            result['Naive'] = np.nan
        # SARIMA
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                sarima_model = SARIMAX(
                    train_data['Price'],
                    order=current_order,
                    seasonal_order=current_seasonal,
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                fitted_sarima = sarima_model.fit(method='lbfgs', maxiter=15, disp=False)
                sarima_forecast = fitted_sarima.forecast(steps=forecast_hours)
                result['SARIMA'] = np.sqrt(mean_squared_error(y_actual, sarima_forecast))
        except Exception:
            result['SARIMA'] = np.nan
        # SARIMAX
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                # Exog voor validatie (herhaal laatste week features)
                future_exog = pd.DataFrame(index=validation_data.index, columns=exog_vars)
                recent_exog = train_data[exog_vars].iloc[-168:]
                for i in range(forecast_hours):
                    future_exog.iloc[i] = recent_exog.iloc[i % len(recent_exog)]
                sarimax_model = SARIMAX(
                    train_data['Price'],
                    exog=train_data[exog_vars],
                    order=current_order,
                    seasonal_order=current_seasonal,
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                fitted_sarimax = sarimax_model.fit(method='lbfgs', maxiter=15, disp=False)
                sarimax_forecast = fitted_sarimax.forecast(steps=forecast_hours, exog=future_exog)
                result['SARIMAX'] = np.sqrt(mean_squared_error(y_actual, sarimax_forecast))
        except Exception:
            result['SARIMAX'] = np.nan
        return result
    except Exception as e:
        return {'Day': day+1, 'Date': forecast_start.date(), 'Naive': np.nan, 'SARIMA': np.nan, 'SARIMAX': np.nan, 'Status': f'ERROR: {str(e)[:50]}'}

# -------- Rolling Validation --------

n_days = 30
forecast_days = 7
max_workers = 4  # Pas aan op je laptop/VM (4-6 optimaal)

print("Running 30-day rolling window validation...")
start_time = time.time()
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_day = {
        executor.submit(
            validate_single_day, day, training_data, available_exog, forecast_days, current_order, current_seasonal
        ): day for day in range(n_days)
    }
    for future in as_completed(future_to_day):
        day = future_to_day[future]
        try:
            result = future.result()
            results.append(result)
            naive_str = f"N:{result['Naive']:.4f}" if not pd.isna(result['Naive']) else "N:FAIL"
            sarima_str = f"S:{result['SARIMA']:.4f}" if not pd.isna(result['SARIMA']) else "S:FAIL"
            sarimax_str = f"X:{result['SARIMAX']:.4f}" if not pd.isna(result['SARIMAX']) else "X:FAIL"
            print(f"Day {result['Day']:2d}: {result['Date']} | {naive_str} | {sarima_str} | {sarimax_str}")
        except Exception as e:
            print(f"Day {day+1:2d}: FAILED ({str(e)[:30]})")
            results.append({'Day': day+1, 'Status': 'FAILED'})
elapsed_time = time.time() - start_time

# DataFrame sorteren en samenvatten
results_df = pd.DataFrame(results).sort_values('Day').reset_index(drop=True)
print(f"\nValidation completed in {elapsed_time:.1f} seconds")
display(results_df)

Running 30-day rolling window validation...
