# Time Series Forecasting

This notebook demonstrates the new modular architecture for time series forecasting experiments.

## Features:
- 🔧 Configuration-driven experiments
- 📊 Unified logging and metrics
- 🎨 Interactive visualizations
- 🔄 Rolling window validation
- ⚡ Parallel model execution

Cel 1 - import

In [1]:
# Cell 1 - Imports & Dynamic Validation Setup
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
from datetime import datetime, timedelta
import sqlite3
import json
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

# Setup paths
current_dir = Path.cwd()
if "ENEXIS" in str(current_dir):
    while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
        current_dir = current_dir.parent
    project_root = current_dir
else:
    project_root = current_dir

sys.path.insert(0, str(project_root / "src"))
from utils.build_training_set import build_training_set

# Load training data
training_data = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 12:00:00"
).set_index('target_datetime')
training_data.index = pd.to_datetime(training_data.index, utc=True)

def load_validation_data(start_date: str, end_date: str) -> pd.DataFrame:
    """Load actual price data from master_warp table"""
    db_path = project_root / "src" / "data" / "WARP.db"
    conn = sqlite3.connect(db_path)
    
    query = "SELECT target_datetime, Price FROM master_warp WHERE target_datetime >= ? AND target_datetime <= ? ORDER BY target_datetime"
    validation_data = pd.read_sql_query(query, conn, params=[start_date, end_date], parse_dates=['target_datetime'])
    conn.close()
    
    if len(validation_data) > 0:
        validation_data = validation_data.set_index('target_datetime')
        validation_data.index = pd.to_datetime(validation_data.index, utc=True)
        return validation_data
    return None

# Calculate validation periods
train_end = training_data.index.max()
validation_start = train_end + timedelta(hours=1)
validation_end = validation_start + timedelta(days=30)

print(f"Training: {training_data.shape[0]} rows, ends {train_end.date()}")
print(f"Validation: {validation_start.date()} to {validation_end.date()} (30 days)")

2025-05-30 12:13:45,690 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-30 12:13:45,691 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-14 23:00:00+00:00
2025-05-30 12:13:45,691 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 12:00:00+00:00, target range: 2025-03-15 12:00:00+00:00 → 2025-03-22 11:00:00+00:00
2025-05-30 12:13:45,692 - build_training_set - INFO - 📥 Loading actuals with selected columns only...
2025-05-30 12:13:45,694 - build_training_set - INFO - 📋 Requested columns found: 20/20
2025-05-30 12:13:45,694 - build_training_set - INFO - 📋 Using columns: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 'hour_sin', 'weekday_cos']
2025-05-30 12:13:45,735 - build_training_set - INFO - ✅ 

Training: 1752 rows, ends 2025-03-14
Validation: 2025-03-15 to 2025-04-14 (30 days)


cell 2 - model config

In [2]:
# Cell 2 - Model Configuration

EXOG_VARS = [
    'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 
    'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 
    'yearday_sin', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 
    'weekday_sin', 'hour_sin', 'weekday_cos'
]

available_exog = [col for col in EXOG_VARS if col in training_data.columns]

# Load your optimized SARIMAX parameters
config_file = project_root / "src" / "config" / "best_sarimax_params.json"
if config_file.exists():
    with open(config_file, 'r') as f:
        best_params = json.load(f)
    current_order = tuple(best_params['order'])
    current_seasonal = tuple(best_params['seasonal_order'])
    print(f"Using optimized: order={current_order}, seasonal={current_seasonal}")
else:
    # Fallback defaults
    current_order = (1, 0, 1)
    current_seasonal = (1, 1, 1, 24)
    print(f"Using defaults: order={current_order}, seasonal={current_seasonal}")

print(f"Exog variables: {len(available_exog)}/{len(EXOG_VARS)}")

Using optimized: order=(2, 0, 0), seasonal=(1, 1, 0, 24)
Exog variables: 18/18


Cell 3 - 30 day rolling window

In [None]:
# Cell 3 - Minimal 30-Day Validation (Maximum Speed)

import time

def run_minimal_30day_validation(training_data, exog_vars, forecast_days=7):
    """Minimal validation - prioritizes speed over model sophistication"""
    
    print(f"Running MINIMAL 30-day validation (forecast horizon: {forecast_days} days)...")
    print("Focus: Maximum speed with basic model validation")
    print("=" * 60)
    
    results = []
    train_end = training_data.index.max()
    
    # Pre-cache some common calculations
    full_train_data = training_data['Price'].values
    full_train_index = training_data.index
    
    for day in range(1, 31):
        forecast_start = train_end + timedelta(days=day)
        forecast_end = forecast_start + timedelta(days=forecast_days-1, hours=23)
        
        print(f"Day {day:2d}: {forecast_start.date()}", end=" | ")
        
        # Load validation data for this day only
        validation_data = load_validation_data(
            forecast_start.strftime("%Y-%m-%d %H:%M:%S"),
            forecast_end.strftime("%Y-%m-%d %H:%M:%S")
        )
        
        if validation_data is None or len(validation_data) == 0:
            print("NO_DATA")
            results.append({
                'Day': day, 'Date': forecast_start.date(),
                'Naive': np.nan, 'SARIMA': np.nan, 'SARIMAX': np.nan,
                'Status': 'NO_DATA'
            })
            continue
        
        # Get training data up to forecast start - FIXED LOGIC
        train_cutoff = forecast_start - timedelta(hours=1)
        
        # Use ALL available training data up to cutoff (not just recent)
        available_train = training_data[training_data.index <= train_cutoff]
        
        if len(available_train) < 168:  # Need at least 1 week
            print("INSUFFICIENT")
            results.append({
                'Day': day, 'Date': forecast_start.date(),
                'Naive': np.nan, 'SARIMA': np.nan, 'SARIMAX': np.nan,
                'Status': 'INSUFFICIENT'
            })
            continue
        
        # For speed: use only last 336 hours (2 weeks) of available training data
        if len(available_train) > 336:
            recent_train_data = available_train.iloc[-336:]
        else:
            recent_train_data = available_train
        y_train = recent_train_data['Price'].values
        y_actual = validation_data['Price'].values
        forecast_hours = len(y_actual)
        
        result = {'Day': day, 'Date': forecast_start.date(), 'Status': 'SUCCESS'}
        
        # 1. NAIVE MODEL (very fast)
        try:
            if len(y_train) >= 24:
                # Simple daily seasonality
                daily_pattern = y_train[-24:]
                naive_forecast = np.tile(daily_pattern, (forecast_hours // 24) + 1)[:forecast_hours]
            else:
                naive_forecast = np.full(forecast_hours, y_train[-1])
            
            result['Naive'] = np.sqrt(mean_squared_error(y_actual, naive_forecast))
        except:
            result['Naive'] = np.nan
        
        # 2. SARIMA MODEL (using YOUR optimized parameters)
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                
                # Use YOUR optimized parameters: order=(2,0,0), seasonal=(1,1,0,24)
                sarima_model = SARIMAX(
                    y_train,
                    order=current_order,  # Your optimized (2,0,0)
                    seasonal_order=current_seasonal,  # Your optimized (1,1,0,24)
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                
                # Fast fitting with reduced iterations
                fitted = sarima_model.fit(method='lbfgs', maxiter=20, disp=False)
                forecast = fitted.forecast(steps=forecast_hours)
                result['SARIMA'] = np.sqrt(mean_squared_error(y_actual, forecast))
        except:
            # Fallback to simple ARIMA if your parameters fail
            try:
                from statsmodels.tsa.arima.model import ARIMA
                simple_model = ARIMA(y_train, order=(1, 0, 1))
                fitted = simple_model.fit(method='css', maxiter=10)
                forecast = fitted.forecast(steps=forecast_hours)
                result['SARIMA'] = np.sqrt(mean_squared_error(y_actual, forecast))
            except:
                result['SARIMA'] = np.nan

        # 3. SARIMAX MODEL (using YOUR optimized parameters + ALL features)
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                
                # Get exogenous data
                X_train = recent_train_data[exog_vars].values
                
                # Use YOUR optimized SARIMAX parameters
                sarimax_model = SARIMAX(
                    y_train,
                    exog=X_train,
                    order=current_order,  # Your optimized (2,0,0)
                    seasonal_order=current_seasonal,  # Your optimized (1,1,0,24)
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                
                # Fast fitting
                fitted = sarimax_model.fit(method='lbfgs', maxiter=20, disp=False)
                
                # Create future exogenous variables (weekly pattern)
                X_future = np.zeros((forecast_hours, len(exog_vars)))
                recent_X = X_train[-168:] if len(X_train) >= 168 else X_train[-24:]
                
                for i in range(forecast_hours):
                    X_future[i] = recent_X[i % len(recent_X)]
                
                forecast = fitted.forecast(steps=forecast_hours, exog=X_future)
                result['SARIMAX'] = np.sqrt(mean_squared_error(y_actual, forecast))
                
        except:
            # Fallback to LinearRegression if SARIMAX fails
            try:
                from sklearn.linear_model import LinearRegression
                X_train = recent_train_data[exog_vars].values
                lr_model = LinearRegression().fit(X_train, y_train)
                X_future = np.tile(X_train[-24:], ((forecast_hours // 24) + 1, 1))[:forecast_hours]
                lr_forecast = lr_model.predict(X_future)
                result['SARIMAX'] = np.sqrt(mean_squared_error(y_actual, lr_forecast))
            except:
                result['SARIMAX'] = np.nan
        
        # Print results
        naive_str = f"N:{result['Naive']:.4f}" if not pd.isna(result['Naive']) else "N:FAIL"
        sarima_str = f"S:{result['SARIMA']:.4f}" if not pd.isna(result['SARIMA']) else "S:FAIL"
        sarimax_str = f"X:{result['SARIMAX']:.4f}" if not pd.isna(result['SARIMAX']) else "X:FAIL"
        
        print(f"{naive_str} | {sarima_str} | {sarimax_str}")
        
        results.append(result)
    
    return pd.DataFrame(results)

# Alternative: Even simpler version if above is still slow
def run_super_simple_validation(training_data, exog_vars, forecast_days=7):
    """Super simple validation - only naive and linear models"""
    
    print("Running SUPER SIMPLE validation (Naive + Linear only)...")
    print("=" * 50)
    
    results = []
    train_end = training_data.index.max()
    
    for day in range(1, 31):
        forecast_start = train_end + timedelta(days=day)
        forecast_end = forecast_start + timedelta(days=forecast_days-1, hours=23)
        
        print(f"Day {day:2d}: {forecast_start.date()}", end=" | ")
        
        validation_data = load_validation_data(
            forecast_start.strftime("%Y-%m-%d %H:%M:%S"),
            forecast_end.strftime("%Y-%m-%d %H:%M:%S")
        )
        
        if validation_data is None or len(validation_data) == 0:
            print("NO_DATA")
            continue
        
        # Get last week of training data
        train_cutoff = forecast_start - timedelta(hours=1)
        recent_train = training_data[
            training_data.index > (train_cutoff - timedelta(hours=168))
        ][training_data.index <= train_cutoff]
        
        if len(recent_train) < 24:
            print("INSUFFICIENT")
            continue
        
        y_actual = validation_data['Price'].values
        forecast_hours = len(y_actual)
        
        # Naive model
        try:
            daily_pattern = recent_train['Price'].values[-24:]
            naive_forecast = np.tile(daily_pattern, (forecast_hours // 24) + 1)[:forecast_hours]
            naive_rmse = np.sqrt(mean_squared_error(y_actual, naive_forecast))
        except:
            naive_rmse = np.nan
        
        # Simple linear model
        try:
            from sklearn.linear_model import LinearRegression
            X = recent_train[exog_vars].values
            y = recent_train['Price'].values
            
            lr = LinearRegression().fit(X, y)
            X_future = np.tile(X[-24:], ((forecast_hours // 24) + 1, 1))[:forecast_hours]
            lr_forecast = lr.predict(X_future)
            lr_rmse = np.sqrt(mean_squared_error(y_actual, lr_forecast))
        except:
            lr_rmse = np.nan
        
        print(f"N:{naive_rmse:.4f} | L:{lr_rmse:.4f}")
        
        results.append({
            'Day': day,
            'Date': forecast_start.date(),
            'Naive': naive_rmse,
            'Linear': lr_rmse,
            'Status': 'SUCCESS'
        })
    
    return pd.DataFrame(results)

# Try the minimal version first
print("🚀 Starting MINIMAL 30-day validation...")
start_time = time.time()

try:
    results_df = run_minimal_30day_validation(training_data, available_exog, forecast_days=7)
    elapsed_time = time.time() - start_time
    
    if elapsed_time > 300:  # If still taking > 5 minutes
        print(f"\n⚠ Still slow ({elapsed_time:.1f}s), trying SUPER SIMPLE version...")
        results_df = run_super_simple_validation(training_data, available_exog, forecast_days=7)
        elapsed_time = time.time() - start_time
    
    print(f"\n✅ Validation completed in {elapsed_time:.1f} seconds")
    
    # Quick summary
    if len(results_df) > 0:
        successful = results_df[results_df['Status'] == 'SUCCESS']
        print(f"\nQuick Results ({len(successful)} successful validations):")
        
        for col in ['Naive', 'SARIMA', 'SARIMAX', 'Linear']:
            if col in successful.columns:
                values = successful[col].dropna()
                if len(values) > 0:
                    print(f"{col:8s}: {values.mean():.4f} ± {values.std():.4f}")
    
    print(f"\nTotal time: {elapsed_time:.1f} seconds")

except Exception as e:
    print(f"❌ Validation failed: {e}")
    import traceback
    traceback.print_exc()

🚀 Starting MINIMAL 30-day validation...
Running MINIMAL 30-day validation (forecast horizon: 7 days)...
Focus: Maximum speed with basic model validation
Day  1: 2025-03-15 | N:0.0617 | S:0.0611 | X:0.0533
Day  2: 2025-03-16 | N:0.0610 | S:0.0604 | X:0.0524
Day  3: 2025-03-17 | N:0.0596 | S:0.0589 | X:0.0501
Day  4: 2025-03-18 | N:0.0568 | S:0.0561 | X:0.0479
Day  5: 2025-03-19 | N:0.0536 | S:0.0529 | X:0.0424
Day  6: 2025-03-20 | N:0.0517 | S:0.0513 | X:0.0344
Day  7: 2025-03-21 | 

cell 4 - performance

In [4]:
# Cell 4 - Performance Analysis

if 'results_df' in locals() and len(results_df) > 0:
    
    # Model comparison
    models = ['naive', 'sarima', 'sarimax']
    comparison = {}
    
    for model in models:
        if model in results_df.columns:
            valid_results = results_df[model].dropna()
            if len(valid_results) > 0:
                comparison[model] = {
                    'mean': valid_results.mean(),
                    'std': valid_results.std(),
                    'count': len(valid_results)
                }
    
    # Calculate improvements
    if 'naive' in comparison and 'sarimax' in comparison:
        improvement = (comparison['naive']['mean'] - comparison['sarimax']['mean']) / comparison['naive']['mean'] * 100
        print(f"SARIMAX vs Naive: {improvement:+.1f}% improvement")
    
    if 'sarima' in comparison and 'sarimax' in comparison:
        improvement = (comparison['sarima']['mean'] - comparison['sarimax']['mean']) / comparison['sarima']['mean'] * 100
        print(f"SARIMAX vs SARIMA: {improvement:+.1f}% improvement")
    
    # Best model recommendation
    if comparison:
        best_model = min(comparison.keys(), key=lambda x: comparison[x]['mean'])
        print(f"Best model: {best_model.upper()} (RMSE: {comparison[best_model]['mean']:.4f})")
        
        # Show trend over time
        if len(results_df) > 5:
            sarimax_results = results_df['sarimax'].dropna()
            if len(sarimax_results) > 3:
                recent_performance = sarimax_results.tail(3).mean()
                early_performance = sarimax_results.head(3).mean()
                trend = "improving" if recent_performance < early_performance else "stable/degrading"
                print(f"Performance trend: {trend}")

else:
    print("No validation results to analyze")