# Time Series Forecasting

This notebook demonstrates the new modular architecture for time series forecasting experiments.

## Features:
- 🔧 Configuration-driven experiments
- 📊 Unified logging and metrics
- 🎨 Interactive visualizations
- 🔄 Rolling window validation
- ⚡ Parallel model execution


# Cell 1: Import Libraries and Setup Training Function

In [1]:
# Cell 1: Import Libraries and Setup Training Function
import pandas as pd
import numpy as np
import warnings
from pathlib import Path
import sys
import gc
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import multiprocessing
import random

# Setup paths and import training function
current_dir = Path.cwd()
if "ENEXIS" in str(current_dir):
    while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
        current_dir = current_dir.parent
    project_root = current_dir
else:
    project_root = current_dir

sys.path.insert(0, str(project_root / "src"))
from utils.build_training_set import build_training_set

# Performance optimizations
warnings.filterwarnings("ignore")
import logging
logging.getLogger().setLevel(logging.ERROR)  # Suppress INFO/DEBUG logs
pd.set_option('mode.chained_assignment', None)
plt.ioff()

print(f"✅ Libraries and training function imported")
print(f"🚀 Available CPU cores: {multiprocessing.cpu_count()}")

✅ Libraries and training function imported
🚀 Available CPU cores: 14


# Cell 2: Random Seeds and Configuration

In [2]:
# Reproducibility
np.random.seed(42)
random.seed(42)

# Key configuration
TARGET = 'Price'
FEATURES = [
    'Load', 'shortwave_radiation', 'temperature_2m', 
    'direct_normal_irradiance', 'diffuse_radiation', 
    'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 
    'yearday_sin', 'wind_speed_10m', 'is_non_working_day', 
    'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 
    'hour_sin', 'weekday_cos'
]

# SARIMAX parameters (optimized for speed)
ORDER = (1, 1, 1)
SEASONAL_ORDER = (1, 1, 1, 24)

# Rolling window setup
BASE_START = "2025-01-01 00:00:00"
BASE_END = "2025-03-14 23:00:00"
BASE_RUN = "2025-03-15 00:00:00"

print(f"🎯 Target: {TARGET}")
print(f"📊 Features: {len(FEATURES)}")
print(f"🎲 Random seed: 42")


🎯 Target: Price
📊 Features: 19
🎲 Random seed: 42


# Cell 3: Core Helper Functions

In [3]:

def prep_data(df, target_col, feature_cols=None):
    """Fast data preparation for SARIMAX models."""
    df = df.copy()
    df['target_datetime'] = pd.to_datetime(df['target_datetime']).dt.tz_localize(None)
    df = df.set_index('target_datetime').sort_index()
    
    y = df[target_col].astype(float)
    exog = df[feature_cols].astype(float) if feature_cols else None
    
    return y, exog

def fit_fast_sarimax(y_train, exog_train=None):
    """Fit SARIMAX with speed optimizations."""
    model = SARIMAX(
        y_train, 
        exog=exog_train,
        order=ORDER, 
        seasonal_order=SEASONAL_ORDER,
        enforce_stationarity=False,
        enforce_invertibility=False,
        concentrate_scale=True  # Speed boost
    )
    return model.fit(disp=False, maxiter=50, method='lbfgs')

def calc_rmse_by_day(y_true, y_pred, max_days=7):
    """Calculate RMSE for each forecast day (1-7)."""
    rmse_dict = {'overall': np.sqrt(mean_squared_error(y_true, y_pred))}
    
    for day in range(1, min(len(y_true)//24, max_days) + 1):
        start, end = (day-1)*24, day*24
        if end <= len(y_true):
            rmse_dict[f'day_{day}'] = np.sqrt(mean_squared_error(
                y_true.iloc[start:end], y_pred[start:end]
            ))
    
    return rmse_dict

print("✅ Helper functions ready")

✅ Helper functions ready



# Cell 4: SARIMA Model (Univariate - Price Only)

In [4]:
# Cell 4: SARIMA Model (Univariate - Price Only)
def test_sarima():
    """Test SARIMA model on single time window."""
    df = build_training_set(BASE_START, BASE_END, BASE_RUN)
    df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
    
    run_date_utc = pd.Timestamp(BASE_RUN).tz_localize("UTC")
    train_data = df[df['target_datetime'] <= run_date_utc]
    test_data = df[df['target_datetime'] > run_date_utc]
    
    # Skip first 24h as per requirement
    test_data = test_data.iloc[24:] if len(test_data) > 24 else test_data
    
    # Prepare data (univariate - no features)
    y_train, _ = prep_data(train_data, TARGET)
    y_test, _ = prep_data(test_data, TARGET)
    
    # Fit and forecast
    model_fit = fit_fast_sarimax(y_train)
    y_pred = model_fit.forecast(len(y_test)).values
    
    # Calculate RMSE
    rmse_results = calc_rmse_by_day(y_test, y_pred)
    
    return rmse_results, len(y_test)

# Run test
sarima_rmse, n_pred = test_sarima()
print(f"🔍 SARIMA Results ({n_pred} predictions):")
for k, v in sarima_rmse.items():
    print(f"  {k}: {v:.4f}")

gc.collect()  # Clean memory

🔍 SARIMA Results (144 predictions):
  overall: 0.0527
  day_1: 0.0571
  day_2: 0.0364
  day_3: 0.0488
  day_4: 0.0463
  day_5: 0.0470
  day_6: 0.0730


36

# Cell 5: SARIMAX Model (19 Observation Features)

In [6]:
# Cell 5: SARIMAX with Perfect Exogenous Features (Oracle Upper Bound)
def test_sarimax_perfect():
    """Oracle scenario: SARIMAX with perfect exogenous features in forecast window."""
    df = build_training_set(BASE_START, BASE_END, BASE_RUN)
    df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
    
    run_date_utc = pd.Timestamp(BASE_RUN).tz_localize("UTC")
    train_data = df[df['target_datetime'] <= run_date_utc]
    test_data = df[df['target_datetime'] > run_date_utc]
    
    print(f"📅 Training: {train_data['target_datetime'].min()} → {train_data['target_datetime'].max()}")
    print(f"🔮 Oracle Forecast: {test_data['target_datetime'].min()} → {test_data['target_datetime'].max()}")
    print(f"🎯 Oracle mode: Using OBSERVED exogenous features (perfect forecasts)")
    
    # Skip first 24h as per requirement  
    test_data = test_data.iloc[24:] if len(test_data) > 24 else test_data
    
    # For oracle scenario, we need to get the ACTUAL observed values for exogenous features
    # in the forecast window. This simulates having perfect weather forecasts, etc.
    
    # Get additional observed data for the forecast window from master_warp
    try:
        df_oracle = build_training_set(
            BASE_START, 
            (pd.Timestamp(BASE_RUN) + pd.Timedelta(hours=168)).strftime("%Y-%m-%d %H:%M:%S"),
            (pd.Timestamp(BASE_RUN) + pd.Timedelta(hours=168)).strftime("%Y-%m-%d %H:%M:%S")
        )
        df_oracle['target_datetime'] = pd.to_datetime(df_oracle['target_datetime'], utc=True)
        
        # Use observed values for ALL features in forecast window
        oracle_train = df_oracle[df_oracle['target_datetime'] <= run_date_utc]
        oracle_test = df_oracle[df_oracle['target_datetime'] > run_date_utc]
        oracle_test = oracle_test.iloc[24:] if len(oracle_test) > 24 else oracle_test
        
        print(f"🔍 Debug - Oracle using OBSERVED values for all {len(FEATURES)} exogenous features")
        
    except:
        print("⚠️ Cannot access future observed values - using available data")
        oracle_train = train_data
        oracle_test = test_data
    
    # Drop NaN and prepare data
    oracle_train = oracle_train.dropna(subset=[TARGET] + FEATURES)
    oracle_test = oracle_test.dropna(subset=[TARGET] + FEATURES)
    
    if oracle_test.empty:
        print("❌ No oracle test data available")
        return {}, 0
    
    # Prepare data with ALL features (using observed values)
    y_train, exog_train = prep_data(oracle_train, TARGET, FEATURES)
    y_test, exog_test = prep_data(oracle_test, TARGET, FEATURES)
    
    # Fit SARIMAX with exogenous features
    model_fit = fit_fast_sarimax(y_train, exog_train)
    
    # Forecast WITH perfect exogenous features
    y_pred = model_fit.forecast(len(y_test), exog=exog_test).values
    
    # Calculate RMSE
    rmse_results = calc_rmse_by_day(y_test, y_pred)
    
    return rmse_results, len(y_test)

# Run oracle test
sarimax_perfect_rmse, n_pred = test_sarimax_perfect()
print(f"🔍 SARIMAX Perfect Results ({n_pred} predictions):")
for k, v in sarimax_perfect_rmse.items():
    print(f"  {k}: {v:.4f}")

gc.collect()

📅 Training: 2025-01-01 00:00:00+00:00 → 2025-03-15 00:00:00+00:00
🔮 Oracle Forecast: 2025-03-15 01:00:00+00:00 → 2025-03-22 00:00:00+00:00
🎯 Oracle mode: Using OBSERVED exogenous features (perfect forecasts)
🔍 Debug - Oracle using OBSERVED values for all 19 exogenous features
🔍 SARIMAX Perfect Results (312 predictions):
  overall: 0.0517
  day_1: 0.0219
  day_2: 0.0281
  day_3: 0.0295
  day_4: 0.0414
  day_5: 0.0399
  day_6: 0.0486
  day_7: 0.0756


965

# Cell 6: SARIMAX Model (19 Obs + Predictive Features)

In [None]:
# Cell 6: SARIMAX Realistic (Predicted/Lagged Exogenous Features)
def analyze_forecast_features(df, run_date_utc, features):
    """Analyze which features are predicted vs lagged in forecast window."""
    forecast_data = df[df['target_datetime'] > run_date_utc]
    train_data = df[df['target_datetime'] <= run_date_utc]
    
    if forecast_data.empty:
        return {}
    
    feature_sources = {}
    first_forecast_time = forecast_data['target_datetime'].iloc[0]
    lag_time = first_forecast_time - pd.Timedelta(hours=168)
    
    print(f"🔍 Debug - Feature sources for forecast (first hour: {first_forecast_time}):")
    
    for feature in features[:6]:  # Show first 6 features to avoid clutter
        if feature in forecast_data.columns:
            forecast_val = forecast_data[feature].iloc[0]
            
            # Find what the 168h lagged value would be
            lag_match = train_data[train_data['target_datetime'] == lag_time]
            lag_val = lag_match[feature].iloc[0] if not lag_match.empty else None
            
            # Determine if this looks like a prediction or lag
            if lag_val is not None and abs(forecast_val - lag_val) < 0.001:
                source = "168h LAG"
                feature_sources[feature] = "lagged"
            else:
                source = "PREDICTION"
                feature_sources[feature] = "predicted"
            
            print(f"  {feature}: {source} (val={forecast_val:.3f})")
    
    pred_count = sum(1 for v in feature_sources.values() if v == "predicted")
    lag_count = sum(1 for v in feature_sources.values() if v == "lagged")
    print(f"  Summary: {pred_count} predicted, {lag_count} lagged features")
    
    return feature_sources

def test_sarimax_realistic():
    """Realistic scenario: SARIMAX with predicted/lagged exogenous features."""
    df = build_training_set(BASE_START, BASE_END, BASE_RUN)
    df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
    
    run_date_utc = pd.Timestamp(BASE_RUN).tz_localize("UTC")
    train_data = df[df['target_datetime'] <= run_date_utc]
    test_data = df[df['target_datetime'] > run_date_utc]
    
    print(f"📅 Training: {train_data['target_datetime'].min()} → {train_data['target_datetime'].max()}")
    print(f"🔮 Realistic Forecast: {test_data['target_datetime'].min()} → {test_data['target_datetime'].max()}")
    print(f"🎯 Realistic mode: Using predicted/lagged exogenous features")
    
    # Analyze feature sources in forecast window
    feature_sources = analyze_forecast_features(df, run_date_utc, FEATURES)
    
    # Skip first 24h as per requirement
    test_data = test_data.iloc[24:] if len(test_data) > 24 else test_data
    
    # Drop NaN for features in both periods
    train_data = train_data.dropna(subset=[TARGET] + FEATURES)
    test_data = test_data.dropna(subset=[TARGET] + FEATURES)
    
    # Prepare data WITH features (realistic scenario uses available predictions/lags)
    y_train, exog_train = prep_data(train_data, TARGET, FEATURES)
    y_test, exog_test = prep_data(test_data, TARGET, FEATURES)
    
    # Fit SARIMAX with exogenous features
    model_fit = fit_fast_sarimax(y_train, exog_train)
    
    # Forecast WITH exogenous features (realistic scenario)
    y_pred = model_fit.forecast(len(y_test), exog=exog_test).values
    
    # Calculate RMSE
    rmse_results = calc_rmse_by_day(y_test, y_pred)
    
    return rmse_results, len(y_test)

# Run realistic test
sarimax_realistic_rmse, n_pred = test_sarimax_realistic()
print(f"🔍 SARIMAX Realistic Results ({n_pred} predictions):")
for k, v in sarimax_realistic_rmse.items():
    print(f"  {k}: {v:.4f}")

gc.collect()