In [1]:
# Dynamic path setup
import sys
from pathlib import Path

# Find project root dynamically
current_dir = Path.cwd()
while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
    current_dir = current_dir.parent
project_root = current_dir

# Add utils to path
utils_path = project_root / "src" / "utils"
sys.path.append(str(utils_path))

# Import the function from the module (make sure the function exists in the file)
from build_training_set import build_training_set

# Test
df = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-15 11:00:00",
    run_date="2025-03-15 12:00:00"
)

print(f"Shape: {df.shape}")
print(f"Price nulls: {df['Price'].isnull().sum()}/{len(df)}")
print(f"Date range: {df['target_datetime'].min()} to {df['target_datetime'].max()}")
print(df.head(10))

2025-05-27 08:16:21,582 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:16:21,583 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-15 11:00:00+00:00
2025-05-27 08:16:21,583 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 12:00:00+00:00, target range: 2025-03-15 12:00:00+00:00 → 2025-03-22 11:00:00+00:00
2025-05-27 08:16:21,610 - build_training_set - INFO - ✅ Actuals geladen: 1764 rijen
2025-05-27 08:16:21,829 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:16:21,838 - build_training_set - INFO - 📦 Eindtabel bevat: 1764 rijen, 31 kolommen
2025-05-27 08:16:21,838 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Shape: (1764, 31)
Price nulls: 0/1764
Date range: 2025-01-01 00:00:00+00:00 to 2025-03-15 11:00:00+00:00
     Price           target_datetime      Load  shortwave_radiation  \
0  0.01362 2025-01-01 00:00:00+00:00  12049.25                  0.0   
1  0.00624 2025-01-01 01:00:00+00:00  11957.50                  0.0   
2  0.00416 2025-01-01 02:00:00+00:00  11636.25                  0.0   
3  0.00328 2025-01-01 03:00:00+00:00  11310.50                  0.0   
4  0.00068 2025-01-01 04:00:00+00:00  11135.25                  0.0   
5  0.00000 2025-01-01 05:00:00+00:00  11185.75                  0.0   
6  0.00076 2025-01-01 06:00:00+00:00  11385.00                  0.0   
7  0.00079 2025-01-01 07:00:00+00:00  11695.25                  0.0   
8  0.00189 2025-01-01 08:00:00+00:00  12041.50                  0.0   
9  0.00750 2025-01-01 09:00:00+00:00  12485.75                  0.0   

   temperature_2m  direct_normal_irradiance  diffuse_radiation  Flow_NO  \
0             0.0                     

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Test parameters
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00" 
base_run = "2025-03-15 00:00:00"

# Forecast config
forecast_horizon = 168  # 168 hours = 7 days

# Storage for results
rmse_results = []

print("🔍 Testing XGBoost - RMSE per forecast day")
print("=" * 60)

for i in range(30):  # 30 rolling windows
    start = pd.Timestamp(base_start) + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        # Get training set
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is not None and len(df) > 0:
            # Convert datetime
            df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
            df = df.sort_values('target_datetime')
            df = df.set_index('target_datetime')

            # Define features
            all_features = [
                 'Load','shortwave_radiation','temperature_2m','direct_normal_irradiance','diffuse_radiation','Flow_NO','yearday_cos','Flow_GB',
                 'month','is_dst','yearday_sin','wind_speed_10m','is_non_working_day','hour_cos','is_weekend','cloud_cover','weekday_sin','hour_sin','weekday_cos'
            ]
            target = 'Price'

            train_cutoff = pd.Timestamp(end, tz="UTC")
            forecast_data = df[df.index > train_cutoff].copy()
            train_data = df[df.index <= train_cutoff].copy()

            # Clean forecast set
            if len(forecast_data) >= forecast_horizon and forecast_data['Price'].notna().sum() > 0:
                forecast_data = forecast_data.iloc[:forecast_horizon]  # Limit to 168h
                X_train = train_data[all_features]
                y_train = train_data[target]
                X_test = forecast_data[all_features].copy()
                y_test = forecast_data[target]

                # Train model
                model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
                model.fit(X_train, y_train)

                # Predict
                y_pred = model.predict(X_test)

                '''# Evaluate per day
                valid_data = forecast_data.copy()
                valid_data['y_true'] = y_test
                valid_data['y_pred'] = y_pred
                valid_data['forecast_hour'] = range(len(valid_data))
                valid_data['forecast_day'] = (valid_data['forecast_hour'] // 24) + 1

                day_rmses = {}
                for day in range(1, 8):
                    day_data = valid_data[valid_data['forecast_day'] == day]
                    if len(day_data) > 0:
                        rmse = np.sqrt(mean_squared_error(day_data['y_true'], day_data['y_pred']))
                        day_rmses[f'Day_{day}'] = rmse
                    else:
                        day_rmses[f'Day_{day}'] = np.nan

                result = {
                    'iteration': i + 1,
                    'run_date': run.strftime('%Y-%m-%d'),
                    'valid_predictions': len(valid_data),
                    **day_rmses
                }'''
                # Evaluate overall RMSE for the 168-hour forecast
                rmse_total = np.sqrt(mean_squared_error(y_test, y_pred))

                result = {
                    'iteration': i + 1,
                    'run_date': run.strftime('%Y-%m-%d'),
                    'valid_predictions': len(y_test),
                    'rmse': rmse_total
    }
                
                rmse_results.append(result)
                print(f"Day {i+1}: ✅ {len(valid_data)} predictions, Run: {run.strftime('%m-%d')}")
            else:
                print(f"Day {i+1}: ❌ No forecast data with actual prices")
        else:
            print(f"Day {i+1}: ❌ No training data")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create RMSE matrix
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    '''print(f"\n📊 RMSE MATRIX - XGBoost Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")'''

    print(f"\n📊 RMSE SUMMARY - 168h Forecast")
    print("=" * 60)
    print(f"Successful runs: {len(rmse_df)}/30")
    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']])

    day_columns = [f'Day_{i}' for i in range(1, 8)]
    available_day_cols = [col for col in day_columns if col in rmse_df.columns]

2025-05-27 08:32:47,490 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:47,491 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-14 23:00:00+00:00
2025-05-27 08:32:47,491 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 00:00:00+00:00, target range: 2025-03-15 00:00:00+00:00 → 2025-03-21 23:00:00+00:00
2025-05-27 08:32:47,545 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen


🔍 Testing XGBoost - RMSE per forecast day


2025-05-27 08:32:47,825 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:47,835 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:47,835 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 'hour_sin', 'weekday_cos', 'apparent_temperature', 'day_of_week', 'day_of_year', 'direct_radiation', 'hour', 'is_holiday', 'local_datetime', 'run_date', 'snowfall', 'wind_direction_10m', 'wind_speed_10m']
2025-05-27 08:32:47,835 - build_training_set - INFO - ❓ Price NaN count: 0/1920 (0.0%)
2025-05-27 08:32:47,852 - build_training_set - INFO - ✅ Opgeslagen als training_set in WARP.db
2025-05-

Day 1: ✅ 168 predictions, Run: 03-15


2025-05-27 08:32:48,297 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:48,298 - build_training_set - INFO - 🧠 Actuals van 2025-01-03 00:00:00+00:00 t/m 2025-03-16 23:00:00+00:00
2025-05-27 08:32:48,298 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-17 00:00:00+00:00, target range: 2025-03-17 00:00:00+00:00 → 2025-03-23 23:00:00+00:00
2025-05-27 08:32:48,315 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:48,453 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:48,462 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:48,462 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 2: ✅ 168 predictions, Run: 03-16


2025-05-27 08:32:48,608 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:48,608 - build_training_set - INFO - 🧠 Actuals van 2025-01-04 00:00:00+00:00 t/m 2025-03-17 23:00:00+00:00
2025-05-27 08:32:48,609 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-18 00:00:00+00:00, target range: 2025-03-18 00:00:00+00:00 → 2025-03-24 23:00:00+00:00
2025-05-27 08:32:48,625 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:48,762 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:48,771 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:48,771 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 3: ✅ 168 predictions, Run: 03-17


2025-05-27 08:32:48,912 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:48,913 - build_training_set - INFO - 🧠 Actuals van 2025-01-05 00:00:00+00:00 t/m 2025-03-18 23:00:00+00:00
2025-05-27 08:32:48,913 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-19 00:00:00+00:00, target range: 2025-03-19 00:00:00+00:00 → 2025-03-25 23:00:00+00:00
2025-05-27 08:32:48,943 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:49,095 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:49,104 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:49,104 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 4: ✅ 168 predictions, Run: 03-18


2025-05-27 08:32:49,116 - build_training_set - INFO - ✅ Opgeslagen als training_set in WARP.db
2025-05-27 08:32:49,117 - build_training_set - INFO - 🔒 Verbinding gesloten
2025-05-27 08:32:49,249 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:49,250 - build_training_set - INFO - 🧠 Actuals van 2025-01-06 00:00:00+00:00 t/m 2025-03-19 23:00:00+00:00
2025-05-27 08:32:49,250 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-20 00:00:00+00:00, target range: 2025-03-20 00:00:00+00:00 → 2025-03-26 23:00:00+00:00
2025-05-27 08:32:49,265 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:49,402 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:49,411 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:49,411 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load'

Day 5: ✅ 168 predictions, Run: 03-19


2025-05-27 08:32:49,551 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:49,552 - build_training_set - INFO - 🧠 Actuals van 2025-01-07 00:00:00+00:00 t/m 2025-03-20 23:00:00+00:00
2025-05-27 08:32:49,552 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-21 00:00:00+00:00, target range: 2025-03-21 00:00:00+00:00 → 2025-03-27 23:00:00+00:00
2025-05-27 08:32:49,568 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:49,706 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:49,714 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:49,715 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 6: ✅ 168 predictions, Run: 03-20


2025-05-27 08:32:49,855 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:49,855 - build_training_set - INFO - 🧠 Actuals van 2025-01-08 00:00:00+00:00 t/m 2025-03-21 23:00:00+00:00
2025-05-27 08:32:49,855 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-22 00:00:00+00:00, target range: 2025-03-22 00:00:00+00:00 → 2025-03-28 23:00:00+00:00
2025-05-27 08:32:49,872 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:50,010 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:50,019 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:50,019 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 7: ✅ 168 predictions, Run: 03-21


2025-05-27 08:32:50,186 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:50,186 - build_training_set - INFO - 🧠 Actuals van 2025-01-09 00:00:00+00:00 t/m 2025-03-22 23:00:00+00:00
2025-05-27 08:32:50,186 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-23 00:00:00+00:00, target range: 2025-03-23 00:00:00+00:00 → 2025-03-29 23:00:00+00:00
2025-05-27 08:32:50,203 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:50,339 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:50,348 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:50,348 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 8: ✅ 168 predictions, Run: 03-22


2025-05-27 08:32:50,491 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:50,491 - build_training_set - INFO - 🧠 Actuals van 2025-01-10 00:00:00+00:00 t/m 2025-03-23 23:00:00+00:00
2025-05-27 08:32:50,492 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-24 00:00:00+00:00, target range: 2025-03-24 00:00:00+00:00 → 2025-03-30 23:00:00+00:00
2025-05-27 08:32:50,509 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:50,649 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:50,658 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:50,659 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 9: ✅ 168 predictions, Run: 03-23


2025-05-27 08:32:50,802 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:50,802 - build_training_set - INFO - 🧠 Actuals van 2025-01-11 00:00:00+00:00 t/m 2025-03-24 23:00:00+00:00
2025-05-27 08:32:50,802 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-25 00:00:00+00:00, target range: 2025-03-25 00:00:00+00:00 → 2025-03-31 23:00:00+00:00
2025-05-27 08:32:50,819 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:50,960 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:50,969 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:50,969 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 10: ✅ 168 predictions, Run: 03-24


2025-05-27 08:32:51,117 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:51,117 - build_training_set - INFO - 🧠 Actuals van 2025-01-12 00:00:00+00:00 t/m 2025-03-25 23:00:00+00:00
2025-05-27 08:32:51,117 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-26 00:00:00+00:00, target range: 2025-03-26 00:00:00+00:00 → 2025-04-01 23:00:00+00:00
2025-05-27 08:32:51,135 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:51,299 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:51,308 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:51,309 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 11: ✅ 168 predictions, Run: 03-25


2025-05-27 08:32:51,320 - build_training_set - INFO - ✅ Opgeslagen als training_set in WARP.db
2025-05-27 08:32:51,321 - build_training_set - INFO - 🔒 Verbinding gesloten
2025-05-27 08:32:51,448 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:51,449 - build_training_set - INFO - 🧠 Actuals van 2025-01-13 00:00:00+00:00 t/m 2025-03-26 23:00:00+00:00
2025-05-27 08:32:51,449 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-27 00:00:00+00:00, target range: 2025-03-27 00:00:00+00:00 → 2025-04-02 23:00:00+00:00
2025-05-27 08:32:51,464 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:51,604 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:51,613 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:51,613 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load'

Day 12: ✅ 168 predictions, Run: 03-26


2025-05-27 08:32:51,757 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:51,757 - build_training_set - INFO - 🧠 Actuals van 2025-01-14 00:00:00+00:00 t/m 2025-03-27 23:00:00+00:00
2025-05-27 08:32:51,757 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-28 00:00:00+00:00, target range: 2025-03-28 00:00:00+00:00 → 2025-04-03 23:00:00+00:00
2025-05-27 08:32:51,774 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:51,915 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:51,923 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:51,924 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 13: ✅ 168 predictions, Run: 03-27


2025-05-27 08:32:52,066 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:52,066 - build_training_set - INFO - 🧠 Actuals van 2025-01-15 00:00:00+00:00 t/m 2025-03-28 23:00:00+00:00
2025-05-27 08:32:52,066 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-29 00:00:00+00:00, target range: 2025-03-29 00:00:00+00:00 → 2025-04-04 23:00:00+00:00
2025-05-27 08:32:52,083 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:52,224 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:52,233 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:52,233 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 14: ✅ 168 predictions, Run: 03-28


2025-05-27 08:32:52,274 - build_training_set - INFO - ✅ Opgeslagen als training_set in WARP.db
2025-05-27 08:32:52,275 - build_training_set - INFO - 🔒 Verbinding gesloten
2025-05-27 08:32:52,405 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:52,406 - build_training_set - INFO - 🧠 Actuals van 2025-01-16 00:00:00+00:00 t/m 2025-03-29 23:00:00+00:00
2025-05-27 08:32:52,406 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-30 00:00:00+00:00, target range: 2025-03-30 00:00:00+00:00 → 2025-04-05 23:00:00+00:00
2025-05-27 08:32:52,421 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:52,561 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:52,570 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:52,571 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load'

Day 15: ✅ 168 predictions, Run: 03-29


2025-05-27 08:32:52,715 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:52,715 - build_training_set - INFO - 🧠 Actuals van 2025-01-17 00:00:00+00:00 t/m 2025-03-30 23:00:00+00:00
2025-05-27 08:32:52,715 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-31 00:00:00+00:00, target range: 2025-03-31 00:00:00+00:00 → 2025-04-06 23:00:00+00:00
2025-05-27 08:32:52,733 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:52,870 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:52,879 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:52,880 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 16: ✅ 168 predictions, Run: 03-30


2025-05-27 08:32:53,024 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:53,024 - build_training_set - INFO - 🧠 Actuals van 2025-01-18 00:00:00+00:00 t/m 2025-03-31 23:00:00+00:00
2025-05-27 08:32:53,024 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-01 00:00:00+00:00, target range: 2025-04-01 00:00:00+00:00 → 2025-04-07 23:00:00+00:00
2025-05-27 08:32:53,040 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:53,180 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:53,189 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:53,189 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 17: ✅ 168 predictions, Run: 03-31


2025-05-27 08:32:53,370 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:53,370 - build_training_set - INFO - 🧠 Actuals van 2025-01-19 00:00:00+00:00 t/m 2025-04-01 23:00:00+00:00
2025-05-27 08:32:53,370 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-02 00:00:00+00:00, target range: 2025-04-02 00:00:00+00:00 → 2025-04-08 23:00:00+00:00
2025-05-27 08:32:53,387 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:53,527 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:53,536 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:53,536 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 18: ✅ 168 predictions, Run: 04-01


2025-05-27 08:32:53,677 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:53,677 - build_training_set - INFO - 🧠 Actuals van 2025-01-20 00:00:00+00:00 t/m 2025-04-02 23:00:00+00:00
2025-05-27 08:32:53,677 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-03 00:00:00+00:00, target range: 2025-04-03 00:00:00+00:00 → 2025-04-09 23:00:00+00:00
2025-05-27 08:32:53,694 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:53,833 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:53,842 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:53,842 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 19: ✅ 168 predictions, Run: 04-02


2025-05-27 08:32:53,985 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:53,986 - build_training_set - INFO - 🧠 Actuals van 2025-01-21 00:00:00+00:00 t/m 2025-04-03 23:00:00+00:00
2025-05-27 08:32:53,986 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-04 00:00:00+00:00, target range: 2025-04-04 00:00:00+00:00 → 2025-04-10 23:00:00+00:00
2025-05-27 08:32:54,002 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:54,143 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:54,151 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:54,151 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 20: ✅ 168 predictions, Run: 04-03


2025-05-27 08:32:54,296 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:54,297 - build_training_set - INFO - 🧠 Actuals van 2025-01-22 00:00:00+00:00 t/m 2025-04-04 23:00:00+00:00
2025-05-27 08:32:54,297 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-05 00:00:00+00:00, target range: 2025-04-05 00:00:00+00:00 → 2025-04-11 23:00:00+00:00
2025-05-27 08:32:54,314 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:54,477 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:54,486 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:54,486 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 21: ✅ 168 predictions, Run: 04-04


2025-05-27 08:32:54,498 - build_training_set - INFO - ✅ Opgeslagen als training_set in WARP.db
2025-05-27 08:32:54,499 - build_training_set - INFO - 🔒 Verbinding gesloten
2025-05-27 08:32:54,631 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:54,631 - build_training_set - INFO - 🧠 Actuals van 2025-01-23 00:00:00+00:00 t/m 2025-04-05 23:00:00+00:00
2025-05-27 08:32:54,631 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-06 00:00:00+00:00, target range: 2025-04-06 00:00:00+00:00 → 2025-04-12 23:00:00+00:00
2025-05-27 08:32:54,648 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:54,786 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:54,795 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:54,795 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load'

Day 22: ✅ 168 predictions, Run: 04-05


2025-05-27 08:32:54,939 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:54,940 - build_training_set - INFO - 🧠 Actuals van 2025-01-24 00:00:00+00:00 t/m 2025-04-06 23:00:00+00:00
2025-05-27 08:32:54,940 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-07 00:00:00+00:00, target range: 2025-04-07 00:00:00+00:00 → 2025-04-13 23:00:00+00:00
2025-05-27 08:32:54,957 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:55,096 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:55,105 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:55,105 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 23: ✅ 168 predictions, Run: 04-06


2025-05-27 08:32:55,248 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:55,249 - build_training_set - INFO - 🧠 Actuals van 2025-01-25 00:00:00+00:00 t/m 2025-04-07 23:00:00+00:00
2025-05-27 08:32:55,249 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-08 00:00:00+00:00, target range: 2025-04-08 00:00:00+00:00 → 2025-04-14 23:00:00+00:00
2025-05-27 08:32:55,265 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:55,405 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:55,414 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:55,414 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 24: ✅ 168 predictions, Run: 04-07


2025-05-27 08:32:55,584 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:55,585 - build_training_set - INFO - 🧠 Actuals van 2025-01-26 00:00:00+00:00 t/m 2025-04-08 23:00:00+00:00
2025-05-27 08:32:55,585 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-09 00:00:00+00:00, target range: 2025-04-09 00:00:00+00:00 → 2025-04-15 23:00:00+00:00
2025-05-27 08:32:55,601 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:55,740 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:55,749 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:55,750 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 25: ✅ 168 predictions, Run: 04-08


2025-05-27 08:32:55,894 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:55,894 - build_training_set - INFO - 🧠 Actuals van 2025-01-27 00:00:00+00:00 t/m 2025-04-09 23:00:00+00:00
2025-05-27 08:32:55,894 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-10 00:00:00+00:00, target range: 2025-04-10 00:00:00+00:00 → 2025-04-16 23:00:00+00:00
2025-05-27 08:32:55,911 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:56,052 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:56,060 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:56,060 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 26: ✅ 168 predictions, Run: 04-09


2025-05-27 08:32:56,201 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:56,201 - build_training_set - INFO - 🧠 Actuals van 2025-01-28 00:00:00+00:00 t/m 2025-04-10 23:00:00+00:00
2025-05-27 08:32:56,201 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-11 00:00:00+00:00, target range: 2025-04-11 00:00:00+00:00 → 2025-04-17 23:00:00+00:00
2025-05-27 08:32:56,218 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:56,358 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:56,367 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:56,367 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 27: ✅ 168 predictions, Run: 04-10


2025-05-27 08:32:56,512 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:56,512 - build_training_set - INFO - 🧠 Actuals van 2025-01-29 00:00:00+00:00 t/m 2025-04-11 23:00:00+00:00
2025-05-27 08:32:56,512 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-12 00:00:00+00:00, target range: 2025-04-12 00:00:00+00:00 → 2025-04-18 23:00:00+00:00
2025-05-27 08:32:56,534 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:56,696 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:56,705 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:56,705 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 'is_dst', 'yearday_sin', 'is_non_

Day 28: ✅ 168 predictions, Run: 04-11


2025-05-27 08:32:56,720 - build_training_set - INFO - ✅ Opgeslagen als training_set in WARP.db
2025-05-27 08:32:56,721 - build_training_set - INFO - 🔒 Verbinding gesloten
2025-05-27 08:32:56,852 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-27 08:32:56,853 - build_training_set - INFO - 🧠 Actuals van 2025-01-30 00:00:00+00:00 t/m 2025-04-12 23:00:00+00:00
2025-05-27 08:32:56,853 - build_training_set - INFO - 📅 Forecast van run_date 2025-04-13 00:00:00+00:00, target range: 2025-04-13 00:00:00+00:00 → 2025-04-19 23:00:00+00:00
2025-05-27 08:32:56,869 - build_training_set - INFO - ✅ Actuals geladen: 1752 rijen
2025-05-27 08:32:57,006 - build_training_set - INFO - ✅ Added actual prices to 168 forecast rows
  df_combined = pd.concat([df_actuals, df_preds], ignore_index=True)
2025-05-27 08:32:57,016 - build_training_set - INFO - 📦 Eindtabel bevat: 1920 rijen, 31 kolommen
2025-05-27 08:32:57,016 - build_training_set - INFO - 🧾 Kolommen: ['Price', 'target_datetime', 'Load'

Day 29: ✅ 168 predictions, Run: 04-12
Day 30: ✅ 168 predictions, Run: 04-13

📊 RMSE SUMMARY - 168h Forecast
Successful runs: 30/30
    iteration    run_date  valid_predictions      rmse
0           1  2025-03-15                168  0.113546
1           2  2025-03-16                168  0.116602
2           3  2025-03-17                168  0.110639
3           4  2025-03-18                168  0.114046
4           5  2025-03-19                168  0.107027
5           6  2025-03-20                168  0.098867
6           7  2025-03-21                168  0.103344
7           8  2025-03-22                168  0.103362
8           9  2025-03-23                168  0.102114
9          10  2025-03-24                168  0.123128
10         11  2025-03-25                168  0.133307
11         12  2025-03-26                168  0.112341
12         13  2025-03-27                168  0.132834
13         14  2025-03-28                168  0.153039
14         15  2025-03-29                168

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
color_pal = sns.color_palette()
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
import sqlite3

conn = sqlite3.connect('../data/WARP.db')
df = pd.read_sql_query("SELECT * FROM training_set", conn)
conn.close()
# change datetime to index
df.set_index('target_datetime', inplace=True)
# convert to datetime
df.index = pd.to_datetime(df.index)
print(df.dtypes)


In [None]:
df['Price'].plot(kind='hist', bins=500)
plt.ylim(top=120)
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Price Distribution')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import timedelta

# Feature and target setup
features = [
    'apparent_temperature',
    'temperature_2m',
    'direct_normal_irradiance',
    'diffuse_radiation',
    'yearday_sin',
    'Flow_BE',
    'hour_sin',
    'is_non_working_day',
    'is_dst',
    'is_weekend',
    'is_holiday',
    'weekday_cos',
    'wind_speed_10m',
    'hour_cos',
    'weekday_sin',
    'cloud_cover',
    'Flow_GB',
    'yearday_cos',
    'Flow_NO',
    'Load'
]
target = 'Price'

# Safe datetime handling
if 'datetime' in df.columns:
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime')
    df = df.set_index('datetime')
else:
    print("'datetime' column not found in columns. Sorting by index instead.")
    df = df.sort_index()

# Forecast settings
start_date = pd.Timestamp("2025-03-13 12:00", tz='UTC')
end_date = pd.Timestamp("2025-05-14 12:00", tz='UTC')
lag = timedelta(hours=36)
forecast_horizon = timedelta(hours=144)

# Store RMSEs
rmses = []

current_time = start_date
while current_time <= end_date:
    train_data = df[df.index < current_time]
    test_start = current_time + lag # check current time  
    test_end = test_start + forecast_horizon
    test_data = df[(df.index >= test_start) & (df.index < test_end)]

    if test_data.empty:
        print(f"No test data for forecast starting at {current_time}")
        current_time += timedelta(days=1)
        continue

    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]
    y_test = test_data[target]

    # Train and predict
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmses.append(rmse)

    print(f"Forecast origin: {current_time}, Predicting {test_start} to {test_end}, RMSE: {rmse:.3f}")

    current_time += timedelta(days=1)

# Summary
avg_rmse = np.mean(rmses)
print(f"\nAverage RMSE over {len(rmses)} runs: {avg_rmse:.3f}")

In [None]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime

# Configuration
MODEL_CODE = "XGB_1fold_split"
PARAMETERS_USED = {
    "temp_2m": True,
    "shortwave_radiation": True,
    "windspeed_10m": True
}
timestamp_predict = pd.to_datetime("2025-03-15 00:00:00", utc=True)
TARGET = 'Price'

# Define features
COMMON_FEATURES = [
    'yearday_cos', 'yearday_sin', 'month',
    'shortwave_radiation', 'windspeed_10m', 'apparent_temperature', 'temperature_2m',
    'direct_normal_irradiance', 'diffuse_radiation',
    'cloud_cover', , 'hour_cos', 'hour_sin', 'is_non_working_day',
    'weekday_sin', 'weekday_cos', 'is_holiday',
]

# Add Load only to training features
TRAIN_FEATURES = COMMON_FEATURES + ['Load', 'Flow_NO', 'Flow_GB', 'Flow_BE', 'Wind_Vol', 'Solar_Vol']
TEST_FEATURES = COMMON_FEATURES

# Sort by index
df = df.sort_index()

# Train-test split based on timestamp_predict
train = df[df.index < timestamp_predict]
test = df[df.index >= timestamp_predict]

X_train = train[TRAIN_FEATURES]
y_train = train[TARGET]
X_test = test[TEST_FEATURES]
y_test = test[TARGET]

# Train model
reg = xgb.XGBRegressor(
    base_score=0.5,
    booster='gbtree',
    n_estimators=1200,
    early_stopping_rounds=50,
    objective='reg:squarederror',
    max_depth=3,
    learning_rate=0.02
)

reg.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=100
)

y_pred = reg.predict(X_test)

# Store predictions
y_test.index = pd.to_datetime(y_test.index)
pred_df = pd.DataFrame({
    'datetime': y_test.index,
    'y_true': y_test.values,
    'y_pred': y_pred
})

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"1-Fold RMSE: {rmse:.4f}")

# Collect results
results = []
for _, row in pred_df.iterrows():
    results.append({
        'timestamp_predict': timestamp_predict.strftime('%d-%m-%Y %H:%M'),
        'datetime': row['datetime'].strftime('%d-%m-%Y %H:%M'),
        'model_code': MODEL_CODE,
        'price': round(row['y_pred'], 4),
        'true_price': round(row['y_true'], 4),
        'RSME': round(rmse, 4),
        'fold': 1,
        'parameters(JSON)': json.dumps(PARAMETERS_USED)
    })

# Save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("model_results_log.csv", mode='a', index=False)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import timedelta

# Feature and target setup
features = ['hour_cos', 'Load', 'hour_sin', 'weekday_sin', 'weekday_cos', 'Solar_Vol', 'Wind_Vol',
            'WindOffshore_Vol', 'is_holiday','Total_Flow']

target = 'Price'

# Safe datetime handling
if 'datetime' in df.columns:
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime')
    df = df.set_index('datetime')
else:
    print("'datetime' column not found in columns. Sorting by index instead.")
    df = df.sort_index()

# Forecast settings
start_date = pd.Timestamp("2025-03-13 12:00", tz='UTC')
end_date = pd.Timestamp("2025-05-05 12:00", tz='UTC')
lag = timedelta(hours=36)
forecast_horizon = timedelta(hours=144)

# Store RMSEs
rmses = []

current_time = start_date
while current_time <= end_date:
    train_data = df[df.index < current_time]
    test_start = current_time + lag
    test_end = test_start + forecast_horizon
    test_data = df[(df.index >= test_start) & (df.index < test_end)]

    if test_data.empty:
        print(f"No test data for forecast starting at {current_time}")
        current_time += timedelta(days=1)
        continue

    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]
    y_test = test_data[target]

    # Train and predict
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmses.append(rmse)

    print(f"Forecast origin: {current_time}, Predicting {test_start} to {test_end}, RMSE: {rmse:.3f}")

    current_time += timedelta(days=1)

# Summary
avg_rmse = np.mean(rmses)
print(f"\nAverage RMSE over {len(rmses)} runs: {avg_rmse:.3f}")

In [None]:
# Extract the forecast origin dates and RMSEs from the previous loop
forecast_origins = []
rmses_per_run = []

start_date = pd.Timestamp("2025-03-13 12:00", tz='UTC')
end_date = pd.Timestamp("2025-05-05 12:00", tz='UTC')
num_runs = len(rmses)
current_time = start_date

for i in range(num_runs):
    forecast_origins.append(current_time)
    current_time += timedelta(days=1)

# Plot RMSE vs. forecast origin date
plt.figure(figsize=(15, 5))
plt.plot(forecast_origins, rmses, marker='o', linestyle='-', color=color_pal[0])
plt.title('RMSE vs. First Predicted Date (per run)')
plt.xlabel('Forecast Origin (First Predicted Date)')
plt.ylabel('RMSE')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Calculate RMSE per day (averaged over all runs where the day is included in the forecast horizon)

# Assume forecast_origins and rmses are available from previous cells
# Each run predicts 6 days (144 hours), so for each run, map RMSE to each predicted day

# Build a DataFrame mapping each forecast run to its predicted days
forecast_horizon_days = 6
forecast_origin_dates = pd.to_datetime(forecast_origins)
rmse_per_day = {}

for run_idx, origin in enumerate(forecast_origin_dates):
    for day_offset in range(forecast_horizon_days):
        day = (origin + pd.Timedelta(hours=36) + pd.Timedelta(days=day_offset)).normalize()
        if day not in rmse_per_day:
            rmse_per_day[day] = []
        rmse_per_day[day].append(rmses[run_idx])

# Compute average RMSE per day
avg_rmse_per_day = pd.Series({day: np.mean(vals) for day, vals in rmse_per_day.items()})
avg_rmse_per_day = avg_rmse_per_day.sort_index()

# Plot
plt.figure(figsize=(15, 5))
plt.plot(avg_rmse_per_day.index, avg_rmse_per_day.values, marker='o', linestyle='-', color=color_pal[1])
plt.title('Average RMSE per Predicted Day (Averaged over all runs)')
plt.xlabel('Predicted Day')
plt.ylabel('Average RMSE')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Get feature importances from the last trained model
importances = model.feature_importances_
feature_names = model.feature_names_in_

# Create a DataFrame for better visualization
feat_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feat_imp_df)

# Optional: Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df, palette='viridis')
plt.title('Feature Importance (XGBoost)')
plt.tight_layout()
plt.show()