In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
# Dynamic path setup
import sys
from pathlib import Path

# Find project root dynamically
current_dir = Path.cwd()
while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
    current_dir = current_dir.parent
project_root = current_dir

# Add utils to path
utils_path = project_root / "src" / "utils"
sys.path.append(str(utils_path))
from build_training_set import build_training_set
 
df = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 00:00:00"
)

print(df.tail())
df.to_csv("training_set1.csv", index=False)


2025-05-30 15:16:32,227 - build_training_set - INFO - 📅 Loading additional historical data until 2025-03-15 00:00:00+00:00 for lagging support
2025-05-30 15:16:32,231 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-30 15:16:32,232 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-14 23:00:00+00:00 (extended to 2025-03-15 00:00:00+00:00 for lagging)
2025-05-30 15:16:32,233 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 00:00:00+00:00, normalized to 2025-03-15 00:00:00+00:00 for DB lookup, target range: 2025-03-15 00:00:00+00:00 → 2025-03-22 00:00:00+00:00
2025-05-30 15:16:32,233 - build_training_set - INFO - 📥 Loading actuals with selected columns only...
2025-05-30 15:16:32,233 - build_training_set - INFO - 📋 Requested columns found: 21/21
2025-05-30 15:16:32,241 - build_training_set - INFO - 📋 Using columns: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'di

2025-05-30 15:16:32,322 - build_training_set - INFO - ✅ Actuals loaded: 1752 rows with 21 selected columns
2025-05-30 15:16:32,322 - build_training_set - INFO - 🔍 Loading forecast/prediction data...
2025-05-30 15:16:32,347 - build_training_set - INFO - 📊 Forecast rows available: 169
2025-05-30 15:16:32,356 - build_training_set - INFO - 📋 Common columns for predictions: 17 - ['target_datetime', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'yearday_cos', 'month', 'is_dst', 'yearday_sin', 'wind_speed_10m', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 'hour_sin', 'weekday_cos']
2025-05-30 15:16:32,389 - build_training_set - INFO - ✅ Predictions loaded: 169 rows with 17 columns
2025-05-30 15:16:32,389 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:32,397 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding tar

        Price           target_datetime      Load  shortwave_radiation  \
1916  0.06842 2025-03-21 20:00:00+00:00  15700.25                  0.0   
1917  0.04569 2025-03-21 21:00:00+00:00  14870.50                  0.0   
1918  0.02923 2025-03-21 22:00:00+00:00  14056.25                  0.0   
1919  0.02108 2025-03-21 23:00:00+00:00  13248.25                  0.0   
1920  0.02693 2025-03-22 00:00:00+00:00  12561.25                  0.0   

      temperature_2m  direct_normal_irradiance  diffuse_radiation  Flow_NO  \
1916           13.05                       0.0                0.0    621.0   
1917           13.00                       0.0                0.0    620.0   
1918           12.60                       0.0                0.0    621.0   
1919           12.00                       0.0                0.0    621.0   
1920           10.05                       0.0                0.0    621.0   

      yearday_cos  Flow_GB  ...  is_dst  yearday_sin  wind_speed_10m  \
1916     0.193

In [6]:
from sklearn.ensemble import RandomForestRegressor

df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
df['Timestamp'] = df['target_datetime']
# Feature engineering
df = df.dropna(subset=['target_datetime', 'Price'])

# Stel de periodes in
train_start = pd.Timestamp('2025-01-01', tz = 'UTC')
train_end = pd.Timestamp('2025-03-14 23:59:59', tz = 'UTC')
test_start = pd.Timestamp('2025-03-15', tz = 'UTC')
test_end = pd.Timestamp('2025-03-21 23:59:59', tz = 'UTC')

# Filter op basis van Timestamp
train = df[(df['Timestamp'] >= train_start) & (df['Timestamp'] <= train_end)]
test = df[(df['Timestamp'] >= test_start) & (df['Timestamp'] <= test_end)]

X_train = train[['weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos']]	
y_train = train['Price']
X_test = test[['weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos']]	
y_test = test['Price']

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluatie
from sklearn.metrics import mean_absolute_error
print("MAE:", mean_absolute_error(y_test, y_pred))

from sklearn.metrics import mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)

MAE: 0.03885165439220991
MAE: 0.03885165439220991
RMSE: 0.05183017036495341


In [None]:
from sklearn.ensemble import RandomForestRegressor

df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
df['Timestamp'] = df['target_datetime']
# Feature engineering
df = df.dropna(subset=['target_datetime', 'Price'])

# Stel de periodes in
train_start = pd.Timestamp('2025-01-01', tz = 'UTC')
train_end = pd.Timestamp('2025-03-14 23:59:59', tz = 'UTC')
test_start = pd.Timestamp('2025-03-15', tz = 'UTC')
test_end = pd.Timestamp('2025-03-21 23:59:59', tz = 'UTC')

# Filter op basis van Timestamp
train = df[(df['Timestamp'] >= train_start) & (df['Timestamp'] <= train_end)]
test = df[(df['Timestamp'] >= test_start) & (df['Timestamp'] <= test_end)]

# Features voor het model
features = [
    'Flow_NO',  'hour_sin', 
    'Load','shortwave_radiation', 'temperature_2m', 
    'Flow_GB', 'weekday_sin'
] # excluded: 'month', 'yearday_sin', 'weekday_cos', diffuse_radiation', 'yearday_cos', 'hour_cos','is_dst','is_non_working_day', 'is_weekend','cloud_cover','direct_normal_irradiance',


X_train = train[features]
y_train = train['Price']
X_test = test[features]
y_test = test['Price']

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluatie
from sklearn.metrics import mean_absolute_error
print("MAE:", mean_absolute_error(y_test, y_pred))

from sklearn.metrics import mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)

In [None]:
import matplotlib.pyplot as plt

importances = model.feature_importances_
feature_names = X_train.columns

plt.figure(figsize=(8, 4))
plt.barh(feature_names, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importances")
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(y_test.values, label='Actual', marker='o')
plt.plot(y_pred, label='Predicted', marker='x')
plt.legend()
plt.title("Actual vs Predicted Prices")
plt.xlabel("Sample")
plt.ylabel("Price")
plt.show()

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define feature columns and target

FEATURES = ['weekday_cos', 'weekday_sin', 'hour_cos', 'hour_sin', 'yearday_cos', 'yearday_sin']
# excluded: : 'Flow_NO',  'hour_sin', 'Load',
#    'shortwave_radiation', 'temperature_2m', 'Flow_GB', 
 #   , 'Load', 'Flow_NO', 'Flow_GB', 'yearday_sin', 'diffuse_radiation', 'yearday_cos' 'hour_cos', 'month','is_dst',
# 'is_non_working_day', 'is_weekend','cloud_cover','direct_normal_irradiance',

TRAIN_FEATURES = FEATURES
TEST_FEATURES = FEATURES
target = 'Price'

# Initial training window
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00"
base_run = "2025-03-15 00:00:00"

rmse_results = []

print("🔍 Testing Random Forest Model - RMSE per forecast day")
print("=" * 60)

for i in range(30):
    start = pd.Timestamp(base_start)  + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is None or df.empty:
            print(f"Day {i+1}: ❌ No training data returned")
            continue

        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')

        # Zorg dat run_date ook in UTC is
        run_date_utc = run_date.tz_localize("UTC")

        # Split into training and testing sets
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]

        # Drop any missing data in training
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])

        if test_data.empty or train_data.empty:
            print(f"Day {i+1}: ❌ Not enough data for training or testing")
            continue

        # Train model
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]

        # Aanvullen met NaN-kolommen waar nodig
        for col in TRAIN_FEATURES:
            if col not in test_data.columns:
                test_data[col] = np.nan



        X_test = test_data[TRAIN_FEATURES]
        y_test = test_data[target]

        # Sla de eerste 24 uur over
        X_test = X_test.iloc[24:]
        y_test = y_test.iloc[24:]

        # Alleen voorspellen als er nog testdata over is
        if len(X_test) > 0:
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            # ...rest van je evaluatie...
        else:
            print("Niet genoeg testdata na lag van 24 uur.")




        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results.append({
            'iteration': i + 1,
            'run_date': run_date.strftime('%Y-%m-%d'),
            'valid_predictions': len(test_data),
            'rmse': rmse
        })

        print(f"Day {i+1}: ✅ {len(test_data)} test rows, Run: {run_date.strftime('%m-%d')}")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create results dataframe
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    print(f"\n📊 OVERALL RMSE - Random Forest Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")

    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']].round(2).to_string(index=False))

    print(f"\n📈 SUMMARY STATISTICS")
    print("-" * 40)
    print(rmse_df['rmse'].describe().round(2))

    print(f"\n📊 AVERAGE OVERALL RMSE")
    print("-" * 40)
    print(f"Mean RMSE: {rmse_df['rmse'].mean():.4f}")
else:
    print("❌ No runs completed successfully")

2025-05-30 15:16:45,048 - build_training_set - INFO - 📅 Loading additional historical data until 2025-03-15 00:00:00+00:00 for lagging support
2025-05-30 15:16:45,049 - build_training_set - INFO - 🚀 Start build van trainingset
2025-05-30 15:16:45,052 - build_training_set - INFO - 🧠 Actuals van 2025-01-01 00:00:00+00:00 t/m 2025-03-14 23:00:00+00:00 (extended to 2025-03-15 00:00:00+00:00 for lagging)
2025-05-30 15:16:45,052 - build_training_set - INFO - 📅 Forecast van run_date 2025-03-15 00:00:00+00:00, normalized to 2025-03-15 00:00:00+00:00 for DB lookup, target range: 2025-03-15 00:00:00+00:00 → 2025-03-22 00:00:00+00:00
2025-05-30 15:16:45,052 - build_training_set - INFO - 📥 Loading actuals with selected columns only...
2025-05-30 15:16:45,063 - build_training_set - INFO - 📋 Requested columns found: 21/21
2025-05-30 15:16:45,064 - build_training_set - INFO - 📋 Using columns: ['Price', 'target_datetime', 'Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'di

🔍 Testing Random Forest Model - RMSE per forecast day


2025-05-30 15:16:45,276 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:45,276 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:45,348 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:45,348 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:45,436 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:45,445 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:45,445 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:45,454 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:45,527 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:45,527 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:45,535 - buil

Day 1: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:45,887 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:45,887 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:45,887 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:45,887 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:46,020 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:46,020 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:46,183 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:46,183 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:46,263 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:46,270 - build_training_set - I

Day 2: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:46,655 - build_training_set - INFO - ✅ Predictions loaded: 169 rows with 17 columns
2025-05-30 15:16:46,655 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:46,655 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:46,655 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:46,664 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:46,805 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:46,805 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:46,920 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:46,928 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:47,054 - build_training_s

Day 3: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:47,420 - build_training_set - INFO - ✅ Predictions loaded: 169 rows with 17 columns
2025-05-30 15:16:47,420 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:47,420 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:47,428 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:47,428 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:47,520 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:47,521 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:47,638 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:47,639 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:47,787 - build_training_s

Day 4: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:48,171 - build_training_set - INFO - 📊 Forecast rows available: 169
2025-05-30 15:16:48,182 - build_training_set - INFO - 📋 Common columns for predictions: 17 - ['target_datetime', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 'diffuse_radiation', 'yearday_cos', 'month', 'is_dst', 'yearday_sin', 'wind_speed_10m', 'is_non_working_day', 'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 'hour_sin', 'weekday_cos']
2025-05-30 15:16:48,206 - build_training_set - INFO - ✅ Predictions loaded: 169 rows with 17 columns
2025-05-30 15:16:48,206 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:48,206 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:48,206 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:48,206 - build_training_set - INFO -    

Day 5: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:48,831 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:48,831 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:48,831 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:48,831 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:48,980 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:48,989 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:49,106 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:49,106 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:49,207 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:49,210 - build_training_set - I

Day 6: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:49,589 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:49,589 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:49,589 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:49,705 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:49,705 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:49,772 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:49,780 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:49,855 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:49,863 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:49,863 - build_training_set - INFO - 💰 Retrieving actual prices fo

Day 7: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:50,247 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:50,247 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:50,372 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:50,372 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:50,488 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:50,488 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:50,600 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:50,608 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:50,608 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:50,617 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-3

Day 8: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:51,015 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:51,177 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:51,185 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:51,299 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:51,299 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:51,415 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:51,423 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:51,424 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:51,424 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:51,481 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:51,489 -

Day 9: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:51,958 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:51,959 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:52,082 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:52,083 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:52,158 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:52,158 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:52,158 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:52,165 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:52,212 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:52,212 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:52,221 - buil

Day 10: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:52,589 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:52,593 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:52,673 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:52,674 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:52,752 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:52,754 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:52,757 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:52,762 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:52,795 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:52,795 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:52,803 - buil

Day 11: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:53,196 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:53,196 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:53,293 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:53,293 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:53,461 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:53,461 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:53,475 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:53,486 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:53,527 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:53,527 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:53,527 - buil

Day 12: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:53,887 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:53,889 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:53,960 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:53,960 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:54,045 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:54,045 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:54,054 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:54,066 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:54,145 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:54,145 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:54,154 - buil

Day 13: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:54,508 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:54,511 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:54,658 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:54,661 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:54,768 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:54,768 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:54,777 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:54,785 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:54,853 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:54,853 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:54,859 - buil

Day 14: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:55,194 - build_training_set - INFO - 🔧 Missing columns in predictions: ['Price', 'Load', 'Flow_NO', 'Flow_GB']
2025-05-30 15:16:55,194 - build_training_set - INFO - 📊 Applying 168-hour lag for missing columns (excluding target variables)...
2025-05-30 15:16:55,194 - build_training_set - INFO -    🎯 Column 'Price' is target variable - filled with NaN (not lagged)
2025-05-30 15:16:55,203 - build_training_set - INFO -    🕐 Lagging column 'Load' by 168 hours
2025-05-30 15:16:55,309 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:55,309 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:55,436 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:55,446 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:55,597 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:55,597 - build_training_set - I

Day 15: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:56,088 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:56,094 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:56,171 - build_training_set - INFO -    ✅ Added Flow_NO: 169/169 values found
2025-05-30 15:16:56,179 - build_training_set - INFO -    🕐 Lagging column 'Flow_GB' by 168 hours
2025-05-30 15:16:56,278 - build_training_set - INFO -    ✅ Added Flow_GB: 169/169 values found
2025-05-30 15:16:56,278 - build_training_set - INFO - 🔄 Combining actuals and predictions...
2025-05-30 15:16:56,278 - build_training_set - INFO - 💰 Retrieving actual prices for forecast period...
2025-05-30 15:16:56,286 - build_training_set - INFO - 📊 Found 169 actual prices for forecast period
2025-05-30 15:16:56,324 - build_training_set - INFO - ✅ Filled 169/169 prediction prices with actual values
2025-05-30 15:16:56,331 - build_training_set - INFO - 💰 Price coverage: 169/169 (100.0%)
2025-05-30 15:16:56,331 - buil

Day 16: ❌ Error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- yearday_cos
- yearday_sin



2025-05-30 15:16:56,721 - build_training_set - INFO -    ✅ Added Load: 169/169 values found
2025-05-30 15:16:56,729 - build_training_set - INFO -    🕐 Lagging column 'Flow_NO' by 168 hours
2025-05-30 15:16:56,846 - build_training_set - INFO - 🔒 Connection closed


KeyboardInterrupt: 

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define feature columns and target

FEATURES = ['Load', 'shortwave_radiation', 'temperature_2m', 'direct_normal_irradiance', 
            'diffuse_radiation', 'Flow_NO', 'yearday_cos', 'Flow_GB', 'month', 
            'is_dst', 'yearday_sin', 'wind_speed_10m', 'is_non_working_day', 
            'hour_cos', 'is_weekend', 'cloud_cover', 'weekday_sin', 'hour_sin', 'weekday_cos']

TRAIN_FEATURES = FEATURES
TEST_FEATURES = FEATURES
target = 'Price'

# Initial training window
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00"
base_run = "2025-03-15 00:00:00"

rmse_results = []

print("🔍 Testing Random Forest Model - RMSE per forecast day")
print("=" * 60)

for i in range(30):
    start = pd.Timestamp(base_start)  + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is None or df.empty:
            print(f"Day {i+1}: ❌ No training data returned")
            continue

        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')

        # Zorg dat run_date ook in UTC is
        run_date_utc = run_date.tz_localize("UTC")

        # Split into training and testing sets
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]

        # Drop any missing data in training
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])

        if test_data.empty or train_data.empty:
            print(f"Day {i+1}: ❌ Not enough data for training or testing")
            continue

        # Train model
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]

        # Aanvullen met NaN-kolommen waar nodig
        for col in TRAIN_FEATURES:
            if col not in test_data.columns:
                test_data[col] = np.nan




        X_test = test_data[TRAIN_FEATURES]
        y_test = test_data[target]

        # Sla de eerste 24 uur over
        X_test = X_test.iloc[24:]
        y_test = y_test.iloc[24:]

        # Alleen voorspellen als er nog testdata over is
        if len(X_test) > 0:
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            # ...rest van je evaluatie...
        else:
            print("Niet genoeg testdata na lag van 24 uur.")




        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results.append({
            'iteration': i + 1,
            'run_date': run_date.strftime('%Y-%m-%d'),
            'valid_predictions': len(test_data),
            'rmse': rmse
        })

        print(f"Day {i+1}: ✅ {len(test_data)} test rows, Run: {run_date.strftime('%m-%d')}")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create results dataframe
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    print(f"\n📊 OVERALL RMSE - Random Forest Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")

    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']].round(2).to_string(index=False))

    print(f"\n📈 SUMMARY STATISTICS")
    print("-" * 40)
    print(rmse_df['rmse'].describe().round(2))

    print(f"\n📊 AVERAGE OVERALL RMSE")
    print("-" * 40)
    print(f"Mean RMSE: {rmse_df['rmse'].mean():.4f}")
else:
    print("❌ No runs completed successfully")

🔍 Testing Random Forest Model - RMSE per forecast day
Day 1: ❌ Error: name 'build_training_set' is not defined
Day 2: ❌ Error: name 'build_training_set' is not defined
Day 3: ❌ Error: name 'build_training_set' is not defined
Day 4: ❌ Error: name 'build_training_set' is not defined
Day 5: ❌ Error: name 'build_training_set' is not defined
Day 6: ❌ Error: name 'build_training_set' is not defined
Day 7: ❌ Error: name 'build_training_set' is not defined
Day 8: ❌ Error: name 'build_training_set' is not defined
Day 9: ❌ Error: name 'build_training_set' is not defined
Day 10: ❌ Error: name 'build_training_set' is not defined
Day 11: ❌ Error: name 'build_training_set' is not defined
Day 12: ❌ Error: name 'build_training_set' is not defined
Day 13: ❌ Error: name 'build_training_set' is not defined
Day 14: ❌ Error: name 'build_training_set' is not defined
Day 15: ❌ Error: name 'build_training_set' is not defined
Day 16: ❌ Error: name 'build_training_set' is not defined
Day 17: ❌ Error: name 'buil

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Example: Use your initial training set to select features
df = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 00:00:00"
)
df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
df = df.sort_values('target_datetime').set_index('target_datetime')

# Define all candidate features
ALL_FEATURES = [
    'is_dst', 'hour_cos', 'hour_sin', 'month', 'is_non_working_day',
    'shortwave_radiation', 'temperature_2m', 'cloud_cover', 'direct_normal_irradiance',
    'diffuse_radiation', 'is_weekend', 'yearday_cos', 'yearday_sin',
    'weekday_sin', 'weekday_cos', 'Load', 'Flow_NO', 'Flow_GB'
]
target = 'Price'

# Drop missing values
df = df.dropna(subset=ALL_FEATURES + [target])

# Train a Random Forest to get feature importances
X = df[ALL_FEATURES]
y = df[target]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

importances = model.feature_importances_
feat_imp_df = pd.DataFrame({
    'Feature': ALL_FEATURES,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Print top N features
N = 10
print(f"Top {N} features by importance:")
print(feat_imp_df.head(N))

# If you want to use the top N features for your model:
SELECTED_FEATURES = feat_imp_df['Feature'].head(N).tolist()
print("Selected features for the model:", SELECTED_FEATURES)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define feature columns and target

COMMON_FEATURES = [
    'Flow_NO',  'hour_sin', 'Load',
    'Flow_GB', 'weekday_sin', 
    'yearday_cos', 'yearday_sin', 'hour_cos', 'weekday_sin', 
    'weekday_cos', 'hour_sin', 'direct_normal_irradiance'
]
TRAIN_ONLY_FEATURES = ['Load', 'Flow_NO', 'Flow_GB', ]  # Only used if known ex post
# excluded: : 'diffuse_radiation', 'month','is_dst',
# 'is_non_working_day', 'is_weekend','cloud_cover', 'shortwave_radiation', 'temperature_2m',

TRAIN_FEATURES = COMMON_FEATURES + TRAIN_ONLY_FEATURES
TEST_FEATURES = COMMON_FEATURES
target = 'Price'

# Initial training window
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00"
base_run = "2025-03-15 00:00:00"

rmse_results = []

print("🔍 Testing Random Forest Model - RMSE per forecast day")
print("=" * 60)

for i in range(30):
    start = pd.Timestamp(base_start)  + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is None or df.empty:
            print(f"Day {i+1}: ❌ No training data returned")
            continue

        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')

        # Zorg dat run_date ook in UTC is
        run_date_utc = run_date.tz_localize("UTC")

        # Split into training and testing sets
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]

        # Drop any missing data in training
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])

        if test_data.empty or train_data.empty:
            print(f"Day {i+1}: ❌ Not enough data for training or testing")
            continue

        # Train model
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]

        # Aanvullen met NaN-kolommen waar nodig
        for col in TRAIN_FEATURES:
            if col not in test_data.columns:
                test_data[col] = np.nan

        X_test = test_data[TRAIN_FEATURES]
        y_test = test_data[target]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results.append({
            'iteration': i + 1,
            'run_date': run_date.strftime('%Y-%m-%d'),
            'valid_predictions': len(test_data),
            'rmse': rmse
        })

        print(f"Day {i+1}: ✅ {len(test_data)} test rows, Run: {run_date.strftime('%m-%d')}")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create results dataframe
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    print(f"\n📊 OVERALL RMSE - Random Forest Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")

    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']].round(2).to_string(index=False))

    print(f"\n📈 SUMMARY STATISTICS")
    print("-" * 40)
    print(rmse_df['rmse'].describe().round(2))

    print(f"\n📊 AVERAGE OVERALL RMSE")
    print("-" * 40)
    print(f"Mean RMSE: {rmse_df['rmse'].mean():.4f}")
else:
    print("❌ No runs completed successfully")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import sys
from pathlib import Path

# Find project root dynamically
current_dir = Path.cwd()
while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
    current_dir = current_dir.parent
project_root = current_dir

# Add utils to path
utils_path = project_root / "src" / "utils"
sys.path.append(str(utils_path))
from build_training_set import build_training_set

# Define feature columns and target
COMMON_FEATURES = [
    'is_dst', 'hour_cos', 'hour_sin', 'month', 'is_non_working_day',
    'shortwave_radiation', 'temperature_2m', 'cloud_cover', 'direct_normal_irradiance',
    'diffuse_radiation', 'is_weekend', 'yearday_cos', 'yearday_sin', 'weekday_sin', 'weekday_cos'
]
TRAIN_ONLY_FEATURES = ['Load', 'Flow_NO', 'Flow_GB']  # Only used if known ex post

TRAIN_FEATURES = COMMON_FEATURES + TRAIN_ONLY_FEATURES
TEST_FEATURES = COMMON_FEATURES
target = 'Price'

# Initial training window
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00"
base_run = "2025-03-15 00:00:00"

rmse_results = []
best_params_list = []

print("🔍 Testing Random Forest Model with GridSearchCV - RMSE per forecast day")
print("=" * 60)

# GridSearch parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, None],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

for i in range(30):
    start = pd.Timestamp(base_start) # + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is None or df.empty:
            print(f"Day {i+1}: ❌ No training data returned")
            continue

        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')

        # Zorg dat run_date ook in UTC is
        run_date_utc = run_date.tz_localize("UTC")

        # Split into training and testing sets
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]

        # Drop any missing data in training
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])

        if test_data.empty or train_data.empty:
            print(f"Day {i+1}: ❌ Not enough data for training or testing")
            continue

        # Train model with GridSearchCV
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]

        # Aanvullen met NaN-kolommen waar nodig
        for col in TRAIN_FEATURES:
            if col not in test_data.columns:
                test_data[col] = np.nan

        X_test = test_data[TRAIN_FEATURES]
        y_test = test_data[target]

        base_model = RandomForestRegressor(random_state=42)
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            scoring='neg_root_mean_squared_error',
            cv=3,
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_params_list.append(best_params)
        print(f"Day {i+1}: 🧪 Best params: {best_params}")
        model = grid_search.best_estimator_

        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results.append({
            'iteration': i + 1,
            'run_date': run_date.strftime('%Y-%m-%d'),
            'valid_predictions': len(test_data),
            'rmse': rmse
        })

        print(f"Day {i+1}: ✅ {len(test_data)} test rows, Run: {run_date.strftime('%m-%d')}")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create results dataframe
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    print(f"\n📊 OVERALL RMSE - Random Forest Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")

    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']].round(2).to_string(index=False))

    print(f"\n📈 SUMMARY STATISTICS")
    print("-" * 40)
    print(rmse_df['rmse'].describe().round(2))

    print(f"\n📊 AVERAGE OVERALL RMSE")
    print("-" * 40)
    print(f"Mean RMSE: {rmse_df['rmse'].mean():.4f}")

    # Show most common best hyperparameters
    best_params_df = pd.DataFrame(best_params_list)
    print("\nMost common best hyperparameters across runs:")
    print(best_params_df.mode().iloc[0].to_dict())
else:
    print("❌ No runs completed successfully")

In [None]:
# fixed start day

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Dynamic path setup
import sys
from pathlib import Path

# Find project root dynamically
current_dir = Path.cwd()
while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
    current_dir = current_dir.parent
project_root = current_dir

# Add utils to path
utils_path = project_root / "src" / "utils"
sys.path.append(str(utils_path))
from build_training_set import build_training_set

# Define feature columns and target
COMMON_FEATURES = [
    'is_dst', 'hour_cos', 'hour_sin', 'month', 'is_non_working_day',
    'shortwave_radiation', 'temperature_2m', 'cloud_cover', 'direct_normal_irradiance',
    'diffuse_radiation', 'is_weekend', 'yearday_cos', 'yearday_sin', 'weekday_sin', 'weekday_cos'
]
TRAIN_ONLY_FEATURES = ['Load', 'Flow_NO', 'Flow_GB']  # Only used if known ex post

TRAIN_FEATURES = COMMON_FEATURES + TRAIN_ONLY_FEATURES
TEST_FEATURES = COMMON_FEATURES
target = 'Price'

# Initial training window
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00"
base_run = "2025-03-15 00:00:00"

rmse_results = []

print("🔍 Testing Random Forest Model - RMSE per forecast day")
print("=" * 60)

for i in range(30):
    start = pd.Timestamp(base_start) # + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is None or df.empty:
            print(f"Day {i+1}: ❌ No training data returned")
            continue

        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')

        # Zorg dat run_date ook in UTC is
        run_date_utc = run_date.tz_localize("UTC")

        # Split into training and testing sets
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]

        # Drop any missing data in training
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])

        if test_data.empty or train_data.empty:
            print(f"Day {i+1}: ❌ Not enough data for training or testing")
            continue

        # Train model
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]

        # Aanvullen met NaN-kolommen waar nodig
        for col in TRAIN_FEATURES:
            if col not in test_data.columns:
                test_data[col] = np.nan

        X_test = test_data[TRAIN_FEATURES]
        y_test = test_data[target]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results.append({
            'iteration': i + 1,
            'run_date': run_date.strftime('%Y-%m-%d'),
            'valid_predictions': len(test_data),
            'rmse': rmse
        })

        print(f"Day {i+1}: ✅ {len(test_data)} test rows, Run: {run_date.strftime('%m-%d')}")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create results dataframe
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    print(f"\n📊 OVERALL RMSE - Random Forest Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")

    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']].round(2).to_string(index=False))

    print(f"\n📈 SUMMARY STATISTICS")
    print("-" * 40)
    print(rmse_df['rmse'].describe().round(2))

    print(f"\n📊 AVERAGE OVERALL RMSE")
    print("-" * 40)
    print(f"Mean RMSE: {rmse_df['rmse'].mean():.4f}")
else:
    print("❌ No runs completed successfully")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Collect feature importances from each run
feature_importances = []
feature_names = TRAIN_FEATURES

for i in range(30):
    start = pd.Timestamp(base_start) + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )
        if df is None or df.empty:
            continue
        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')
        run_date_utc = run_date.tz_localize("UTC")
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])
        if test_data.empty or train_data.empty:
            continue
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        feature_importances.append(model.feature_importances_)
    except Exception:
        continue

# Calculate mean importances
if feature_importances:
    importances_mean = np.mean(feature_importances, axis=0)
    feat_imp_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances_mean
    }).sort_values(by='Importance', ascending=False)

    # Plot
    plt.figure(figsize=(10, 5))
    plt.barh(feat_imp_df['Feature'], feat_imp_df['Importance'])
    plt.xlabel("Average Feature Importance")
    plt.title("Random Forest Feature Importances (Rolling Window)")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("No feature importances collected.")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Store predictions, actuals, and dates for each run
horizon_preds = {}  # key: horizon (days ahead), value: DataFrame with index=date, columns=['prediction']
actuals = {}

n_horizons = 7  # 7-day ahead forecasts

for horizon in range(1, n_horizons + 1):
    preds = []
    dates = []
    acts = []
    for i in range(30 - horizon + 1):  # ensure enough data for each horizon
        start = pd.Timestamp(base_start) + pd.Timedelta(days=i)
        end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
        run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

        try:
            df = build_training_set(
                train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
                train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
                run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
            )
            if df is None or df.empty:
                continue
            df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
            df = df.sort_values('target_datetime').set_index('target_datetime')
            run_date_utc = run_date.tz_localize("UTC")
            train_data = df[df.index <= run_date_utc]
            test_data = df[df.index > run_date_utc]
            train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])
            if test_data.empty or train_data.empty:
                continue
            X_train = train_data[TRAIN_FEATURES]
            y_train = train_data[target]
            X_test = test_data[TRAIN_FEATURES]
            y_test = test_data[target]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            # For this horizon, pick the prediction for the (horizon-1)th row (if exists)
            if len(y_pred) >= horizon:
                pred_date = X_test.index[horizon-1]
                preds.append(y_pred[horizon-1])
                acts.append(y_test.iloc[horizon-1])
                dates.append(pred_date)
        except Exception:
            continue
    horizon_preds[horizon] = pd.Series(preds, index=dates)
    if horizon == 1:
        actuals = pd.Series(acts, index=dates)

# Plot: for each horizon, plot the predictions for each date
plt.figure(figsize=(14, 7))
for horizon in range(1, n_horizons + 1):
    s = horizon_preds[horizon].sort_index()
    plt.plot(s.index, s.values, marker='o', label=f'{horizon} day(s) ahead')
# Plot actuals
actuals = actuals.sort_index()
plt.plot(actuals.index, actuals.values, color='black', marker='x', linestyle='--', label='Actual')
plt.xlabel("Date")
plt.ylabel("Price")
plt.title("Random Forest: 1-7 Day Ahead Predictions vs Actuals")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Optimizing the code for feature selection with Random Forest
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import sys
from pathlib import Path
from sklearn.model_selection import GridSearchCV

# Find project root dynamically
current_dir = Path.cwd()
while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
    current_dir = current_dir.parent
project_root = current_dir

# Add utils to path
utils_path = project_root / "src" / "utils"
sys.path.append(str(utils_path))
from build_training_set import build_training_set

# Define feature columns and target
COMMON_FEATURES = ['is_dst', 'hour_cos', 'hour_sin', 'month', 'is_non_working_day', 'shortwave_radiation', 'temperature_2m']
TRAIN_ONLY_FEATURES = ['Load', 'Flow_NO']  # <- Only used if known ex post

TRAIN_FEATURES = COMMON_FEATURES + TRAIN_ONLY_FEATURES
TEST_FEATURES = COMMON_FEATURES

target = 'Price'

# Initial training window
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00"
base_run = "2025-03-15 00:00:00"

rmse_results = []

print("🔍 Testing Random Forest Model - RMSE per forecast day")
print("=" * 60)

# GridSearch parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, None],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Loop over forecast days
for i in range(30):
    start = pd.Timestamp(base_start) + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run_date = pd.Timestamp(base_run) + pd.Timedelta(days=i)

    try:
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run_date.strftime("%Y-%m-%d %H:%M:%S")
        )

        if df is None or df.empty:
            print(f"Day {i+1}: ❌ No training data returned")
            continue

        df['target_datetime'] = pd.to_datetime(df['target_datetime'], utc=True)
        df = df.sort_values('target_datetime').set_index('target_datetime')

        # Ensure run_date is in UTC
        run_date_utc = run_date.tz_localize("UTC")

        # Split into training and testing sets
        train_data = df[df.index <= run_date_utc]
        test_data = df[df.index > run_date_utc]

        # Drop any missing data in training
        train_data = train_data.dropna(subset=TRAIN_FEATURES + [target])

        if test_data.empty or train_data.empty:
            print(f"Day {i+1}: ❌ Not enough data for training or testing")
            continue

        # Train model
        X_train = train_data[TRAIN_FEATURES]
        y_train = train_data[target]

        # Fill missing columns in test_data with NaN if needed
        for col in TRAIN_FEATURES:
            if col not in test_data.columns:
                test_data[col] = np.nan

        X_test = test_data[TRAIN_FEATURES]
        y_test = test_data[target]

        base_model = RandomForestRegressor(random_state=42)
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            scoring='neg_root_mean_squared_error',
            cv=3,
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(X_train, y_train)
        print(f"Day {i+1}: 🧪 Best params: {grid_search.best_params_}")
        model = grid_search.best_estimator_

        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results.append({
            'iteration': i + 1,
            'run_date': run_date.strftime('%Y-%m-%d'),
            'valid_predictions': len(test_data),
            'rmse': rmse
        })

        print(f"Day {i+1}: ✅ {len(test_data)} test rows, Run: {run_date.strftime('%m-%d')}")

    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create results dataframe
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)

    print(f"\n📊 OVERALL RMSE - Random Forest Model")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")

    print(rmse_df[['iteration', 'run_date', 'valid_predictions', 'rmse']].round(2).to_string(index=False))

    print(f"\n📈 SUMMARY STATISTICS")
    print("-" * 40)
    print(rmse_df['rmse'].describe().round(2))

    print(f"\n📊 AVERAGE OVERALL RMSE")
    print("-" * 40)
    print(f"Mean RMSE: {rmse_df['rmse'].mean():.2f}")
else:
    print("❌ No runs completed successfully")