# Test Training Set Build and Review

In [1]:
# Dynamic path setup
import sys
from pathlib import Path

# Find project root dynamically
current_dir = Path.cwd()
while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
    current_dir = current_dir.parent
project_root = current_dir

# Add utils to path
utils_path = project_root / "src" / "utils"
sys.path.append(str(utils_path))

# Simple import as requested
from build_training_set import build_training_set

# Test
df = build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 00:00:00"
)

print(f"Shape: {df.shape}")
print(f"Price nulls: {df['Price'].isnull().sum()}/{len(df)}")
print(f"Date range: {df['target_datetime'].min()} to {df['target_datetime'].max()}")
print(df.head(10))

Shape: (1921, 21)
Price nulls: 0/1921
Date range: 2025-01-01 00:00:00+00:00 to 2025-03-22 00:00:00+00:00
     Price           target_datetime      Load  shortwave_radiation  \
0  0.01362 2025-01-01 00:00:00+00:00  12049.25                  0.0   
1  0.00624 2025-01-01 01:00:00+00:00  11957.50                  0.0   
2  0.00416 2025-01-01 02:00:00+00:00  11636.25                  0.0   
3  0.00328 2025-01-01 03:00:00+00:00  11310.50                  0.0   
4  0.00068 2025-01-01 04:00:00+00:00  11135.25                  0.0   
5      0.0 2025-01-01 05:00:00+00:00  11185.75                  0.0   
6  0.00076 2025-01-01 06:00:00+00:00  11385.00                  0.0   
7  0.00079 2025-01-01 07:00:00+00:00  11695.25                  0.0   
8  0.00189 2025-01-01 08:00:00+00:00  12041.50                  0.0   
9   0.0075 2025-01-01 09:00:00+00:00  12485.75                  3.0   

   temperature_2m  direct_normal_irradiance  diffuse_radiation  Flow_NO  \
0          7.4325                     

In [None]:
# RMSE Matrix for Naive Model (168h lag) - 7 day forecast horizon
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# Test parameters
base_start = "2025-01-01 00:00:00"
base_end = "2025-03-14 23:00:00" 
base_run = "2025-03-15 00:00:00"

# Storage for results
rmse_results = []

print("🔍 Testing Naive Model (168h lag) - RMSE per forecast day")
print("=" * 60)

for i in range(30):  # 30 rolling windows
    # Shift dates
    start = pd.Timestamp(base_start) + pd.Timedelta(days=i)
    end = pd.Timestamp(base_end) + pd.Timedelta(days=i)
    run = pd.Timestamp(base_run) + pd.Timedelta(days=i)
    
    try:
        # Get training set
        df = build_training_set(
            train_start=start.strftime("%Y-%m-%d %H:%M:%S"),
            train_end=end.strftime("%Y-%m-%d %H:%M:%S"),
            run_date=run.strftime("%Y-%m-%d %H:%M:%S")
        )
        
        if df is not None and len(df) > 0:
            # Split into training and forecast periods
            train_cutoff = pd.Timestamp(end, tz="UTC")
            forecast_data = df[df['target_datetime'] > train_cutoff].copy()
            
            # Check if we have forecast data with actual prices
            if len(forecast_data) > 0 and forecast_data['Price'].notna().sum() > 0:
                # Sort by datetime
                forecast_data = forecast_data.sort_values('target_datetime').reset_index(drop=True)
                
                # Create naive predictions (lag 168 hours = 7 days)
                # For each forecast hour, predict using price from 168 hours ago
                forecast_data['naive_prediction'] = np.nan
                
                for idx, row in forecast_data.iterrows():
                    target_time = row['target_datetime']
                    lag_time = target_time - pd.Timedelta(hours=168)
                    
                    # Find the lagged price in the full dataset
                    lagged_price = df[df['target_datetime'] == lag_time]['Price']
                    if len(lagged_price) > 0 and lagged_price.iloc[0] is not pd.NA:
                        forecast_data.loc[idx, 'naive_prediction'] = lagged_price.iloc[0]
                
                # Remove rows where we couldn't make predictions or don't have actuals
                valid_data = forecast_data.dropna(subset=['Price', 'naive_prediction'])
                
                if len(valid_data) > 0:
                    # Calculate RMSE per day (group by day of forecast)
                    valid_data['forecast_hour'] = range(len(valid_data))
                    valid_data['forecast_day'] = (valid_data['forecast_hour'] // 24) + 1
                    
                    # Limit to 7 days (168 hours)
                    valid_data = valid_data[valid_data['forecast_day'] <= 7]
                    
                    day_rmses = {}
                    for day in range(1, 8):  # Days 1-7
                        day_data = valid_data[valid_data['forecast_day'] == day]
                        if len(day_data) > 0:
                            rmse = np.sqrt(mean_squared_error(day_data['Price'], day_data['naive_prediction']))
                            day_rmses[f'Day_{day}'] = rmse
                        else:
                            day_rmses[f'Day_{day}'] = np.nan
                    
                    # Store results
                    result = {
                        'iteration': i+1,
                        'run_date': run.strftime('%Y-%m-%d'),
                        'valid_predictions': len(valid_data),
                        **day_rmses
                    }
                    rmse_results.append(result)
                    
                    print(f"Day {i+1}: ✅ {len(valid_data)} predictions, Run: {run.strftime('%m-%d')}")
                else:
                    print(f"Day {i+1}: ❌ No valid predictions (missing lag data)")
            else:
                print(f"Day {i+1}: ❌ No forecast data with actual prices")
        else:
            print(f"Day {i+1}: ❌ No training data")
            
    except Exception as e:
        print(f"Day {i+1}: ❌ Error: {e}")

# Create RMSE matrix
if rmse_results:
    rmse_df = pd.DataFrame(rmse_results)
    
    print(f"\n📊 RMSE MATRIX - Naive Model (168h lag)")
    print("=" * 80)
    print(f"Successful runs: {len(rmse_df)}/30")
    
    # Show the matrix
    day_columns = [f'Day_{i}' for i in range(1, 8)]
    available_day_cols = [col for col in day_columns if col in rmse_df.columns]
    
    if available_day_cols:
        display_cols = ['iteration', 'run_date'] + available_day_cols
        print(rmse_df[display_cols].round(2).to_string(index=False))
        
        # Summary statistics
        print(f"\n📈 SUMMARY STATISTICS")
        print("-" * 40)
        summary = rmse_df[available_day_cols].describe().round(2)
        print(summary)
        
        # Average RMSE per day
        print(f"\n📊 AVERAGE RMSE PER FORECAST DAY")
        print("-" * 40)
        avg_rmse = rmse_df[available_day_cols].mean().round(2)
        for day, rmse in avg_rmse.items():
            print(f"{day}: {rmse:.2f}")
    else:
        print("❌ No valid RMSE calculations found")
else:
    print("❌ No results generated")