# PV Forecasting with Error Propagation Analysis

1. **Parameter Range Search**: Find optimal hyperparameters
2. **Random Date Evaluation**: Analyze model performance on random dates
3. **Error Propagation Analysis**: Track how errors evolve throughout 2024
4. **Daily Error Metrics**: Comprehensive daily error analysis

In [1]:
import os, time, warnings, random
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# ML / DL
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks
import keras_tuner as kt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)


In [4]:
# Configuration
LAGS          = list(range(1, 25))         # past 24 hours
STEP          = "H"
MAX_EPOCHS    = 50
EARLY_STOP    = 5
TRIALS        = 30                         # Increased for better search
EXEC_PER_TR   = 2
BATCH_SIZE    = 64

print(f"Run started: {datetime.now()}")


Run started: 2025-06-18 20:07:15.965530


## Data Loading & Feature Engineering


In [16]:
# Load data
df = (
    pd.read_csv("../../data/renewables/pv_with_weather_data.csv",
                comment="#", parse_dates=["time"])
    .rename(columns={"time": "ts",
                     "electricity": "pv",
                     "irradiance_direct": "dir_irr",
                     "irradiance_diffuse": "dif_irr",
                     "temperature": "temp"})
    .set_index("ts")
    .asfreq(STEP)
)
print(f"Data span: {df.index.min()} – {df.index.max()} ({len(df):,} rows)")
print(f"Training period: {df.index.min()} to 2023-12-31")
print(f"Holdout period: 2024-01-01 to {df.index.max()}")

# Verify training vs holdout split
train_years = (pd.to_datetime("2023-12-31") - df.index.min()).days / 365.25
holdout_days = (df.index.max() - pd.to_datetime("2024-01-01")).days
print(f"Training duration: ~{train_years:.1f} years")
print(f"Holdout duration: {holdout_days} days")


Data span: 2014-01-01 00:00:00 – 2024-12-31 23:00:00 (96,432 rows)
Training period: 2014-01-01 00:00:00 to 2023-12-31
Holdout period: 2024-01-01 to 2024-12-31 23:00:00
Training duration: ~10.0 years
Holdout duration: 365 days


In [9]:
def make_features(data, lags, include_time=True, include_weather=True):
    """Create feature matrix with lags, time, and weather features"""
    X = pd.DataFrame(index=data.index)
    
    # Lag features
    for lag in lags:
        X[f"lag_{lag}"] = data["pv"].shift(lag)

    # Time features
    if include_time:
        hr = data.index.hour
        dy = data.index.dayofyear
        X["sin_hour"] = np.sin(2 * np.pi * hr / 24)
        X["cos_hour"] = np.cos(2 * np.pi * hr / 24)
        X["sin_doy"]  = np.sin(2 * np.pi * dy / 365)
        X["cos_doy"]  = np.cos(2 * np.pi * dy / 365)
    
    # Weather features
    if include_weather:
        X["dir_irr"] = data["dir_irr"]
        X["dif_irr"] = data["dif_irr"]
        X["temp"]    = data["temp"]

    y = data["pv"]
    return X.join(y.rename("target")).dropna()

feats_all = make_features(df, LAGS)
print("Feature matrix shape:", feats_all.shape)


Feature matrix shape: (96408, 32)


## Data Splitting & Scaling


In [10]:
# Split data
train_val_end = "2023-12-31 23:00"
holdout_start = "2024-01-01 00:00"

train_val = feats_all.loc[:train_val_end]
hold_out  = feats_all.loc[holdout_start:]

# Internal chronological 80/20 split for training/validation
split_idx = int(len(train_val) * 0.8)
train = train_val.iloc[:split_idx]
ival  = train_val.iloc[split_idx:]

def split_xy(frame):
    """Split features and target"""
    X = frame.drop(columns="target").values
    y = frame["target"].values
    return X, y

Xt_tr, yt_tr = split_xy(train)
Xt_val, yt_val = split_xy(ival)
Xt_hold, yt_hold = split_xy(hold_out)

# Scalers fitted on training data only
x_scaler = StandardScaler().fit(Xt_tr)
y_scaler = StandardScaler().fit(yt_tr.reshape(-1, 1))

def scale_x(X): return x_scaler.transform(X)
def unscale_y(y): return y_scaler.inverse_transform(y.reshape(-1, 1)).ravel()

# Scale data
Xt_tr_s, Xt_val_s = scale_x(Xt_tr), scale_x(Xt_val)
Xt_hold_s = scale_x(Xt_hold)
yt_tr_s = y_scaler.transform(yt_tr.reshape(-1, 1)).ravel()

print(f"Train: {len(train):,} | Val: {len(ival):,} | Hold-out: {len(hold_out):,}")
print(f"Training years: ~{len(train) / (365 * 24):.1f} years")
print(f"Holdout days: {len(hold_out) / 24:.0f} days")


Train: 70,099 | Val: 17,525 | Hold-out: 8,784
Training years: ~8.0 years
Holdout days: 366 days


## Function 1: Enhanced Hyperparameter Search


In [17]:
def run_parameter_search(param_ranges=None, max_trials=30, verbose=True):
    """
    Run hyperparameter search with configurable parameter ranges.
    
    Args:
        param_ranges (dict): Custom parameter ranges. If None, uses default ranges.
        max_trials (int): Maximum number of trials for hyperparameter search
        verbose (bool): Whether to print detailed results
    
    Returns:
        dict: Best hyperparameters found
    """
    
    # Default parameter ranges
    default_ranges = {
        'n_layers': {'min_value': 1, 'max_value': 4},
        'units': {'values': [32, 64, 128, 256]},
        'dropout': {'min_value': 0.0, 'max_value': 0.5, 'step': 0.1},
        'l2': {'min_value': 1e-6, 'max_value': 1e-2, 'sampling': 'log'},
        'lr': {'values': [1e-4, 3e-4, 1e-3, 3e-3]}
    }
    
    if param_ranges is None:
        param_ranges = default_ranges
    
    def hp_model_enhanced(hp):
        n_feat = Xt_tr_s.shape[1]
        
        # Define hyperparameters based on provided ranges
        n_layers = hp.Int("n_layers", **param_ranges['n_layers'])
        units = hp.Choice("units", param_ranges['units']['values'])
        drop = hp.Float("dropout", **{k:v for k,v in param_ranges['dropout'].items() if k != 'step'}, step=param_ranges['dropout'].get('step', 0.1))
        l2_reg = hp.Float("l2", **param_ranges['l2'])
        lr = hp.Choice("lr", param_ranges['lr']['values'])
        
        # Build model
        m = models.Sequential()
        m.add(layers.Input(shape=(n_feat,)))
        
        for _ in range(n_layers):
            m.add(layers.Dense(units, activation="relu",
                               kernel_regularizer=regularizers.l2(l2_reg)))
            m.add(layers.Dropout(drop))
        
        m.add(layers.Dense(1))
        m.compile(optimizer=tf.keras.optimizers.Adam(lr),
                  loss="mse", metrics=["mae"])
        return m
    
    # Create tuner
    tuner = kt.RandomSearch(
        hp_model_enhanced,
        objective="val_loss",
        max_trials=max_trials,
        executions_per_trial=EXEC_PER_TR,
        directory="tuner_enhanced",
        project_name="pv_enhanced"
    )
    
    if verbose:
        print("Search space:")
        tuner.search_space_summary()
    
    # Run search
    es_cb = callbacks.EarlyStopping(patience=EARLY_STOP, restore_best_weights=True)
    
    print(f"\\nStarting hyperparameter search with {max_trials} trials...")
    start_time = time.time()
    
    tuner.search(
        Xt_tr_s, yt_tr_s,
        validation_data=(Xt_val_s, y_scaler.transform(yt_val.reshape(-1,1)).ravel()),
        epochs=MAX_EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[es_cb],
        verbose=0 if not verbose else 1
    )
    
    search_time = time.time() - start_time
    print(f"Search completed in {search_time/60:.2f} minutes")
    
    # Get best hyperparameters
    best_hp = tuner.get_best_hyperparameters(1)[0]
    best_params = best_hp.values
    
    if verbose:
        print("\\nBest hyperparameters found:")
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        
        # Show top 5 trials
        print("\\nTop 5 trials:")
        tuner.results_summary(5)
    
    return best_params


## Function 2: Random Date Evaluation & Error Propagation Analysis


In [18]:
def evaluate_random_dates_and_error_propagation(model_params, n_random_dates=10, 
                                                 analyze_error_propagation=True,
                                                 verbose=True):
    """
    Train model with best parameters and evaluate on random dates + error propagation.
    
    Args:
        model_params (dict): Best hyperparameters to use
        n_random_dates (int): Number of random dates to evaluate
        analyze_error_propagation (bool): Whether to analyze error propagation
        verbose (bool): Whether to print detailed results
    
    Returns:
        tuple: (model, daily_errors_df, random_dates_results, error_propagation_results)
    """
    
    # Build model with best parameters
    def create_best_model(params):
        n_feat = Xt_tr_s.shape[1]
        
        m = models.Sequential()
        m.add(layers.Input(shape=(n_feat,)))
        
        for _ in range(params['n_layers']):
            m.add(layers.Dense(params['units'], activation="relu",
                               kernel_regularizer=regularizers.l2(params['l2'])))
            m.add(layers.Dropout(params['dropout']))
        
        m.add(layers.Dense(1))
        m.compile(optimizer=tf.keras.optimizers.Adam(params['lr']),
                  loss="mse", metrics=["mae"])
        return m
    
    # Train model on full training data (train + validation)
    print("Training model with best parameters on full training set...")
    model = create_best_model(model_params)
    
    full_X = np.concatenate([Xt_tr_s, Xt_val_s])
    full_y = np.concatenate([yt_tr_s, y_scaler.transform(yt_val.reshape(-1,1)).ravel()])
    
    es_cb = callbacks.EarlyStopping(patience=EARLY_STOP, restore_best_weights=True)
    history = model.fit(
        full_X, full_y,
        epochs=MAX_EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[es_cb],
        verbose=0
    )
    
    # Make predictions on holdout set
    y_pred_s = model.predict(Xt_hold_s, verbose=0).flatten()
    y_pred = unscale_y(y_pred_s)
    
    # Overall holdout performance
    mae_overall = mean_absolute_error(yt_hold, y_pred)
    rmse_overall = np.sqrt(mean_squared_error(yt_hold, y_pred))
    
    if verbose:
        print(f"\\nOverall 2024 Performance:")
        print(f"  MAE: {mae_overall:.4f}")
        print(f"  RMSE: {rmse_overall:.4f}")
    
    # Create results dataframe
    results_df = pd.DataFrame({
        "actual": yt_hold,
        "predicted": y_pred,
        "error": yt_hold - y_pred,
        "abs_error": np.abs(yt_hold - y_pred)
    }, index=hold_out.index)
    
    return model, results_df, history


In [13]:
def analyze_daily_errors(results_df, verbose=True):
    """
    Analyze daily error patterns from results DataFrame.
    
    Args:
        results_df: DataFrame with actual, predicted, error, abs_error columns
        verbose: Whether to print detailed results
    
    Returns:
        DataFrame: Daily error statistics
    """
    daily_errors = []
    unique_dates = results_df.index.normalize().unique()
    
    for date in unique_dates:
        day_data = results_df[results_df.index.normalize() == date]
        if len(day_data) > 0:
            daily_errors.append({
                'date': date.date(),
                'mae': day_data['abs_error'].mean(),
                'rmse': np.sqrt(day_data['error'].pow(2).mean()),
                'max_error': day_data['abs_error'].max(),
                'std_error': day_data['abs_error'].std(),
                'mean_actual': day_data['actual'].mean(),
                'mean_predicted': day_data['predicted'].mean(),
                'day_of_year': date.dayofyear,
                'month': date.month,
                'weekday': date.weekday()
            })
    
    daily_errors_df = pd.DataFrame(daily_errors)
    daily_errors_df['date'] = pd.to_datetime(daily_errors_df['date'])
    
    if verbose:
        print(f"\\n=== DAILY ERROR ANALYSIS ===")
        print(f"Total days analyzed: {len(daily_errors_df)}")
        print(f"Average daily MAE: {daily_errors_df['mae'].mean():.4f}")
        print(f"MAE std deviation: {daily_errors_df['mae'].std():.4f}")
        print(f"Best day (lowest MAE): {daily_errors_df.loc[daily_errors_df['mae'].idxmin(), 'date'].strftime('%Y-%m-%d')} ({daily_errors_df['mae'].min():.4f})")
        print(f"Worst day (highest MAE): {daily_errors_df.loc[daily_errors_df['mae'].idxmax(), 'date'].strftime('%Y-%m-%d')} ({daily_errors_df['mae'].max():.4f})")
    
    return daily_errors_df


In [14]:
def analyze_error_propagation(daily_errors_df, verbose=True):
    """
    Analyze error propagation patterns throughout the year.
    
    Args:
        daily_errors_df: DataFrame with daily error statistics
        verbose: Whether to print detailed results
    
    Returns:
        dict: Error propagation analysis results
    """
    # Weekly error analysis
    daily_errors_df['week'] = daily_errors_df['date'].dt.isocalendar().week
    weekly_errors = daily_errors_df.groupby('week').agg({
        'mae': ['mean', 'std', 'min', 'max'],
        'rmse': ['mean', 'std', 'min', 'max'],
        'date': 'count'
    }).round(4)
    
    # Monthly error analysis
    monthly_errors = daily_errors_df.groupby('month').agg({
        'mae': ['mean', 'std', 'min', 'max'],
        'rmse': ['mean', 'std', 'min', 'max'],
        'date': 'count'
    }).round(4)
    
    # Seasonal trends
    daily_errors_df['season'] = daily_errors_df['month'].map({
        12: 'Winter', 1: 'Winter', 2: 'Winter',
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall'
    })
    
    seasonal_errors = daily_errors_df.groupby('season').agg({
        'mae': ['mean', 'std', 'min', 'max'],
        'rmse': ['mean', 'std', 'min', 'max'],
        'date': 'count'
    }).round(4)
    
    # Trend analysis
    mae_trend = np.corrcoef(daily_errors_df['day_of_year'], daily_errors_df['mae'])[0,1]
    rmse_trend = np.corrcoef(daily_errors_df['day_of_year'], daily_errors_df['rmse'])[0,1]
    
    # Weekday analysis
    weekday_errors = daily_errors_df.groupby('weekday').agg({
        'mae': ['mean', 'std'],
        'rmse': ['mean', 'std']
    }).round(4)
    
    error_propagation = {
        'weekly': weekly_errors,
        'monthly': monthly_errors,
        'seasonal': seasonal_errors,
        'weekday': weekday_errors,
        'trend_analysis': {
            'mae_trend': mae_trend,
            'rmse_trend': rmse_trend
        }
    }
    
    if verbose:
        print(f"\\n=== ERROR PROPAGATION ANALYSIS ===")
        print(f"MAE trend correlation with day of year: {mae_trend:.4f}")
        print(f"RMSE trend correlation with day of year: {rmse_trend:.4f}")
        print(f"\\nSeasonal MAE averages:")
        for season in ['Winter', 'Spring', 'Summer', 'Fall']:
            if season in seasonal_errors.index:
                mae_mean = seasonal_errors.loc[season, ('mae', 'mean')]
                print(f"  {season}: {mae_mean:.4f}")
        
        print(f"\\nMonthly MAE averages:")
        for month in range(1, 13):
            if month in monthly_errors.index:
                mae_mean = monthly_errors.loc[month, ('mae', 'mean')]
                print(f"  Month {month}: {mae_mean:.4f}")
    
    return error_propagation


In [15]:
def evaluate_random_dates(results_df, daily_errors_df, n_random_dates=10, verbose=True):
    """
    Evaluate model performance on random dates.
    
    Args:
        results_df: DataFrame with hourly predictions
        daily_errors_df: DataFrame with daily error statistics
        n_random_dates: Number of random dates to evaluate
        verbose: Whether to print detailed results
    
    Returns:
        list: Random dates evaluation results
    """
    available_dates = daily_errors_df['date'].tolist()
    random_dates = random.sample(available_dates, min(n_random_dates, len(available_dates)))
    
    random_results = []
    for date in random_dates:
        day_data = results_df[results_df.index.normalize() == date.normalize()]
        day_errors = daily_errors_df[daily_errors_df['date'] == date].iloc[0]
        
        random_results.append({
            'date': date.date(),
            'mae': day_errors['mae'],
            'rmse': day_errors['rmse'],
            'max_error': day_errors['max_error'],
            'hourly_data': day_data[['actual', 'predicted', 'abs_error']].copy()
        })
    
    if verbose:
        print(f"\\n=== RANDOM {n_random_dates} DATES EVALUATION ===")
        for result in random_results:
            print(f"  {result['date']}: MAE={result['mae']:.4f}, RMSE={result['rmse']:.4f}, Max_Error={result['max_error']:.4f}")
    
    return random_results
