In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import matplotlib.pyplot as plt
import math
from joblib import dump
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [None]:
def plot_seasonal_comparison(test_index, ys_test_rescaled, predictions_rescaled):
    """
    Plot the average actual and forecasted load for each season.

    Parameters:
    - test_index: Datetime index for the test data.
    - ys_test_rescaled: Rescaled actual test values.
    - predictions_rescaled: Rescaled forecasted values.
    """
    # Create a DataFrame to hold the test and prediction data
    data = pd.DataFrame({
        'Actual': ys_test_rescaled.flatten(),
        'Predicted': predictions_rescaled.flatten()
    }, index=test_index)
    
    # Add a 'Season' column to the DataFrame
    data['Month'] = data.index.month
    data['Season'] = data['Month'].apply(lambda x: (
        'Winter' if x in [12, 1, 2] else
        'Spring' if x in [3, 4, 5] else
        'Summer' if x in [6, 7, 8] else
        'Autumn'
    ))
    
    # Group by season and calculate the mean for actual and predicted values
    seasonal_data = data.groupby('Season').mean()
    
    # Plot the seasonal comparison
    plt.figure(figsize=(12, 6))
    plt.plot(seasonal_data.index, seasonal_data['Actual'], label='Actual Load', marker='o')
    plt.plot(seasonal_data.index, seasonal_data['Predicted'], label='Forecasted Load', marker='o')
    plt.xlabel('Season')
    plt.ylabel('Average Load')
    plt.title('Average Actual Load and Forecasted Load by Season')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def plot_monthly_comparison(test_index, ys_test_rescaled, predictions_rescaled):
    """
    Plot the average actual and forecasted load by month.

    Parameters:
    - test_index: Datetime index for the test data.
    - ys_test_rescaled: Rescaled actual test values.
    - predictions_rescaled: Rescaled forecasted values.
    """
    # Create a DataFrame to hold the test and prediction data
    data = pd.DataFrame({
        'Actual': ys_test_rescaled.flatten(),
        'Predicted': predictions_rescaled.flatten()
    }, index=test_index)
    
    # Add a 'Month' column to the DataFrame
    data['Month'] = data.index.month

    # Group by month and calculate the mean for actual and predicted values
    monthly_data = data.groupby('Month').mean()

    # Create month labels corresponding to the months present in the dataset
    month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    available_months = monthly_data.index
    available_labels = [month_labels[month - 1] for month in available_months]

    # Plot the monthly comparison
    plt.figure(figsize=(12, 6))
    plt.plot(monthly_data.index, monthly_data['Actual'], label='Actual Load', marker='o')
    plt.plot(monthly_data.index, monthly_data['Predicted'], label='Forecasted Load', marker='o')
    plt.xlabel('Month')
    plt.ylabel('Average Load')
    plt.title('Average Actual Load and Forecasted Load by Month')
    plt.xticks(ticks=monthly_data.index, labels=available_labels)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def plot_weekday_comparison(test_index, ys_test_rescaled, predictions_rescaled):
    """
    Plot the average actual and forecasted load by day of the week.

    Parameters:
    - test_index: Datetime index for the test data.
    - ys_test_rescaled: Rescaled actual test values.
    - predictions_rescaled: Rescaled forecasted values.
    """
    # Create a DataFrame to hold the test and prediction data
    data = pd.DataFrame({
        'Actual': ys_test_rescaled.flatten(),
        'Predicted': predictions_rescaled.flatten()
    }, index=test_index)
    
    # Add a 'DayOfWeek' column to the DataFrame
    data['DayOfWeek'] = data.index.dayofweek

    # Group by day of the week and calculate the mean for actual and predicted values
    weekday_data = data.groupby('DayOfWeek').mean()

    # Create day labels corresponding to the days of the week
    day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    available_days = weekday_data.index
    available_day_labels = [day_labels[day] for day in available_days]

    # Plot the weekday comparison
    plt.figure(figsize=(12, 6))
    plt.plot(weekday_data.index, weekday_data['Actual'], label='Actual Load', marker='o')
    plt.plot(weekday_data.index, weekday_data['Predicted'], label='Forecasted Load', marker='o')
    plt.xlabel('Day of the Week')
    plt.ylabel('Average Load')
    plt.title('Average Actual Load and Forecasted Load by Day of the Week')
    plt.xticks(ticks=weekday_data.index, labels=available_day_labels)
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
def plot_hourly_comparison(test_index, ys_test_rescaled, predictions_rescaled):
    """
    Plot the average actual and forecasted load by hour of the day.

    Parameters:
    - test_index: Datetime index for the test data.
    - ys_test_rescaled: Rescaled actual test values.
    - predictions_rescaled: Rescaled forecasted values.
    """
    # Create a DataFrame to hold the test and prediction data
    data = pd.DataFrame({
        'Actual': ys_test_rescaled.flatten(),
        'Predicted': predictions_rescaled.flatten()
    }, index=test_index)
    
    # Add an 'Hour' column to the DataFrame
    data['Hour'] = data.index.hour

    # Group by hour of the day and calculate the mean for actual and predicted values
    hourly_data = data.groupby('Hour').mean()

    # Create hour labels corresponding to the hours of the day
    available_hours = hourly_data.index
    available_hour_labels = [f'{hour}:00' for hour in available_hours]

    # Plot the hourly comparison
    plt.figure(figsize=(12, 6))
    plt.plot(hourly_data.index, hourly_data['Actual'], label='Actual Load', marker='o')
    plt.plot(hourly_data.index, hourly_data['Predicted'], label='Forecasted Load', marker='o')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Average Load')
    plt.title('Average Actual Load and Forecasted Load by Hour of the Day')
    plt.xticks(ticks=hourly_data.index, labels=available_hour_labels)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def plot_results_from_to(test_index, ys_test_rescaled, predictions_rescaled, start_date, end_date):
    """
    Plot the actual and forecasted load for a specified date range.

    Parameters:
    - test_index: Datetime index for the test data.
    - ys_test_rescaled: Rescaled actual test values.
    - predictions_rescaled: Rescaled forecasted values.
    - start_date: Start date for the plot.
    - end_date: End date for the plot.
    """
    # Convert start_date and end_date to datetime if they are strings
    if isinstance(start_date, str):
        start_date = pd.to_datetime(start_date)
    if isinstance(end_date, str):
        end_date = pd.to_datetime(end_date)

    # Create a boolean mask for the date range
    mask = (test_index >= start_date) & (test_index <= end_date)

    # Apply the mask to the test data and predictions
    time_index = test_index[mask]
    ys_test_range = ys_test_rescaled[mask]
    predictions_range = predictions_rescaled[mask]

    # Plotting the actual and forecasted load for the specified date range
    plt.figure(figsize=(24, 5))
    plt.plot(time_index, ys_test_range.flatten(), label='Actual Load')
    plt.plot(time_index, predictions_range.flatten(), label='Forecasted Load')
    plt.xlabel('Time')
    plt.ylabel('Load')
    plt.title(f'Actual Load and Forecasted Load from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}')
    plt.legend()

    # Customize x-axis to show date and day of the week
    plt.xticks(ticks=time_index[::24], labels=[f"{date:%Y-%m-%d}\n{date:%A}" for date in time_index[::24]], rotation=45)
    plt.show()


In [None]:

def plot_results(test_index, ys_test_rescaled, predictions_rescaled, hours_to_plot=720):
    """
    Plot the actual and forecasted load for the first month.

    Parameters:
    - test_index: Datetime index for the test data.
    - ys_test_rescaled: Rescaled actual test values.
    - predictions_rescaled: Rescaled forecasted values.
    - hours_in_month: Number of hours to plot for the first month.
    """
    # Adjust the test index to include only the first month of data
    time_index = test_index[:hours_to_plot]
    ys_test_first_month = ys_test_rescaled[:hours_to_plot]
    predictions_first_month = predictions_rescaled[:hours_to_plot]

    # Plotting the initial month of actual and forecasted load
    plt.figure(figsize=(24, 5))
    plt.plot(time_index, ys_test_first_month.flatten(), label='Actual Load')
    plt.plot(time_index, predictions_first_month.flatten(), label='Forecasted Load')
    plt.xlabel('Time')
    plt.ylabel('Load')
    plt.title('Actual Load and Forecasted Load for the First Month')
    plt.legend()

    # Customize x-axis to show date and day of the week
    plt.xticks(ticks=time_index[::24], labels=[f"{date:%Y-%m-%d}\n{date:%A}" for date in time_index[::24]], rotation=45)
    plt.show()


In [None]:
def build_and_train_model(xs_train, ys_train, model_config, num_target_features,path_to_save_model):
    """
    Build, train, and evaluate an LSTM model.

    Parameters:
    - xs_train, ys_train: Training data.
    - xs_test, ys_test: Test data.
    - model_config: Dictionary containing LSTM layers configuration and other model parameters.
    - scaler: Scaler used to scale data.
    - num_target_features: Number of output features for the model.

    Returns:
    - model: Trained LSTM model.
    - history: Training history of the model.
    - predictions_rescaled: Rescaled predictions.
    - ys_test_rescaled: Rescaled actual values.
    """
    # Build the LSTM model
    model = Sequential()
    input_shape = (xs_train.shape[1], xs_train.shape[2])

    for layer_config in model_config['lstm_layers']:
        model.add(LSTM(layer_config['units'], return_sequences=layer_config['return_sequences'], input_shape=input_shape if 'input_shape' not in layer_config else None))
        input_shape = None  # For subsequent layers, input_shape is not required

      # Reset input_shape after first layer
        input_shape = None

        # Add Dropout layer if specified
        if 'dropout' in layer_config:
            model.add(Dropout(layer_config['dropout']))
    model.add(Dense(num_target_features))
    model.compile(optimizer='adam', loss='mse')

    # Train the model
    # Define the early stopping and model checkpoint callbacks
    early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
    checkpoint = ModelCheckpoint(path_to_save_model, monitor='loss', verbose=1, save_best_only=True, mode='min')

    history = model.fit(xs_train, ys_train, epochs=model_config.get('epochs', 50), batch_size=model_config.get('batch_size', 32), callbacks=[early_stopping, checkpoint])

    return model, history


In [None]:
def create_sequences(data, seq_length, forecast_horizon, target_col):
    target_col_index = target_col
    xs, ys = [], []
    target_col_name = data.columns[target_col_index]  
    for i in range(len(data) - seq_length - forecast_horizon + 1):
        x = data.iloc[i:(i + seq_length)].values
      #  x = data.iloc[i:(i + seq_length)].drop(columns=[target_col_name]).values 
        y = data.iloc[(i + seq_length):(i + seq_length + forecast_horizon), target_col].values
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [None]:
# Load and preprocess data
#data_df = pd.read_csv('../../data/processed/actuals_data.csv', parse_dates=['Time'], index_col='Time')
# Dataset with actual weather variables
# data_df = pd.read_csv('../data/interim/precovid-data/train/load_with_actual_weather_variables_dataset.csv', parse_dates=['Time'], index_col='Time')

# Dataset without COVID-19 with forecasted weather variables
data_df = pd.read_csv('../data/interim/precovid-data/train/load_with_forecasted_weather_variables_dataset.csv', parse_dates=['Time'], index_col='Time')

#COVID-19 dataset
#data_df = pd.read_csv('../data/interim/covid-data/load_with_forecasted_weather_variables_dataset.csv', parse_dates=['Time'], index_col='Time')

coloumns_to_drop  = ['Pressure_kpa','Cloud Cover (%)','Wind Direction (deg)']
data_df = data_df.drop(columns=coloumns_to_drop)

#data preprocessing
hour_of_day_col = data_df.index.hour
data_df['hour_of_day_sin'] = np.sin(2 * np.pi * hour_of_day_col / 24)
data_df['hour_of_day_cos'] = np.cos(2 * np.pi * hour_of_day_col / 24)

load_col = data_df.pop('Load (kW)')
data_df['Load (kW)'] = load_col

target_col = (data_df.columns.get_loc('Load (kW)'))
num_target_features = 1
scaler_num_features = data_df.shape[1]

In [None]:
data_df

In [None]:
#1. Scale data
scaler = MinMaxScaler()
data_df_scaled = pd.DataFrame(scaler.fit_transform(data_df), columns=data_df.columns, index=data_df.index)

In [None]:
scaler.feature_names_in_

In [None]:
# Save the scaler to a file using joblib
dump(scaler, '../models/scalers/scaler.joblib')

In [None]:

#2. Create sequences pairs of input and output
#In this case we have to configure the target_col-1 to be the index of the target column in the data_df in order to assign in the ys variable
# and have input output pairs of sequences
seq_length = 2
forecast_horizon = 1
xs, ys = create_sequences(data_df_scaled, seq_length, forecast_horizon, target_col)
print(xs.shape, ys.shape)

In [None]:
data_df_scaled

In [None]:
xs[0]

In [None]:
ys[0]

In [None]:
#3. Split data
train_size = 0.9
num_samples = len(xs)
train_end = int(num_samples * train_size)

xs_train = xs[:train_end]
ys_train = ys[:train_end]
xs_test = xs[train_end:]
ys_test = ys[train_end:]

In [None]:

fig, ax = plt.subplots(figsize=(7, 3))
data_df.iloc[:train_end]['Load (kW)'].plot(ax=ax, label="Train")
data_df.iloc[train_end:]['Load (kW)'].plot(ax=ax, label="Test")
ax.legend()

In [None]:
data_df_scaled

In [None]:
xs_train

In [None]:
ys_train

In [None]:
#4. Define the model configuration
model_config = {
    'lstm_layers': [
        {'units': 64, 'return_sequences': True,'dropout': 0.2},
        {'units': 32, 'return_sequences': True,'dropout': 0.2},
        {'units': 16, 'return_sequences': False}
    ],
    'epochs': 300,
    'batch_size': 32
}

In [None]:
#5. Build, train, and evaluate the model
# multivariate_load_foreacasting_load_temp_included_model_path = '../models/multivariate_load_foreacasting_load_temp_included_model.keras'
# num_target_features = 1 # The number of output features for the model only load for now
# model, history = build_and_train_model(
#     xs_train, ys_train, model_config, num_target_features, path_to_save_model = multivariate_load_foreacasting_load_temp_included_model_path
# )
multivariate_load_foreacasting_load_temp_included_model_path = '../models/multivariate_load_foreacasting_load_temp_included_model.keras'
num_target_features = 1  # The number of output features for the model only load for now

# Define the number of experiments
num_experiments = 3

# Initialize variables to accumulate the total loss and track the best model
total_loss = 0
best_loss = float('inf')
best_model = None

for _ in range(num_experiments):
    # Build and train the model
    model, history = build_and_train_model(
        xs_train, ys_train, model_config, num_target_features, path_to_save_model=multivariate_load_foreacasting_load_temp_included_model_path
    )
    
    # Evaluate the model
    loss = model.evaluate(xs_test, ys_test, verbose=0)
    total_loss += loss
    
    # Update the best model if the current model's loss is lower than the best loss
    if loss < best_loss:
        best_loss = loss
        best_model = model

# Calculate the average loss
average_loss = total_loss / num_experiments

print(f'Average Test Loss over {num_experiments} experiments: {average_loss}')
print(f'Best Loss: {best_loss}')

# Save the best model
best_model.save(multivariate_load_foreacasting_load_temp_included_model_path)
print(f'Best model saved to {multivariate_load_foreacasting_load_temp_included_model_path}')

In [None]:
# Load the best model from the path into model variable
model = tf.keras.models.load_model(multivariate_load_foreacasting_load_temp_included_model_path)
print('Best model loaded from path.')

In [None]:
xs_test.shape

In [None]:
#5. Build, train, and evaluate the model

# Make predictions on the test set
predictions_scaled = model.predict(xs_test) # contains only load
predictions_scaled

In [None]:
predictions_scaled

In [None]:
# Rescale the predictions and actual values
# predictions=> contains values for target column (Load)
# but our scaler was trained on all columns so we have to inverse transform all columns
# so we need to padd with zeros the other columns
num_of_missing_training_features = data_df.shape[1] - num_target_features

padding_for_missing_training_features = np.zeros((predictions_scaled.shape[0], num_of_missing_training_features))
data_to_be_invert_from_scaling = np.hstack([padding_for_missing_training_features, predictions_scaled])
data_to_be_invert_from_scaling

In [None]:
data_to_be_invert_from_scaling

In [None]:
#Model outputs 
predictions= scaler.inverse_transform(data_to_be_invert_from_scaling)[:, target_col]
predictions

In [None]:
ys_test

In [None]:
xs_test.shape

In [None]:
padding_for_missing_training_features = np.zeros((ys_test.shape[0], num_of_missing_training_features))
ys_test_scaled = np.hstack([padding_for_missing_training_features, ys_test])
ys_test_scaled
ys_test = scaler.inverse_transform(ys_test_scaled)[:,target_col]

In [None]:
ys_test.shape

In [None]:
ys_test

In [None]:
(abs(ys_test-predictions)).sum()

In [None]:
predictions

In [None]:
predictions.shape

In [None]:
#6. Plot the results
test_index = data_df.index[-len(xs_test):]
hours_to_plot = 24*14 # Approximately one month

plot_results(test_index, ys_test, predictions, hours_to_plot=hours_to_plot)

In [None]:
plot_results_from_to(test_index, ys_test, predictions,'2019-06-01', '2019-07-01')

In [None]:
# Assuming you have your test_index, ys_test_rescaled, and predictions_rescaled already defined
plot_seasonal_comparison(test_index, ys_test, predictions)

In [None]:
# Assuming you have your test_index, ys_test_rescaled, and predictions_rescaled already defined
plot_monthly_comparison(test_index, ys_test, predictions)

In [None]:
# Assuming you have your test_index, ys_test_rescaled, and predictions_rescaled already defined
plot_weekday_comparison(test_index, ys_test, predictions)

In [None]:
plot_hourly_comparison(test_index, ys_test, predictions)

In [None]:
model.summary()

In [None]:
target_col_name = data_df.columns[target_col]
predictions_df = pd.DataFrame(predictions, columns=[target_col_name], index=test_index)
predictions_df

In [None]:
data_df.iloc[:train_end]['Load (kW)'].shape

In [None]:
fig, ax = plt.subplots(figsize=(7, 4))
data_df.iloc[:train_end]['Load (kW)'].plot(ax=ax, label="Train")
data_df.iloc[train_end:]['Load (kW)'].plot(ax=ax, label="Test")
predictions_df.plot(ax=ax, label="Forecasted Load")
ax.legend()