## 04 - Evaluation

### Import packages and load the data

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import math
import matplotlib.pyplot as plt
import numpy as np
import datetime
import plotly.express as px
import pickle
import plotly.graph_objects as go


In [2]:
data_input_path = '/Users/szejozsef00/Desktop/MSC/MSC 2. félév/DS Lab I/DSLAB1/data/processed/'
prediction_output_path = '/Users/szejozsef00/Desktop/MSC/MSC 2. félév/DS Lab I/DSLAB1/data/predictions/'

In [3]:
fact_df = pd.read_csv(data_input_path + 'modelling_df_0120.csv',parse_dates=['DATETIME'])
bm_pred = pd.read_csv(prediction_output_path + 'bm_pred_2010_07_01_1_3_d.csv',parse_dates=['DATETIME'])
pred = pd.read_csv(prediction_output_path + 'pred_2010_07_01_0.7_2.csv',parse_dates=['DATETIME'])

### Merge the data

In [4]:
fact_df = fact_df[(fact_df['DATETIME'].dt.date == datetime.date(2010, 7, 1)) & (fact_df['LOCATION'] < 100)].copy(deep=True)

In [5]:
eval_df = pred.merge(fact_df[['DATETIME','LOCATION','VALUE']], on=['DATETIME','LOCATION'], how='left')
eval_df = eval_df.merge(bm_pred[['DATETIME','LOCATION','bm_1d_prediction']], on=['DATETIME','LOCATION'], how='left')

In [6]:
# Filter out locations where the actual value is 0 all the time
locations_to_keep = eval_df.groupby('LOCATION')['VALUE'].sum().loc[lambda x: x != 0].index
eval_df = eval_df[eval_df['LOCATION'].isin(locations_to_keep)]

eval_df['LOCATION'].nunique()

83

In [7]:
def plot_with_bounds(location):
    # Filter the DataFrame for the specific location
    filtered_df = eval_df[eval_df['LOCATION'] == location]

    # Create the figure
    fig = go.Figure()

    # Add actual values trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['VALUE'],
        mode='lines',
        name='Actual Value',
        line=dict(color='blue', width=2),
        hovertemplate='Actual: %{y}<br>Date: %{x}'
    ))

    # Add prediction trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['prediction'],
        mode='lines',
        name='Prediction',
        line=dict(color='orange', width=2),
        hovertemplate='Prediction: %{y}<br>Date: %{x}'
    ))

    # Add upper bound prediction trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['upper_bound'],
        mode='lines',
        name='Upper Bound',
        line=dict(color='green', width=2, dash='dot'),
        hovertemplate='Upper Bound: %{y}<br>Date: %{x}'
    ))

    # Add lower bound prediction trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['lower_bound'],
        mode='lines',
        name='Lower Bound',
        line=dict(color='red', width=2, dash='dot'),
        hovertemplate='Lower Bound: %{y}<br>Date: %{x}'
    ))

    # Update layout for styling
    fig.update_layout(
        title={
            'text': f"Predictions with Bounds for Location {location}",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis_title='DateTime',
        yaxis_title='Value',
        legend_title='Legend',
        font=dict(family="Arial, sans-serif", size=12, color="black"),
        hovermode='x unified'
    )

    # Show the figure
    fig.show()

plot_with_bounds(3)  # Example usage with location 0

In [8]:
eval_df.head()

Unnamed: 0,LOCATION,DATETIME,horizont,prediction,lower_bound,upper_bound,VALUE,bm_1d_prediction
0,0,2010-07-01 00:00:00,288,314.813482,244.267873,358.978717,363.47,363.16
1,0,2010-07-01 00:05:00,288,382.325522,292.031503,419.092667,433.95,449.45
2,0,2010-07-01 00:10:00,288,426.046261,316.403927,472.927873,474.54,504.09
3,0,2010-07-01 00:15:00,288,426.046261,320.694082,472.927873,482.43,506.39
4,0,2010-07-01 00:20:00,288,385.078896,282.997054,395.804652,497.15,453.43


In [27]:
# Calculate sMAPE and RMSE by LOCATION for predictions

eval_df = eval_df[(~eval_df.prediction.isna())].sort_values(by=['LOCATION','DATETIME']).copy(deep=True)
pred_smape_by_location = eval_df.groupby('LOCATION').apply(
    lambda x: 100 * np.mean(np.abs(x['VALUE'] - x['prediction']) / ((np.abs(x['VALUE']) + np.abs(x['prediction'])) / 2))
).reset_index(name='P_MAPE')

pred_rmse_by_location = eval_df.groupby('LOCATION').apply(
    lambda x: math.sqrt(mean_squared_error(x['VALUE'], x['prediction']))
).reset_index(name='P_RMSE')

pred_stat_df = pd.concat([pred_smape_by_location, pred_rmse_by_location['P_RMSE']], axis=1)

# Calculate sMAPE and RMSE by LOCATION for benchmarks
bm_pred_smape_by_location = eval_df.groupby('LOCATION').apply(
    lambda x: 100 * np.mean(np.abs(x['VALUE'] - x['bm_1d_prediction']) / ((np.abs(x['VALUE']) + np.abs(x['bm_1d_prediction'])) / 2))
).reset_index(name='BM_MAPE')

bm_pred_rmse_by_location = eval_df.groupby('LOCATION').apply(
    lambda x: math.sqrt(mean_squared_error(x['VALUE'], x['bm_1d_prediction']))
).reset_index(name='BM_RMSE')

bm_pred_stat_df = pd.concat([bm_pred_smape_by_location, bm_pred_rmse_by_location['BM_RMSE']], axis=1)

# Combine prediction and benchmark stats
stat_df = pd.concat([pred_stat_df, bm_pred_stat_df['BM_MAPE'], bm_pred_stat_df['BM_RMSE']], axis=1)

# Calculate differences between benchmark and prediction metrics
stat_df['MAPE_DIFF'] = stat_df['BM_MAPE'] - stat_df['P_MAPE']
stat_df['RMSE_DIFF'] = stat_df['BM_RMSE'] - stat_df['P_RMSE']

# Display the final DataFrame
stat_df

Unnamed: 0,LOCATION,P_MAPE,P_RMSE,BM_MAPE,BM_RMSE,MAPE_DIFF,RMSE_DIFF
0,0,22.750642,106.229961,14.784591,71.123878,-7.966051,-35.106084
1,1,53.355644,60.382190,49.468045,46.419383,-3.887600,-13.962807
2,3,15.655496,102.836785,18.974537,128.436565,3.319041,25.599781
3,4,32.591156,87.202197,25.938407,73.240823,-6.652748,-13.961374
4,5,9.188657,90.222831,13.064560,129.815585,3.875902,39.592754
...,...,...,...,...,...,...,...
78,95,46.206238,69.231566,20.901059,49.737251,-25.305179,-19.494315
79,96,15.262271,112.731903,26.980495,176.695864,11.718224,63.963962
80,97,102.773702,218.783368,94.870245,204.077765,-7.903457,-14.705602
81,98,60.007843,83.681819,62.100154,90.304555,2.092311,6.622737


In [28]:
best_values = stat_df.sort_values(by='MAPE_DIFF', ascending=False).head(10)['LOCATION'].tolist()
worst_values = stat_df.sort_values(by='MAPE_DIFF', ascending=True).head(10)['LOCATION'].tolist()
worst_mape = stat_df.sort_values(by='P_RMSE', ascending=False).head(5)['LOCATION'].tolist()
best_mape = stat_df.sort_values(by='P_RMSE', ascending=True).head(5)['LOCATION'].tolist()

In [30]:
print("MAPE:")
print("P MAPE:",len(stat_df[stat_df['MAPE_DIFF'] > 0]) / len(stat_df)) 
print("BM MAPE:",len(stat_df[stat_df['MAPE_DIFF'] <= 0]) / len(stat_df))
print()
print("RMSE:")
print("P RMSE:",len(stat_df[stat_df['RMSE_DIFF'] > 0]) / len(stat_df)) 
print("BM RMSE:",len(stat_df[stat_df['RMSE_DIFF'] <= 0]) / len(stat_df))

MAPE:
P MAPE: 0.5903614457831325
BM MAPE: 0.40963855421686746

RMSE:
P RMSE: 0.5542168674698795
BM RMSE: 0.4457831325301205


In [12]:
import plotly.graph_objects as go

def plot_predictions(location):
    # Filter the DataFrame for the specific location
    filtered_df = eval_df[eval_df['LOCATION'] == location]

    # Get MAPE and RMSE values for the location
    pred_mape = stat_df.loc[stat_df['LOCATION'] == location, 'P_MAPE'].values[0]
    pred_rmse = stat_df.loc[stat_df['LOCATION'] == location, 'P_RMSE'].values[0]
    bm_mape = stat_df.loc[stat_df['LOCATION'] == location, 'BM_MAPE'].values[0]
    bm_rmse = stat_df.loc[stat_df['LOCATION'] == location, 'BM_RMSE'].values[0]

    # Create the figure
    fig = go.Figure()

    # Add actual values trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['VALUE'],
        mode='lines',
        name='Actual Value',
        line=dict(color='blue', width=2),
        hovertemplate='Actual: %{y}<br>Date: %{x}'
    ))

    # Add prediction trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['prediction'],
        mode='lines',
        name='Prediction',
        line=dict(color='orange', width=2),
        hovertemplate='Prediction: %{y}<br>Date: %{x}'
    ))

    # Add benchmark prediction trace
    fig.add_trace(go.Scatter(
        x=filtered_df['DATETIME'],
        y=filtered_df['bm_1d_prediction'],
        mode='lines',
        name='Benchmark Prediction',
        line=dict(color='green', width=2),
        hovertemplate='Benchmark: %{y}<br>Date: %{x}'
    ))

    # Update layout for styling
    fig.update_layout(
        title={
            'text': f"Predictions for Location {location}<br>"
                    f"P_MAPE: {pred_mape:.2f}, P_RMSE: {pred_rmse:.2f}<br>"
                    f"BM_MAPE: {bm_mape:.2f}, BM_RMSE: {bm_rmse:.2f}",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis_title='DateTime',
        yaxis_title='Value',
        legend_title='Legend',
        font=dict(family="Arial, sans-serif", size=12, color="black"),
        # plot_bgcolor="#f9f9f9",
        hovermode='x unified'
    )

    # Show the figure
    fig.show()

In [13]:
for l in best_values:
    plot_predictions(l)

In [14]:
for l in worst_values:
    plot_predictions(l)

In [15]:
for l in best_mape:
    plot_predictions(l)