In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Notes
Data Exploration:
- Electricity demand is for NSW
- Temperature is for just Bankstown
- Different time spacings between electricity demand and temperature

Forecasting analysis:
- Clear trend between temperature and forecasting inaccuracy
    - Observed in both actual error and relative error
    - Inaccuracy is greater at extremely hot temperatures

In [3]:
df_temperature = pd.read_csv('data/temperature_nsw.csv', names = ['location', 'date_time', 'temperature'], skiprows = 1)
df_temperature.date_time = pd.to_datetime(df_temperature.date_time, format = "%d/%m/%Y %H:%M")

df_demand = pd.read_csv('data/totaldemand_nsw.csv', names = ['date_time', 'total_demand', 'region_id'], skiprows = 1)
df_demand.date_time = pd.to_datetime(df_demand.date_time, format = "%d/%m/%Y %H:%M")

df_forecast = pd.read_csv('data/forecastdemand_nsw.csv', names = ['id', 'region_id', 'period_id', 'forecast_demand', 'date_time_forecast', 'date_time_prediction'], skiprows = 1)
df_forecast.date_time_forecast = pd.to_datetime(df_forecast.date_time_forecast, format = "%Y-%m-%d %H:%M:%S")
df_forecast.date_time_prediction = pd.to_datetime(df_forecast.date_time_prediction, format = "%Y-%m-%d %H:%M:%S")

In [None]:
# Exploration - Temperature Dataset
print("Locations = {}".format(set(df_temperature.location)))
print("Date Min = {}  |  Date Max = {}".format(df_temperature.date_time.min(), df_temperature.date_time.max()))
print("Temp Min = {}  |  Temp Max = {}\n".format(df_temperature.temperature.min(), df_temperature.temperature.max()))
print(df_temperature.head())
print("\nRows = {}".format(len(df_temperature)))

In [None]:
# Exploration - Demand Dataset
print("Regions = {}".format(set(df_demand.region_id)))
print("Date Min = {}  |  Date Max = {}".format(df_demand.date_time.min(), df_demand.date_time.max()))
print("Demand Min = {}  |  Demand Max = {}\n".format(df_demand.total_demand.min(), df_demand.total_demand.max()))
print(df_demand.head())
print("\nRows = {}".format(len(df_demand)))

In [None]:
# Exploration - Forecast Dataset
print("Regions = {}".format(set(df_forecast.region_id)))
#print("Periods = {}".format(set(df_forecast.period_id)))
print("Forecast Date Min = {}  |  Forecast Date Max = {}".format(df_forecast.date_time_forecast.min(), df_forecast.date_time_forecast.max()))
print("Predict Date Min = {}  |  Predict Date Max = {}".format(df_forecast.date_time_prediction.min(), df_forecast.date_time_prediction.max()))
print("Forecast Demand Min = {}  |  Forecast Demand Max = {}\n".format(df_forecast.forecast_demand.min(), df_forecast.forecast_demand.max()))
print(df_forecast.head())
print("\nRows = {}".format(len(df_forecast)))

In [None]:
#Note: this code chunk may be slow to run
#Forecast_interval is the number of hours between prediction and it's forecast
interval = 60*60 #sets the interval in seconds
df_forecast["forecast_interval"] = df_forecast.date_time_prediction - df_forecast.date_time_forecast
df_forecast.forecast_interval = df_forecast.forecast_interval.apply(lambda x: x.total_seconds()/interval)

#Rounding forecast time to intervals of 30mins to match df_demand and only have one record where forecast interval is ~24hrs
interval_min, interval_max = 23 , 25 #sets a window for forecast periods
df_forecast_near24hour = df_forecast.loc[(df_forecast.forecast_interval > interval_min) & (df_forecast.forecast_interval < interval_max)]
df_forecast_near24hour["date_time_forecast_rounded"] = df_forecast_near24hour.date_time_forecast.apply(lambda x: x.round(freq='30min'))
df_forecast_near24hour_1instance = df_forecast_near24hour.loc[df_forecast_near24hour.groupby("date_time_forecast_rounded")["forecast_interval"].idxmax()]

#Merge forecast data with demand data
df_forecast_near24hour_1instance_with_demand = pd.merge(df_forecast_near24hour_1instance, df_demand, left_on = "date_time_forecast_rounded", right_on = "date_time")
df_forecast_near24hour_1instance_with_demand["forecast_error"] = df_forecast_near24hour_1instance_with_demand.total_demand - df_forecast_near24hour_1instance_with_demand.forecast_demand

In [None]:
plot_year = 2018
plot_hour = 16

df_plot = df_forecast_near24hour_1instance_with_demand.loc[(df_forecast_near24hour_1instance_with_demand.date_time.dt.year == plot_year) & (df_forecast_near24hour_1instance_with_demand.date_time.dt.hour == plot_hour)]

plt.figure(figsize = (12,7))
plt.subplot(2,1,1)
plt.title("Forecasting 24h into the future")
plt.plot(df_plot.date_time.dt.dayofyear, df_plot.forecast_demand, label = '{} {}:00 - Demand Forecast'.format(plot_year, plot_hour))
plt.plot(df_plot.date_time.dt.dayofyear, df_plot.total_demand, label = '{} {}:00 - Demand Actual'.format(plot_year, plot_hour))
plt.legend(loc = 'upper right')

plt.subplot(2,1,2)
plt.plot(df_plot.date_time.dt.dayofyear, df_plot.forecast_error, 'g.-', label = '{} {}:00 - Error'.format(plot_year, plot_hour))
plt.legend(loc = 'upper right')

plt.figure(figsize = (5,4))
sns.histplot(df_plot.forecast_error, bins = 50);
plt.axvline(df_plot.forecast_error.median(), color='r', ls = '--', label = 'median')
plt.axvline(df_plot.forecast_error.mean(), color='m', label = 'mean')
plt.legend(loc = 'upper right')

In [None]:
df_forecast_near24hour_1instance_with_demand_temperature = pd.merge(df_forecast_near24hour_1instance_with_demand, df_temperature, left_on = "date_time_forecast_rounded", right_on = "date_time")
df_forecast_near24hour_1instance_with_demand_temperature["forecast_error_relative"] = df_forecast_near24hour_1instance_with_demand_temperature.forecast_error/df_forecast_near24hour_1instance_with_demand_temperature.total_demand

df_plot = df_forecast_near24hour_1instance_with_demand_temperature[["temperature", "forecast_error", "forecast_error_relative"]].copy()
df_plot.temperature = df_plot.temperature.round()

plt.figure(figsize = (12,7))
sns.boxplot(data=df_plot, x="temperature", y="forecast_error", fliersize = 1)
plt.axhline(0, color='r', alpha = 0.2)
plt.xticks(rotation = 90);
plt.title("Accuracy of forecasting 24h into the future")

plt.figure(figsize = (12,7))
sns.boxplot(data=df_plot, x="temperature", y="forecast_error_relative", fliersize = 1)
plt.axhline(0, color='r', alpha = 0.2)
plt.xticks(rotation = 90);
plt.ylabel("Forecast Error as Portion of Actual Demand")