In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
#from google.colab import drive
#drive.mount('/gdrive')
#%cd "/gdrive/MyDrive/Colab Notebooks/Capstone Project"

Mounted at /gdrive
/gdrive/MyDrive/Colab Notebooks/Capstone Project


Explaratory modelling using a shallow LSTM neural netowrk for *NSW* dataset

In [2]:
# Get device 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
# Load the pv+demand+forecast combined data with the DATETIME column as the index
# Change path to your location
combined = pd.read_csv("pv_demand_forecast_combined.csv", index_col="DATETIME")

In [4]:
#Get a count of features in the dataset
combined.count()

INTERVAL_DATETIME    283486
POWER                283352
STATE                283486
TOTALDEMAND          217842
LOCATION             217754
TEMPERATURE          217754
FORECASTDEMAND       218010
Weekday              221887
Quarter              221887
Month                221887
Season               221887
Day                  221887
Year                 221887
SUNRISE              221887
SUNSET               221887
DAYTYPE              221887
dtype: int64

In [5]:
# We have missing data
# Drop rows with null values (for now)
combined_full = combined.dropna()
combined_full.count()
#combined.isna().sum()
# We now have 213743 rows

INTERVAL_DATETIME    213743
POWER                213743
STATE                213743
TOTALDEMAND          213743
LOCATION             213743
TEMPERATURE          213743
FORECASTDEMAND       213743
Weekday              213743
Quarter              213743
Month                213743
Season               213743
Day                  213743
Year                 213743
SUNRISE              213743
SUNSET               213743
DAYTYPE              213743
dtype: int64

In [6]:
# Since we are using DATETIME as the index (refer to https://www.crosstab.io/articles/time-series-pytorch-lstm)
# Because of different states, DATETIME is NOT unique
# We focus only on NSW to make sure the DATETIME is unique
nsw = combined_full[combined_full.STATE == "NSW"]
nsw

Unnamed: 0_level_0,INTERVAL_DATETIME,POWER,STATE,TOTALDEMAND,LOCATION,TEMPERATURE,FORECASTDEMAND,Weekday,Quarter,Month,Season,Day,Year,SUNRISE,SUNSET,DAYTYPE
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-03-06 09:30:00,2018-03-06 09:30:00,427.281,NSW,8051.25,Bankstown,16.3,8432.046190,Tuesday,1.0,3.0,Autumn,6.0,2018.0,2018-03-06 05:48:00,2018-03-06 18:27:00,Day
2018-03-06 10:00:00,2018-03-06 10:00:00,495.195,NSW,8062.84,Bankstown,16.3,8413.074419,Tuesday,1.0,3.0,Autumn,6.0,2018.0,2018-03-06 05:48:00,2018-03-06 18:27:00,Day
2018-03-06 10:30:00,2018-03-06 10:30:00,516.137,NSW,8058.62,Bankstown,16.3,8367.147273,Tuesday,1.0,3.0,Autumn,6.0,2018.0,2018-03-06 05:48:00,2018-03-06 18:27:00,Day
2018-03-06 11:00:00,2018-03-06 11:00:00,549.542,NSW,7927.81,Bankstown,16.9,8334.082667,Tuesday,1.0,3.0,Autumn,6.0,2018.0,2018-03-06 05:48:00,2018-03-06 18:27:00,Day
2018-03-06 11:30:00,2018-03-06 11:30:00,556.764,NSW,7696.73,Bankstown,17.7,8289.618478,Tuesday,1.0,3.0,Autumn,6.0,2018.0,2018-03-06 05:48:00,2018-03-06 18:27:00,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-17 22:00:00,2021-03-17 22:00:00,0.000,NSW,7419.77,Bankstown,19.7,7317.702687,Wednesday,1.0,3.0,Autumn,17.0,2021.0,2021-03-17 05:57:00,2021-03-17 18:12:00,Night
2021-03-17 22:30:00,2021-03-17 22:30:00,0.000,NSW,7417.91,Bankstown,19.5,7266.887647,Wednesday,1.0,3.0,Autumn,17.0,2021.0,2021-03-17 05:57:00,2021-03-17 18:12:00,Night
2021-03-17 23:00:00,2021-03-17 23:00:00,0.000,NSW,7287.32,Bankstown,19.1,7173.866522,Wednesday,1.0,3.0,Autumn,17.0,2021.0,2021-03-17 05:57:00,2021-03-17 18:12:00,Night
2021-03-17 23:30:00,2021-03-17 23:30:00,0.000,NSW,7172.39,Bankstown,18.8,7043.236857,Wednesday,1.0,3.0,Autumn,17.0,2021.0,2021-03-17 05:57:00,2021-03-17 18:12:00,Night


# Experiment 1
Use **temperature only** to predict demand



In [7]:
# Create target and features
target = "TOTALDEMAND"
features = ["TEMPERATURE"]

In [15]:
nsw.index

Index(['2018-03-06 09:30:00', '2018-03-06 10:00:00', '2018-03-06 10:30:00',
       '2018-03-06 11:00:00', '2018-03-06 11:30:00', '2018-03-06 12:00:00',
       '2018-03-06 12:30:00', '2018-03-06 13:00:00', '2018-03-06 13:30:00',
       '2018-03-06 14:00:00',
       ...
       '2021-03-17 19:30:00', '2021-03-17 20:00:00', '2021-03-17 20:30:00',
       '2021-03-17 21:00:00', '2021-03-17 21:30:00', '2021-03-17 22:00:00',
       '2021-03-17 22:30:00', '2021-03-17 23:00:00', '2021-03-17 23:30:00',
       '2021-03-18 00:00:00'],
      dtype='object', name='DATETIME', length=53105)

In [8]:
# Split dataset to train and test
# Want to use the first 80% approximately for training and then use the last 20% for prediction
# Date below gives us around 80% train and 20% test (after trial and error)
test_start = "2020-08-01 00:00:00"
#Drop the columns NOT required for this experiment
nsw.drop(['INTERVAL_DATETIME','POWER','STATE', 'LOCATION', 'Weekday', 'Quarter',
      'Month', 'Season', 'Day', 'Year', 'SUNRISE', 'SUNSET', 'DAYTYPE'], axis=1, inplace=True)

# Create the train set
df_train = nsw.loc[:test_start].copy()
# Create the test set
df_test = nsw.loc[test_start:].copy()

#Get percentage of test set
len(df_test)/len(nsw)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


0.19887016288485077

In [11]:
# Sneak preview into the train data
df_train.head()

Unnamed: 0_level_0,TOTALDEMAND,TEMPERATURE,FORECASTDEMAND
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-06 09:30:00,0.083055,-0.183369,0.404018
2018-03-06 10:00:00,0.092402,-0.183369,0.388392
2018-03-06 10:30:00,0.088999,-0.183369,0.350564
2018-03-06 11:00:00,-0.016492,-0.085633,0.323331
2018-03-06 11:30:00,-0.202844,0.044681,0.286708


In [14]:
#Standardise the data
target_mean = df_train[target].mean()
target_stdev = df_train[target].std()

for c in df_train.columns:
    mean = df_train[c].mean()
    stdev = df_train[c].std()

    df_train[c] = (df_train[c] - mean) / stdev
    df_test[c] = (df_test[c] - mean) / stdev



In [15]:
# Sneak preview into training set after standardising
df_train.head()

Unnamed: 0_level_0,TOTALDEMAND,TEMPERATURE,FORECASTDEMAND
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-06 09:30:00,0.083055,-0.183369,0.404018
2018-03-06 10:00:00,0.092402,-0.183369,0.388392
2018-03-06 10:30:00,0.088999,-0.183369,0.350564
2018-03-06 11:00:00,-0.016492,-0.085633,0.323331
2018-03-06 11:30:00,-0.202844,0.044681,0.286708


In [16]:
# Current MSE value for NSW train values from forecast data provided
nsw_mse_train = mean_squared_error(df_train['TOTALDEMAND'].to_numpy(), df_train['FORECASTDEMAND'].to_numpy())
nsw_mse_train

0.4192650714419789

In [17]:
# Current MSE value for NSW test values from forecast data provided
nsw_mse_test = mean_squared_error(df_test['TOTALDEMAND'].to_numpy(), df_test['FORECASTDEMAND'].to_numpy())
nsw_mse_test

0.48196713877808667

In [18]:
# Create the dataset (needed by pytorch) to load the dataset into a dataloader (to create batches for NN)
class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length=5):
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i): 
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i]

In [19]:
# Create DataLoader
# Parameters below can be optimised
torch.manual_seed(48)
batch_size = 4
sequence_length = 30

train_dataset = SequenceDataset(df_train, target, features, sequence_length)
test_dataset = SequenceDataset(df_test, target, features, sequence_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


In [20]:
# Confim train loader has been created correctly

x,y = next(iter(train_loader))

In [21]:
# Look at the shape of the features
x.shape

torch.Size([4, 30, 1])

In [32]:
# Look at the shape of the target
y.shape

torch.Size([4])

In [24]:
# Create LSTM Model
# Very shallow model
class TimeSeriesLSTM(nn.Module):
    def __init__(self, num_sensors, hidden_units):
        super().__init__()
        self.num_sensors = num_sensors  # this is the number of features
        self.hidden_units = hidden_units
        self.num_layers = 1

        self.lstm = nn.LSTM(
            input_size=num_sensors,
            hidden_size=hidden_units,
            batch_first=True,
            num_layers=self.num_layers
        )

        self.linear = nn.Linear(in_features=self.hidden_units, out_features=1)

    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).to(device).requires_grad_()
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).to(device).requires_grad_()

        _, (hn, _) = self.lstm(x, (h0, c0))
        out = self.linear(hn[0]).flatten()  # First dim of Hn is num_layers, which is set to 1 above.

        return out

In [25]:
# Set the parameters (can be optimised)
learning_rate = 0.001
num_hidden_units = 16

model = TimeSeriesLSTM(num_sensors=len(features), hidden_units=num_hidden_units).to(device)
loss_function = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
# Create training function
def train_model(data_loader, model, loss_function, optimizer, device):
    num_batches = len(data_loader)
    total_loss = 0
    model.train()

    for X, y in data_loader:
        output = model(X.to(device))
        loss = loss_function(output, y.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Train loss: {avg_loss}")

In [27]:
# Create test function
def test_model(data_loader, model, loss_function, device):

    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in data_loader:
            output = model(X.to(device))
            total_loss += loss_function(output, y.to(device)).item()

    avg_loss = total_loss / num_batches
    print(f"Test loss: {avg_loss}")

In [28]:
# Do a quick test of the loss function before training 
# Error metric is worse than the baseline provided by the forecasting demand
print("Untrained test\n--------")
test_model(test_loader, model, loss_function, device)
print()

Untrained test
--------
Test loss: 1.0050211743091433



In [29]:
# Train the model for 5 epochs
epochs = 5
for ix_epoch in range(epochs):
    print(f"Epoch {ix_epoch}\n---------")
    train_model(train_loader, model, loss_function, optimizer, device)
    test_model(test_loader, model, loss_function, device)
    print()

Epoch 0
---------
Train loss: 0.5428208862437038
Test loss: 0.5142791559923241

Epoch 1
---------
Train loss: 0.41723529326558
Test loss: 0.4497078992394658

Epoch 2
---------
Train loss: 0.3800697225404983
Test loss: 0.4570235301183649

Epoch 3
---------
Train loss: 0.3665003904664714
Test loss: 0.45406635491856623

Epoch 4
---------
Train loss: 0.3572986234561748
Test loss: 0.4574393224470467



In [30]:
# Create predict function
def predict(data_loader, model, device):

    output = torch.tensor([]).to(device)
    model.eval()
    with torch.no_grad():
        for X, _ in data_loader:
            y_star = model(X.to(device))
            output = torch.cat((output, y_star.to(device)), 0)

    return output

In [31]:
# Perform evaluation
# Get the prediction values of all NSW data using this model

ystar_col = "Model forecast"
df_train[ystar_col] = predict(train_loader, model, device).cpu().numpy()
df_test[ystar_col] = predict(test_loader, model, device).cpu().numpy()

df_out = pd.concat((df_train, df_test))[[target, ystar_col]]

for c in df_out.columns:
    df_out[c] = df_out[c] * target_stdev + target_mean

print(df_out)
df_out.to_csv("nsw_temp_demand.csv",index=True)

                     TOTALDEMAND  Model forecast
DATETIME                                        
2018-03-06 09:30:00     0.083055       -1.120308
2018-03-06 10:00:00     0.092402       -1.123658
2018-03-06 10:30:00     0.088999       -0.096148
2018-03-06 11:00:00    -0.016492        0.360042
2018-03-06 11:30:00    -0.202844       -0.890043
...                          ...             ...
2021-03-17 22:00:00    -0.090045       -0.256976
2021-03-17 22:30:00    -0.091626       -0.969447
2021-03-17 23:00:00    -0.202682        0.296543
2021-03-17 23:30:00    -0.300420        1.952817
2021-03-18 00:00:00    -0.366650       -0.867560

[53106 rows x 2 columns]


# Add More features apart from PV data
Features added: **"TEMPERATURE", "Weekday_ENC", "Quarter","Month","Season_ENC","Day","Year","DAYTYPE_ENC"**

ENC stands for encoded

In [32]:
# Load the nsw dataset again to start second experiment
nsw = combined_full[combined_full.STATE == "NSW"]

In [33]:
# Create target and features
target = "TOTALDEMAND"
features = ["TEMPERATURE", "Weekday_ENC", "Quarter","Month","Season_ENC","Day","Year","DAYTYPE_ENC"]

In [34]:
# Label encode weekday
le = LabelEncoder()
a = list(nsw['Weekday'])
enc = le.fit_transform(a)
nsw['Weekday_ENC'] = enc

# Label endocde Season
a = list(nsw['Season'])
enc = le.fit_transform(a)
nsw['Season_ENC'] = enc

# Label endocde DayType
a = list(nsw['DAYTYPE'])
enc = le.fit_transform(a)
nsw['DAYTYPE_ENC'] = enc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [35]:
#Split to train and test dataset (80%, 20%)
test_start = "2020-08-01 00:00:00" 
# Drop columns not needed
nsw.drop(['INTERVAL_DATETIME','POWER','STATE', 'LOCATION', 'Weekday', 'Season', 'DAYTYPE',"SUNRISE","SUNSET",], axis=1, inplace=True)
# Create train dataset
df_train = nsw.loc[:test_start].copy()
# Create test dataset
df_test = nsw.loc[test_start:].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [37]:
# Sneak preview into the train dataset
df_train.head()

Unnamed: 0_level_0,TOTALDEMAND,TEMPERATURE,FORECASTDEMAND,Quarter,Month,Day,Year,Weekday_ENC,Season_ENC,DAYTYPE_ENC
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-03-06 09:30:00,8051.25,16.3,8432.04619,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 10:00:00,8062.84,16.3,8413.074419,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 10:30:00,8058.62,16.3,8367.147273,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 11:00:00,7927.81,16.9,8334.082667,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 11:30:00,7696.73,17.7,8289.618478,1.0,3.0,6.0,2018.0,5,0,0


In [38]:
#Standardise the data
target_mean = df_train[target].mean()
target_stdev = df_train[target].std()

for c in df_train.columns:
    mean = df_train[c].mean()
    stdev = df_train[c].std()

    df_train[c] = (df_train[c] - mean) / stdev
    df_test[c] = (df_test[c] - mean) / stdev

In [39]:
# Sneak preview into the data after standardising the data
df_test

Unnamed: 0_level_0,TOTALDEMAND,TEMPERATURE,FORECASTDEMAND,Quarter,Month,Day,Year,Weekday_ENC,Season_ENC,DAYTYPE_ENC
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-08-01 00:00:00,-0.421897,0.924306,0.749745,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 00:30:00,-0.561306,0.908016,0.584319,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 00:30:00,-0.561306,0.908016,0.584319,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 01:00:00,-0.715796,0.859148,0.413267,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 01:00:00,-0.715796,0.859148,0.413267,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
...,...,...,...,...,...,...,...,...,...,...
2021-03-17 22:00:00,-0.426195,0.370468,-0.513804,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862
2021-03-17 22:30:00,-0.427695,0.337889,-0.555658,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862
2021-03-17 23:00:00,-0.533008,0.272732,-0.632274,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862
2021-03-17 23:30:00,-0.625692,0.223864,-0.739867,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862


In [40]:
# Create DataLoader (using same parameter as above)
torch.manual_seed(48)
batch_size = 4
sequence_length = 30

train_dataset = SequenceDataset(df_train, target, features, sequence_length)
test_dataset = SequenceDataset(df_test, target, features, sequence_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [41]:
# Set the parameters
learning_rate = 0.001
num_hidden_units = 16

model = TimeSeriesLSTM(num_sensors=len(features), hidden_units=num_hidden_units).to(device)
loss_function = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [42]:
# Untrained test (should be the same as the untrained test above)
print("Untrained test\n--------")
test_model(test_loader, model, loss_function, device)
print()

Untrained test
--------
Test loss: 1.108964520714795



In [43]:
# Train the model for 5 epochs
epochs = 5
for ix_epoch in range(epochs):
    print(f"Epoch {ix_epoch}\n---------")
    train_model(train_loader, model, loss_function, optimizer, device)
    test_model(test_loader, model, loss_function, device)
    print()

Epoch 0
---------
Train loss: 0.3264541288358996
Test loss: 0.32544639126183617

Epoch 1
---------
Train loss: 0.20055060077107328
Test loss: 0.3343333269755784

Epoch 2
---------
Train loss: 0.17062740508553026
Test loss: 0.26097347380170954

Epoch 3
---------
Train loss: 0.14866465145480248
Test loss: 0.34208364551516646

Epoch 4
---------
Train loss: 0.1376034491438435
Test loss: 0.3188501123073312



In [44]:
# Perform evaluation
# Get the prediction values of all NSW data using this model

ystar_col = "Model forecast"
df_train[ystar_col] = predict(train_loader, model, device).cpu().numpy()
df_test[ystar_col] = predict(test_loader, model, device).cpu().numpy()

df_out = pd.concat((df_train, df_test))[[target, ystar_col]]

for c in df_out.columns:
    df_out[c] = df_out[c] * target_stdev + target_mean

print(df_out[['TOTALDEMAND','Model forecast']])
df_out.to_csv("nsw_no_pv.csv",index=True)

                     TOTALDEMAND  Model forecast
DATETIME                                        
2018-03-06 09:30:00      8051.25     6246.380371
2018-03-06 10:00:00      8062.84     9088.916016
2018-03-06 10:30:00      8058.62     7354.463379
2018-03-06 11:00:00      7927.81    10687.624023
2018-03-06 11:30:00      7696.73     8281.809570
...                          ...             ...
2021-03-17 22:00:00      7419.77     7225.657227
2021-03-17 22:30:00      7417.91     7698.907227
2021-03-17 23:00:00      7287.32     8300.642578
2021-03-17 23:30:00      7172.39     8976.543945
2021-03-18 00:00:00      7094.51     8537.544922

[53106 rows x 2 columns]


# Experiment 3

**With PV Data Included**

In [45]:
# Load the nsw dataset again to start third experiment
nsw = combined_full[combined_full.STATE == "NSW"]

In [46]:
# Create target and features
target = "TOTALDEMAND"
features = ["POWER", "TEMPERATURE", "Weekday_ENC", "Quarter","Month","Season_ENC","Day","Year","DAYTYPE_ENC"]

In [47]:
# Label encode weekday
le = LabelEncoder()
a = list(nsw['Weekday'])
enc = le.fit_transform(a)
nsw['Weekday_ENC'] = enc

# Label endocde Season
a = list(nsw['Season'])
enc = le.fit_transform(a)
nsw['Season_ENC'] = enc

# Label endocde Season
a = list(nsw['DAYTYPE'])
enc = le.fit_transform(a)
nsw['DAYTYPE_ENC'] = enc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [48]:
#Split to train and test dataset (80%, 20% split)
test_start = "2020-08-01 00:00:00"
# Drop columns not required
nsw.drop(['INTERVAL_DATETIME','STATE', 'LOCATION', 'Weekday', 'Season', 'DAYTYPE',"SUNRISE","SUNSET",], axis=1, inplace=True)
# Create the train dataset
df_train = nsw.loc[:test_start].copy()
# Create the test dataset
df_test = nsw.loc[test_start:].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [49]:
# Sneak preview into the train dataset
df_train.head()

Unnamed: 0_level_0,POWER,TOTALDEMAND,TEMPERATURE,FORECASTDEMAND,Quarter,Month,Day,Year,Weekday_ENC,Season_ENC,DAYTYPE_ENC
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-03-06 09:30:00,427.281,8051.25,16.3,8432.04619,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 10:00:00,495.195,8062.84,16.3,8413.074419,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 10:30:00,516.137,8058.62,16.3,8367.147273,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 11:00:00,549.542,7927.81,16.9,8334.082667,1.0,3.0,6.0,2018.0,5,0,0
2018-03-06 11:30:00,556.764,7696.73,17.7,8289.618478,1.0,3.0,6.0,2018.0,5,0,0


In [50]:
#Standardise the data
target_mean = df_train[target].mean()
target_stdev = df_train[target].std()

for c in df_train.columns:
    mean = df_train[c].mean()
    stdev = df_train[c].std()

    df_train[c] = (df_train[c] - mean) / stdev
    df_test[c] = (df_test[c] - mean) / stdev

In [51]:
# Sneak preview into the train dataset after standardising
df_test

Unnamed: 0_level_0,POWER,TOTALDEMAND,TEMPERATURE,FORECASTDEMAND,Quarter,Month,Day,Year,Weekday_ENC,Season_ENC,DAYTYPE_ENC
DATETIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-08-01 00:00:00,-0.706466,-0.421897,0.924306,0.749745,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 00:30:00,-0.706466,-0.561306,0.908016,0.584319,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 00:30:00,-0.706466,-0.561306,0.908016,0.584319,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 01:00:00,-0.706466,-0.715796,0.859148,0.413267,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
2020-08-01 01:00:00,-0.706466,-0.715796,0.859148,0.413267,0.537087,0.531431,-1.655114,1.433035,-0.499781,1.294412,0.989862
...,...,...,...,...,...,...,...,...,...,...,...
2021-03-17 22:00:00,-0.706466,-0.426195,0.370468,-0.513804,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862
2021-03-17 22:30:00,-0.706466,-0.427695,0.337889,-0.555658,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862
2021-03-17 23:00:00,-0.706466,-0.533008,0.272732,-0.632274,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862
2021-03-17 23:30:00,-0.706466,-0.625692,0.223864,-0.739867,-1.339586,-1.007903,0.149977,2.748372,1.494982,-1.223032,0.989862


In [52]:
# Create DataLoader (same parameters as above experiments)
torch.manual_seed(48)
batch_size = 4
sequence_length = 30

train_dataset = SequenceDataset(df_train, target, features, sequence_length)
test_dataset = SequenceDataset(df_test, target, features, sequence_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [53]:
# Set the parameters
learning_rate = 0.001
num_hidden_units = 16

model = TimeSeriesLSTM(num_sensors=len(features), hidden_units=num_hidden_units).to(device)
loss_function = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [54]:
# Train the model for 5 epochs
epochs = 5
for ix_epoch in range(epochs):
    print(f"Epoch {ix_epoch}\n---------")
    train_model(train_loader, model, loss_function, optimizer, device)
    test_model(test_loader, model, loss_function, device)
    print()

Epoch 0
---------
Train loss: 0.28574527879479134
Test loss: 0.29847829809081056

Epoch 1
---------
Train loss: 0.18251625581469058
Test loss: 0.24903777595863327

Epoch 2
---------
Train loss: 0.15417903453060802
Test loss: 0.23749912757019553

Epoch 3
---------
Train loss: 0.1330129290231406
Test loss: 0.24102790039168465

Epoch 4
---------
Train loss: 0.1210874286139276
Test loss: 0.22204020160065405



In [55]:
# Perform evaluation

ystar_col = "Model forecast"
df_train[ystar_col] = predict(train_loader, model, device).cpu().numpy()
df_test[ystar_col] = predict(test_loader, model, device).cpu().numpy()

df_out = pd.concat((df_train, df_test))[[target, ystar_col]]
#df_out = df_test

for c in df_out.columns:
    df_out[c] = df_out[c] * target_stdev + target_mean

print(df_out[['TOTALDEMAND','Model forecast']])
df_out.to_csv("nsw_with_pv.csv",index=True)

                     TOTALDEMAND  Model forecast
DATETIME                                        
2018-03-06 09:30:00      8051.25     6497.984863
2018-03-06 10:00:00      8062.84     7393.920898
2018-03-06 10:30:00      8058.62     8549.027344
2018-03-06 11:00:00      7927.81     9285.481445
2018-03-06 11:30:00      7696.73     8258.131836
...                          ...             ...
2021-03-17 22:00:00      7419.77     6859.223633
2021-03-17 22:30:00      7417.91     7299.731934
2021-03-17 23:00:00      7287.32     6709.609863
2021-03-17 23:30:00      7172.39     8005.292969
2021-03-18 00:00:00      7094.51     6516.585449

[53106 rows x 2 columns]
