In [35]:
!pip install pytorch_lightning



In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from collections import defaultdict

In [38]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

In [39]:
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


42

In [40]:
data_path = './drive/MyDrive/git_hub_repo/CNX_load_forecasting/data_preparation_outputs/datasets/load_weather_programs_df.csv'

In [41]:
df = pd.read_csv(data_path, parse_dates=['intervalStart'])
df.head()

Unnamed: 0.1,Unnamed: 0,intervalStart,Connexus_kWh,unixTime,latitude,longitude,station,apparentTemperature,cloudCover,dewPoint,...,windGust,windSpeed,DVR_duration_mins,CampusGen_duration_mins,CIGen_duration_mins,ACST_duration_mins,PTR_duration_mins,Interruptible Irrigation_duration_mins,Cycled Air Conditioning_duration_mins,Interruptible Water Heating_duration_mins
0,0,2017-01-01 01:00:00,194634.4219,1483254000.0,45.395556,-93.386667,0.0,18.93,0.39,19.23,...,10.51,5.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2017-01-01 02:00:00,185003.5234,1483258000.0,45.395556,-93.386667,0.0,20.16,0.0,18.98,...,8.88,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2017-01-01 03:00:00,179523.5938,1483261000.0,45.395556,-93.386667,0.0,20.4,0.04,18.58,...,6.34,3.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2017-01-01 04:00:00,178223.6797,1483265000.0,45.395556,-93.386667,0.0,24.46,0.2,18.62,...,5.83,2.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2017-01-01 05:00:00,180018.2031,1483268000.0,45.395556,-93.386667,0.0,23.61,0.11,18.69,...,3.13,1.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df.shape

(58634, 33)

### Preprocessing

In [43]:
df['prev_Connexus_kWh'] = df.shift(1)['Connexus_kWh']

In [44]:
df['Connexus_kWh_change'] = df.progress_apply(
    lambda row : 0 if np.isnan(row.prev_Connexus_kWh) else (row.Connexus_kWh - row.prev_Connexus_kWh),
    axis=1
)

  0%|          | 0/58634 [00:00<?, ?it/s]

In [45]:
df.head()

Unnamed: 0.1,Unnamed: 0,intervalStart,Connexus_kWh,unixTime,latitude,longitude,station,apparentTemperature,cloudCover,dewPoint,...,DVR_duration_mins,CampusGen_duration_mins,CIGen_duration_mins,ACST_duration_mins,PTR_duration_mins,Interruptible Irrigation_duration_mins,Cycled Air Conditioning_duration_mins,Interruptible Water Heating_duration_mins,prev_Connexus_kWh,Connexus_kWh_change
0,0,2017-01-01 01:00:00,194634.4219,1483254000.0,45.395556,-93.386667,0.0,18.93,0.39,19.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,1,2017-01-01 02:00:00,185003.5234,1483258000.0,45.395556,-93.386667,0.0,20.16,0.0,18.98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,194634.4219,-9630.8985
2,2,2017-01-01 03:00:00,179523.5938,1483261000.0,45.395556,-93.386667,0.0,20.4,0.04,18.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,185003.5234,-5479.9296
3,3,2017-01-01 04:00:00,178223.6797,1483265000.0,45.395556,-93.386667,0.0,24.46,0.2,18.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,179523.5938,-1299.9141
4,4,2017-01-01 05:00:00,180018.2031,1483268000.0,45.395556,-93.386667,0.0,23.61,0.11,18.69,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,178223.6797,1794.5234


In [46]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
  row_data = dict()

  for column in df.columns:
    if column == 'intervalStart':
      row_data['day_of_week'] = row.intervalStart.dayofweek
      row_data['day_of_month'] = row.intervalStart.day
      row_data['week_of_year'] = row.intervalStart.week
      row_data['month'] = row.intervalStart.month

    else:
      if column!='prev_Connexus_kWh':
        row_data[str(column)] = row[str(column)]


  rows.append(row_data)

features_df = pd.DataFrame(rows)
features_df.head()

  0%|          | 0/58634 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Connexus_kWh,unixTime,latitude,longitude,station,...,windSpeed,DVR_duration_mins,CampusGen_duration_mins,CIGen_duration_mins,ACST_duration_mins,PTR_duration_mins,Interruptible Irrigation_duration_mins,Cycled Air Conditioning_duration_mins,Interruptible Water Heating_duration_mins,Connexus_kWh_change
0,0,6,1,52,1,194634.4219,1483254000.0,45.395556,-93.386667,0.0,...,5.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,6,1,52,1,185003.5234,1483258000.0,45.395556,-93.386667,0.0,...,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-9630.8985
2,2,6,1,52,1,179523.5938,1483261000.0,45.395556,-93.386667,0.0,...,3.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5479.9296
3,3,6,1,52,1,178223.6797,1483265000.0,45.395556,-93.386667,0.0,...,2.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1299.9141
4,4,6,1,52,1,180018.2031,1483268000.0,45.395556,-93.386667,0.0,...,1.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1794.5234


In [47]:
features_df = features_df.drop(['Unnamed: 0'], axis=1)
features_df.shape

(58634, 36)

In [48]:
train_size = int(len(features_df)*0.9)
train_size

52770

In [49]:
train_df, test_df = features_df[:train_size], features_df[train_size+1:]
train_df.shape, test_df.shape


((52770, 36), (5863, 36))

In [50]:
train_df.to_csv('./drive/MyDrive/git_hub_repo/CNX_load_forecasting/data_preparation_outputs/datasets/train_df.csv')
test_df.to_csv('./drive/MyDrive/git_hub_repo/CNX_load_forecasting/data_preparation_outputs/datasets/test_df.csv')

In [28]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [29]:
train_df = pd.DataFrame(scaler.transform(train_df), columns=train_df.columns, index=train_df.index)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns, index=test_df.index)

In [None]:
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Connexus_kWh,unixTime,latitude,longitude,station,apparentTemperature,...,windSpeed,DVR_duration_mins,CampusGen_duration_mins,CIGen_duration_mins,ACST_duration_mins,PTR_duration_mins,Interruptible Irrigation_duration_mins,Cycled Air Conditioning_duration_mins,Interruptible Water Heating_duration_mins,Connexus_kWh_change
0,1.0,-1.0,0.961538,-1.0,-0.686765,-1.0,-1.0,-1.0,-1.0,-0.110541,...,-0.62998,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.20137
1,1.0,-1.0,0.961538,-1.0,-0.733464,-0.999962,-1.0,-1.0,-1.0,-0.094758,...,-0.702903,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.06801
2,1.0,-1.0,0.961538,-1.0,-0.760036,-0.999925,-1.0,-1.0,-1.0,-0.091679,...,-0.757596,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.125489
3,1.0,-1.0,0.961538,-1.0,-0.766339,-0.999887,-1.0,-1.0,-1.0,-0.039584,...,-0.805537,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.18337
4,1.0,-1.0,0.961538,-1.0,-0.757638,-0.999849,-1.0,-1.0,-1.0,-0.050491,...,-0.887914,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.226219


In [32]:
test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Connexus_kWh,unixTime,latitude,longitude,station,apparentTemperature,...,windSpeed,DVR_duration_mins,CampusGen_duration_mins,CIGen_duration_mins,ACST_duration_mins,PTR_duration_mins,Interruptible Irrigation_duration_mins,Cycled Air Conditioning_duration_mins,Interruptible Water Heating_duration_mins,Connexus_kWh_change
52771,-1.0,0.466667,-0.884615,-1.0,-0.348346,1.000075,-1.0,-1.0,-1.0,-0.373067,...,-0.367319,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.408625
52772,-1.0,0.466667,-0.884615,-1.0,-0.352196,1.000113,-1.0,-1.0,-1.0,-0.373067,...,-0.414585,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.190375
52773,-1.0,0.466667,-0.884615,-1.0,-0.343942,1.000151,-1.0,-1.0,-1.0,-0.377302,...,-0.272789,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.224942
52774,-1.0,0.466667,-0.884615,-1.0,-0.32657,1.000188,-1.0,-1.0,-1.0,-0.344069,...,-0.216745,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.250981
52775,-1.0,0.466667,-0.884615,-1.0,-0.325773,1.000226,-1.0,-1.0,-1.0,-0.254635,...,-0.395679,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.203646


In [None]:
# Drop features about ran programs

for i in train_df.columns:
  if '_mins' in i:
    train_df = train_df.drop(columns = [i])


for i in test_df.columns:
  if '_mins' in i:
    test_df = test_df.drop(columns = [i])

In [None]:
train_df.columns

Index(['day_of_week', 'day_of_month', 'week_of_year', 'month', 'Connexus_kWh',
       'unixTime', 'latitude', 'longitude', 'station', 'apparentTemperature',
       'cloudCover', 'dewPoint', 'humidity', 'icon', 'precipAccumulation',
       'precipIntensity', 'precipProbability', 'precipType', 'pressure',
       'snowAccumulation', 'snowIntensity', 'temperature', 'uvIndex',
       'visibility', 'windBearing', 'windGust', 'windSpeed',
       'Connexus_kWh_change'],
      dtype='object')

In [None]:
def create_sequences(input_data:pd.DataFrame, target_column, sequence_length):

  sequences = []
  data_size = len(input_data)

  for i in  tqdm(range(data_size - sequence_length)):
    sequence = input_data[i:i+sequence_length]

    label_position = i + sequence_length
    label = input_data.iloc[label_position][target_column]


    sequences.append((sequence, label))

  return sequences


In [None]:
# To find the best sequence_length, experiments with various value are needed.
sequence_length = 120

train_sequences = create_sequences(train_df, 'Connexus_kWh', sequence_length)
test_sequences = create_sequences(test_df, 'Connexus_kWh', sequence_length)


  0%|          | 0/52650 [00:00<?, ?it/s]

  0%|          | 0/5743 [00:00<?, ?it/s]

In [None]:
train_sequences[0][0].head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Connexus_kWh,unixTime,latitude,longitude,station,apparentTemperature,...,pressure,snowAccumulation,snowIntensity,temperature,uvIndex,visibility,windBearing,windGust,windSpeed,Connexus_kWh_change
0,1.0,-1.0,0.961538,-1.0,-0.686765,-1.0,-1.0,-1.0,-1.0,-0.110541,...,-0.163265,-1.0,-1.0,-0.121193,-1.0,1.0,0.247911,-0.576466,-0.62998,0.20137
1,1.0,-1.0,0.961538,-1.0,-0.733464,-0.999962,-1.0,-1.0,-1.0,-0.094758,...,-0.142857,-1.0,-1.0,-0.120117,-1.0,1.0,0.286908,-0.642152,-0.702903,0.06801
2,1.0,-1.0,0.961538,-1.0,-0.760036,-0.999925,-1.0,-1.0,-1.0,-0.091679,...,-0.122449,-1.0,-1.0,-0.130268,-1.0,1.0,0.309192,-0.744509,-0.757596,0.125489
3,1.0,-1.0,0.961538,-1.0,-0.766339,-0.999887,-1.0,-1.0,-1.0,-0.039584,...,-0.102041,-1.0,-1.0,-0.136881,-1.0,1.0,0.364903,-0.765061,-0.805537,0.18337
4,1.0,-1.0,0.961538,-1.0,-0.757638,-0.999849,-1.0,-1.0,-1.0,-0.050491,...,-0.071429,-1.0,-1.0,-0.149954,-1.0,1.0,0.325905,-0.873867,-0.887914,0.226219


In [None]:
test_sequences[0][0].head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,Connexus_kWh,unixTime,latitude,longitude,station,apparentTemperature,...,pressure,snowAccumulation,snowIntensity,temperature,uvIndex,visibility,windBearing,windGust,windSpeed,Connexus_kWh_change
52771,-1.0,0.466667,-0.884615,-1.0,-0.348346,1.000075,-1.0,-1.0,-1.0,-0.373067,...,-0.020408,-1.0,-1.0,-0.335743,-1.0,0.696,0.192201,-0.13238,-0.367319,0.408625
52772,-1.0,0.466667,-0.884615,-1.0,-0.352196,1.000113,-1.0,-1.0,-1.0,-0.373067,...,-0.010204,-1.0,-1.0,-0.344509,-1.0,0.914,0.192201,-0.245215,-0.414585,0.190375
52773,-1.0,0.466667,-0.884615,-1.0,-0.343942,1.000151,-1.0,-1.0,-1.0,-0.377302,...,-0.030612,-1.0,-1.0,-0.330667,-1.0,0.942,0.203343,-0.188797,-0.272789,0.224942
52774,-1.0,0.466667,-0.884615,-1.0,-0.32657,1.000188,-1.0,-1.0,-1.0,-0.344069,...,-0.040816,-1.0,-1.0,-0.294217,-1.0,0.928,0.348189,-0.250453,-0.216745,0.250981
52775,-1.0,0.466667,-0.884615,-1.0,-0.325773,1.000226,-1.0,-1.0,-1.0,-0.254635,...,-0.020408,-1.0,-1.0,-0.232082,-1.0,0.77,0.381616,-0.295587,-0.395679,0.203646


In [None]:
import pickle

with open('./drive/MyDrive/CNX_load_forecasting/data_preparation_outputs/datasets/train_sequences.pkl', 'wb') as sp:
  pickle.dump(train_sequences, sp)

with open('./drive/MyDrive/CNX_load_forecasting/data_preparation_outputs/datasets/test_sequences.pkl', 'wb') as fp:
  pickle.dump(test_sequences, fp)