In [4]:
# Re-loads all imports every time the cell is ran. 
%load_ext autoreload
%autoreload 2

from time import time

import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.5f}'.format

from IPython.display import display

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Neural Networks
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
# from pytorch_lightning.loggers.csv_logs import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import sys
sys.path.append("../scripts/")
import data_loader as dl

In [6]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('C:/Users/yagne/Downloads/household_power_consumption.txt/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

C:/Users/yagne/Downloads/household_power_consumption.txt/household_power_consumption.txt


In [7]:
class TimeseriesDataset(Dataset):   
    '''
    Custom Dataset subclass. 
    Serves as input to DataLoader to transform X 
      into sequence data using rolling window. 
    DataLoader using this dataset will output batches 
      of `(batch_size, seq_len, n_features)` shape.
    Suitable as an input to RNNs. 
    '''
    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 1):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

In [8]:
class PowerConsumptionDataModule(pl.LightningDataModule):
    '''
    PyTorch Lighting DataModule subclass:
    https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html

    Serves the purpose of aggregating all data loading 
      and processing work in one place.
    '''
    
    def __init__(self, path, seq_len = 1, batch_size = 128, num_workers=0):
        super().__init__()
        self.path = path
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.y_val = None
        self.X_test = None
        self.X_test = None
        self.columns = None
        self.preprocessing = None

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        '''
        Data is resampled to hourly intervals.
        Both 'np.nan' and '?' are converted to 'np.nan'
        'Date' and 'Time' columns are merged into 'dt' index
        '''

        if stage == 'fit' and self.X_train is not None:
            return 
        if stage == 'test' and self.X_test is not None:
            return
        if stage is None and self.X_train is not None and self.X_test is not None:  
            return
        
        
        df = pd.read_csv(
            self.path, 
            sep=';', 
            parse_dates={'dt' : ['Date', 'Time']}, 
            infer_datetime_format=True, 
            low_memory=False, 
            na_values=['nan','?'], 
            index_col='dt'
        )

        df_resample = df.resample('h').mean()

        X = df_resample.dropna().copy()
        y = X['Global_active_power'].shift(-1).ffill()
        self.columns = X.columns


        X_cv, X_test, y_cv, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=False
        )
    
        X_train, X_val, y_train, y_val = train_test_split(
            X_cv, y_cv, test_size=0.25, shuffle=False
        )

        preprocessing = StandardScaler()
        preprocessing.fit(X_train)

        if stage == 'fit' or stage is None:
            self.X_train = preprocessing.transform(X_train)
            self.y_train = y_train.values.reshape((-1, 1))
            self.X_val = preprocessing.transform(X_val)
            self.y_val = y_val.values.reshape((-1, 1))

        if stage == 'test' or stage is None:
            self.X_test = preprocessing.transform(X_test)
            self.y_test = y_test.values.reshape((-1, 1))
        

    def train_dataloader(self):
        train_dataset = dl.TimeseriesDataset(self.X_train, 
                                          self.y_train, 
                                          seq_len=self.seq_len)
        train_loader = DataLoader(train_dataset, 
                                  batch_size = self.batch_size, 
                                  shuffle = False, 
                                  num_workers = self.num_workers)
        
        return train_loader

    def val_dataloader(self):
        val_dataset = dl.TimeseriesDataset(self.X_val, 
                                        self.y_val, 
                                        seq_len=self.seq_len)
        val_loader = DataLoader(val_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = False, 
                                num_workers = self.num_workers)

        return val_loader

    def test_dataloader(self):
        test_dataset = dl.TimeseriesDataset(self.X_test, 
                                         self.y_test, 
                                         seq_len=self.seq_len)
        test_loader = DataLoader(test_dataset, 
                                 batch_size = self.batch_size, 
                                 shuffle = False, 
                                 num_workers = self.num_workers)

        return test_loader

In [9]:
class LSTMRegressor(pl.LightningModule):
    '''
    Standard PyTorch Lightning module:
    https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
    '''
    def __init__(self, 
                 n_features, 
                 hidden_size, 
                 seq_len, 
                 batch_size,
                 num_layers, 
                 dropout, 
                 learning_rate,
                 criterion):
        super(LSTMRegressor, self).__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.criterion = criterion
        self.learning_rate = learning_rate

        self.lstm = nn.LSTM(input_size=n_features, 
                            hidden_size=hidden_size,
                            num_layers=num_layers, 
                            dropout=dropout, 
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        # lstm_out = (batch_size, seq_len, hidden_size)
        lstm_out, _ = self.lstm(x)
        y_pred = self.linear(lstm_out[:,-1])
        return y_pred
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

In [11]:
'''
All parameters are aggregated in one place.
This is useful for reporting experiment params to experiment tracking software
'''

p = dict(
    seq_len = 24,
    batch_size = 1024, 
    criterion = nn.MSELoss(),
    num_workers = 4,
    max_epochs = 50,
    n_features = 7,
    hidden_size = 128,
    num_layers = 2,
    dropout = 0.2,
    learning_rate = 0.001,
    path="C:/Users/yagne/Downloads/household_power_consumption.txt/household_power_consumption.txt"
)

In [2]:
%load_ext tensorboard
%tensorboard --logdir ./lstm

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 15684), started 0:00:23 ago. (Use '!kill 15684' to kill it.)

In [12]:
# seed_everything(1)

# csv_logger = CSVLogger('./', name='lstm'),

checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint", 
    save_top_k=1, 
    verbose =True, 
    monitor = "val_loss",
    mode="min"
)


logger = TensorBoardLogger("lstm", name="kaggle_example")
early_stopping_callback = EarlyStopping(monitor="val_loss", patience = 10)

trainer = Trainer(
    max_epochs=p['max_epochs'],
    logger=logger,
    gpus=1,
    checkpoint_callback=checkpoint_callback,
    callbacks = [early_stopping_callback],
)

model = LSTMRegressor(
    n_features = p['n_features'],
    hidden_size = p['hidden_size'],
    seq_len = p['seq_len'],
    batch_size = p['batch_size'],
    criterion = p['criterion'],
    num_layers = p['num_layers'],
    dropout = p['dropout'],
    learning_rate = p['learning_rate']
)

dm = PowerConsumptionDataModule(
    path=p["path"],
    seq_len = p['seq_len'],
    batch_size = p['batch_size'],
    num_workers = p["num_workers"]
)


  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [17]:
dm.setup()
next(iter(dm.val_dataloader()))

  rank_zero_deprecation(


[tensor([[[ 0.5238,  1.3351, -0.8441,  ..., -0.3239, -0.3128,  1.7411],
          [-0.1272,  0.4233, -0.4469,  ..., -0.3239, -0.1902, -0.0191],
          [-0.4293, -0.2587, -0.0095,  ..., -0.3239, -0.3128, -0.0624],
          ...,
          [ 0.3451, -0.9257, -0.2844,  ..., -0.3239, -0.3128,  1.7843],
          [ 0.4715,  0.8245, -0.4566,  ..., -0.3239, -0.1938,  1.7661],
          [ 0.4700,  1.7367,  0.5705,  ..., -0.3239, -0.3128,  1.7047]],
 
         [[-0.1272,  0.4233, -0.4469,  ..., -0.3239, -0.1902, -0.0191],
          [-0.4293, -0.2587, -0.0095,  ..., -0.3239, -0.3128, -0.0624],
          [-0.4399,  0.1564,  0.1259,  ..., -0.3239, -0.1902, -0.0191],
          ...,
          [ 0.4715,  0.8245, -0.4566,  ..., -0.3239, -0.1938,  1.7661],
          [ 0.4700,  1.7367,  0.5705,  ..., -0.3239, -0.3128,  1.7047],
          [ 1.0761,  2.8870,  0.0285,  ...,  1.8750, -0.2010,  1.7888]],
 
         [[-0.4293, -0.2587, -0.0095,  ..., -0.3239, -0.3128, -0.0624],
          [-0.4399,  0.1564,

In [16]:
# metrics = pd.read_csv('./lstm/version_2/metrics.csv')
# train_loss = metrics[['train_loss', 'step', 'epoch']][~np.isnan(metrics['train_loss'])]
# val_loss = metrics[['val_loss', 'epoch']][~np.isnan(metrics['val_loss'])]
# test_loss = metrics['test_loss'].iloc[-1]

# fig, axes = plt.subplots(1, 2, figsize=(16, 5), dpi=100)
# axes[0].set_title('Train loss per batch')
# axes[0].plot(train_loss['step'], train_loss['train_loss'])
# axes[1].set_title('Validation loss per epoch')
# axes[1].plot(val_loss['epoch'], val_loss['val_loss'], color='orange')
# plt.show(block = True)

# print('MSE:')
# print(f"Train loss: {train_loss['train_loss'].iloc[-1]:.3f}")
# print(f"Val loss:   {val_loss['val_loss'].iloc[-1]:.3f}")
# print(f'Test loss:  {test_loss:.3f}')



In [11]:
a = next(iter(dm.train_dataloader()))

In [12]:
print("X: ", a[0].size())
print("y: ", a[1].size())

X:  torch.Size([128, 24, 7])
y:  torch.Size([128, 1])


In [17]:
df = pd.read_csv(
            p["path"], 
            sep=';', 
            parse_dates={'dt' : ['Date', 'Time']}, 
            infer_datetime_format=True, 
            low_memory=False, 
            na_values=['nan','?'], 
            index_col='dt'
        )

In [18]:
df.shape

(2075259, 7)

In [19]:
df.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [20]:
print(f"{2075259:,}")

2,075,259


In [21]:
df_resample = df.resample('h').mean()

In [23]:
df_resample.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:00:00,4.22289,0.229,234.64389,18.1,0.0,0.52778,16.86111
2006-12-16 18:00:00,3.6322,0.08003,234.58017,15.6,0.0,6.71667,16.86667
2006-12-16 19:00:00,3.40023,0.08523,233.2325,14.50333,0.0,1.43333,16.68333
2006-12-16 20:00:00,3.26857,0.0751,234.0715,13.91667,0.0,0.0,16.78333
2006-12-16 21:00:00,3.05647,0.07667,237.15867,13.04667,0.0,0.41667,17.21667
