# Long Short Term Memory Model with Lightning

Build a simeple LSTM model from scratch and use pytorch lightning. Inspired by Statquest.

In [1]:
# ! pip install tensorboard

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from optim import Adam
import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger

## From scratch

In [48]:
class LSTMbyHand(L.LightningModule):
    def __init__(self):
        super().__init__()
        
        L.seed_everything(seed=42)
        
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)

        self.wlr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wlr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.blr1 = nn.Parameter(torch.tensor(0.), requires_grad=True)

        self.wpr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wpr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bpr1 = nn.Parameter(torch.tensor(0.), requires_grad=True)

        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.), requires_grad=True)

        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
        
        
    def lstm_unit(self, input_value, long_memory, short_memory):
        long_remember_pct = torch.sigmoid(short_memory * self.wlr1 + input_value * self.wlr2 + self.blr1)
        
        potenital_remember_pct = torch.sigmoid(short_memory * self.wpr1 + input_value * self.wpr2 + self.bpr1)
        potential_memory = torch.tanh(short_memory * self.wp1 + input_value * self.wp2 + self.bp1)
        updated_long_memory = long_memory * long_remember_pct + potenital_remember_pct * potential_memory

        output_pct = torch.sigmoid(short_memory * self.wo1 + input_value * self.wo2 + self.bo1)
        updated_short_memory = torch.tanh(updated_long_memory) + output_pct

        return([updated_long_memory, updated_short_memory])

    def forward(self, input):
        long_memory = 0
        short_memory = 0
        day1 = input[0]
        day2 = input[1]
        day3 = input[2]
        day4 = input[3]

        long_memory, short_memory = self.lstm_unit(day1, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day2, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day3, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day4, long_memory, short_memory)

        return short_memory

    def configure_optimizers(self):
        return Adam(self.parameters())

    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        output_i = self.forward(input_i[0])
        loss = (output_i - label_i)**2

        self.log("train_loss", loss)
        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)
            
        return loss

In [49]:
model = LSTMbyHand()

print("Random initial weight for company A", model(torch.tensor([0., 0.5, 0.25, 1.])).detach(), "; Observed = 0")
print("Random initial weight for company B", model(torch.tensor([1., 0.5, 0.25, 1.])).detach(), "; Observed = 1")

Seed set to 42


Random initial weight for company A tensor(0.0388) ; Observed = 0
Random initial weight for company B tensor(0.0758) ; Observed = 1


### Training

In [50]:
inputs = torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])

dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

In [51]:
trainer = L.Trainer(max_epochs=1500, logger=TensorBoardLogger("logs"))
trainer.fit(model, train_dataloaders=dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)
0         Modules in train mode
0         Modules in eval mode
/opt/anaconda3/envs/dl/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/anaconda3/envs/dl/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower va

Epoch 386:   0%|                                                                                                                                 | 0/2 [11:05<?, ?it/s, v_num=2]
Epoch 1499: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 67.61it/s, v_num=4]

`Trainer.fit` stopped: `max_epochs=1500` reached.


Epoch 1499: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.23it/s, v_num=4]


In [52]:
print("Model output for company A", model(torch.tensor([0., 0.5, 0.25, 1.])).detach(), "; Actual = 0")
print("Model output for company B", model(torch.tensor([1., 0.5, 0.25, 1.])).detach(), "; Actual = 1")

Model output for company A tensor(0.4703) ; Actual = 0
Model output for company B tensor(0.5022) ; Actual = 1


## Pytorch LSTM function

In [27]:
class LSTMbyTorch(L.LightningModule):
    def __init__(self):
        super().__init__()

        self.lstm = nn.LSTM(input_size=1, hidden_size=1)

    def forward(self, input):
        input_trans = input.view(len(input), 1)
        lstm_out, temp = self.lstm(input_trans)
        prediction = lstm_out[-1]
        
        return prediction

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)

    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        output_i = self.forward(input_i[0])
        loss = (output_i - label_i)**2

        self.log("train loss", loss)
        if label_i == 0:
            self.log("output-0", output_i)
        else:
            self.log("output-1", output_i)
        
        return loss

In [28]:
model = LSTMbyTorch()

print("Random initial weight for company A", model(torch.tensor([0., 0.5, 0.25, 1.])).detach(), "; Observed = 0")
print("Random initial weight for company B", model(torch.tensor([1., 0.5, 0.25, 1.])).detach(), "; Observed = 1")

Random initial weight for company A tensor([-0.0727]) ; Observed = 0
Random initial weight for company B tensor([-0.0726]) ; Observed = 1


In [32]:
trainer = L.Trainer(max_epochs=300, log_every_n_steps=2, logger=TensorBoardLogger("logs"))
trainer.fit(model, train_dataloaders=dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type | Params | Mode 
--------------------------------------
0 | lstm | LSTM | 16     | train
--------------------------------------
16        Trainable params
0         Non-trainable params
16        Total params
0.000     Total estimated model params size (MB)
1         Modules in train mode
0         Modules in eval mode


Epoch 299: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44.70it/s, v_num=1]

`Trainer.fit` stopped: `max_epochs=300` reached.


Epoch 299: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 36.03it/s, v_num=1]


In [34]:
print("Model output for company A", model(torch.tensor([0., 0.5, 0.25, 1.])).detach(), "; Actual = 0")
print("Model output for company B", model(torch.tensor([1., 0.5, 0.25, 1.])).detach(), "; Actual = 1")

Model output for company A tensor([-1.9422e-06]) ; Actual = 0
Model output for company B tensor([0.9965]) ; Actual = 1


## Notes

The model build by hand is not training well, ad it trys to converge predict both samples as 0.5.