In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F # activation function ReLU
from torch.optim import Adam # Faster than SGD

import lightning as L
from torch.utils.data import TensorDataset, DataLoader

# import matplotlib.pyplot as plt
# import seaborn as sns

# Parameters

In [9]:
inputs = torch.tensor([[0.,0.5,0.25,1.],[1.,0.5,0.25,1.]])
labels = torch.tensor([0.,1.])
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

# Basic LSTM

In [12]:
class LSTMbyHand(L.LightningModule):
    def __init__(self):
        super().__init__()
        mean = torch.tensor(0.)
        std = torch.tensor(1.)
        
        self.wlr1 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.wlr2 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.blr1 = nn.Parameter(torch.tensor(0.,),requires_grad=True)
        
        self.wpr1 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.wpr2 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.bpr1 = nn.Parameter(torch.tensor(0.,),requires_grad=True)
        
        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.,),requires_grad=True)
        
        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std),requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.,),requires_grad=True)
        

    def lstm_unit(self, input_value, long_memory,short_memory):
        long_remember_percent = torch.sigmoid((short_memory*self.wlr1) + 
                                              (input_value*self.wlr2) + 
                                              self.blr1)
        petential_remember_percent = torch.sigmoid((short_memory*self.wpr1) + 
                                              (input_value*self.wpr2) + 
                                              self.bpr1)
        
        petential_memory = torch.tanh((short_memory*self.wp1) + 
                                              (input_value*self.wp2) + 
                                              self.bp1)
        
        updated_long_memory = ((long_memory*long_remember_percent) + 
                               (petential_memory*petential_remember_percent))
        
        output_percent = torch.sigmoid((short_memory*self.wo1) + 
                                              (input_value*self.wo2) + 
                                              self.bo1)
        
        updated_short_memory = torch.tanh(updated_long_memory)*output_percent
        
        return([updated_long_memory, updated_short_memory])

    def forward(self, input):
        long_memory =0
        short_memory =0
        day1 = input[0]
        day2 = input[1]
        day3 = input[2]
        day4 = input[3]
        
        long_memory, short_memory = self.lstm_unit(day1, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day2, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day3, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day4, long_memory, short_memory)
        output = short_memory
        return output
    
    def configure_optimizers(self):
        return(Adam(self.parameters()))
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        output_i = self.forward(input_i[0])
        loss = (output_i-label_i)**2
        self.log('train_loss', loss)
        
        if(label_i == 0):
            self.log('out_0', output_i)
        else:
            self.log('out_1', output_i)
        return loss    

In [13]:
model = LSTMbyHand()
print(f"\nNow let's compare the observed and predicted values...")
print(f"Company A: Observed: 0, Predicted: {model(torch.tensor([0.,0.5,0.25,1.])).detach()}")
print(f"Company B: Observed: 0, Predicted: {model(torch.tensor([1.,0.5,0.25,1.])).detach()}")


Now let's compare the observed and predicted values...
Company A: Observed: 0, Predicted: -0.44012928009033203
Company B: Observed: 0, Predicted: -0.6451089382171631


In [14]:
trainer = L.Trainer(max_epochs=2000)
trainer.fit(model, train_dataloaders = dataloader)
print(f"\nNow let's compare the observed and predicted values...")
print(f"Company A: Observed: 0, Predicted: {model(torch.tensor([0.,0.5,0.25,1.])).detach()}")
print(f"Company B: Observed: 0, Predicted: {model(torch.tensor([1.,0.5,0.25,1.])).detach()}")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)
0         Modules in train mode
0         Modules in eval mode


Epoch 1999: 100%|██████████| 2/2 [00:00<00:00, 147.34it/s, v_num=7]

`Trainer.fit` stopped: `max_epochs=2000` reached.


Epoch 1999: 100%|██████████| 2/2 [00:00<00:00, 102.17it/s, v_num=7]

Now let's compare the observed and predicted values...
Company A: Observed: 0, Predicted: 0.5057027339935303
Company B: Observed: 0, Predicted: 0.5075316429138184


In [16]:
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path
trainer = L.Trainer(max_epochs=5000)
trainer.fit(model, train_dataloaders = dataloader, ckpt_path=path_to_best_checkpoint)
print(f"\nNow let's compare the observed and predicted values...")
print(f"Company A: Observed: 0, Predicted: {model(torch.tensor([0.,0.5,0.25,1.])).detach()}")
print(f"Company B: Observed: 0, Predicted: {model(torch.tensor([1.,0.5,0.25,1.])).detach()}")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/yifanli/Github/Machine_Learning/code/transformer/lightning_logs/version_7/checkpoints/epoch=1999-step=4000.ckpt
/opt/anaconda3/envs/machine_learning/lib/python3.13/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:362: The dirpath has changed from '/Users/yifanli/Github/Machine_Learning/code/transformer/lightning_logs/version_7/checkpoints' to '/Users/yifanli/Github/Machine_Learning/code/transformer/lightning_logs/version_8/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-t

Epoch 4999: 100%|██████████| 2/2 [00:00<00:00, 133.10it/s, v_num=8]

`Trainer.fit` stopped: `max_epochs=5000` reached.


Epoch 4999: 100%|██████████| 2/2 [00:00<00:00, 91.64it/s, v_num=8] 

Now let's compare the observed and predicted values...
Company A: Observed: 0, Predicted: 0.42677316069602966
Company B: Observed: 0, Predicted: 0.5205246210098267


In [17]:
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path
trainer = L.Trainer(max_epochs=10000)
trainer.fit(model, train_dataloaders = dataloader, ckpt_path=path_to_best_checkpoint)
print(f"\nNow let's compare the observed and predicted values...")
print(f"Company A: Observed: 0, Predicted: {model(torch.tensor([0.,0.5,0.25,1.])).detach()}")
print(f"Company B: Observed: 0, Predicted: {model(torch.tensor([1.,0.5,0.25,1.])).detach()}")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/yifanli/Github/Machine_Learning/code/transformer/lightning_logs/version_8/checkpoints/epoch=4999-step=10000.ckpt
/opt/anaconda3/envs/machine_learning/lib/python3.13/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:362: The dirpath has changed from '/Users/yifanli/Github/Machine_Learning/code/transformer/lightning_logs/version_8/checkpoints' to '/Users/yifanli/Github/Machine_Learning/code/transformer/lightning_logs/version_9/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-

Epoch 9999: 100%|██████████| 2/2 [00:00<00:00, 149.20it/s, v_num=9]

`Trainer.fit` stopped: `max_epochs=10000` reached.


Epoch 9999: 100%|██████████| 2/2 [00:00<00:00, 101.78it/s, v_num=9]

Now let's compare the observed and predicted values...
Company A: Observed: 0, Predicted: 0.00012457992124836892
Company B: Observed: 0, Predicted: 0.9881858825683594


# Lightning LSTM

In [21]:
class LightningLSTM(L.LightningModule):
    def __init__(self):
        super().__init__()
        
        self.lstm = nn.LSTM(input_size=1, hidden_size=1)
        
    def forward(self, input):
        input_trans = input.view(len(input), 1)
        lstm_out, temp = self.lstm(input_trans)
        prediction = lstm_out[-1]
        return prediction
    
    def configure_optimizers(self):
        return(Adam(self.parameters(),lr=0.01))
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        output_i = self.forward(input_i[0])
        loss = (output_i-label_i)**2
        self.log('train_loss', loss)
        
        if(label_i == 0):
            self.log('out_0', output_i)
        else:
            self.log('out_1', output_i)
        return loss    

In [22]:
model = LightningLSTM()
print(f"\nNow let's compare the observed and predicted values...")
print(f"Company A: Observed: 0, Predicted: {model(torch.tensor([0.,0.5,0.25,1.])).detach()}")
print(f"Company B: Observed: 0, Predicted: {model(torch.tensor([1.,0.5,0.25,1.])).detach()}")


Now let's compare the observed and predicted values...
Company A: Observed: 0, Predicted: tensor([0.2412])
Company B: Observed: 0, Predicted: tensor([0.2079])


In [24]:
trainer = L.Trainer(max_epochs=1000,log_every_n_steps=2)
trainer.fit(model, train_dataloaders = dataloader)
print(f"\nNow let's compare the observed and predicted values...")
print(f"Company A: Observed: 0, Predicted: {model(torch.tensor([0.,0.5,0.25,1.])).detach()}")
print(f"Company B: Observed: 0, Predicted: {model(torch.tensor([1.,0.5,0.25,1.])).detach()}")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type | Params | Mode 
--------------------------------------
0 | lstm | LSTM | 16     | train
--------------------------------------
16        Trainable params
0         Non-trainable params
16        Total params
0.000     Total estimated model params size (MB)
1         Modules in train mode
0         Modules in eval mode


Epoch 999: 100%|██████████| 2/2 [00:00<00:00, 279.60it/s, v_num=11]

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 999: 100%|██████████| 2/2 [00:00<00:00, 192.53it/s, v_num=11]

Now let's compare the observed and predicted values...
Company A: Observed: 0, Predicted: tensor([0.0174])
Company B: Observed: 0, Predicted: tensor([0.9565])
