<a href="https://colab.research.google.com/github/whoami-Lory271/DL-project/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytorch-lightning --quiet
!pip install torchmetrics --quiet

In [None]:
!pip install -U --no-cache-dir gdown --pre

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from numpy import array
from sklearn.preprocessing import OneHotEncoder
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.normalization import LayerNorm
import pickle
import os
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from pathlib import Path
import pandas as pd
import numpy as np
import csv
import torchmetrics
import pytorch_lightning as pl
from pytorch_lightning.callbacks.progress import TQDMProgressBar
import gdown

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Helper Function

In [None]:
def create_padding(encode, vocab, max_len):
    enc = np.array(encode)
    encoded = np.zeros(max_len + 2)
    for k in range(len(enc)):
      encoded[k] = enc[k]
    return encoded

In [None]:
def get_padding_mask(X,inverse=False):
  pad = None
  if inverse == False:
    pad = X == 0
  else:
    pad = X != 0
  padding_mask = pad.repeat(1,1,X.shape[1]).reshape((X.shape[0],X.shape[1],X.shape[1]))
  if inverse == False:
    padding_mask[pad] = True
  return padding_mask

##Dataset

In [None]:

url = "https://drive.google.com/drive/folders/1gt_OmBTq_T-0EZbAIKDq6pfXRvuHCmq7?usp=sharing" #algebra
gdown.download_folder(url = url, quiet = True, use_cookies=False)  

url = "https://drive.google.com/drive/folders/1h-RQsLMDcw3yVZGMtsInx-0ePPYW-C_M?usp=sharing" #calculus
gdown.download_folder(url = url, quiet = True, use_cookies=False)

url = "https://drive.google.com/drive/folders/1gu-yWS9gtcLPXjvpBkQDPmgRcCpcpr2t?usp=sharing" #probability
gdown.download_folder(url = url, quiet = True, use_cookies=False)

url = "https://drive.google.com/drive/folders/1dK0ifABFPmXYwsJQow5038yXuUA0Pqy_?usp=sharing" #interpolate
gdown.download_folder(url = url, quiet = True, use_cookies=False)

!gdown "1-7ies2HEUy56W8akeHlBN-XkPDPy9uh0" #vocab


In [None]:
class MathematicsDataset(Dataset):
  def __init__(self, module, train_type, max_len_quest=160, max_len_answ=30):
    self.max_len_quest = max_len_quest
    self.max_len_answ = max_len_answ
    #unpickle the vocabulary
    with open(os.path.join("/content",'vocabs'),'rb') as infile:
      self.vocab = pickle.load(infile)

    modules = ["algebra","calculus", "probability","interpolate"]
    assert module in modules, f"the module should be algebra , calculus, probability or interpolate!"
    if module in ["algebra","calculus", "probability"]:
      train_types =  ["easy","medium", "hard", "complete"]
      assert train_type in train_types, f"the train_type for {module} should be easy, medium or hard!"
    elif module in ["interpolate"]:
      train_types = ["algebra","calculus", "probability"]
      assert train_type in train_types, f"the train_type for {module} should be algebra, calculus or proability!"

    self.path = "/content/" + module + "_pickle" +"/" + train_type

    #unpickle the qestions
    with open(os.path.join(self.path,"questions"),'rb') as infile:
      self.quest = pickle.load(infile)

    #unpickle the answers
    with open(os.path.join(self.path,"answers"),'rb') as infile:
      self.answ = pickle.load(infile)
  
  def __len__(self):
    return len(self.quest)

  def __getitem__(self, idx):
    assert(idx  < len(self.quest)) #indices should start from 0 to len - 1 (there are 666666 elements or 1999998 elements)
    question = self.quest[idx]["Encoding"]
    encoding1 = torch.from_numpy(create_padding(question, self.vocab, self.max_len_quest))
    encoding1 = encoding1.type(torch.int64)
      
    answer = self.answ[idx]["Encoding"]
    encoding2 = torch.from_numpy(create_padding(answer, self.vocab, self.max_len_answ))
    encoding2 = encoding2.type(torch.int64)
      
    return encoding1, encoding2 

In [None]:
class PlMathematicsDataset(pl.LightningDataModule):
    def __init__(self, module, train_type, max_len_quest=160, max_len_answ=30, batch_size=256):
        super().__init__()
        self.module = module
        self.train_type = train_type #should be "easy","medium","hard"
        self.test_type = "interpolate"
        self.max_len_quest = max_len_quest
        self.max_len_answ = max_len_answ
        self.batch_size = batch_size
        self.dataset = MathematicsDataset(self.module, self.train_type) #change if needed
        self.vocab = self.dataset.vocab
    # def prepare_data(self):
        # tok, load, ecc...
        # MNIST(self.data_dir, train=True, download=True)
        # MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            self.train_ds = MathematicsDataset(self.module, self.train_type)

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.test_ds = MathematicsDataset(self.test_type, "algebra")

        if stage == "predict" or stage is None:
            self.predict_ds = MathematicsDataset(self.test_type, "algebra")

    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.batch_size, shuffle = True, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=self.batch_size)

    def predict_dataloader(self):
        return DataLoader(self.predict_ds, batch_size=self.batch_size)

In [None]:
dataset = PlMathematicsDataset("algebra", "complete")

In [None]:
print(len(dataset.dataset))

print(len(dataset.vocab))

1999998
58


In [None]:
# #Create dataloader
# train_loader = DataLoader(dataset, 32, shuffle=True)
# print(len(train_loader))

In [None]:
# print("data", next(iter(train_loader)))
# print("questions_shape ", next(iter(train_loader))[0].shape)

# print("answers_shape ", next(iter(train_loader))[1].shape)

# Model

In [None]:
class SimpleLSTM(pl.LightningModule):
    def __init__(self,voc_size,hidden_size):
      super().__init__()
      self.save_hyperparameters()
      self.voc_size = voc_size
      self.hidden_size = hidden_size
      self.lstm = nn.LSTM(voc_size, hidden_size, 1)
      self.linear = nn.Linear(hidden_size, voc_size, bias=False)
      self.metric = nn.CrossEntropyLoss()

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        quest, answ = batch
        output = self.forward(quest,answ,forcing=True)
        pred = output.reshape(output.shape[0]*output.shape[1],output.shape[2])
        target = answ.reshape(answ.shape[0]*answ.shape[1])
        loss = self.metric(pred,target)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(),lr=6e-04, betas=(0.9, 0.995), eps=1e-09)
        return optimizer
    
    def accuracy(self,pred,targ):
      c = 0
      for i in range(targ.shape[0]):
        if targ[i] == pred[i]:
          c += 1
      return c/targ.shape[0]

    def test_step(self, batch, batch_idx):
        quest,answ = batch
        target_pad_mask = get_padding_mask(answ,inverse = True)
        pred = self.forward(quest,answ)
        acc = self.accuracy(pred.argmax(-1)[target_pad_mask[:,0,:]],answ[target_pad_mask[:,0,:]])
        self.log('test_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return acc

    def forward(self,quest,answ,thinking_steps=16,forcing=False):

      device = 'cuda' if torch.cuda.is_available() else 'cpu'
      q = nn.functional.one_hot(quest, num_classes = self.voc_size).to(device)
      q = q.type(torch.float32)

      a = nn.functional.one_hot(answ, num_classes = self.voc_size).to(device)
      a = a.type(torch.float32)

      hidden_state = torch.zeros(1,quest.shape[0],self.hidden_size,dtype=torch.float32).to(device)
      cell_state = torch.zeros(1,quest.shape[0],self.hidden_size,dtype=torch.float32).to(device)

      self.lstm.flatten_parameters()
      for t in range(quest.shape[1]):
        outputs, (hidden_state, cell_state) = self.lstm(q[:,t,:].unsqueeze(0), (hidden_state, cell_state))
      
      thinking= torch.zeros(1, quest.shape[0], self.voc_size, dtype=torch.float32).to(device)
      for i in range(thinking_steps):
        _, (hidden_state, cell_state) = self.lstm(thinking, (hidden_state, cell_state))
      
      output_sequence = torch.zeros((answ.shape[1], answ.shape[0], self.voc_size), dtype=torch.float32).to(device)
      start = self.linear(outputs[0])
      outputs, (hidden_state, cell_state) = self.lstm(start.unsqueeze(0), (hidden_state, cell_state))
   
      for t in range(answ.shape[1]):
        output_sequence[t] = self.linear(outputs[0])
        if forcing == True:
          outputs, (hidden_state, cell_state) = self.lstm(a[:,t,:].unsqueeze(0), (hidden_state, cell_state))
        else:
          char = output_sequence[t].argmax(-1)
          char = nn.functional.one_hot(char, num_classes = self.voc_size).to(device)
          char = char.type(torch.float32)
          outputs, (hidden_state, cell_state) = self.lstm(char.unsqueeze(0), (hidden_state, cell_state))
      
      return output_sequence.reshape(answ.shape[0],answ.shape[1],-1)


    def predict_step(self, batch, batch_idx):
        quest,answ = batch
        out = self.forward(quest,answ)
        return out.argmax(dim=-1)

In [None]:
hidden_size = 2048
vocabulary_size = len(dataset.vocab)
enable_checkpointing = True
resume_training = False
load_model = False
load_model_dir = "/content/drive/MyDrive/Deep learning/model_checkpoints/LSTM/last.ckpt" #change path to load a different checkpoint

In [None]:
model = SimpleLSTM(vocabulary_size,hidden_size)
if load_model == True:
  model = SimpleLSTM.load_from_checkpoint(load_model_dir)

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(dirpath="/content/drive/MyDrive/Deep learning/model_checkpoints/LSTM", save_last =True, save_on_train_epoch_end = True)
# define the logger object 
logger = TensorBoardLogger("/content/drive/MyDrive/Deep learning/tb_logs", name="LSTM",log_graph=True )

#build the correct callbacks
callbacks = [TQDMProgressBar(refresh_rate=20)]
ckpt_path = load_model_dir

#handle checkpointing
if resume_training == False:
  ckpt_path = None
if enable_checkpointing == True:
  callbacks = [checkpoint_callback, TQDMProgressBar(refresh_rate=20)]
  
#passing it to the trainer
gpu = 0
if torch.cuda.is_available() : 
  gpu = 1
trainer = pl.Trainer(enable_checkpointing=enable_checkpointing ,gpus=gpu, max_epochs=1, logger=logger, callbacks=callbacks)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(ckpt_path=ckpt_path, model=model, datamodule=dataset)
print(checkpoint_callback.best_model_path)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name   | Type             | Params
--------------------------------------------
0 | lstm   | LSTM             | 17.3 M
1 | linear | Linear           | 118 K 
2 | metric | CrossEntropyLoss | 0     
--------------------------------------------
17.4 M    Trainable params
0         Non-trainable params
17.4 M    Total params
69.550    Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [None]:
trainer.test(model=model,datamodule=dataset)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_epoch                 0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc_epoch': 0.0}]

In [None]:
trainer.predict(model=model,datamodule=dataset)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 1it [00:00, ?it/s]

[tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 