<a href="https://colab.research.google.com/github/UNIST-LIM-Lab-course/seq2seq_rnn_assignment-xiyanafiguera/blob/main/lstm_seq2seq_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building seq2seq rnn using LSTM
The task objective is to code seq2seq model using LSTM.

* An assignment part is denoted by (Assingment) 

* Grading criteria: Points are given if all your code in this notebook is runnable and the final ppl is lower than 200

* Points are not given if the testing cell at the end of the notebook is modified or extra cells (including text) are added after the last cell. Do not change N_EPOCHS for testing efficiently.

* Testing your model with the testing cell is recommended. 

* Please do not re-use the code from the example code. You have to write the code yourself.

## Assignment List


(Assignment) 2.1 Make LSTM scratch 

(Assignment) 2.3 Make Encoder

(Assignment) 2.4 Make Decoder 

(Assignment) 2.5 Make Seq2seq

(Assignment) 3.1 Train your model 

#1. Preparing Data

##1.1. Upload dataset

In [1]:
pip install torchtext==0.10.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 4.9 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.6 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time


## 1.2 pre-processing 

In [10]:

from spacy.lang.en import English
from spacy.lang.de import German


spacy_en=English()
spacy_de=German()

def tokenize_de(text):

    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):

    return [tok.text for tok in spacy_en.tokenizer(text)]
  

SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)


 
* sos: start of sequence
* eos: end of sequence 
* German: SRC(source)
* English: TRG(target)
* Multi30k datase is a dataset with ~30,000 parallel English, Genrman and French sentences, each with ~12 workds per sentence
* exts specifies which languages to use as the source and target (source goes first) 
* fields specifies which field to use for the source and target.





##1.3 Check whether gpu is avaliable 

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1.4 Define train/ validation /test datset 

In [12]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

#2. Implement RNN Model


#(Assignment) 2.1 Make LSTM scratch 



In [13]:
import torch
from torch import nn


class LSTM(nn.Module):
  def __init__(self, emb_dim, hid_dim, sigma=0.01):
    super().__init__()
    
    init_weight= lambda *shape: nn.Parameter(torch.randn(*shape)*sigma)
    
    triple = lambda: (init_weight(emb_dim, hid_dim,),
                      init_weight(hid_dim,hid_dim),
                      nn.Parameter(torch.zeros(hid_dim)))
    
    self.W_xi, self.W_hi, self.b_i = triple() #input gate
    self.W_xf, self.W_hf, self.b_f = triple() #forget gate
    self.W_xo, self.W_ho, self.b_o = triple() #output gate 
    self.W_xc, self.W_hc, self.b_c = triple() #candidate memory cell
  
  def forward(self, inputs, H_C=None):

      H, C = None, None if H_C is None else H_C
      
      outputs = []
      
      for X in inputs:
          
          
          self.xi = torch.matmul(X, self.W_xi)
          if H is not None: 
             self.hi = torch.matmul(H, self.W_hi)
          else:
             self.hi = 0 
          
          in_gate = torch.sigmoid(self.xi + self.hi + self.b_i)
              


          if H is None:
              H, C = torch.zeros_like(in_gate), torch.zeros_like(in_gate)


          self.xf = torch.matmul(X, self.W_xf)
          self.hf = torch.matmul(H, self.W_hf)
          fe_gate = torch.sigmoid(self.xf + self.hf + self.b_f)
          

          self.xt = torch.matmul(X, self.W_xo)
          self.ht = torch.matmul(H, self.W_ho)
          op_gate = torch.sigmoid(self.xt + self.ht + self.b_o)

          self.xc = torch.matmul(X, self.W_xc)
          self.hc = torch.matmul(H, self.W_hc)
          i = torch.tanh(self.xc + self.hc + self.b_c)
          
          
          C = fe_gate * C + in_gate * i

          H = op_gate * torch.tanh(C)
          

          outputs.append(torch.unsqueeze(H,0))
          
      
      output = torch.cat(outputs, dim=0)
      H = torch.unsqueeze(H,0)
      C = torch.unsqueeze(C,0)
    
      return output, (H, C)

## 2.2 Check your LSTM

In [14]:
# nn.LSTM 
rnn = LSTM(10,20) # input_size= 10, hidden_size = 20

input = torch.randn(5,3,10) # sequence length=5, batch_size=3, input_size = 10 

# Initial hidden state and memery cell 
h_0 = torch.randn(1,3,20)
c_0 = torch.randn(1,3,20)

output, (hn, cn) = rnn(input, (h_0, c_0))

# tensor, tuple object

print(output.shape, '\n', hn.shape, '\n', cn.shape)


torch.Size([5, 3, 20]) 
 torch.Size([1, 3, 20]) 
 torch.Size([1, 3, 20])


## (Assignment) 2.3. Make Encoder


* src = [src len, batch size]
* embedded = [src len, batch size, emb dim]
* outputs = [src len, batch size, hid dim ]
* hidden = [1, batch size, hid dim]
* cell = [1, batch size, hid dim]
        

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
                
        #embedded = [src len, batch size, emb dim]
      
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer

        x = self.embedding(src)
        x = self.dropout(x)
        outputs, (hidden, cell) = self.rnn(x)
        
        return hidden, cell

## (Assignment) 2.4. Make Decoder 

In [16]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
  
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [batch size, hid dim]
        #cell = [batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [batch size, hid dim]
        #context = [n layers, batch size, hid dim]
      
        
        #input = [1, batch size]
        
        
        #embedded = [1, batch size, emb dim]
                
  
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [1, batch size, hid dim]
        #cell = [1, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        #cell = [1, batch size, hid dim]
        
       
        
        #prediction = [batch size, output dim]

        x = input.unsqueeze(0)
        x = self.embedding(x)
        x = self.dropout(x)
        outputs, (hidden, cell) = self.rnn(x, (hidden, cell))
        
        predictions = self.fc_out(outputs)

        prediction = predictions.squeeze(0)
        
        return prediction, hidden, cell

##(Assignment) 2.5 Make Seq2seq

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
       
        
        #tensor to store decoder outputs
        
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
   
        #first input to the decoder is the <sos> tokens

        trg_len = trg.shape[0]
        
        outputs = torch.zeros(trg_len, trg.shape[1], self.decoder.output_dim).to(self.device)
        hidden, cell = self.encoder(src)
        x = trg[0,:]

        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
     
            
            #place predictions in a tensor holding predictions for each token
     
            #decide if we are going to use teacher forcing or not

            
            #get the highest predicted token from our predictions
          
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            t_force = random.random() < teacher_forcing_ratio
            bst = output.argmax(1)

            x = trg[t] if t_force else bst
        
        return outputs

## 2.6 Define seq2seq Model

In [18]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 256
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5



input = torch.randint(0,3,(5,3)) #sequence length=5, batch_size=3 
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM,  ENC_DROPOUT)
hidden, cell = enc(input)
print('hidden and cell shape:', hidden.shape, cell.shape)


# Initial hidden state and memery cell 
h_0 = torch.randn(1,3,20)
c_0 = torch.randn(1,3,20)
input = torch.Tensor([0,1,2])
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,  DEC_DROPOUT)



model = Seq2Seq(enc, dec, device).to(device)

hidden and cell shape: torch.Size([1, 3, 256]) torch.Size([1, 3, 256])


  "num_layers={}".format(dropout, num_layers))


In [19]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 256, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 256)
    (fc_out): Linear(in_features=256, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,086,149 trainable parameters


# 3. Training step 

##(Assignment) 3.1 Train your model 
* Use gradient clipping 

In [21]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [22]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)
        loss.backward()

        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        

        
    return epoch_loss / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [24]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [25]:
N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print('')

Epoch: 01 | Time: 0m 24s
	Train Loss: 5.165 | Train PPL: 175.080
	 Val. Loss: 4.959 |  Val. PPL: 142.447



# 4. Test your model

In [26]:
model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f}')

| Test Loss: 4.943


In [27]:
ppl = math.exp(test_loss)
print(ppl)

140.17299694551664
