<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_Seq2Seq/test_sample_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! nvidia-smi

Mon Apr 26 12:33:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_Seq2Seq/

In [None]:
! pip install datasets
! pip install tokenizers

In [4]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from model import Seq2Seq, Encoder, Decoder

In [None]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

In [6]:
# Sample from the dataset
dataset['train']['translation'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

In [7]:
# Paper proposes that reversing source sentence helps in
# LSTM long-range dependencies

train_sentences = dataset['train']
en_sentences = []
vi_sentences = []
for value in train_sentences:
  en_sentence = value['translation']['en']
  en_sentence = ' '.join(reversed(en_sentence.split()))
  en_sentences.append(en_sentence)
  vi_sentences.append(value['translation']['vi'])

In [8]:
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_vi = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])
trainer_vi = BpeTrainer(special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])

In [9]:
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_vi.pre_tokenizer = Whitespace()

In [10]:
tokenizer_en.train_from_iterator(en_sentences, trainer_en)
tokenizer_vi.train_from_iterator(vi_sentences, trainer_vi)

In [11]:
tokenizer_en.post_processor = TemplateProcessing(
    single = "[SOS] $A [EOS]",
    special_tokens = [
                      ("[SOS]", tokenizer_en.token_to_id("[SOS]")),
                      ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
    ],
)

tokenizer_vi.post_processor = TemplateProcessing(
    single = "[SOS] $A [EOS]",
    special_tokens = [
                      ("[SOS]", tokenizer_vi.token_to_id("[SOS]")),
                      ("[EOS]", tokenizer_vi.token_to_id("[EOS]"))
    ],
)

# Paper uses the following format:
# [SOS] en_sentence [EOS] --> [SOS] vi_sentence [EOS]

In [12]:
# Cell for hyperparameters
MAX_LENGTH = 16
INPUT_DIM_ENCODER = tokenizer_en.get_vocab_size()
INPUT_DIM_DECODER = tokenizer_vi.get_vocab_size()
EMB_DIM_ENCODER = 256
EMB_DIM_DECODER = 256
HIDDEN_DIM_ENCODER = 512
HIDDEN_DIM_DECODER = 512
DROPOUT_ENCODER = 0.5
DROPOUT_DECODER = 0.5
NUM_LAYERS = 4
BATCH_SIZE = 128
LEARNING_RATE = 3e-5
EPOCHS = 3
CLIP = 1 # clip the gradients to max_norm

In [13]:
tokenizer_en.enable_padding(pad_id=2, pad_token="[PAD]", len=MAX_LENGTH)
tokenizer_vi.enable_padding(pad_id=2, pad_token="[PAD]", len=MAX_LENGTH)
tokenizer_en.enable_truncation(max_length=MAX_LENGTH)
tokenizer_vi.enable_truncation(max_length=MAX_LENGTH)

In [14]:
# Sample tokenization of a batch of english sentences --> pad uses max_length in batch
sample_sentence = "This is amazing and great!"
print('Sentences --> ', sample_sentence)
output = tokenizer_en.encode(sample_sentence)
output.tokens

Sentences -->  This is amazing and great!


['[SOS]', 'This', 'is', 'amazing', 'and', 'great', '!', '[EOS]']

In [16]:
# Sample tokenization of a vietnamese sentence
print('Sentence --> ', vi_sentences[0])
output = tokenizer_vi.encode(vi_sentences[0])
output.tokens

Sentence -->  Khoa học đằng sau một tiêu đề về khí hậu


['[SOS]',
 'Khoa',
 'học',
 'đằng',
 'sau',
 'một',
 'tiêu',
 'đề',
 'về',
 'khí',
 'hậu',
 '[EOS]']

In [17]:
def collate_fn_en(batch):
  sentences = []
  for sent in batch:
    sentences.append(sent[0])
  outputs = tokenizer_en.encode_batch(sentences)
  input_ids = []
  for i in range(len(outputs)):
    input_ids.append(outputs[i].ids)
  return torch.tensor(input_ids, dtype=torch.long)

def collate_fn_vi(batch):
  sentences = []
  for sent in batch:
    sentences.append(sent[0])
  outputs = tokenizer_vi.encode_batch(sentences)
  input_ids = []
  for i in range(len(outputs)):
    input_ids.append(outputs[i].ids)
  return torch.tensor(input_ids, dtype=torch.long)

class CustomDataset(Dataset):
  def __init__(self, tokenizer, sentences, max_input_length=16):
    self.tokenizer = tokenizer
    self.sentences = sentences
    self.max_input_length = max_input_length
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sents = [self.sentences[idx]]
    sents_list = []
    for sent in sents:
      sents_list.append(sent)
    return sents_list

In [18]:
# Sanity check DataLoader
sample_sentences_en = en_sentences[:4]
sample_dataset_en = CustomDataset(tokenizer_en, sample_sentences_en)
sample_dataloader_en = DataLoader(dataset=sample_dataset_en, batch_size=2, shuffle=False, drop_last=True, collate_fn=collate_fn_en)

for sample in sample_dataloader_en:
  print(sample.shape)
  break

sample_sentences_vi = vi_sentences[:4]
sample_dataset_vi = CustomDataset(tokenizer_vi, sample_sentences_vi)
sample_dataloader_vi = DataLoader(dataset=sample_dataset_vi, batch_size=2, shuffle=False, drop_last=True, collate_fn=collate_fn_vi)

for sample in sample_dataloader_vi:
  print(sample.shape)
  break

torch.Size([2, 16])
torch.Size([2, 16])


In [19]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

encoder = Encoder(input_dim=INPUT_DIM_DECODER,
                  emb_dim=EMB_DIM_ENCODER,
                  hidden_dim=HIDDEN_DIM_ENCODER,
                  num_layers=NUM_LAYERS,
                  p_drop=DROPOUT_ENCODER)

decoder = Decoder(output_dim=INPUT_DIM_DECODER,
                  emb_dim=EMB_DIM_DECODER,
                  hidden_dim=HIDDEN_DIM_DECODER,
                  num_layers=NUM_LAYERS,
                  p_drop=DROPOUT_DECODER)

model = Seq2Seq(encoder=encoder,
                decoder=decoder)

In [20]:
def init_weights(m):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)
model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30000, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(30000, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=30000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [21]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable Parameters: ', params)

Trainable Parameters:  46511408


In [22]:
assert len(en_sentences) == len(vi_sentences)
split = 90 * len(en_sentences) // 100
en_sentences_train = en_sentences[:split]
vi_sentences_train = vi_sentences[:split]
en_sentences_valid = en_sentences[split:]
vi_sentences_valid = vi_sentences[split:]

print('Training samples: ', len(en_sentences_train))
print('Valid samples: ', len(vi_sentences_valid))

assert (len(en_sentences_train) + len(en_sentences_valid)) == \
        (len(vi_sentences_train) + len(vi_sentences_valid))

Training samples:  119986
Valid samples:  13332


In [23]:
en_sentences_train_dataset = CustomDataset(tokenizer=tokenizer_en,
                                          sentences=en_sentences_train)
vi_sentences_train_dataset = CustomDataset(tokenizer=tokenizer_vi,
                                           sentences=vi_sentences_train)

en_sentences_valid_dataset = CustomDataset(tokenizer=tokenizer_en,
                                           sentences=en_sentences_valid)
vi_sentences_valid_dataset = CustomDataset(tokenizer=tokenizer_vi,
                                           sentences=vi_sentences_valid)

In [24]:
en_sentences_train_loader = DataLoader(dataset=en_sentences_train_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_en,)
vi_sentences_train_loader = DataLoader(dataset=vi_sentences_train_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_vi)

en_sentences_valid_loader = DataLoader(dataset=en_sentences_valid_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_en)
vi_sentences_valid_loader = DataLoader(dataset=vi_sentences_valid_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_vi)

assert len(en_sentences_train_loader) == len(vi_sentences_train_loader)
assert len(en_sentences_valid_loader) == len(vi_sentences_valid_loader)

print('Length of Train DataLoader: ', len(en_sentences_train_loader))
print('Length of Valid DataLoader: ', len(en_sentences_valid_loader))

Length of Train DataLoader:  938
Length of Valid DataLoader:  105


In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss = nn.CrossEntropyLoss(ignore_index=tokenizer_vi.token_to_id("[PAD]"))

In [26]:
# Sanity check forward pass
model.eval()
with torch.set_grad_enabled(False):
  for en_sample, vi_sample in zip(en_sentences_train_loader, vi_sentences_train_loader):
    en_sample = en_sample.transpose(0, 1).contiguous().to(device)
    vi_sample = vi_sample.transpose(0, 1).contiguous().to(device)
    output = model(src=en_sample, trg=vi_sample)

    print(output[1:].view(-1, output.size(2)).shape)
    print(vi_sample[1:].view(-1).shape)
    assert output.dim() == 3
    assert output.size(0) == en_sample.size(0)
    assert output.size(1) == en_sample.size(1)
    break

torch.Size([1920, 30000])
torch.Size([1920])


In [27]:
def compute_loss(model, en_loader, vi_loader, device):
  cost_list = []
  with torch.set_grad_enabled(False):
    for idx, (en_sample, vi_sample) in enumerate(zip(en_loader, vi_loader)):
      en_sample = en_sample.transpose(0, 1).contiguous().to(device)
      vi_sample = vi_sample.transpose(0, 1).contiguous().to(device)
      output = model(src=en_sample, trg=vi_sample)
      output = output[1:].view(-1, output.size(2))
      trg = vi_sample[1:].view(-1)
      cost = loss(output, trg)
      cost_list.append(cost.item())
  return torch.mean(torch.tensor(cost_list))

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, (sample_en, sample_vi) in enumerate(zip(en_sentences_train_loader, vi_sentences_train_loader)):
    sample_en = sample_en.transpose(0, 1).contiguous().to(device)
    sample_vi = sample_vi.transpose(0, 1).contiguous().to(device)

    output = model(src=sample_en, trg=sample_vi)
    output = output[1:].view(-1, output.size(2))
    trg = sample_vi[1:].view(-1)
    cost = loss(output, trg)

    optimizer.zero_grad()
    cost.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=CLIP)
    optimizer.step()
    
    # LOGGING
    if idx % 300 == 0:
      print('Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.2f' % (idx, len(en_sentences_train_loader),
                                                                    epoch+1, EPOCHS, cost.item()))
      
  model.eval()
  with torch.set_grad_enabled(False):
    train_loss = compute_loss(model, en_sentences_train_loader, 
                                    vi_sentences_train_loader, device)
    valid_loss = compute_loss(model, en_sentences_valid_loader,
                              vi_sentences_valid_loader, device)
    print('Train Loss: %.2f || Valid Loss: %.2f' % (train_loss.item(), valid_loss.item()))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch Elapsed Time: %.2f min' % (epoch_elapsed_time))
total_training_time = (time.time() - start_time) / 60
print('Total Training Time: %.2f min' % (total_training_time))

Batch: 0000/0938 || Epoch: 0001/0003 || Loss: 10.30
Batch: 0300/0938 || Epoch: 0001/0003 || Loss: 5.75
Batch: 0600/0938 || Epoch: 0001/0003 || Loss: 5.36
Batch: 0900/0938 || Epoch: 0001/0003 || Loss: 5.96
Train Loss: 5.47 || Valid Loss: 5.49
Epoch Elapsed Time: 7.20 min
Batch: 0000/0938 || Epoch: 0002/0003 || Loss: 5.74
Batch: 0300/0938 || Epoch: 0002/0003 || Loss: 5.47
Batch: 0600/0938 || Epoch: 0002/0003 || Loss: 5.24
Batch: 0900/0938 || Epoch: 0002/0003 || Loss: 5.88
Train Loss: 5.39 || Valid Loss: 5.41
Epoch Elapsed Time: 14.43 min
Batch: 0000/0938 || Epoch: 0003/0003 || Loss: 5.66
Batch: 0300/0938 || Epoch: 0003/0003 || Loss: 5.42
Batch: 0600/0938 || Epoch: 0003/0003 || Loss: 5.22
Batch: 0900/0938 || Epoch: 0003/0003 || Loss: 5.85
Train Loss: 5.36 || Valid Loss: 5.39
Epoch Elapsed Time: 21.67 min
Total Training Time: 21.67 min
