<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_Seq2Seq/test_sample_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_Seq2Seq/

fatal: destination path 'PyTorch-Architectures' already exists and is not an empty directory.
/content/PyTorch-Architectures/modeling_Seq2Seq


In [None]:
! pip install datasets
! pip install tokenizers

In [3]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from model import Seq2Seq, Encoder, Decoder

In [4]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

Reusing dataset mt_eng_vietnamese (/root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71)


In [5]:
# Sample from the dataset
dataset['train']['translation'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

In [6]:
train_sentences = dataset['train']
en_sentences = []
vi_sentences = []
for value in train_sentences:
  en_sentences.append(value['translation']['en'])
  vi_sentences.append(value['translation']['vi'])

In [7]:
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_vi = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])
trainer_vi = BpeTrainer(special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"])

In [8]:
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_vi.pre_tokenizer = Whitespace()

In [9]:
tokenizer_en.train_from_iterator(en_sentences, trainer_en)
tokenizer_vi.train_from_iterator(vi_sentences, trainer_vi)

In [10]:
tokenizer_en.post_processor = TemplateProcessing(
    single = "[SOS] $A [EOS]",
    special_tokens = [
                      ("[SOS]", tokenizer_en.token_to_id("[SOS]")),
                      ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
    ],
)

tokenizer_vi.post_processor = TemplateProcessing(
    single = "[SOS] $A [EOS]",
    special_tokens = [
                      ("[SOS]", tokenizer_vi.token_to_id("[SOS]")),
                      ("[EOS]", tokenizer_vi.token_to_id("[EOS]"))
    ],
)

# Paper uses the following format:
# en_sentence [EOS] --> vi_sentence [EOS]

In [11]:
# Cell for hyperparameters
MAX_LENGTH = 16
INPUT_DIM_ENCODER = tokenizer_en.get_vocab_size()
INPUT_DIM_DECODER = tokenizer_vi.get_vocab_size()
EMB_DIM_ENCODER = 256
EMB_DIM_DECODER = 256
HIDDEN_DIM_ENCODER = 512
HIDDEN_DIM_DECODER = 512
DROPOUT_ENCODER = 0.5
DROPOUT_DECODER = 0.5
NUM_LAYERS = 4
BATCH_SIZE = 32
LEARNING_RATE = 3e-5

In [12]:
tokenizer_en.enable_padding(pad_id=2, pad_token="[PAD]", len=MAX_LENGTH)
tokenizer_vi.enable_padding(pad_id=2, pad_token="[PAD]", len=MAX_LENGTH)
tokenizer_en.enable_truncation(max_length=MAX_LENGTH)
tokenizer_vi.enable_truncation(max_length=MAX_LENGTH)

In [13]:
# Sample tokenization of a batch of english sentences --> pad uses max_length in batch
sample_sentence = "This is amazing and great!"
print('Sentences --> ', sample_sentence)
output = tokenizer_en.encode(sample_sentence)
output.tokens

Sentences -->  This is amazing and great!


['[SOS]', 'This', 'is', 'amazing', 'and', 'great', '!', '[EOS]']

In [14]:
# Sample tokenization of a vietnamese sentence
print('Sentence --> ', vi_sentences[0])
output = tokenizer_vi.encode(vi_sentences[0])
output.tokens

Sentence -->  Khoa học đằng sau một tiêu đề về khí hậu


['[SOS]',
 'Khoa',
 'học',
 'đằng',
 'sau',
 'một',
 'tiêu',
 'đề',
 'về',
 'khí',
 'hậu',
 '[EOS]']

In [15]:
def collate_fn_en(batch):
  sentences = []
  for sent in batch:
    sentences.append(sent[0])
  outputs = tokenizer_en.encode_batch(sentences)
  input_ids = []
  for i in range(len(outputs)):
    input_ids.append(outputs[i].ids)
  return torch.tensor(input_ids, dtype=torch.long)

def collate_fn_vi(batch):
  sentences = []
  for sent in batch:
    sentences.append(sent[0])
  outputs = tokenizer_vi.encode_batch(sentences)
  input_ids = []
  for i in range(len(outputs)):
    input_ids.append(outputs[i].ids)
  return torch.tensor(input_ids, dtype=torch.long)

class CustomDataset(Dataset):
  def __init__(self, tokenizer, sentences, max_input_length=16):
    self.tokenizer = tokenizer
    self.sentences = sentences
    self.max_input_length = max_input_length
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sents = [self.sentences[idx]]
    sents_list = []
    for sent in sents:
      sents_list.append(sent)
    return sents_list

In [16]:
# Sanity check DataLoader
sample_sentences_en = en_sentences[:4]
sample_dataset_en = CustomDataset(tokenizer_en, sample_sentences_en)
sample_dataloader_en = DataLoader(dataset=sample_dataset_en, batch_size=2, shuffle=False, drop_last=True, collate_fn=collate_fn_en)

for sample in sample_dataloader_en:
  print(sample.shape)
  break

sample_sentences_vi = vi_sentences[:4]
sample_dataset_vi = CustomDataset(tokenizer_vi, sample_sentences_vi)
sample_dataloader_vi = DataLoader(dataset=sample_dataset_vi, batch_size=2, shuffle=False, drop_last=True, collate_fn=collate_fn_vi)

for sample in sample_dataloader_vi:
  print(sample.shape)
  break

torch.Size([2, 16])
torch.Size([2, 16])


In [17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

encoder = Encoder(input_dim=INPUT_DIM_DECODER,
                  emb_dim=EMB_DIM_ENCODER,
                  hidden_dim=HIDDEN_DIM_ENCODER,
                  num_layers=NUM_LAYERS,
                  p_drop=DROPOUT_ENCODER)

decoder = Decoder(output_dim=INPUT_DIM_DECODER,
                  emb_dim=EMB_DIM_DECODER,
                  hidden_dim=HIDDEN_DIM_DECODER,
                  num_layers=NUM_LAYERS,
                  p_drop=DROPOUT_DECODER)

model = Seq2Seq(encoder=encoder,
                decoder=decoder)

In [18]:
def init_weights(m):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)
model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30000, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(30000, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=30000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [19]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable Parameters: ', params)

Trainable Parameters:  46511408


In [20]:
assert len(en_sentences) == len(vi_sentences)
split = 90 * len(en_sentences) // 100
en_sentences_train = en_sentences[:split]
vi_sentences_train = vi_sentences[:split]
en_sentences_valid = en_sentences[split:]
vi_sentences_valid = vi_sentences[split:]

print('Training samples: ', len(en_sentences_train))
print('Valid samples: ', len(vi_sentences_valid))

assert (len(en_sentences_train) + len(en_sentences_valid)) == \
        (len(vi_sentences_train) + len(vi_sentences_valid))

Training samples:  119986
Valid samples:  13332


In [21]:
en_sentences_train_dataset = CustomDataset(tokenizer=tokenizer_en,
                                          sentences=en_sentences_train)
vi_sentences_train_dataset = CustomDataset(tokenizer=tokenizer_vi,
                                           sentences=vi_sentences_train)

en_sentences_valid_dataset = CustomDataset(tokenizer=tokenizer_en,
                                           sentences=en_sentences_valid)
vi_sentences_valid_dataset = CustomDataset(tokenizer=tokenizer_vi,
                                           sentences=vi_sentences_valid)

In [22]:
en_sentences_train_loader = DataLoader(dataset=en_sentences_train_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_en,)
vi_sentences_train_loader = DataLoader(dataset=vi_sentences_train_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_vi)

en_sentences_valid_loader = DataLoader(dataset=en_sentences_valid_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_en)
vi_sentences_valid_loader = DataLoader(dataset=vi_sentences_valid_dataset,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False,
                                       collate_fn=collate_fn_vi)

assert len(en_sentences_train_loader) == len(vi_sentences_train_loader)
assert len(en_sentences_valid_loader) == len(vi_sentences_valid_loader)

print('Length of Train DataLoader: ', len(en_sentences_train_loader))
print('Length of Valid DataLoader: ', len(en_sentences_valid_loader))

Length of Train DataLoader:  3750
Length of Valid DataLoader:  417


In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss = nn.CrossEntropyLoss(ignore_index=tokenizer_vi.token_to_id("[PAD]"))

In [26]:
# Sanity check forward pass
model.eval()
with torch.set_grad_enabled(False):
  for en_sample, vi_sample in zip(en_sentences_train_loader, vi_sentences_train_loader):
    en_sample = en_sample.transpose(0, 1).contiguous()
    vi_sample = vi_sample.transpose(0, 1).contiguous()
    output = model(src=en_sample, trg=vi_sample)

    print(output[1:].view(-1, output.size(2)).shape)
    print(vi_sample[1:].view(-1).shape)
    assert output.dim() == 3
    assert output.size(0) == en_sample.size(0)
    assert output.size(1) == en_sample.size(1)
    break

torch.Size([480, 30000])
torch.Size([480])


In [49]:
def compute_loss(model, src_loader, valid_loader, device):
  pass

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for sample_en, sample_vi in zip(en_sentences_train_loader, vi_sentences_train_loader):
    sample_en = sample_en.transpose(0, 1).contiguous().to(device)
    sample_vi = sample_vi.transpose(0, 1).contiguous().to(device)

    output = model(src=sample_en, trg=sample_vi)
    pass
  model.eval()
  with torch.set_grad_enabled(False):
    pass

480