<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_Seq2Seq/test_sample_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_Seq2Seq/

fatal: destination path 'PyTorch-Architectures' already exists and is not an empty directory.
/content/PyTorch-Architectures/modeling_Seq2Seq


In [None]:
! pip install datasets
! pip install tokenizers

In [2]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from model import Seq2Seq

In [3]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

Reusing dataset mt_eng_vietnamese (/root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71)


In [4]:
# Sample from the dataset
dataset['train']['translation'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

In [5]:
train_sentences = dataset['train']
en_sentences = []
vi_sentences = []
for value in train_sentences:
  en_sentences.append(value['translation']['en'])
  vi_sentences.append(value['translation']['vi'])

In [6]:
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_vi = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(special_tokens=["[UNK]", "[EOS]", "[PAD]"])
trainer_vi = BpeTrainer(special_tokens=["[UNK]", "[EOS]", "[PAD]"])

In [7]:
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_vi.pre_tokenizer = Whitespace()

In [30]:
tokenizer_en.train_from_iterator(en_sentences, trainer_en)
tokenizer_vi.train_from_iterator(vi_sentences, trainer_vi)

In [9]:
tokenizer_en.post_processor = TemplateProcessing(
    single = "$A [EOS]",
    special_tokens = [
                      ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
    ],
)

tokenizer_vi.post_processor = TemplateProcessing(
    single = "$A [EOS]",
    special_tokens = [
                      ("[EOS]", tokenizer_vi.token_to_id("[EOS]")),
    ],
)

# Paper uses the following format:
# en_sentence [EOS] --> vi_sentence [EOS]

In [10]:
# Cell for hyperparameters
MAX_LENGTH = 16

In [25]:
tokenizer_en.enable_padding(pad_id=2, pad_token="[PAD]", len=MAX_LENGTH)
tokenizer_vi.enable_padding(pad_id=2, pad_token="[PAD]", len=MAX_LENGTH)
tokenizer_en.enable_truncation(max_length=MAX_LENGTH)
tokenizer_vi.enable_truncation(max_length=MAX_LENGTH)

In [92]:
# Sample tokenization of a batch of english sentences --> pad uses max_length in batch
sample_sentence = ["This is amazing and great!", "This is good"]
print('Sentences --> ', sample_sentence)
output = tokenizer_en.encode_batch(sample_sentence)
tensor_list = []
tensor_list.append(output[0].ids)
tensor_list.append(output[1].ids)
torch.tensor(tensor_list).shape

Sentences -->  ['This is amazing and great!', 'This is good']


torch.Size([2, 7])

In [27]:
# Sample tokenization of a vietnamese sentence
print('Sentence --> ', vi_sentences[0])
output = tokenizer_vi.encode(vi_sentences[0])
output.tokens

Sentence -->  Khoa học đằng sau một tiêu đề về khí hậu


['Khoa',
 'học',
 'đằng',
 'sau',
 'một',
 'tiêu',
 'đề',
 'về',
 'khí',
 'hậu',
 '[EOS]']

In [205]:
def collate_fn_en(batch):
  sentences = []
  for sent in batch:
    sentences.append(sent[0])
  outputs = tokenizer_en.encode_batch(sentences)
  input_ids = []
  for i in range(len(outputs)):
    input_ids.append(outputs[i].ids)
  return torch.tensor(input_ids, dtype=torch.long)

def collate_fn_vi(batch):
  sentences = []
  for sent in batch:
    sentences.append(sent[0])
  outputs = tokenizer_vi.encode_batch(sentences)
  input_ids = []
  for i in range(len(outputs)):
    input_ids.append(outputs[i].ids)
  return torch.tensor(input_ids, dtype=torch.long)

class CustomDataset(Dataset):
  def __init__(self, tokenizer, sentences, max_input_length=16):
    self.tokenizer = tokenizer
    self.sentences = sentences
    self.max_input_length = max_input_length
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sents = [self.sentences[idx]]
    sents_list = []
    for sent in sents:
      sents_list.append(sent)
    return sents_list

In [207]:
# Sanity check DataLoader
sample_sentences_en = en_sentences[:4]
sample_dataset_en = CustomDataset(tokenizer_en, sample_sentences_en)
sample_dataloader_en = DataLoader(dataset=sample_dataset_en, batch_size=2, shuffle=False, drop_last=True, collate_fn=collate_fn_en)

for sample in sample_dataloader_en:
  print(sample.shape)
  break

sample_sentences_vi = vi_sentences[:4]
sample_dataset_vi = CustomDataset(tokenizer_vi, sample_sentences_vi)
sample_dataloader_vi = DataLoader(dataset=sample_dataset_vi, batch_size=2, shuffle=False, drop_last=True, collate_fn=collate_fn_vi)

for sample in sample_dataloader_vi:
  print(sample.shape)
  break

torch.Size([2, 16])
torch.Size([2, 16])
