<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_Seq2Seq/test_sample_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_Seq2Seq/

In [None]:
! pip install datasets
! pip install tokenizers

In [5]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from model import Seq2Seq

In [None]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

In [7]:
# Sample from the dataset
dataset['train']['translation'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

In [8]:
train_sentences = dataset['train']
en_sentences = []
vi_sentences = []
for value in train_sentences:
  en_sentences.append(value['translation']['en'])
  vi_sentences.append(value['translation']['vi'])

In [9]:
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_vi = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(special_tokens=["[UNK]", "[EOS]", "[PAD]"])
trainer_vi = BpeTrainer(special_tokens=["[UNK]", "[EOS]", "[PAD]"])

In [10]:
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_vi.pre_tokenizer = Whitespace()

In [11]:
tokenizer_en.train_from_iterator(en_sentences, trainer_en)
tokenizer_vi.train_from_iterator(vi_sentences, trainer_vi)

In [12]:
tokenizer_en.post_processor = TemplateProcessing(
    single = "$A [EOS]",
    special_tokens = [
                      ("[EOS]", tokenizer_en.token_to_id("[EOS]")),
    ],
)

tokenizer_vi.post_processor = TemplateProcessing(
    single = "$A [EOS]",
    special_tokens = [
                      ("[EOS]", tokenizer_vi.token_to_id("[EOS]")),
    ],
)

# Paper uses the following format:
# en_sentence [EOS] --> vi_sentence[EOS]

In [13]:
# Sample tokenization of an english sentence
sample_sentence = "This is amazing and great!"
print('Sentence --> ', sample_sentence)
output = tokenizer_en.encode(sample_sentence)
output.tokens

Sentence -->  This is amazing and great!


['This', 'is', 'amazing', 'and', 'great', '!', '[EOS]']

In [14]:
# Sample tokenization of a vietnamese sentence
print('Sentence --> ', vi_sentences[0])
output = tokenizer_vi.encode(vi_sentences[0])
output.tokens

Sentence -->  Khoa học đằng sau một tiêu đề về khí hậu


['Khoa',
 'học',
 'đằng',
 'sau',
 'một',
 'tiêu',
 'đề',
 'về',
 'khí',
 'hậu',
 '[EOS]']