### Refs
* https://towardsdatascience.com/how-to-use-torchtext-for-neural-machine-translation-plus-hack-to-make-it-5x-faster-77f3884d95
* https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec
* https://github.com/SamLynnEvans/Transformer

In [None]:
pip install spacy torchtext pandas

In [None]:
import torchtext
torchtext.__version__

###  Download the European Parliament Proceedings Parallel Corpus 1996–2011

In [None]:
!wget https://www.statmt.org/europarl/v7/fr-en.tgz

In [None]:
!tar zxvf fr-en.tgz

In [None]:
!spacy download  en_core_web_sm # https://spacy.io/models/en
!spacy download fr_core_news_sm

### Prepare datasets

In [None]:
europarl_en = open('europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
europarl_fr = open('europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

In [None]:
import pandas as pd
raw_data = {'English' : [line for line in europarl_en], 'French': [line for line in europarl_fr]}
df = pd.DataFrame(raw_data, columns=["English", "French"])
# remove very long sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['fr_len'] = df['French'].str.count(' ')
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')
df

In [None]:
from sklearn.model_selection import train_test_split
# create train and validation set 
train, val = train_test_split(df, test_size=0.1)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)

### Showcase the construction of a custom Dataset 

In [None]:
from torch.utils.data import IterableDataset, DataLoader
class MyIterableDataset(IterableDataset):  
    def __iter__(self):  
        return iter([(j,str(i)) for i,j in enumerate('abcdefghij')]) 

print(list(DataLoader(MyIterableDataset(), batch_size=4)))

### Tokenization

In [None]:
import pandas as pd

df_train = pd.read_csv("train.csv", nrows=200, usecols=['English','French']) 
df_val   = pd.read_csv("val.csv", nrows=100, usecols=['English','French']) 
display(df_train) 

In [None]:
import spacy # a language-aware tokenizer library

en = spacy.load('en_core_web_sm') # https://spacy.io/models/en      initialize an english tokenizer 
fr = spacy.load('fr_core_news_sm')

def tokenizer_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]
def tokenizer_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]

In [None]:
from collections import Counter
from torchtext.vocab import vocab, Vocab

coll_en = []
coll_fr = [] 
for sentence_en, sentence_fr in df_train.to_dict('split')['data'] + df_val.to_dict('split')['data']:
    coll_en.extend(tokenizer_en(sentence_en))
    coll_fr.extend(tokenizer_fr(sentence_fr))

vocab_en = vocab(Counter(coll_en), specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
vocab_fr = vocab(Counter(coll_fr), specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))

In [None]:
text_transform_en = lambda x: [vocab_en['<BOS>']] + [vocab_en[token] for token in tokenizer_en(x)] + [vocab_en['<EOS>']] 
text_transform_fr = lambda x: [vocab_fr['<BOS>']] + [vocab_fr[token] for token in tokenizer_fr(x)] + [vocab_fr['<EOS>']] 

# Print out the output of text_transform_en
print("input to the text_transform_en:", "here is an example")
print("output of the text_transform_en:", text_transform_en("here is an example"))

# Print out the output of text_transform_en
print("input to the text_transform_fr:", 'Merci beaucoup, Monsieur de Silguy.')
print("output of the text_transform_fr:", text_transform_fr('Merci beaucoup, Monsieur de Silguy.'))

In [None]:
import torch 
from torch.utils.data import IterableDataset, DataLoader

class train(IterableDataset):
    def __iter__(self):  
        return iter(df_train.to_dict('split')['data']) 
    
class val(IterableDataset):
    def __iter__(self):  
        return iter(df_val.to_dict('split')['data']) 

from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    coll_en, coll_fr = [], []
    for sentence_en, sentence_fr in batch: 
        coll_en.append(torch.tensor(text_transform_en(sentence_en))) 
        coll_fr.append(torch.tensor(text_transform_fr(sentence_fr)))   
    coll_en = pad_sequence(coll_en) # 
    coll_fr = pad_sequence(coll_fr) # 
    return coll_en, coll_fr
 
train_dl = DataLoader(train(), batch_size=8, collate_fn=collate_batch) 

next(iter(train_dl))

In [None]:
for sentence in batch.English.T: 
    print(' '.join([EN_TEXT.vocab.itos[word_index] for word_index in sentence if word_index!=1])) 

<img title="a title" alt="Alt text" src="https://miro.medium.com/max/380/1*2vyKzFlzIHfSmOU_lnQE4A.png">

In [None]:
### Embedding

In [None]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)