In [None]:
# !python -m spacy download en
# !python -m spacy download de

In [1]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths   = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths  = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')


2023-09-29 11:08:00.555139: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-29 11:08:00.710187: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-29 11:08:01.625032: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib/python3.8/site-packages

In [2]:
# Visualize torchtex.utils.get_tokenize
token = de_tokenizer("You can now install TorchText using pip!")
print(token)
# Visualize Counter
counter = Counter(['a', 'a', 'b'])
print(counter)

['You', 'can', 'now', 'install', 'TorchText', 'using', 'pip', '!']
Counter({'a': 2, 'b': 1})


### PREPROCESSING DATA

In [5]:
def build_vocab(file_path, tokenizer):
    counter = Counter()
    with io.open(file_path, encoding="utf8") as f: 
        for string_ in f:
            counter.update(tokenizer(string_))
    return Vocab(counter)
    

In [6]:
de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

In [7]:
# Visualizing tonize in biuld_vocab
data = [
    'Two young, White males are outside near many bushes.' 
    ,'Several men in hard hats are operating a giant pulley system.'
    ,'A little girl climbing into a wooden playhouse.'
    ,'A man in a blue shirt is standing on a ladder cleaning a window.'
    ,'Two men are at the stove preparing food.'
    ,'A man in green holds a guitar while the other man observes his shirt.'
    ,'A man is smiling at a stuffed lion'
    ,'A trendy girl talking on her cellphone while gliding slowly down the street.'
    ,'A woman with a large purse is walking by a gate.'
]

for string_ in data[:2]:
    print(en_tokenizer(string_))
    counter.update(en_tokenizer(string_))
    print(counter)


['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
Counter({'a': 2, 'b': 1, 'Two': 1, 'young': 1, ',': 1, 'White': 1, 'males': 1, 'are': 1, 'outside': 1, 'near': 1, 'many': 1, 'bushes': 1, '.': 1})
['Several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']
Counter({'a': 3, 'are': 2, '.': 2, 'b': 1, 'Two': 1, 'young': 1, ',': 1, 'White': 1, 'males': 1, 'outside': 1, 'near': 1, 'many': 1, 'bushes': 1, 'Several': 1, 'men': 1, 'in': 1, 'hard': 1, 'hats': 1, 'operating': 1, 'giant': 1, 'pulley': 1, 'system': 1})


In [11]:
def data_process(filepaths): 
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = [] 
    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        de_tensor = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)], dtype=torch.long)
        en_tensor = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long)
        
        data.append((de_tensor, en_tensor))
    return data

In [12]:
train_data = data_process(train_filepaths)
val_data   = data_process(val_filepaths)
test_data  = data_process(test_filepaths)

In [29]:
# Visualize in data process function 
raw_en_iter = iter(io.open(train_filepaths[1], encoding="utf8"))
print(list(raw_en_iter)[:10], "\n")
raw_en_iter = list(raw_en_iter)[:10]

['Two young, White males are outside near many bushes.\n', 'Several men in hard hats are operating a giant pulley system.\n', 'A little girl climbing into a wooden playhouse.\n', 'A man in a blue shirt is standing on a ladder cleaning a window.\n', 'Two men are at the stove preparing food.\n', 'A man in green holds a guitar while the other man observes his shirt.\n', 'A man is smiling at a stuffed lion\n', 'A trendy girl talking on her cellphone while gliding slowly down the street.\n', 'A woman with a large purse is walking by a gate.\n', 'Boys dancing on poles in the middle of the night.\n'] 



In [30]:
torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long)

tensor([17458,  7359, 14847,   302,  7378, 31707,     8,  2323,    45,   615,
         9923,    51,  6871, 31707,     9,   248,  3963,  6175,     7,  7378,
           71, 14847,  9923,   460, 27623, 29000])

### DATALOADER

In [34]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 32 

PAD = de_vocab['<pad>']
BOS = de_vocab['<bos>']
EOS = de_vocab['<eos>']

In [74]:
def generate_batch(data_batch): 
    de_batch, en_batch = [], [] 
    for (de_item, en_item) in data_batch: 
        de_batch.append(torch.cat([torch.tensor([BOS]), de_item, torch.tensor([EOS])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS]), de_item, torch.tensor([EOS])], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=PAD)
    en_batch = pad_sequence(en_batch, padding_value=PAD)
    return de_batch, en_batch
        

train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
test_iter  = DataLoader(test_data,  batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
val_iter   = DataLoader(val_data,   batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [75]:
# Visualize in one batch_size

batch = train_data[:7]
de_batch, en_batch = [], [] 
for (de_item, en_item) in batch: 
    print("de",de_item)
    print("en",en_item)
    
    de_batch.append(torch.cat([torch.tensor([BOS]), de_item, torch.tensor([EOS])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS]), en_item, torch.tensor([EOS])], dim=0))
    

de tensor([ 3177,   498,   143,  1662,   490,  3079,   466, 11832,  4501,   383,
            2,     5, 28809, 29000])
en tensor([ 3116,  2074,  3963,    23,    39,  3716,   870,   569,   120,    19,
        27623, 29000])
de tensor([  502,  1662,  8838,    33,     9,  4947,     1, 28809, 29000])
en tensor([  280,  1612, 14847,   120,   146,  3716,    22, 31707,    43,     3,
            6, 27623, 29000])
de tensor([13904,   754,  2121,   171, 11832,  4947,     2,   906,    61, 28809,
        29000])
en tensor([17458,   843,  1667,   193,   711, 31707,   174,     3, 27623, 29000])
de tensor([13904,  7805, 11832, 13697,   992,  1202,  1778,  8686,  6750,    56,
         8925,    43,  4947,   148, 28809, 29000])
en tensor([17458,  7359, 14847, 31707,  1871,  2323,  7524,  1612,  8019, 31707,
           60,    60, 31707,   183, 27623, 29000])
de tensor([ 3177,  1662,   926,   902,    17,  8925,    78,   232,  1905, 28809,
        29000])
en tensor([ 3116,  1612,  3716,  2908,  9923,    23,

In [76]:
print([torch.tensor([BOS])])
print([de_item])
print([torch.tensor([EOS])])

# All will concatenate

[tensor([0])]
[tensor([13904,  7805,   234,  3479,     3,     3,  3044, 28809, 29000])]
[tensor([0])]


### Defining Model

In [125]:
import random 
from typing import Tuple 

import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F
from torch import Tensor

class Encoder(nn.Module): 
    def __init__(
        self,
        input_dim: int, 
        emb_dim: int, 
        enc_hid_dim: int, 
        dec_hid_dim: int, 
        dropout: float
    ): 
        super().__init__()
        self.input_dim = input_dim 
        self.emb_dim = emb_dim 
        self.enc_hid_dim = enc_hid_dim 
        self.dec_hid_dim = dec_hid_dim 
        self.dropout = dropout 
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(
        self,
        src: Tensor
    ) -> Tuple[Tensor]:
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],
                                               hidden[-1,:,:]),
                                               dim = 1)))
        return outputs, hidden
    
    
class Attention(nn.Module): 
    def __init__(
        self,
        enc_hid_dim:int,
        dec_hid_dim:int,
        attn_dim:int
    ):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim 
        self.attn = nn.Linear(self.attn_in, attn_dim)
        
    def forward(
        self,
        secoder_hidden:Tensor,
        encoder_outputs:Tensor
    ) -> Tensor: 
        sec_len = encoder_outputs.shape[0]
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat(
            repeated_decoder_hidden, 
            encoder_outputs,
            dim =2
        )))
        attention = torch.sum(encergy, dim=2)
        return f.softmax(attention, dim=1)
        

class Decoder(nn.Module):
    def __init__(
        self,
        output_dim:int,
        emb_dim: int,
        enc_hid_dim: int,
        dec_hid_dim: int,
        dropout: int, 
        attention: nn.Module
    ):
        super().__init__()
        self.emb_dim = emb_dim 
        self.enc_hid_dim = enc_hid_dim 
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim 
        self.dropout = dropout
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim) 
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim) 
        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def _weighted_encoder_rep(
        self,
        decoder_hidden: Tensor,
        encoder_outputs: Tensor
    ) -> Tensor: 
        a = self.attention(decoder_hidden, encoder_outputs)
        a = a.unsqueze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted_encoder_rep = torch.bmm(a, encoder_outputs)
        weighted_encoder_erp = weighted_encoder_rep.permute(1, 0, 2)
        return weighted_encoder_rep
    
    def forward(
        self,
        input: Tensor, 
        decoder_hidden: Tensor, 
        encoder_outputs: Tensor
   ) -> Tuple[Tensor]: 
        
        input_ = input_.unsqueenze(0)
        embedded = self.dropout(self.embedding(input_))
        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                         encoder_outputs)
        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)
        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weightted_encoder_rep = weighted_encoder_rep.squeeze(0)
        output = self.out(torch.cat((
            output,
            weighted_encoder_rep,
            embedded),
            dim = 1 
        ))
        return output, decoder_hidden.squeeze(0)
        
        
class Seq2Seq(nn.Module): 
    def __init__(
        self,
        encoder: nn.Module,
        decoder: nn.Module,
        device: torch.device
    ):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device 
        
    def forward(
        self,
        src:Tensor,
        trg:Tensor,
        teach_forcing_ratio: float = 0.5
    ) -> Tensor:
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden =self.encoder(src)
        output = trg[0, :]
        
        for t in range(1, max_len): 
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output 
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)
            
        return outputs
    
    

In [126]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embbeding_dim = 32
decoder_embbeding_dim = 32 
encoder_hidden_dim = 64
decoder_hidden_dim = 64 
attention_dim = 8 
encoder_dropout = 0.5 
decoder_dropout = 0.5 

encoder = Encoder(
                  input_dim,
                  encoder_embbeding_dim,
                  encoder_hidden_dim,
                  decoder_hidden_dim,
                  encoder_dropout
                 )

attention = Attention(
                  encoder_hidden_dim,
                  decoder_hidden_dim,
                  attention_dim
                 )

decoder = Decoder(
                  output_dim, 
                  decoder_embbeding_dim,
                  encoder_hidden_dim,
                  decoder_hidden_dim,
                  decoder_dropout,
                  attention
                 )

model = Seq2Seq(encoder, decoder, device).to(device)

In [127]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

optimizer = optim.Adam(model.parameters())


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,489,914 trainable parameters


In [None]:
import math
import time

# PAD = en_vocab.get_itos['<pad>']
# criterion = nn.CrossEntropyLoss(ignore_index=PAD)
criterion = nn.CrossEntropyLoss()

def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_iter, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')