In [1]:
!pip install torchtext==0.15.2 torch==2.0.1 numpy 'portalocker>=2.0.0'



In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

In [3]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation suc

In [4]:
if torch.cuda.is_available():
    device=torch.device(type='cuda',index=0)
else:
    device=torch.device(type='cpu',index=0)
print(device)

cuda:0


In [42]:
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

In [6]:
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [7]:
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    if language=='de':
        for data_sample in data_iter:
            yield de_tokenizer(data_sample[0])
    elif language=='en':
        for data_sample in data_iter:
            yield en_tokenizer(data_sample[1])

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

# build vocab for German
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

vocab_de=build_vocab_from_iterator(yield_tokens(train_iter, 'de'),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
vocab_de.set_default_index(UNK_IDX)


# build vocab for English
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
vocab_en=build_vocab_from_iterator(yield_tokens(train_iter, 'en'),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)
vocab_en.set_default_index(UNK_IDX)

In [8]:
sentence = "Ein Hund springt in die Luft, um eine orange Frisbeescheibe zu fangen."
tokens = sentence.split()
token_ids = [vocab_de.get_stoi().get(word, UNK_IDX) for word in tokens]  # Use get_stoi() for access

In [9]:
print(token_ids)

[5, 33, 71, 7, 19, 0, 66, 18, 806, 1400, 29, 0]


In [10]:
torch.save(vocab_de, 'vocab_de.pth')
torch.save(vocab_en, 'vocab_en.pth')


In [11]:
print(type(vocab_en))
print("English Vocab Length:",vocab_en.__len__())
print("German Vocab Length:",vocab_de.__len__())

<class 'torchtext.vocab.vocab.Vocab'>
English Vocab Length: 10837
German Vocab Length: 19214


In [12]:
def collate_fn(batch):
    src_batch, tgt_batch, src_len = [], [], []
    for src_sample, tgt_sample in batch:

        src_sample = src_sample.rstrip("\n")  # string
        tgt_sample = tgt_sample.rstrip("\n")

        src_tokens = de_tokenizer(src_sample)  # sentence to list of word tokens
        tgt_tokens = en_tokenizer(tgt_sample)

        src_ids = vocab_de(src_tokens)  # list of word tokens to list of ids
        tgt_ids = vocab_en(tgt_tokens)

        src_ids.append(EOS_IDX)  # append <EOS> to list
        tgt_ids.append(EOS_IDX)

        tgt_ids.insert(0, BOS_IDX)  # insert <BOS> at the beginning of list

        src_len.append(len(src_ids))  # [batch_size,]

        src_tensor = torch.tensor(src_ids)  # [seq_len_src]
        tgt_tensor = torch.tensor(tgt_ids)  # [seq_len_tgt]

        src_batch.append(src_tensor)  # list of tensors
        tgt_batch.append(tgt_tensor)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)  # [batch_size, max_seq_len_src]
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)  # [batch_size, max_seq_len_tgt]
    return src_batch, tgt_batch, src_len

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, dropout_p=0.1):
        super().__init__()
        self.e = nn.Embedding(input_size, embed_size)  # input_size: vocab_size, embed_size: embedding dimension
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)  # embed_size: input size, hidden_size: output size

    def forward(self, x, lengths):
        x = self.e(x)  # x: [batch_size, seq_len] -> output: [batch_size, seq_len, embed_size]
        x = self.dropout(x)
        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)  # [batch_size, seq_len, embed_size]

        outputs, hidden = self.gru(x)  # outputs: [batch_size, seq_len, hidden_size], hidden: [1, batch_size, hidden_size]
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)  # outputs: [batch_size, seq_len, hidden_size]

        return outputs, hidden

In [14]:
class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size):
        super().__init__()
        self.e = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout()
        self.gru = nn.GRU(embed_size + hidden_size, hidden_size, batch_first=True)
        self.lin = nn.Linear(hidden_size, output_size)
        self.lsoftmax = nn.LogSoftmax(dim=-1)

    def forward(self, x, context, prev_hidden):
        x = self.e(x)  # x: [batch_size, seq_len] -> [batch_size, seq_len, embed_size]
        x = self.dropout(x)
        x = torch.cat((x, context), dim=2)  # [batch_size, seq_len, embed_size + hidden_size]

        output, hidden = self.gru(x, prev_hidden)  # output: [batch_size, seq_len, hidden_size], hidden: [1, batch_size, hidden_size]

        y = self.lin(output)  # y: [batch_size, seq_len, output_size]
        y = self.lsoftmax(y)  # y: [batch_size, seq_len, output_size]

        return y, hidden

In [15]:
class Bahdanau_Attention(nn.Module):
    def __init__(self,encoder_hidden_size,decoder_hidden_size,new_hidden_size):
        super().__init__()
        self.eh2nh=nn.Linear(in_features=encoder_hidden_size, out_features=new_hidden_size)
        self.dh2nh=nn.Linear(in_features=decoder_hidden_size, out_features=new_hidden_size)
        self.score=nn.Linear(in_features=new_hidden_size, out_features=1)

    def forward(self,query,keys):
        query=self.dh2nh(query)
        keys=self.eh2nh(keys)
        att_score=self.score(torch.tanh(query.permute(1,0,2) + keys))
        att_score=att_score.squeeze(2).unsqueeze(1)
        att_weights=F.softmax(att_score, dim=-1)
        context=torch.bmm(att_weights,keys)


        return context, att_weights

In [22]:
def train_one_epoch(train_dataloader):
    encoder.train()
    decoder.train()
    total_loss = 0


    for idx, (src_ids, tgt_ids, src_len) in enumerate(train_dataloader):
        src_ids = src_ids.to(device)  # src_ids: [batch_size, seq_len_src]
        tgt_ids = tgt_ids.to(device)  # tgt_ids: [batch_size, seq_len_tgt]

        encoder_outputs, encoder_hidden = encoder(src_ids, src_len)  # encoder_outputs: [batch_size, seq_len_src, hidden_size], encoder_hidden: [1, batch_size, hidden_size]
        decoder_hidden = encoder_hidden  # decoder_hidden: [1, batch_size, hidden_size]

        predictions = []

        for j in range(0, tgt_ids.shape[1] - 1):  # tgt_ids.shape[1] -> seq_len_tgt
            context, attention_weights = ba(decoder_hidden, encoder_outputs)  # context: [batch_size, seq_len_src, hidden_size]

            probs, decoder_hidden = decoder(tgt_ids[:, j].unsqueeze(1), context, decoder_hidden)  # probs: [batch_size, 1, output_size], decoder_hidden: [1, batch_size, hidden_size]
            predictions.append(probs)

        predictions_cat = torch.cat(predictions, dim=1)  # predictions_cat: [batch_size, seq_len_tgt - 1, output_size]
        predictions_reshaped = predictions_cat.view(-1, predictions_cat.shape[-1])  # predictions_reshaped: [batch_size * (seq_len_tgt - 1), output_size]

        target = tgt_ids[:, 1:]  # target: [batch_size, seq_len_tgt - 1]
        target = target.reshape(-1)  # target: [batch_size * (seq_len_tgt - 1)]

        loss = loss_fn(predictions_reshaped, target)
        total_loss += loss.item()

        opte.zero_grad()
        optd.zero_grad()
        optba.zero_grad()

        loss.backward()

        opte.step()
        optd.step()
        optba.step()

    return total_loss / (idx + 1)


In [23]:
def test_one_epoch(epoch, n_epochs, test_dataloader):
    encoder.eval()
    decoder.eval()
    ba.eval()
    total_loss = 0



    with torch.no_grad():
        for i, (src_ids, tgt_ids, src_len) in enumerate(test_dataloader):
   
            src_ids = src_ids.to(device)  # [1, source seq_len]
            tgt_ids = tgt_ids.to(device)
            src_len = torch.tensor(src_len, dtype=torch.int64)

            enc_outputs, enc_hidden = encoder(src_ids, src_len)  # enc_outputs: [batch_size, src_len, hidden_size]
            dec_hidden = enc_hidden  # [1, batch_size, hidden_size]
            input_id = tgt_ids[:, 0]  # first token (BOS) [batch_size]
            yhats = []

            if epoch + 1 == n_epochs:
                pred_sentence = ""

            for j in range(1, tgt_ids.shape[1]):  # iterate for len(tgt_ids)-1
 
                # dec_hidden: [1, batch_size, hidden_size] -> [batch_size, 1, hidden_size]
                query = dec_hidden.transpose(0, 1)
                context, attention_weights = ba(query, enc_outputs)

                probs, dec_hidden = decoder(
                    input_id.unsqueeze(1),  # [batch_size, 1]
                    context,  # [batch_size, 1, hidden_size]
                    dec_hidden  # [1, batch_size, hidden_size]
                )
                yhats.append(probs)

                _, input_id = torch.topk(probs, 1, dim=-1)  # [batch_size, 1, 1]
                input_id = input_id.squeeze(1).squeeze(1)  # [batch_size]

                if epoch + 1 == n_epochs:
                    word = vocab_en.lookup_token(input_id.item())  # batch_size=1
                    pred_sentence += word + " "

                # if EOS token is predicted, stop generating further words
                if input_id.item() == EOS_IDX:
                    break

            if epoch + 1 == n_epochs:
                src_tokens = vocab_de.lookup_tokens(src_ids.tolist()[0])
                src_sentence = " ".join([token for token in src_tokens if token not in ["<pad>", "<eos>"]])
                gt_tokens = vocab_en.lookup_tokens(tgt_ids[:, 1:].tolist()[0])
                gt_sentence = " ".join([token for token in gt_tokens if token not in ["<pad>", "<eos>"]])
                print("-----------------------------------")
                print("Source Sentence:", src_sentence)
                print("Ground Truth:", gt_sentence)
                print("Prediction:", pred_sentence.strip())

            # cancatenate the predictions and calculate loss
            
            yhats_cat = torch.cat(yhats, dim=1)  # [batch_size, tgt_len-1, vocab_size]
            yhats_reshaped = yhats_cat.view(-1, yhats_cat.shape[-1])  # [batch_size*(tgt_len-1), vocab_size]
            gt = tgt_ids[:, 1:j+1].view(-1)  # [batch_size*(tgt_len-1)]

            loss = loss_fn(yhats_reshaped, gt)
            total_loss += loss.item()

        avg_loss = total_loss / (i + 1)
        return avg_loss


In [24]:
embed_size=300
hidden_size=512
batch_size=32

encoder_path = "encoder.pth"
decoder_path = "decoder.pth"
ba_path = "ba.pth"

In [43]:
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
train_dataloader = DataLoader(train_iter, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)


test_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
test_dataloader = DataLoader(test_iter, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [26]:
encoder=Encoder(vocab_de.__len__(),embed_size,hidden_size).to(device) #translation-direction sensitive
decoder=Decoder(vocab_en.__len__(),embed_size,hidden_size).to(device) #translation-direction sensitive
ba=Bahdanau_Attention(hidden_size,hidden_size,hidden_size).to(device)

# uncomment below code to load already trained models

# encoder.load_state_dict(torch.load(encoder_path))
# decoder.load_state_dict(torch.load(decoder_path))
# ba.load_state_dict(torch.load(ba_path))

encoder.to(device)
decoder.to(device)
ba.to(device)

loss_fn=nn.NLLLoss(ignore_index=1).to(device)

lr=0.001

opte = optim.Adam(params=encoder.parameters(), lr=lr, weight_decay=0.001)
optd = optim.Adam(params=decoder.parameters(), lr=lr, weight_decay=0.001)
optba = optim.Adam(params=ba.parameters(), lr=lr)

In [27]:
# training with lr=0.001

n_epochs=20

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    print("Train Loss=", round(train_one_epoch(train_dataloader),4))

Epoch=1, Train Loss= 3.9296
Epoch=2, Train Loss= 3.2449
Epoch=3, Train Loss= 3.0491
Epoch=4, Train Loss= 2.9085
Epoch=5, Train Loss= 2.8034
Epoch=6, Train Loss= 2.7458
Epoch=7, Train Loss= 2.7034
Epoch=8, Train Loss= 2.6744
Epoch=9, Train Loss= 2.6412
Epoch=10, Train Loss= 2.6224
Epoch=11, Train Loss= 2.5972
Epoch=12, Train Loss= 2.576
Epoch=13, Train Loss= 2.5608
Epoch=14, Train Loss= 2.5468
Epoch=15, Train Loss= 2.5396
Epoch=16, Train Loss= 2.5255
Epoch=17, Train Loss= 2.5225
Epoch=18, Train Loss= 2.5122
Epoch=19, Train Loss= 2.5083
Epoch=20, Train Loss= 2.5038


In [29]:
lr=0.0001

opte = optim.Adam(params=encoder.parameters(), lr=lr, weight_decay=0.001)
optd = optim.Adam(params=decoder.parameters(), lr=lr, weight_decay=0.001)
optba = optim.Adam(params=ba.parameters(), lr=lr)

In [30]:
# training with lr=0.0001

n_epochs=20

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    print("Train Loss=", round(train_one_epoch(train_dataloader),4))

Epoch=1, Train Loss= 2.3726
Epoch=2, Train Loss= 2.2533
Epoch=3, Train Loss= 2.1787
Epoch=4, Train Loss= 2.1235
Epoch=5, Train Loss= 2.085
Epoch=6, Train Loss= 2.0543
Epoch=7, Train Loss= 2.0294
Epoch=8, Train Loss= 2.0107
Epoch=9, Train Loss= 1.9919
Epoch=10, Train Loss= 1.9798
Epoch=11, Train Loss= 1.9657
Epoch=12, Train Loss= 1.9565
Epoch=13, Train Loss= 1.9457
Epoch=14, Train Loss= 1.937
Epoch=15, Train Loss= 1.927
Epoch=16, Train Loss= 1.9191
Epoch=17, Train Loss= 1.9144
Epoch=18, Train Loss= 1.9071
Epoch=19, Train Loss= 1.9001
Epoch=20, Train Loss= 1.8969


In [44]:
n_epochs=1

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    print("Eval Loss=",round(test_one_epoch(e,n_epochs, test_dataloader),4))

Epoch=1, -----------------------------------
Source Sentence: Ein sitzender Mann arbeitet mit seinen Händen .
Ground Truth: A seated man is working with his hands .
Prediction: A man is sitting on his man is using his
-----------------------------------
Source Sentence: Eine Frau in einem schwarzen Kleid schiebt einen Wagen mit <unk> einen gepflasterten Fußgängerweg entlang .
Ground Truth: A woman in a black dress is pushing a cart with <unk> down a paved walkway .
Prediction: A woman in a black dress is pushing a cart with a brick path on a stone path
-----------------------------------
Source Sentence: Ein Mann mit einer grauen Mütze , Trägershirt und schwarzen Shorts macht einen Handstand .
Ground Truth: A man in a gray hat and tank top with black shorts doing a handstand .
Prediction: A man in a gray hat , top and black shorts is doing a handstand . <eos>
-----------------------------------
Source Sentence: Zwei kurzhaarige Frauen stehen einander gegenüber und die blonde Frau spric

In [32]:
# save the state_dicts
torch.save(encoder.state_dict(), encoder_path)
torch.save(decoder.state_dict(), decoder_path)
torch.save(ba.state_dict(), ba_path)