In [None]:
!pip install datasets
!pip install -U torchdata
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install bert_score
!pip install --index-url https://test.pypi.org/simple/ pymeteor
!pip install evaluate

from nltk.translate.bleu_score import sentence_bleu
import evaluate
import torch
import pymeteor.pymeteor as pymeteor
from bert_score import BERTScorer
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
import matplotlib.pyplot as plt
from torchtext.data.metrics import bleu_score
from nltk.translate import meteor_score
from bert_score import score
from torch import Tensor
import torch.nn as nn
from torch.nn import Transformer
import math
import pandas as pd
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from torchdata.datapipes.iter import ShardingFilter
from nltk.translate.bleu_score import SmoothingFunction

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation succ

In [3]:
# Load the training dataset
train_data = load_dataset("wmt16", "de-en", split="train[:50000]")
print(type(train_data))

# Load the validation dataset
val_data = load_dataset("wmt16", "de-en", split="validation")

# Display the first few examples from each dataset
print("Training Examples:")
print(train_data[0])
print("\nValidation Examples:")
print(val_data[0])

final_train_data = []
for data in train_data['translation']:
    final_train_data.append((data['de'], data['en']))

final_val_data = []
for data in val_data['translation']:
    final_val_data.append((data['de'], data['en']))

final_train_data = ShardingFilter(final_train_data)
final_val_data = ShardingFilter(final_val_data)

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Placeholder for token and vocab transformations
token_transform = {}
vocab_transform = {}

# Tokenizers for German and English
token_transform['de'] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform['en'] = get_tokenizer('spacy', language='en_core_web_sm')

# Helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

# Create torchtext's Vocab object for both languages
for ln in ['de', 'en']:
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(final_train_data, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index
for ln in ['de', 'en']:
    vocab_transform[ln].set_default_index(UNK_IDX)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/283M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/305M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4548885 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2999 [00:00<?, ? examples/s]

<class 'datasets.arrow_dataset.Dataset'>
Training Examples:
{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}

Validation Examples:
{'translation': {'de': 'Die Premierminister Indiens und Japans trafen sich in Tokio.', 'en': 'India and Japan prime ministers meet in Tokyo'}}


In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)


def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform)


def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = final_train_data

    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    train_dataloader = tqdm(train_dataloader, desc="Training", leave=False)
    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        tgt_input = tgt[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
        optimizer.zero_grad()
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()
    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0
    val_iter = final_val_data

    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        tgt_input = tgt[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(list(val_dataloader))



In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 10
train_losses = []
val_losses = []

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    train_losses.append(train_loss)
    val_losses.append(val_loss)
torch.save(transformer.state_dict(), '/German_to_English_model.pth')
torch.save(transformer, "/German_to_English_completemodel.pth")


def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

# Task 2: Generate Loss Plot After Training is Complete
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.savefig('/training_validation_loss_plot.png')
plt.show()

# Task 3: Evaluation with Multiple Metrics
weights1 = (1, 0, 0, 0)
weights2 = (0, 1, 0, 0)
weights3 = (0, 0, 1, 0)
weights4 = (0, 0, 0, 1)

def compute_meteor(reference_corpus, translation_corpus):
    return meteor_score.meteor_score(reference_corpus, translation_corpus)

def compute_bertscore(reference_corpus, translation_corpus):
    _, _, bert_scores = score(translation_corpus, reference_corpus, lang='en', verbose=False)
    return bert_scores.mean().item()

# Evaluation on validation data
val_references = [[token_transform[TGT_LANGUAGE](token) for token in example['translation']['en'].split()] for example in val_data]
val_translations = [[token_transform[TGT_LANGUAGE](token) for token in translate(transformer, example['translation']['de']).split()] for example in val_data]

val_references_formatted = [[sum(sentence, [])] for sentence in val_references]
val_translations_formatted = [[token[0] for token in sentence] for sentence in val_translations]

val_references_list_meteor = [' '.join(inner[0]) for inner in val_references_formatted]
val_translations_list_meteor = [' '.join(sublist) for sublist in val_translations_formatted]

score1 = 0
score2 = 0
score3 = 0
score4 = 0
meteor_score = 0

from bert_score import BERTScorer
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

for i in range(len(val_translations_list_meteor)): # iterate over no of sent
  score1 += sentence_bleu(val_references_formatted[i][0], val_translations_formatted[i], weights=weights1)
  score2 += sentence_bleu(val_references_formatted[i][0], val_translations_formatted[i], weights=weights2)
  score3 += sentence_bleu(val_references_formatted[i][0], val_translations_formatted[i], weights=weights3)
  score4 += sentence_bleu(val_references_formatted[i][0], val_translations_formatted[i], weights=weights4)
  meteor_score += pymeteor.meteor(val_references_list_meteor[i], val_translations_list_meteor[i])

val_translations_converted = [' '.join(sentence) for sentence in val_translations_formatted]
P, R, F1 = scorer.score(val_translations_converted, val_references_list_meteor)
print(f"BERT-Score : {F1.mean()}")

value = len(val_translations_list_meteor)
print(f"BLEU-1: {score1/value}")
print(f"BLEU-2: {score2/value}")
print(f"BLEU-3: {score3/value}")
print(f"BLEU-4: {score4/value}")
print(f"Meteor: {meteor_score/value}")


Training:   1%|          | 6/782 [03:07<7:20:11, 34.04s/it]

## Inference pipeline

In [None]:
## test_data

In [1]:
!pip install datasets
!pip install -U torchdata
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install bert_score
!pip install --index-url https://test.pypi.org/simple/ pymeteor

from nltk.translate.bleu_score import sentence_bleu
import torch
from tqdm import tqdm
import pymeteor.pymeteor as pymeteor
from bert_score import BERTScorer
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
import matplotlib.pyplot as plt
from torchtext.data.metrics import bleu_score
from nltk.translate import meteor_score
from bert_score import score
from torch import Tensor
import torch.nn as nn
from torch.nn import Transformer
import pandas as pd
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import math
from torchdata.datapipes.iter import ShardingFilter
from nltk.translate.bleu_score import SmoothingFunction

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [5]:
# Load the training dataset
train_data = load_dataset("wmt16", "de-en", split="train[:50000]")
val_data = load_dataset("wmt16", "de-en", split="validation")

final_train_data = []
for data in train_data['translation']:
    final_train_data.append((data['de'], data['en']))

final_val_data = []
for data in val_data['translation']:
    final_val_data.append((data['de'], data['en']))

final_train_data = ShardingFilter(final_train_data)
final_val_data = ShardingFilter(final_val_data)

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Placeholder for token and vocab transformations
token_transform = {}
vocab_transform = {}

# Tokenizers for German and English
token_transform['de'] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform['en'] = get_tokenizer('spacy', language='en_core_web_sm')

# Helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

# Create torchtext's Vocab object for both languages
for ln in ['de', 'en']:
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(final_train_data, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index
for ln in ['de', 'en']:
    vocab_transform[ln].set_default_index(UNK_IDX)


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)


def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform)


def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = final_train_data

    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    train_dataloader = tqdm(train_dataloader, desc="Training", leave=False)
    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        tgt_input = tgt[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
        optimizer.zero_grad()
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()
    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0
    val_iter = final_val_data

    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        tgt_input = tgt[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(list(val_dataloader))

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")




In [None]:
# !pip install datasets
from datasets import load_dataset
test_data = load_dataset("wmt16", "de-en", split="test[:1000]")

In [None]:
test_references = [[token_transform[TGT_LANGUAGE](token) for token in example['en'].split()] for example in test_data['translation']]


In [None]:
# Task 3: Evaluation with Multiple Metrics
model_path = '/content/drive/MyDrive/Colab Notebooks/German_to_English_model.pth'
model = transformer
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

test_references = [[token_transform[TGT_LANGUAGE](token) for token in example['translation']['en'].split()] for example in test_data]
test_translations = [[token_transform[TGT_LANGUAGE](token) for token in translate(model, example['translation']['de']).split()] for example in test_data]
# print(test_references) --> [[['Obama'], ['receives'], ['Netanyahu']],[['Netanyahu'],['is'],['not'],['exactly'],['friendly', '.']],[['The'],['two']]]
# print(test_translations) --> [['Flax', ',', 'proper', 'administration'], ['The','relationship','between','commitology','and', 'hempseed','is']]


In [None]:
# save test_references
file_path = "/content/drive/MyDrive/Colab Notebooks/test_references.txt"
with open(file_path, "w") as f:
    for translation in test_references:
        f.write(" ".join([" ".join(token) for token in translation]) + "\n")

In [None]:
# load test_references
file_path = "/content/drive/MyDrive/Colab Notebooks/test_references.txt"
test_references_loaded = []

with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        test_references_loaded.append([line])
test_references_loaded[:2]

[['Obama receives Netanyahu'],
 ['The relationship between Obama and Netanyahu is not exactly friendly .']]

In [None]:
# save the test_translations
file_path = "/content/drive/MyDrive/Colab Notebooks/test_translations.txt"

with open(file_path, "w") as f:
    for translation in test_translations:
        f.write(" ".join([" ".join(token) for token in translation]) + "\n")


In [None]:
# Load test_translations
file_path = "/content/drive/MyDrive/Colab Notebooks/test_translations.txt"
loaded_test_translations = []

with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        loaded_test_translations.append([line])
loaded_test_translations[:2]

[['Flax , proper administration'],
 ['The relationship between commitology and hempseed is not just a key factor .']]

In [None]:
test_translations = loaded_test_translations
test_references_formatted = [[sum(sentence, [])] for sentence in test_references]
#print(test_references_formatted) --> [[['Obama', 'receives', 'Netanyahu']], [['The','relationship','between','Obama','and','Netanyahu']]]

test_references_list_meteor = [' '.join(inner[0]) for inner in test_references_formatted]
test_translations_list_meteor = [' '.join(sublist) for sublist in test_translations]
# print(test_references_list_meteor) --> ['Obama receives Netanyahu', 'The relationship between Obama and Netanyahu is not exactly friendly .']
# print(test_translations_list_meteor) --> ['Flax , proper administration', 'The relationship between commitology and hempseed is not just a key factor .']



In [None]:
score1 = []
score2 = []
score3 = []
score4 = []

weights1 = (1, 0, 0, 0)
weights2 = (0, 1, 0, 0)
weights3 = (0, 0, 1, 0)
weights4 = (0, 0, 0, 1)

from bert_score import BERTScorer
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

# Initialize SmoothingFunction
smooth_fn = SmoothingFunction().method1

for i in range(1000): # iterate over no of sent
  score1.append(sentence_bleu(test_references_loaded[i][0], loaded_test_translations[i][0], weights=weights1, smoothing_function=smooth_fn))
  score2.append(sentence_bleu(test_references_loaded[i][0], loaded_test_translations[i][0], weights=weights2, smoothing_function=smooth_fn))
  score3.append(sentence_bleu(test_references_loaded[i][0], loaded_test_translations[i][0], weights=weights3, smoothing_function=smooth_fn))
  score4.append(sentence_bleu(test_references_loaded[i][0], loaded_test_translations[i][0], weights=weights4, smoothing_function=smooth_fn))


In [None]:
# Define the path to the text file
file_path = "/content/drive/MyDrive/Colab Notebooks/BLEUscore1_test.txt"

# Save test_references to a text file
with open(file_path, "w") as file:
    for num in score1:
        file.write(str(num) + "\n")

file_path = "/content/drive/MyDrive/Colab Notebooks/BLEUscore2_test.txt"
with open(file_path, "w") as file:
    for num in score2:
        file.write(str(num) + "\n")

file_path = "/content/drive/MyDrive/Colab Notebooks/BLEUscore3_test.txt"
with open(file_path, "w") as file:
    for score in score3:
        file.write(str(num) + "\n")

file_path = "/content/drive/MyDrive/Colab Notebooks/BLEUscore4_test.txt"
with open(file_path, "w") as file:
    for num in score4:
        file.write(str(num) + "\n")

print(f"BLEU-1: {sum(score1)/len(score1)}")
print(f"BLEU-2: {sum(score2)/len(score2)}")
print(f"BLEU-3: {sum(score3)/len(score3)}")
print(f"BLEU-4: {sum(score4)/len(score4)}")

BLEU-1: 0.19120750061296038
BLEU-2: 0.0011193788021188783
BLEU-3: 0.0011406653552233001
BLEU-4: 0.001163797384739101


In [None]:
import evaluate
meteor = evaluate.load('meteor')

In [None]:
meteor_scores = []

for i in tqdm(range(1000)):
    results = meteor.compute(predictions=loaded_test_translations[i], references=test_references_loaded[i])
    meteor_scores.append(results['meteor'])

print(meteor_scores)


100%|██████████| 1000/1000 [17:05<00:00,  1.03s/it]

[0.0, 0.5666909620991254, 0.6015971606033719, 0.43080602417426156, 0.45231071779744353, 0.091324200913242, 0.09259259259259259, 0.33360199728714934, 0.3422373081463991, 0.10204081632653061, 0.15384615384615383, 0.13473053892215567, 0.21871874506501024, 0.21547402495094314, 0.026881720430107527, 0.2485236220472441, 0.28055027460921, 0.2418228378535677, 0.11869436201780416, 0.10204081632653061, 0.3476119216859958, 0.13409961685823754, 0.4008714596949891, 0.623125, 0.14652014652014653, 0.305719557195572, 0.10121457489878541, 0.5769944341372912, 0.1323529411764706, 0.2333124084536514, 0.1595744680851064, 0.09216589861751152, 0.23232805676855894, 0.2690058479532164, 0.15267175572519084, 0.3362573099415205, 0.1824817518248175, 0.14705882352941177, 0.16487455197132617, 0.2078212290502793, 0.24234915666840554, 0.3896604938271605, 0.2482421875, 0.34403669724770647, 0.2189429012345679, 0.2032168799723279, 0.25974025974025977, 0.26072340007852374, 0.3475303164744158, 0.11235955056179774, 0.139999




In [None]:
print(sum(meteor_scores)/len(meteor_scores))

0.24188054580883256


In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/meteor_score_test.txt"
with open(file_path, "w") as file:
    for score in meteor_scores:
        file.write(str(num) + "\n")

In [None]:
f1 = []
for i in range(1000):
  P, R, F1 = scorer.score(loaded_test_translations[:1000][i],test_references_loaded[:1000][i])
  f1.append(F1)

In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/BERT_score_test.txt"

with open(file_path, "w") as file:
    for num in f1:
        file.write(str(num) + "\n")

In [None]:
length_f1 = len(f1)
print(f'BERT-score: {sum(f1)/length_f1}')

BERT-score: tensor([0.1020])


##Inference Pipeline

In [None]:
# Inference Pipeline for Testing Model
def inference_pipeline(input_csv_path, output_csv_path):
    model_path = '/content/drive/MyDrive/Colab Notebooks/German_to_English_model.pth'
    model = transformer
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    df = pd.read_csv(input_csv_path)
    translations = [translate(transformer, text) for text in df['de']]
    df['en'] = translations
    df.to_csv(output_csv_path, index=False)

inference_pipeline('/content/drive/MyDrive/Colab Notebooks/input.csv', '/content/drive/MyDrive/Colab Notebooks/output.csv')

##Val

In [8]:
# Task 3: Evaluation with Multiple Metrics
model_path = '/content/drive/MyDrive/Colab Notebooks/German_to_English_model.pth'
model = transformer
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

val_references = [[token_transform[TGT_LANGUAGE](token) for token in example['translation']['en'].split()] for example in val_data]
val_translations = [[token_transform[TGT_LANGUAGE](token) for token in translate(model, example['translation']['de']).split()] for example in val_data]


In [9]:
# save val_references
file_path = "/content/drive/MyDrive/Colab Notebooks/val_references.txt"
with open(file_path, "w") as f:
    for translation in val_references:
        f.write(" ".join([" ".join(token) for token in translation]) + "\n")

# save val_translations
file_path = "/content/drive/MyDrive/Colab Notebooks/val_translations.txt"
with open(file_path, "w") as f:
    for translation in val_translations:
        f.write(" ".join([" ".join(token) for token in translation]) + "\n")

In [14]:
# load val_references
file_path = "/content/drive/MyDrive/Colab Notebooks/val_references.txt"
val_references_loaded = []

with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        val_references_loaded.append([line])

In [15]:
# load val_translations
file_path = "/content/drive/MyDrive/Colab Notebooks/val_translations.txt"
val_translations_loaded = []

with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        val_translations_loaded.append([line])

In [17]:
score1 = []
score2 = []
score3 = []
score4 = []

weights1 = (1, 0, 0, 0)
weights2 = (0, 1, 0, 0)
weights3 = (0, 0, 1, 0)
weights4 = (0, 0, 0, 1)

from bert_score import BERTScorer
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

# Initialize SmoothingFunction
smooth_fn = SmoothingFunction().method1

for i in range(1000): # iterate over no of sent
  score1.append(sentence_bleu(val_references_loaded[i][0], val_translations_loaded[i][0], weights=weights1, smoothing_function=smooth_fn))
  score2.append(sentence_bleu(val_references_loaded[i][0], val_translations_loaded[i][0], weights=weights2, smoothing_function=smooth_fn))
  score3.append(sentence_bleu(val_references_loaded[i][0], val_translations_loaded[i][0], weights=weights3, smoothing_function=smooth_fn))
  score4.append(sentence_bleu(val_references_loaded[i][0], val_translations_loaded[i][0], weights=weights4, smoothing_function=smooth_fn))


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Define the path to the text file
file_path = "/content/drive/MyDrive/Colab Notebooks/val_BLEUscore1_test.txt"

# Save test_references to a text file
with open(file_path, "w") as file:
    for num in score1:
        file.write(str(num) + "\n")

file_path = "/content/drive/MyDrive/Colab Notebooks/val_BLEUscore2_test.txt"
with open(file_path, "w") as file:
    for num in score2:
        file.write(str(num) + "\n")

file_path = "/content/drive/MyDrive/Colab Notebooks/val_BLEUscore3_test.txt"
with open(file_path, "w") as file:
    for score in score3:
        file.write(str(num) + "\n")

file_path = "/content/drive/MyDrive/Colab Notebooks/val_BLEUscore4_test.txt"
with open(file_path, "w") as file:
    for num in score4:
        file.write(str(num) + "\n")

print(f"BLEU-1: {sum(score1)/len(score1)}")
print(f"BLEU-2: {sum(score2)/len(score2)}")
print(f"BLEU-3: {sum(score3)/len(score3)}")
print(f"BLEU-4: {sum(score4)/len(score4)}")

BLEU-1: 0.20672135959524235
BLEU-2: 0.0013413949518187693
BLEU-3: 0.0013771716610135587
BLEU-4: 0.0014191016653400245


In [19]:
!pip install evaluate
import evaluate
meteor = evaluate.load('meteor')

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [21]:
len(val_references_loaded)

2169

In [22]:
meteor_scores = []

for i in tqdm(range(2169)):
    results = meteor.compute(predictions=val_translations_loaded[i], references=val_references_loaded[i])
    meteor_scores.append(results['meteor'])
print(f"Meteor Score: {sum(meteor_scores)/len(meteor_scores)}")

100%|██████████| 2169/2169 [00:44<00:00, 48.64it/s]

Meteor Score: 0.23137782798053994





In [23]:
file_path = "/content/drive/MyDrive/Colab Notebooks/val_meteor_score_test.txt"
with open(file_path, "w") as file:
    for score in meteor_scores:
        file.write(str(num) + "\n")

In [28]:
f1 = []
for i in tqdm(range(2169)):
  P, R, F1 = scorer.score(val_translations_loaded[i],val_references_loaded[i])
  f1.append(F1)
print(f"BERT-score{sum(f1)/len(f1)}")

100%|██████████| 2169/2169 [26:19<00:00,  1.37it/s]

BERT-scoretensor([0.0730])





In [29]:
file_path = "/content/drive/MyDrive/Colab Notebooks/val_BERT_score_test.txt"

with open(file_path, "w") as file:
    for num in f1:
        file.write(str(num) + "\n")