# INM706: Inference (Transformer)

This is notebook is for inference of the models found in the [City-INM706](https://github.com/yasirbarlas/City-INM706) Github repository.

The model used for inference here is our baseline model, and so the parameters reflect this.

### Import Libraries and Models

In [1]:
# Import relevant libraries and models

from models import *
from dataset import *
from utils import *

import random
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.nist_score import sentence_nist


from torch.utils.data import DataLoader

### Import Training/Validation Dataset

We need to import the dataset since it contains the dictionaries for our token-word mappings.

In [2]:
train_dataset = TranslationDataset(lang1 = "europarl-v7.fr-en.en", lang2 = "europarl-v7.fr-en.fr", max_seq_len = 50, reverse = False, directory = "../fr-en/")

Read 2007723 sentence pairs
Trimmed to 1802065 sentence pairs
Counting words...
Counted words:
europarl-v7.fr-en.en 71012
europarl-v7.fr-en.fr 96093


### Create Model From Checkpoint

In [3]:
# Set device
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

# Get model checkpoint
model = torch.load(f"checkpoints/checkpoint_transformer_latest.pth.tar", map_location = device)

# Encoder and Decoder, replace 128 with your hidden dimension size and other parameters
# The parameters given are for our baseline model
# Replace src_vocab_size and target_vocab_size with your number of words (for english and french respectively)
transformer = Transformer(embed_dim = 512, src_vocab_size = train_dataset.input_lang.n_words,
                        target_vocab_size = train_dataset.output_lang.n_words, seq_len = 50,
                        num_layers = 6, expansion_factor = 4,
                        n_heads = 8, activation = "ReLU", norm_first = False, relative_attention = False).to(device)

transformer.load_state_dict(model["model_state_dict"])

# Ensure the Transformer is in evaluation mode
transformer.eval()

Transformer(
  (encoder): TransformerEncoder(
    (embedding_layer): Embedding(
      (embed): Embedding(71012, 512)
    )
    (positional_encoder): PositionalEmbedding()
    (layers): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadAttention(
          (query_matrix): Linear(in_features=64, out_features=64, bias=False)
          (key_matrix): Linear(in_features=64, out_features=64, bias=False)
          (value_matrix): Linear(in_features=64, out_features=64, bias=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2

### Test Dataset

In [4]:
# Make test dataset (change max_seq_len to your maximum sentence word length)
# Choose any dataset by inserting their paths as below (or skip if not interested in model testing)
test_dataset = TranslationDataset(lang1 = "test2008-en.txt", lang2 = "test2008-fr.txt", max_seq_len = 50, reverse = False, directory = "../fr-en/")

# Dataloader for faster inference
test_dataloader = DataLoader(test_dataset, batch_size = 32)

Read 2000 sentence pairs
Trimmed to 1766 sentence pairs
Counting words...
Counted words:
test2008-en.txt 5003
test2008-fr.txt 5924


### Filter Dataset

Filter the dataset for sentences containing only words found in training.

In [5]:
# Create the filtered dataset
def create_filtered_dataset(test_dataset, input_lang):
    valid_pairs = []

    for pair in test_dataset.pairs:
        input_valid = all(word in input_lang.word2index for word in pair[0].split())
        if input_valid:
            valid_pairs.append(pair)

    class FilteredTranslationDataset(TranslationDataset):
        def __init__(self, input_lang, output_lang, pairs, max_seq_len = 50):
            self.input_lang = input_lang
            self.output_lang = output_lang
            self.pairs = pairs
            self.max_seq_len = max_seq_len
            self.input_lang_voc = input_lang.word2index
            self.output_lang_voc = output_lang.word2index

        def __len__(self):
            return len(self.pairs)

        def __getitem__(self, index):
            input_sentence = self.pairs[index][0]
            output_sentence = self.pairs[index][1]
            in_sentence, out_sentence = self.tokenize_pair((input_sentence, output_sentence))
            input_ids = np.zeros(self.max_seq_len, dtype = np.int32)
            target_ids = np.zeros(self.max_seq_len, dtype = np.int32)
            input_ids[:len(in_sentence)] = in_sentence
            target_ids[:len(out_sentence)] = out_sentence
            return input_sentence, torch.tensor(input_ids, dtype = torch.long, device=device), torch.tensor(target_ids, dtype = torch.long, device = device)

    return FilteredTranslationDataset(test_dataset.input_lang, test_dataset.output_lang, valid_pairs, test_dataset.max_seq_len)

### Test Set Evaluation

Calculate BLEU and NIST for the test set imported earlier.

In [6]:
filtered_dataset = create_filtered_dataset(test_dataset, train_dataset.input_lang)
test_dataloader = DataLoader(filtered_dataset, batch_size = 32)

bleu_scores = []
nist_scores = []

with torch.no_grad():
    for _, src, target_tensor in test_dataloader:
        output = transformer(src, target_tensor)
        # Calculate BLEU score
        bleu_score = calculate_bleu(output, target_tensor)
        bleu_scores.append(bleu_score)

        # Calculate NIST score
        nist_score = calculate_nist(output, target_tensor)
        nist_scores.append(nist_score)

print("Mean BLEU:", np.mean(bleu_scores))
print("Mean NIST:", np.mean(nist_scores))

  out = F.softmax(self.fc_out(x))


Mean BLEU: 0.059293001384640054
Mean NIST: 1.1848307154946724


### Single Sentence Evaluation

One may wish to enter their own sentence (one that contains the words used during training) for inference. You should use lowercase without any punctuation.

In [7]:
# Functions for inference

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(" ") if word in lang.word2index]

def tensorFromSentence(lang, sentence):
    EOS_token = 1
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype = torch.long, device = device).view(1, -1)

In [8]:
sentence = "mr president ladies and gentlemen in his policy statement yesterday mr prodi the president of the commission said that whoever weakened any institution of the european union weakened the union as a whole"

# Let 0 be SOS token and 1 be EOS token
src = tensorFromSentence(train_dataset.input_lang, sentence)
target = torch.tensor([[0]], device = device)

print(src.shape, target.shape)

# Translate the input sentence
output_indices = transformer.decode(src, target)

decoded_words = []

for i in output_indices:
    decoded_words.append(train_dataset.output_lang.index2word[i])

output_sentence = " ".join(decoded_words)

print(output_sentence)

torch.Size([1, 34]) torch.Size([1, 1])
SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS
