In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

from sklearn.model_selection import train_test_split
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences

# **I. Dataset Acquisition**

**Select a suitable dataset from http://www.manythings.org/anki/LinksLinks to an external site. for use in the machine translation task.**

In [2]:
# Load the dataset

df=pd.read_csv("hin.txt", sep='\t', header=None)
df.head()

Unnamed: 0,0,1,2
0,Wow!,वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Duck!,झुको!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Duck!,बतख़!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Help!,बचाओ!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
4,Jump.,उछलो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...


In [3]:
df=pd.read_csv("hin.txt", sep='\t', names=['English', 'Hindi', 'Attribution'])
df.head(10)

Unnamed: 0,English,Hindi,Attribution
0,Wow!,वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Duck!,झुको!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Duck!,बतख़!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Help!,बचाओ!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
4,Jump.,उछलो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
5,Jump.,कूदो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
6,Jump.,छलांग.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
7,Hello!,नमस्ते।,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
8,Hello!,नमस्कार।,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
9,Cheers!,वाह-वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


We will consider the 2 columns of 'Hindi' and 'English' for data preprocessing step

In [4]:
df=df[['English','Hindi']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116 entries, 0 to 3115
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   English  3116 non-null   object
 1   Hindi    3116 non-null   object
dtypes: object(2)
memory usage: 48.8+ KB


In [6]:
# removing white spaces
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x).drop_duplicates().dropna()

In [7]:
df.head()

Unnamed: 0,English,Hindi
0,Wow!,वाह!
1,Duck!,झुको!
2,Duck!,बतख़!
3,Help!,बचाओ!
4,Jump.,उछलो.


The dataset has no abbreviations or Full form so, we can skip the lowercase conversion here. But, there are many repetitive words such as "Hello!" → "नमस्ते।", which may lead to overfitting of the model and evaluation might be biased. So, checking for duplicates is necessary.

In [8]:
df.duplicated(subset=["English", "Hindi"]).sum()

np.int64(0)

Since, there are no dupliactes and dataset is clean, we can proceed with the splitting the dataset

In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# Tokenizing the data
train_df['English_tokens'] = train_df['English'].apply(lambda x: x.split())
train_df['Hindi_tokens'] = train_df['Hindi'].apply(lambda x: x.split())

# Check few examples
train_df[['English_tokens', 'Hindi_tokens']].head(5)


Unnamed: 0,English_tokens,Hindi_tokens
521,"[I, can, teach, English.]","[मैं, अंग्रेज़ी, पढ़ा, सकता, हूँ।]"
522,"[I, continued, singing.]","[मैं, गाता, चला, गया।]"
1644,"[I, hope, that, it, rains, tomorrow.]","[काश, कल, बारिश, हो, जाए।]"
2837,"[She, reproached, me, for, not, answering, the...","[उसने, मुझे, चिट्ठी, का, जवाब, न, देने, के, लि..."
1578,"[The, girl, did, nothing, but, cry.]","[उस, लड़की, ने, रोने, के, अलावा, और, कुछ, नहीं..."


In [11]:
# creating vocabulary

# Flatten token lists into one long list
english_words = [word for tokens in train_df['English_tokens'] for word in tokens]
hindi_words = [word for tokens in train_df['Hindi_tokens'] for word in tokens]

# Count frequencies
eng_counts = Counter(english_words)
hin_counts = Counter(hindi_words)

# vocabulary lists
special_tokens = ["<pad>", "<bos>", "<eos>", "<unk>"]
eng_vocabulary = special_tokens + list(eng_counts.keys())
hin_vocabulary = special_tokens + list(hin_counts.keys())

# dictionaries
English_dict = {word: idx for idx, word in enumerate(eng_vocabulary)}
Hindi_dict = {word: idx for idx, word in enumerate(hin_vocabulary)}

In [12]:
# I'll use encoding to convert each token into a numeric ID so the transformer model can understand and process the text mathematically.


eng_encoded = []
hin_encoded = []

for i in range(len(train_df)):
    eng_tokens = train_df['English_tokens'].iloc[i]
    hin_tokens = train_df['Hindi_tokens'].iloc[i]


    # English sentence encoding
    eng_ids = []
    for w in eng_tokens:
        if w in English_dict:
            eng_ids.append(English_dict[w])
        else:
            eng_ids.append(English_dict["<unk>"])
    eng_encoded.append(eng_ids)

    # Hindi sentence encoding and adding <bos> and <eos> to know the start and beginning of a sentence
    hin_ids = [Hindi_dict["<bos>"]]
    for w in hin_tokens:
        if w in Hindi_dict:
            hin_ids.append(Hindi_dict[w])
        else:
            hin_ids.append(Hindi_dict["<unk>"])
    hin_ids.append(Hindi_dict["<eos>"])
    hin_encoded.append(hin_ids)

# column created for encoded tokens
train_df["English_encoded"] = eng_encoded
train_df["Hindi_encoded"] = hin_encoded

# Checking first few rows
train_df[["English_tokens", "English_encoded", "Hindi_tokens", "Hindi_encoded"]].head()


Unnamed: 0,English_tokens,English_encoded,Hindi_tokens,Hindi_encoded
521,"[I, can, teach, English.]","[4, 5, 6, 7]","[मैं, अंग्रेज़ी, पढ़ा, सकता, हूँ।]","[1, 4, 5, 6, 7, 8, 2]"
522,"[I, continued, singing.]","[4, 8, 9]","[मैं, गाता, चला, गया।]","[1, 4, 9, 10, 11, 2]"
1644,"[I, hope, that, it, rains, tomorrow.]","[4, 10, 11, 12, 13, 14]","[काश, कल, बारिश, हो, जाए।]","[1, 12, 13, 14, 15, 16, 2]"
2837,"[She, reproached, me, for, not, answering, the...","[15, 16, 17, 18, 19, 20, 21, 22]","[उसने, मुझे, चिट्ठी, का, जवाब, न, देने, के, लि...","[1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2]"
1578,"[The, girl, did, nothing, but, cry.]","[23, 24, 25, 26, 27, 28]","[उस, लड़की, ने, रोने, के, अलावा, और, कुछ, नहीं...","[1, 27, 28, 29, 30, 24, 31, 32, 33, 34, 35, 2]"


In [13]:
# Padding the columns 'English_encoded' and 'Hindi_encoded sequences to the same length within each column as
# Transformers need all sentences in a batch to be the same length. Therefore, shorter length sentences will be padded

max_len_eng = max(len(seq) for seq in train_df['English_encoded'])
max_len_hin = max(len(seq) for seq in train_df['Hindi_encoded'])

train_df['English_padded'] = list(pad_sequences(train_df['English_encoded'], maxlen=max_len_eng, padding='post', value=English_dict["<pad>"]))
train_df['Hindi_padded']   = list(pad_sequences(train_df['Hindi_encoded'], maxlen=max_len_hin, padding='post', value=Hindi_dict["<pad>"]))


# Checking first few rows
train_df[['English_encoded', 'English_padded', 'Hindi_encoded', 'Hindi_padded']].head()


Unnamed: 0,English_encoded,English_padded,Hindi_encoded,Hindi_padded
521,"[4, 5, 6, 7]","[4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 4, 5, 6, 7, 8, 2]","[1, 4, 5, 6, 7, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, ..."
522,"[4, 8, 9]","[4, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 4, 9, 10, 11, 2]","[1, 4, 9, 10, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1644,"[4, 10, 11, 12, 13, 14]","[4, 10, 11, 12, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 12, 13, 14, 15, 16, 2]","[1, 12, 13, 14, 15, 16, 2, 0, 0, 0, 0, 0, 0, 0..."
2837,"[15, 16, 17, 18, 19, 20, 21, 22]","[15, 16, 17, 18, 19, 20, 21, 22, 0, 0, 0, 0, 0...","[1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2]","[1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2,..."
1578,"[23, 24, 25, 26, 27, 28]","[23, 24, 25, 26, 27, 28, 0, 0, 0, 0, 0, 0, 0, ...","[1, 27, 28, 29, 30, 24, 31, 32, 33, 34, 35, 2]","[1, 27, 28, 29, 30, 24, 31, 32, 33, 34, 35, 2,..."


# **II. Custom Transformer Implementation**

  **a) Develop a custom transformer-based machine translation model tailored to the selected dataset.**

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

# Check device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

from torch.utils.data import DataLoader, TensorDataset

Using device: cpu


In [15]:
# creating custom transformer model using PyTorch's nn.Transformer which consists of Embedding layers for English and Hindi,
# a transformer block with one encoder and one decoder layer and a linear output layer that predicts the next Hindi word

class CustomTransformer(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, d_model=64, nhead=2, num_layers=1):
        super(CustomTransformer, self).__init__()
        self.encoder = nn.Embedding(input_vocab_size, d_model)
        self.decoder = nn.Embedding(target_vocab_size, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=128,
            batch_first=True
        )

        self.fc_out = nn.Linear(d_model, target_vocab_size)

    def forward(self, src, tgt):
        src_emb = self.encoder(src)
        tgt_emb = self.decoder(tgt)
        output = self.transformer(src_emb, tgt_emb)
        return self.fc_out(output)

In [16]:
# Initializing Model, Loss, Optimizer
input_vocab_size = len(English_dict)
target_vocab_size = len(Hindi_dict)

custom_model = CustomTransformer(input_vocab_size, target_vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=Hindi_dict["<pad>"])
optimizer = optim.Adam(custom_model.parameters(), lr=0.001)

In [17]:
print(custom_model)

CustomTransformer(
  (encoder): Embedding(3241, 64)
  (decoder): Embedding(2943, 64)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=128, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0): TransformerDecode

In [18]:
# Training the dataset

# Converting the padded sequences into torch tensors
source_tensor = torch.tensor(train_df['English_padded'].tolist(), dtype=torch.long)
target_tensor = torch.tensor(train_df['Hindi_padded'].tolist(), dtype=torch.long)

# Create dataset and DataLoader for batch training
dataset = TensorDataset(source_tensor, target_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)


  source_tensor = torch.tensor(train_df['English_padded'].tolist(), dtype=torch.long)


In [19]:
epochs = 20

for epoch in range(epochs):
    custom_model.train()
    total_loss = 0.0

    for source, target in loader:
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()

        # decoder input: all tokens except the last one
        # output: all tokens except the first one
        output = custom_model(source, target[:, :-1])

        # Compute Cross-Entropy Loss
        loss = criterion(output.reshape(-1, output.shape[-1]), target[:, 1:].reshape(-1))

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch [{epoch+1}/{epochs}] - Average Loss: {avg_loss:.4f}")

Epoch [1/20] - Average Loss: 6.6289
Epoch [2/20] - Average Loss: 5.7739
Epoch [3/20] - Average Loss: 5.3376
Epoch [4/20] - Average Loss: 4.9248
Epoch [5/20] - Average Loss: 4.5437
Epoch [6/20] - Average Loss: 4.1810
Epoch [7/20] - Average Loss: 3.8478
Epoch [8/20] - Average Loss: 3.5418
Epoch [9/20] - Average Loss: 3.2430
Epoch [10/20] - Average Loss: 2.9661
Epoch [11/20] - Average Loss: 2.6987
Epoch [12/20] - Average Loss: 2.4482
Epoch [13/20] - Average Loss: 2.2238
Epoch [14/20] - Average Loss: 2.0030
Epoch [15/20] - Average Loss: 1.8029
Epoch [16/20] - Average Loss: 1.6226
Epoch [17/20] - Average Loss: 1.4640
Epoch [18/20] - Average Loss: 1.3214
Epoch [19/20] - Average Loss: 1.1986
Epoch [20/20] - Average Loss: 1.0664


Here, decoder input is taken as all tokens except the last one so as to predict the next word to enable sequential learning. Similarly, the first word from the expected output is not being considered as it is <bos> which is already known by the model.

Cross-Entropy measures how far predicted probabilities are from the correct class. So, less loss value means predicted Hindi words are closer to the real Hindi words.

**Insights**: Training log reflects loss decreasing from 6.62 to 1.06 which means model is learning.

# **III. Pre-trained Transformer Usage**

**b) Utilize pre-trained transformer models for the same dataset, optimizing their performance for machine translation.**

I'll use the Helsinki-NLP/opus-mt-en-hi model for this part from MarianMT family, which is a Transformer-based encoder–decoder model pre-trained on large-scale parallel corpora from the OPUS dataset.

In [20]:
!pip install -q transformers sacrebleu

from transformers import MarianMTModel, MarianTokenizer
import sacrebleu

In [21]:
# Define the model name
model_name = 'Helsinki-NLP/opus-mt-en-hi'

# Load the tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
pretrained_model = MarianMTModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model = pretrained_model.to(device)
device

device(type='cpu')

In [23]:
def translate(text):
    # Tokenize the input text
    tokenized_text = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(pretrained_model.device)
    # Translation
    translated_tokens = pretrained_model.generate(**tokenized_text, max_length=50)
    # Decoding the translated tokens
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

In [24]:
# Example

for text in train_df['English'].sample(10):
    print(f"\nEnglish: {text}")
    print(f"Predicted Hindi: {translate(text)}")


English: The king crushed his enemies.
Predicted Hindi: राजा अपने दुश्‍मनों को कुचल देता है ।

English: My father left me a lot of money in his will.
Predicted Hindi: मेरे पिता ने मुझे अपनी इच्छा में बहुत पैसा दिया है.

English: There was a strong wind that day.
Predicted Hindi: उस दिन एक ज़ोरदार आँधी थी ।

English: Are there enough chairs to go around?
Predicted Hindi: चारों ओर जाने के लिए पर्याप्त कुर्सी है?

English: She asked him to raise the funds.
Predicted Hindi: उसने उससे पैसे बढ़ाने के लिए कहा ।

English: How many books do you have?
Predicted Hindi: तुम्हारे पास कितनी किताबें हैं?

English: What do you want it for?
Predicted Hindi: तुम इसके लिए क्या चाहते हो?

English: I like my job very much.
Predicted Hindi: मैं अपने काम को बहुत पसंद है.

English: Do you feel any pain in your stomach?
Predicted Hindi: क्या आप अपने पेट में कोई दर्द महसूस करते हैं?

English: Tom's fever is getting worse.
Predicted Hindi: Tom का बुखार बदतर हो रहा है.


**Insights**: We see that the model produced semantically and grammatically correct translations for the sample 10 rows taken. Only minor errors in gender or idiomatic phrasing were observed, which are common in machine-generated translations.

# **IV. Comparative Analysis**

**c) Perform a detailed comparative study to assess the output generated by the custom transformer and pre-trained transformer models. Evaluate these outputs using BLEU metrics to quantify translation quality and overall performance.**

In [25]:
from nltk.translate.bleu_score import corpus_bleu

In [26]:
custom_model.eval()
custom_predictions = []

with torch.no_grad():
    for eng_sentence in test_df["English"]:
        # converting English words to ids
        source_ids = [English_dict.get(w, English_dict["<unk>"]) for w in eng_sentence.split()]
        source_tensor = torch.tensor([source_ids], dtype=torch.long).to(device)

        # decoding from <bos>
        target_tensor = torch.tensor([[Hindi_dict["<bos>"]]], dtype=torch.long).to(device)

        # greedy decoding
        for _ in range(25):
            output = custom_model(source_tensor, target_tensor)
            next_id = output.argmax(2)[:, -1].item()
            target_tensor = torch.cat([target_tensor, torch.tensor([[next_id]], device=device)], dim=1)
            if next_id == Hindi_dict["<eos>"]:
                break

        # converting ids to words
        pred_ids = target_tensor.squeeze().tolist()
        pred_words = [
            word for word, idx in Hindi_dict.items()
            if idx in pred_ids and word not in ["<bos>", "<eos>", "<pad>"]
        ]
        custom_predictions.append(pred_words)

In [27]:
reference_hindi = [[txt.split()] for txt in test_df["Hindi"].tolist()]

In [28]:
# Pretrained model predictions on test set

pretrained_predictions = []

for eng_text in test_df["English"]:
    Hindi_text = translate(eng_text)
    pretrained_predictions.append(Hindi_text.split())

In [29]:
# BLEU scores
bleu_custom = corpus_bleu(reference_hindi, custom_predictions)
bleu_pretrained = corpus_bleu(reference_hindi, pretrained_predictions)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [30]:
print(f"Custom Transformer BLEU:     {bleu_custom*100:.2f}")
print(f"Pre-trained MarianMT BLEU:   {bleu_pretrained*100:.2f}")

Custom Transformer BLEU:     0.00
Pre-trained MarianMT BLEU:   15.90


**Insights**: After evaluating both the models using the same test_df pairs. The pre-trained MarianMT model got a higher BLEU score and had more accurate translations, which was predictable because of the extensive pretraining done. The custom transformer model barely learned to generalize which might be due to small dataset and vocabulary size.