# Generative AI (CS F437) Assignment 1
Training and Evaluating Transformer Models for English to Hindi Translation

## Part 1: Fine-Tuning a Transformer Model

### Import Dependencies

In [None]:
!pip install --quiet --upgrade pip
!pip install numpy pandas matplotlib nltk wandb datasets transformers evaluate


[31m[m

[H[2J
[31m[m

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting wandb
  Downloading wandb-0.19.8-py3-none-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting eval-type-backport (from wandb)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting pydantic<3,>=2.6 (from wandb)
  Downloading pydantic-2.11.1-py3-none-any.whl.metadata (63 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.25.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import nltk
import wandb

from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer
import evaluate

# Download NLTK data required for BLEU computation
nltk.download('punkt', quiet=True)

We use WandB to store out weights and training data

In [None]:
wandb_api_key = input("Enter your wandb API key: ")
wandb.login(key=wandb_api_key)

# Initialize wandb run (change project and entity as desired)
wandb.init(
    project="GEN_AI",
    entity="aashreyrachaputi-bits-pilani",
    config={
        "model_name": "Helsinki-NLP/opus-mt-en-hi",
        "dataset": "cfilt/iitb-english-hindi",
        "num_train_epochs": 3,
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "num_decoder_layers_to_keep_trainable": 1,
    }
)

### Data Loading and Preprocessing
Load the IITB English-Hindi parallel corpus and prepare it for training.

In [None]:
# Load the dataset
ds = load_dataset("cfilt/iitb-english-hindi")
train_data = ds["train"]
val_data = ds["validation"]
test_data = ds["test"]

# Create text dictionaries for evaluation
train_texts = {
    "en": [example["en"] for example in train_data["translation"]],
    "hi": [example["hi"] for example in train_data["translation"]],
}
val_texts = {
    "en": [example["en"] for example in val_data["translation"]],
    "hi": [example["hi"] for example in val_data["translation"]],
}
test_texts = {
    "en": [example["en"] for example in test_data["translation"]],
    "hi": [example["hi"] for example in test_data["translation"]],
}

### Training Configuration
Set up the training arguments and trainer.

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True,
    report_to=["wandb"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

### Model Layer Freezing
Freeze specific layers in the model to focus training on the most important components.

In [None]:
def freeze_layers(model, freeze_encoder=True, num_decoder_layers_to_keep_trainable=2):
    for param in model.parameters():
        param.requires_grad = False

    if not freeze_encoder:
        for param in model.model.encoder.embed_tokens.parameters():
            param.requires_grad = True
        for layer in model.model.encoder.layers:
            for param in layer.parameters():
                param.requires_grad = True

    # Unfreeze decoder embeddings
    for param in model.model.decoder.embed_tokens.parameters():
        param.requires_grad = True

    # Unfreeze the last N decoder layers only
    total_decoder_layers = len(model.model.decoder.layers)
    for i in range(total_decoder_layers - num_decoder_layers_to_keep_trainable, total_decoder_layers):
        if i >= 0:
            for param in model.model.decoder.layers[i].parameters():
                param.requires_grad = True

    # Always unfreeze the final output projection
    for param in model.lm_head.parameters():
        param.requires_grad = True

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())

    wandb.log({
        "trainable_parameters": trainable_params,
        "total_parameters": total_params,
        "trainable_percentage": trainable_params/total_params
    })

    architecture_info = {
        "encoder_status": "Frozen" if freeze_encoder else "Trainable",
        "decoder_status": f"Partially trainable (last {num_decoder_layers_to_keep_trainable} layers)",
        "output_projection": "Trainable"
    }
    wandb.log({"model_architecture": architecture_info})

In [None]:
freeze_layers(model, freeze_encoder=True, num_decoder_layers_to_keep_trainable=7)

In [None]:
# Start training
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./pretrain_model_7l")
tokenizer.save_pretrained("./pretrain_model_7l")

In [None]:
def translate(texts):
    batch_size = 8
    translations = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        # Ensure tensors are on the correct device
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        # Generate translations using beam search
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded_outputs)
    return translations

### Evaluation

Once we have trained the model and saved it to our hardrive, we can directly import it from there to test.

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

model_dir = "./pretrain_model_7l"

# Load tokenizer from the folder
tokenizer = MarianTokenizer.from_pretrained(model_dir)

# Load the model from the folder
model = MarianMTModel.from_pretrained(model_dir)
model.eval()

#### Interactive Translation Demo

In [None]:
# Simple translation loop
while True:
    english_text = input("Enter an English sentence (or 'exit' to quit): ")
    if english_text.strip().lower() == 'exit':
        break
    inputs = tokenizer(english_text, return_tensors="pt")
    with torch.no_grad():
        generated_tokens = model.generate(**inputs)
    hindi_translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    print("Hindi Translation:", hindi_translation)

#### Model Evaluation

Evaluate the translation model using standard metrics including:
- BLEU score: Measures n-gram overlap between translations and references
- ROUGE score: Measures recall of n-grams between translations and references
- METEOR score: Measures word-to-word matches between translations and references

In [None]:
import torch
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import evaluate

nltk.download('punkt')

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

dataset = load_dataset("cfilt/iitb-english-hindi", split="test")
source_texts = [item["translation"]["en"] for item in dataset]
reference_texts = [item["translation"]["hi"] for item in dataset]

print(len(source_texts))
print(len(reference_texts))

model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate(texts, batch_size=8):
    predictions = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                num_beams=5,
                max_length=128,
                early_stopping=True
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded)
    return predictions

predictions = translate(source_texts)

references_tokenized = [[ref.split()] for ref in reference_texts]
predictions_tokenized = [pred.split() for pred in predictions]

# Compute BLEU score
smoothing_function = SmoothingFunction().method1
bleu_score = corpus_bleu(references_tokenized, predictions_tokenized, smoothing_function=smoothing_function)
print(f"Corpus BLEU Score: {bleu_score:.4f}")

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=predictions, references=reference_texts)
print(f"ROUGE-1 F1 Score: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2 F1 Score: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L F1 Score: {rouge_results['rougeL']:.4f}")

# Compute METEOR score
meteor_score = meteor.compute(predictions=predictions, references=reference_texts)
print(f"METEOR Score: {meteor_score['meteor']:.4f}")

#### Visualize Results

The cell below can be used to visualize translation examples and quality metrics.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import random

num_examples = 20
random_indices = random.sample(range(len(source_texts)), num_examples)

examples = []
for idx in random_indices:
    examples.append({
        "English": source_texts[idx],
        "Hindi (Reference)": reference_texts[idx],
        "Hindi (Predicted)": predictions[idx]
    })

pd.DataFrame(examples)

## Part 2: Trained Transformer Model

### Environment Setup

In [None]:
!pip install datasets evaluate transformers torch

In [None]:
!pip install sentencepiece sacrebleu sacremoses nltk rouge-score

In [None]:
!pip install optuna

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

def translate_english_to_hindi(english_text):
    # Load MarianMT model and tokenizer for English-Hindi
    model_name = "Helsinki-NLP/opus-mt-en-hi"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Tokenize input with better handling
    inputs = tokenizer(
        english_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    # Generate translation with improved parameters
    with torch.no_grad():
        translated_tokens = model.generate(
            **inputs,
            max_length=512,
            num_beams=4,               # Better than greedy search
            no_repeat_ngram_size=2,    # Avoid word repetition
            early_stopping=True        # Stop when appropriate
        )

    # Decode the output
    hindi_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return hindi_text

# Sentences categorized by difficulty
sentences = {
    "Easy": [
        "Good morning",
        "What is your name?",
        "I love my country"
    ],
    "Medium": [
        "The Prime Minister announced new economic policies yesterday",
        "Climate change is affecting agricultural productivity",
        "The conference discussed artificial intelligence applications"
    ],
    "Tough": [
        "Quantum computing leverages superposition and entanglement to perform calculations",
        "The geopolitical implications of the recent trade agreements are multifaceted",
        "Neuroplasticity refers to the brain's ability to reorganize synaptic connections"
    ]
}

# Translate and display all sentences
for difficulty_level, sentence_list in sentences.items():
    print(f"\n{'='*40}")
    print(f"{difficulty_level.upper()} SENTENCES")
    print(f"{'='*40}")

    for sentence in sentence_list:
        translation = translate_english_to_hindi(sentence)
        print(f"\nEnglish: {sentence}")
        print(f"Hindi: {translation}")
        print("-"*60)


EASY SENTENCES


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


English: Good morning
Hindi: सुप्रभात
------------------------------------------------------------

English: What is your name?
Hindi: आपका Windows Live कूटशब्द क्या है?
------------------------------------------------------------

English: I love my country
Hindi: मैं अपने देश से प्यार
------------------------------------------------------------

MEDIUM SENTENCES

English: The Prime Minister announced new economic policies yesterday
Hindi: प्रधानमंत्री मंत्री ने कल नए आर्थिक नियमों की घोषणा की
------------------------------------------------------------

English: Climate change is affecting agricultural productivity
Hindi: जलवायु परिवर्तनों को कृषि उत्पादन को प्रभावित कर रहा है
------------------------------------------------------------

English: The conference discussed artificial intelligence applications
Hindi: सम्मेलन ने कृत्रिम बुद्धि अनुप्रयोग की चर्चा की
------------------------------------------------------------

TOUGH SENTENCES

English: Quantum computing leverages superpo

### Greedy Search

In [None]:
# Greedy Search
from datasets import load_dataset
from evaluate import load
from transformers import MarianMTModel, MarianTokenizer
import torch

ds = load_dataset("cfilt/iitb-english-hindi")
test_data = ds["test"]

model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def evaluate_model(model, tokenizer, dataset):
    model.eval()
    metric_bleu = load("sacrebleu")
    metric_rouge = load("rouge")
    metric_meteor = load("meteor")

    references = []
    predictions = []

    for idx, example in enumerate(dataset.select(range(len(dataset)))):
        print(f"Processing {idx+1}/{len(dataset)}...")

        input_text = example["translation"]["en"]
        ref_text = example["translation"]["hi"]

        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        references.append(ref_text)
        predictions.append(pred_text)

    bleu_score = metric_bleu.compute(predictions=predictions, references=references)
    rouge_score = metric_rouge.compute(predictions=predictions, references=references)
    meteor_score = metric_meteor.compute(predictions=predictions, references=references)

    return bleu_score, rouge_score, meteor_score

bleu, rouge, meteor = evaluate_model(model, tokenizer, test_data)

print("\nEvaluation Results:")
print(f"BLEU Score: {bleu['score']/100}")
print(f"ROUGE-1 Score: {rouge['rouge1']}")
print(f"ROUGE-2 Score: {rouge['rouge2']}")
print(f"ROUGE-L Score: {rouge['rougeL']}")
print(f"METEOR Score: {meteor['meteor']}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Processing 1/2507...
Processing 2/2507...
Processing 3/2507...
Processing 4/2507...
Processing 5/2507...
Processing 6/2507...
Processing 7/2507...
Processing 8/2507...
Processing 9/2507...
Processing 10/2507...
Processing 11/2507...
Processing 12/2507...
Processing 13/2507...
Processing 14/2507...
Processing 15/2507...
Processing 16/2507...
Processing 17/2507...
Processing 18/2507...
Processing 19/2507...
Processing 20/2507...
Processing 21/2507...
Processing 22/2507...
Processing 23/2507...
Processing 24/2507...
Processing 25/2507...
Processing 26/2507...
Processing 27/2507...
Processing 28/2507...
Processing 29/2507...
Processing 30/2507...
Processing 31/2507...
Processing 32/2507...
Processing 33/2507...
Processing 34/2507...
Processing 35/2507...
Processing 36/2507...
Processing 37/2507...
Processing 38/2507...
Processing 39/2507...
Processing 40/2507...
Processing 41/2507...
Processing 42/2507...
Processing 43/2507...
Processing 44/2507...
Processing 45/2507...
Processing 46/2507.

In [None]:
# Beam Search
from datasets import load_dataset
from evaluate import load
from transformers import MarianMTModel, MarianTokenizer
import torch

ds = load_dataset("cfilt/iitb-english-hindi")
test_data = ds["test"]

model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def evaluate_model(model, tokenizer, dataset):
    model.eval()
    metric_bleu = load("sacrebleu")
    metric_rouge = load("rouge")
    metric_meteor = load("meteor")

    references = []
    predictions = []

    for idx, example in enumerate(dataset.select(range(len(dataset)))):
        print(f"Processing {idx+1}/{len(dataset)}...")

        input_text = example["translation"]["en"]
        ref_text = example["translation"]["hi"]

        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                num_beams=5,
                no_repeat_ngram_size=2,
                early_stopping=True,
            )
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        references.append(ref_text)
        predictions.append(pred_text)

    bleu_score = metric_bleu.compute(predictions=predictions, references=references)
    rouge_score = metric_rouge.compute(predictions=predictions, references=references)
    meteor_score = metric_meteor.compute(predictions=predictions, references=references)

    return bleu_score, rouge_score, meteor_score

bleu, rouge, meteor = evaluate_model(model, tokenizer, test_data)

print("\nEvaluation Results:")
print(f"BLEU Score: {bleu['score']/100}")
print(f"ROUGE-1 Score: {rouge['rouge1']}")
print(f"ROUGE-2 Score: {rouge['rouge2']}")
print(f"ROUGE-L Score: {rouge['rougeL']}")
print(f"METEOR Score: {meteor['meteor']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Processing 1/2507...
Processing 2/2507...
Processing 3/2507...
Processing 4/2507...
Processing 5/2507...
Processing 6/2507...
Processing 7/2507...
Processing 8/2507...
Processing 9/2507...
Processing 10/2507...
Processing 11/2507...
Processing 12/2507...
Processing 13/2507...
Processing 14/2507...
Processing 15/2507...
Processing 16/2507...
Processing 17/2507...
Processing 18/2507...
Processing 19/2507...
Processing 20/2507...
Processing 21/2507...
Processing 22/2507...
Processing 23/2507...
Processing 24/2507...
Processing 25/2507...
Processing 26/2507...
Processing 27/2507...
Processing 28/2507...
Processing 29/2507...
Processing 30/2507...
Processing 31/2507...
Processing 32/2507...
Processing 33/2507...
Processing 34/2507...
Processing 35/2507...
Processing 36/2507...
Processing 37/2507...
Processing 38/2507...
Processing 39/2507...
Processing 40/2507...
Processing 41/2507...
Processing 42/2507...
Processing 43/2507...
Processing 44/2507...
Processing 45/2507...
Processing 46/2507.