# Generative AI (CS F437) Assignment 1
Training and Evaluating Transformer Models for English to Hindi Translation

## Part 1: Fine-Tuning a Transformer Model

### Import Dependencies

In [None]:
@pip install --quiet --upgrade pip
@pip install numpy pandas matplotlib nltk wandb datasets transformers evaluate

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import nltk
import wandb

from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer
import evaluate

# Download NLTK data required for BLEU computation
nltk.download('punkt', quiet=True)

We use WandB to store out weights and training data

In [None]:
wandb_api_key = input("Enter your wandb API key: ") 
wandb.login(key=wandb_api_key)

# Initialize wandb run (change project and entity as desired)
wandb.init(
    project="GEN_AI",
    entity="aashreyrachaputi-bits-pilani", 
    config={
        "model_name": "Helsinki-NLP/opus-mt-en-hi",
        "dataset": "cfilt/iitb-english-hindi",
        "num_train_epochs": 3,
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "num_decoder_layers_to_keep_trainable": 1,
    }
)

### Data Loading and Preprocessing
Load the IITB English-Hindi parallel corpus and prepare it for training.

In [None]:
# Load the dataset
ds = load_dataset("cfilt/iitb-english-hindi")
train_data = ds["train"]
val_data = ds["validation"]
test_data = ds["test"]

# Create text dictionaries for evaluation
train_texts = {
    "en": [example["en"] for example in train_data["translation"]],
    "hi": [example["hi"] for example in train_data["translation"]],
}
val_texts = {
    "en": [example["en"] for example in val_data["translation"]],
    "hi": [example["hi"] for example in val_data["translation"]],
}
test_texts = {
    "en": [example["en"] for example in test_data["translation"]],
    "hi": [example["hi"] for example in test_data["translation"]],
}

### Training Configuration
Set up the training arguments and trainer.

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True, 
    report_to=["wandb"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

### Model Layer Freezing
Freeze specific layers in the model to focus training on the most important components.

In [None]:
def freeze_layers(model, freeze_encoder=True, num_decoder_layers_to_keep_trainable=2):
    for param in model.parameters():
        param.requires_grad = False

    if not freeze_encoder:
        for param in model.model.encoder.embed_tokens.parameters():
            param.requires_grad = True
        for layer in model.model.encoder.layers:
            for param in layer.parameters():
                param.requires_grad = True

    # Unfreeze decoder embeddings
    for param in model.model.decoder.embed_tokens.parameters():
        param.requires_grad = True

    # Unfreeze the last N decoder layers only
    total_decoder_layers = len(model.model.decoder.layers)
    for i in range(total_decoder_layers - num_decoder_layers_to_keep_trainable, total_decoder_layers):
        if i >= 0:
            for param in model.model.decoder.layers[i].parameters():
                param.requires_grad = True

    # Always unfreeze the final output projection
    for param in model.lm_head.parameters():
        param.requires_grad = True

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    wandb.log({
        "trainable_parameters": trainable_params,
        "total_parameters": total_params,
        "trainable_percentage": trainable_params/total_params
    })
    
    architecture_info = {
        "encoder_status": "Frozen" if freeze_encoder else "Trainable",
        "decoder_status": f"Partially trainable (last {num_decoder_layers_to_keep_trainable} layers)",
        "output_projection": "Trainable"
    }
    wandb.log({"model_architecture": architecture_info})

In [None]:
freeze_layers(model, freeze_encoder=True, num_decoder_layers_to_keep_trainable=7)

In [None]:
# Start training
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./pretrain_model_7l")
tokenizer.save_pretrained("./pretrain_model_7l")

In [None]:
def translate(texts):
    batch_size = 8
    translations = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        # Ensure tensors are on the correct device
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        # Generate translations using beam search
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded_outputs)
    return translations

### Evaluation

Once we have trained the model and saved it to our hardrive, we can directly import it from there to test.

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

model_dir = "./pretrain_model_7l"

# Load tokenizer from the folder
tokenizer = MarianTokenizer.from_pretrained(model_dir)

# Load the model from the folder
model = MarianMTModel.from_pretrained(model_dir)
model.eval()

#### Interactive Translation Demo

In [None]:
# Simple translation loop
while True:
    english_text = input("Enter an English sentence (or 'exit' to quit): ")
    if english_text.strip().lower() == 'exit':
        break
    inputs = tokenizer(english_text, return_tensors="pt")
    with torch.no_grad():
        generated_tokens = model.generate(**inputs)
    hindi_translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    print("Hindi Translation:", hindi_translation)

#### Model Evaluation

Evaluate the translation model using standard metrics including:
- BLEU score: Measures n-gram overlap between translations and references
- ROUGE score: Measures recall of n-grams between translations and references
- METEOR score: Measures word-to-word matches between translations and references

In [None]:
import torch
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import evaluate

nltk.download('punkt')

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

dataset = load_dataset("cfilt/iitb-english-hindi", split="test")
source_texts = [item["translation"]["en"] for item in dataset]
reference_texts = [item["translation"]["hi"] for item in dataset]

print(len(source_texts))
print(len(reference_texts))

model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate(texts, batch_size=8):
    predictions = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                num_beams=5,
                max_length=128,
                early_stopping=True
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded)
    return predictions

predictions = translate(source_texts)

references_tokenized = [[ref.split()] for ref in reference_texts]
predictions_tokenized = [pred.split() for pred in predictions]

# Compute BLEU score
smoothing_function = SmoothingFunction().method1
bleu_score = corpus_bleu(references_tokenized, predictions_tokenized, smoothing_function=smoothing_function)
print(f"Corpus BLEU Score: {bleu_score:.4f}")

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=predictions, references=reference_texts)
print(f"ROUGE-1 F1 Score: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2 F1 Score: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L F1 Score: {rouge_results['rougeL']:.4f}")

# Compute METEOR score
meteor_score = meteor.compute(predictions=predictions, references=reference_texts)
print(f"METEOR Score: {meteor_score['meteor']:.4f}")

#### Visualize Results

The cell below can be used to visualize translation examples and quality metrics.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import random

num_examples = 20
random_indices = random.sample(range(len(source_texts)), num_examples)

examples = []
for idx in random_indices:
    examples.append({
        "English": source_texts[idx],
        "Hindi (Reference)": reference_texts[idx],
        "Hindi (Predicted)": predictions[idx]
    })

pd.DataFrame(examples)