In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import torch
import re
from collections import defaultdict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load m2m100 multilingual model
model_name = "facebook/m2m100_418M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Load standard BLEU (not SacreBLEU)
bleu = evaluate.load("bleu")

# Helper: normalize text for fair comparison
def normalize_text(text):
    text = text.strip()
    text = re.sub(r"[।\.\,\!\?]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)         # normalize spaces
    text = text.lower()
    return text

# Define dataset with all pairs
data = [
    # English → Hindi
    {"src_lang": "en", "tgt_lang": "hi", "input": "The weather is good today.", "expected": "आज मौसम अच्छा है"},
    {"src_lang": "en", "tgt_lang": "hi", "input": "I am reading a book.", "expected": "मैं किताब पढ़ रहा हूँ"},

    # English → Marathi
    {"src_lang": "en", "tgt_lang": "mr", "input": "The weather is good today.", "expected": "आज हवामान छान आहे"},
    {"src_lang": "en", "tgt_lang": "mr", "input": "I am reading a book.", "expected": "मी पुस्तक वाचत आहे"},

    # Hindi → English
    {"src_lang": "hi", "tgt_lang": "en", "input": "मैं स्कूल जा रहा हूँ", "expected": "I am going to school"},
    {"src_lang": "hi", "tgt_lang": "en", "input": "आज बहुत गर्मी है", "expected": "It is very hot today"},

    # Marathi → English
    {"src_lang": "mr", "tgt_lang": "en", "input": "मी पाणी पित आहे", "expected": "I am drinking water"},
    {"src_lang": "mr", "tgt_lang": "en", "input": "आज हवामान छान आहे", "expected": "The weather is nice today"},

    # Hindi → Marathi
    {"src_lang": "hi", "tgt_lang": "mr", "input": "मैं बाजार जा रहा हूँ", "expected": "मी बाजारात जात आहे"},
    {"src_lang": "hi", "tgt_lang": "mr", "input": "मैं पानी पी रहा हूँ", "expected": "मी पाणी पित आहे"},

    # Marathi → Hindi
    {"src_lang": "mr", "tgt_lang": "hi", "input": "मी घरी जात आहे", "expected": "मैं घर जा रहा हूँ"},
    {"src_lang": "mr", "tgt_lang": "hi", "input": "मी पुस्तक वाचत आहे", "expected": "मैं किताब पढ़ रहा हूँ"},
]

# Group BLEU scores by language pair
lang_pair_scores = defaultdict(list)

def translate(sample):
    src, src_lang, tgt_lang = sample["input"], sample["src_lang"], sample["tgt_lang"]
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(src, return_tensors="pt", padding=True).to(device)
    forced_token_id = tokenizer.get_lang_id(tgt_lang)
    with torch.no_grad():
        outputs = model.generate(**inputs, forced_bos_token_id=forced_token_id, max_length=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Evaluate per language pair
for sample in data:
    pred = translate(sample)
    ref = sample["expected"]
    pair = f"{sample['src_lang']}→{sample['tgt_lang']}"

    pred_norm = normalize_text(pred)
    ref_norm = normalize_text(ref)

    # Compute BLEU for each example individually
    # predictions should be list of strings, references should be list of list of strings
    bleu_score = bleu.compute(predictions=[pred_norm], references=[[ref_norm]])["bleu"]
    lang_pair_scores[pair].append(bleu_score)

    print(f"\n{pair}")
    print(f"Input: {sample['input']}")
    print(f"Predicted: {pred}")
    print(f"Expected:  {ref}")
    print(f"BLEU: {bleu_score:.4f}")

# Compute average BLEU per pair
print("\n=== AVERAGE BLEU SCORES PER LANGUAGE PAIR ===")
for pair, scores in lang_pair_scores.items():
    avg = sum(scores) / len(scores)
    print(f"{pair:10s} : {avg:.4f}")

# Overall average
overall = sum(sum(v) / len(v) for v in lang_pair_scores.values()) / len(lang_pair_scores)
print(f"\nOverall Average BLEU: {overall:.4f}")


en→hi
Input: The weather is good today.
Predicted: आज मौसम अच्छा है।
Expected:  आज मौसम अच्छा है
BLEU: 1.0000

en→hi
Input: I am reading a book.
Predicted: मैं एक किताब पढ़ रहा हूं।
Expected:  मैं किताब पढ़ रहा हूँ
BLEU: 0.0000

en→mr
Input: The weather is good today.
Predicted: आज मौसम चांगला आहे.
Expected:  आज हवामान छान आहे
BLEU: 0.0000

en→mr
Input: I am reading a book.
Predicted: मी एक पुस्तक वाचतो.
Expected:  मी पुस्तक वाचत आहे
BLEU: 0.0000

hi→en
Input: मैं स्कूल जा रहा हूँ
Predicted: I am going to school.
Expected:  I am going to school
BLEU: 1.0000

hi→en
Input: आज बहुत गर्मी है
Predicted: It is very hot today.
Expected:  It is very hot today
BLEU: 1.0000

mr→en
Input: मी पाणी पित आहे
Predicted: I am drinking water.
Expected:  I am drinking water
BLEU: 1.0000

mr→en
Input: आज हवामान छान आहे
Predicted: The weather is today.
Expected:  The weather is nice today
BLEU: 0.0000

hi→mr
Input: मैं बाजार जा रहा हूँ
Predicted: मी बाजार जातो.
Expected:  मी बाजारात जात आहे
BLEU: 0.0000

