In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import math

# Load Google's mT5 model
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Sample texts in different languages (can be replaced with corpus data)
texts = {
    "English": "Artificial intelligence and machine learning are transforming industries.",
    "Hindi": "कृत्रिम बुद्धिमत्ता और मशीन लर्निंग उद्योगों को बदल रही हैं।",
    "Marathi": "कृत्रिम बुद्धिमत्ता आणि मशीन लर्निंग उद्योगांमध्ये परिवर्तन घडवत आहेत."
}

def compute_perplexity(text):
    # Tokenize and encode
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return math.exp(loss.item())

# Evaluate perplexity per language
results = {}
for lang, text in texts.items():
    ppl = compute_perplexity(text)
    results[lang] = ppl
    print(f"{lang} Perplexity: {ppl:.2f}")

# Optional: compare balance
print("\n--- Multilinguality Balance Summary ---")
for lang, ppl in results.items():
    print(f"{lang}: {ppl:.2f}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


English Perplexity: 479115.35
Hindi Perplexity: 10206386883393.80
Marathi Perplexity: 2064627.93

--- Multilinguality Balance Summary ---
English: 479115.35
Hindi: 10206386883393.80
Marathi: 2064627.93
