
Perplexity Metric on mT5

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import math

# Load Google's mT5 model
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Sample texts in different languages (can be replaced with corpus data)
texts = {
    "English": "Artificial intelligence and machine learning are transforming industries.",
    "Hindi": "कृत्रिम बुद्धिमत्ता और मशीन लर्निंग उद्योगों को बदल रही हैं।",
    "Marathi": "कृत्रिम बुद्धिमत्ता आणि मशीन लर्निंग उद्योगांमध्ये परिवर्तन घडवत आहेत."
}

def compute_perplexity(text):
    # Tokenize and encode
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return math.exp(loss.item())

# Evaluate perplexity per language
results = {}
for lang, text in texts.items():
    ppl = compute_perplexity(text)
    results[lang] = ppl
    print(f"{lang} Perplexity: {ppl:.2f}")

# Optional: compare balance
print("\n--- Multilinguality Balance Summary ---")
for lang, ppl in results.items():
    print(f"{lang}: {ppl:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


English Perplexity: 479115.35
Hindi Perplexity: 10206386883393.80
Marathi Perplexity: 2064627.93

--- Multilinguality Balance Summary ---
English: 479115.35
Hindi: 10206386883393.80
Marathi: 2064627.93


Gave 3 lines of text for all languages


In [None]:
texts = {
    "English": [
        "Artificial intelligence and machine learning are transforming industries.",
        "Data science is changing how decisions are made in every field.",
        "Natural language processing helps computers understand human language."
    ],
    "Hindi": [
        "कृत्रिम बुद्धिमत्ता और मशीन लर्निंग उद्योगों को बदल रही हैं।",
        "डेटा साइंस हर क्षेत्र में निर्णय लेने का तरीका बदल रही है।",
        "प्राकृतिक भाषा संसाधन कंप्यूटरों को मानव भाषा समझने में मदद करता है।"
    ],
    "Marathi": [
        "कृत्रिम बुद्धिमत्ता आणि मशीन लर्निंग उद्योगांमध्ये परिवर्तन घडवत आहेत.",
        "डेटा सायन्स निर्णय घेण्याची प्रक्रिया बदलत आहे.",
        "नैसर्गिक भाषा प्रक्रिया संगणकांना मानवी भाषा समजून घेण्यास मदत करते."
    ]
}

def compute_avg_loss(texts):
    losses = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        losses.append(outputs.loss.item())
    return sum(losses) / len(losses)

results = {}
for lang, samples in texts.items():
    avg_loss = compute_avg_loss(samples)
    results[lang] = math.exp(avg_loss)  # or just use avg_loss directly
    print(f"{lang}: Avg Loss = {avg_loss:.4f}, Perplexity = {math.exp(avg_loss):.2f}")


English: Avg Loss = 12.8774, Perplexity = 391373.23
Hindi: Avg Loss = 19.5817, Perplexity = 319314266.68
Marathi: Avg Loss = 16.1711, Perplexity = 10544415.85


Gave 2000 Lines of text

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import math
import pandas as pd
from tqdm import tqdm

# ===== 1️⃣ Load Model and Tokenizer =====
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# ===== 2️⃣ Load Your Dataset =====
file_path = "Multilingual_text_dataset.csv"  # Update path if needed
df = pd.read_csv(file_path)

languages = ["English", "Hindi", "Marathi"]

# ===== 3️⃣ Function to compute average loss & perplexity =====
def compute_avg_loss_and_ppl(sentences):
    total_loss = 0.0
    count = 0
    for text in tqdm(sentences, desc="Evaluating", leave=False):
        if isinstance(text, str) and len(text.strip()) > 0:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
            with torch.no_grad():
                outputs = model(**inputs, labels=inputs["input_ids"])
                loss = outputs.loss.item()
                total_loss += loss
                count += 1
    avg_loss = total_loss / max(count, 1)
    perplexity = math.exp(avg_loss)
    return avg_loss, perplexity

# ===== 4️⃣ Evaluate each language =====
results = {}
for lang in languages:
    print(f"\nEvaluating {lang} text...")
    avg_loss, ppl = compute_avg_loss_and_ppl(df[lang].dropna().tolist())
    results[lang] = {"Average Loss": avg_loss, "Perplexity": ppl}
    print(f"{lang}: Average Loss = {avg_loss:.4f}, Perplexity = {ppl:.2f}")

# ===== 5️⃣ Summary =====
print("\n=== Multilingual Perplexity Summary (mT5-small) ===")
for lang, metrics in results.items():
    print(f"{lang:8s} | Loss: {metrics['Average Loss']:.4f} | PPL: {metrics['Perplexity']:.2f}")



Evaluating English text...


Evaluating:   0%|          | 0/2000 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


English: Average Loss = 13.9612, Perplexity = 1156867.92

Evaluating Hindi text...




Hindi: Average Loss = 17.2048, Perplexity = 29643948.43

Evaluating Marathi text...


                                                               

Marathi: Average Loss = 15.5103, Perplexity = 5445353.15

=== Multilingual Perplexity Summary (mT5-small) ===
English  | Loss: 13.9612 | PPL: 1156867.92
Hindi    | Loss: 17.2048 | PPL: 29643948.43
Marathi  | Loss: 15.5103 | PPL: 5445353.15




Gave 5000 lines of text

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import math
import pandas as pd
from tqdm import tqdm

# ===== 1️⃣ Load Model and Tokenizer =====
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# ===== 2️⃣ Load Your Dataset =====
file_path = "Multilingual_text_dataset - 2.csv"  # Update path if needed
df = pd.read_csv(file_path)

languages = ["English", "Hindi", "Marathi"]

# ===== 3️⃣ Function to compute average loss & perplexity =====
def compute_avg_loss_and_ppl(sentences):
    total_loss = 0.0
    count = 0
    for text in tqdm(sentences, desc="Evaluating", leave=False):
        if isinstance(text, str) and len(text.strip()) > 0:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
            with torch.no_grad():
                outputs = model(**inputs, labels=inputs["input_ids"])
                loss = outputs.loss.item()
                total_loss += loss
                count += 1
    avg_loss = total_loss / max(count, 1)
    perplexity = math.exp(avg_loss)
    return avg_loss, perplexity

# ===== 4️⃣ Evaluate each language =====
results = {}
for lang in languages:
    print(f"\nEvaluating {lang} text...")
    avg_loss, ppl = compute_avg_loss_and_ppl(df[lang].dropna().tolist())
    results[lang] = {"Average Loss": avg_loss, "Perplexity": ppl}
    print(f"{lang}: Average Loss = {avg_loss:.4f}, Perplexity = {ppl:.2f}")

# ===== 5️⃣ Summary =====
print("\n=== Multilingual Perplexity Summary (mT5-small) ===")
for lang, metrics in results.items():
    print(f"{lang:8s} | Loss: {metrics['Average Loss']:.4f} | PPL: {metrics['Perplexity']:.2f}")





Evaluating English text...


Evaluating:   0%|          | 0/5000 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


English: Average Loss = 13.9612, Perplexity = 1156867.92

Evaluating Hindi text...




Hindi: Average Loss = 17.2048, Perplexity = 29643948.43

Evaluating Marathi text...


                                                               

Marathi: Average Loss = 15.3470, Perplexity = 4625023.26

=== Multilingual Perplexity Summary (mT5-small) ===
English  | Loss: 13.9612 | PPL: 1156867.92
Hindi    | Loss: 17.2048 | PPL: 29643948.43
Marathi  | Loss: 15.3470 | PPL: 4625023.26


