In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9666dcfef091d6dffbe6dcd306bc74011ded50ede25af689eee5965201b35fe0
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

base_model_name = "google/mt5-base"

ft_model_name = "suryakantmani/mt5-vanilla-finetune-summarization"

tokenizer = AutoTokenizer.from_pretrained(ft_model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
ft_model = AutoModelForSeq2SeqLM.from_pretrained(ft_model_name).to(device)

rouge = evaluate.load("rouge")

eval_data = [
    {
        "src": "भारत एक बड़ा देश है जिसमें कई राज्य हैं। यह एशिया में स्थित है और इसकी राजधानी नई दिल्ली है।",
        "ref": "भारत एशिया में स्थित एक बड़ा देश है।"
    },
    {
        "src": "La Terre tourne autour du Soleil et c’est ce mouvement qui crée les saisons sur notre planète.",
        "ref": "La rotation de la Terre autour du Soleil cause les saisons."
    },
    {
        "src": "The ozone layer protects Earth from harmful ultraviolet rays from the Sun.",
        "ref": "The ozone layer shields Earth from UV rays."
    }
]

def generate_summary(model, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=80, num_beams=4)
    return tokenizer.decode(out[0], skip_special_tokens=True)

def evaluate_model(model, name):
    preds, refs = [], []
    for sample in eval_data:
        pred = generate_summary(model, sample["src"])
        print(f"\n[{name}] Input: {sample['src']}\n→ Pred: {pred}\nRef: {sample['ref']}")
        preds.append(pred)
        refs.append(sample["ref"])
    scores = rouge.compute(predictions=preds, references=refs)
    rouge_l = scores["rougeL"]  # updated for new API
    print(f"\n{name} ROUGE-L: {rouge_l:.4f}")
    return rouge_l


base_score = evaluate_model(base_model, "BASE mT5")
ft_score = evaluate_model(ft_model, "FINE-TUNED mT5")

print(f"\n Catastrophic Forgetting (ΔROUGE-L) = {base_score - ft_score:.4f}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



[BASE mT5] Input: भारत एक बड़ा देश है जिसमें कई राज्य हैं। यह एशिया में स्थित है और इसकी राजधानी नई दिल्ली है।
→ Pred:  <extra_id_0> नई दिल्ली
Ref: भारत एशिया में स्थित एक बड़ा देश है।

[BASE mT5] Input: La Terre tourne autour du Soleil et c’est ce mouvement qui crée les saisons sur notre planète.
→ Pred:  <extra_id_0>. La Terre tourne...
Ref: La rotation de la Terre autour du Soleil cause les saisons.

[BASE mT5] Input: The ozone layer protects Earth from harmful ultraviolet rays from the Sun.
→ Pred:  <extra_id_0> ozone layer
Ref: The ozone layer shields Earth from UV rays.

BASE mT5 ROUGE-L: 0.1810

[FINE-TUNED mT5] Input: भारत एक बड़ा देश है जिसमें कई राज्य हैं। यह एशिया में स्थित है और इसकी राजधानी नई दिल्ली है।
→ Pred:  'भारत' - 'भारत - 'भारत-एक बड़ा देश' - 'भारत - 'भारत - 'भारत - 'भारत - '
Ref: भारत एशिया में स्थित एक बड़ा देश है।

[FINE-TUNED mT5] Input: La Terre tourne autour du Soleil et c’est ce mouvement qui crée les saisons sur notre planète.
→ Pred:  La Terre tourne auto

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate, torch
from torch.nn.functional import cosine_similarity

device = "cuda" if torch.cuda.is_available() else "cpu"

base_model_name = "google/mt5-base"
ft_model_name = "suryakantmani/mt5-vanilla-finetune-summarization"

tokenizer = AutoTokenizer.from_pretrained(ft_model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name).to(device)
ft_model = AutoModelForSeq2SeqLM.from_pretrained(ft_model_name).to(device)

# metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# multilingual dataset
eval_data = [
    {
        "lang": "hi",
        "src": "भारत एक विशाल और विविधता से भरा देश है जहाँ अनेक भाषाएँ, संस्कृतियाँ और परंपराएँ पाई जाती हैं। "
               "यहाँ हिमालय की बर्फ़ीली चोटियों से लेकर दक्षिण के समुद्र तटों तक प्राकृतिक विविधता देखने को मिलती है। "
               "भारत की राजधानी नई दिल्ली है और यह एशिया के प्रमुख देशों में से एक है।",
        "ref": "भारत एक सांस्कृतिक और भौगोलिक विविधता वाला एशियाई देश है जिसकी राजधानी नई दिल्ली है।"
    },
    {
        "lang": "mr",
        "src": "भारत हा विविध संस्कृती, भाषा आणि धर्म असलेला एक विशाल देश आहे. "
               "उत्तरमध्ये हिमालय पर्वतरांग आणि दक्षिणमध्ये समुद्रकिनारे असून या देशात नैसर्गिक संपत्ती विपुल आहे. "
               "नवी दिल्ली ही भारताची राजधानी आहे आणि तो आशियातील महत्त्वाचा देश आहे.",
        "ref": "भारत हा नैसर्गिक आणि सांस्कृतिक विविधतेने समृद्ध देश असून त्याची राजधानी नवी दिल्ली आहे."
    },
    {
        "lang": "en",
        "src": "India is a vast and diverse country known for its many languages, cultures, and traditions. "
               "From the snowy peaks of the Himalayas in the north to the coastal plains of the south, "
               "the nation’s geography is rich and varied. The capital city is New Delhi, an important center in Asia.",
        "ref": "India is a diverse Asian nation with rich culture and its capital is New Delhi."
    },
    {
        "lang": "fr",
        "src": "L’Inde est un pays vaste et diversifié, connu pour ses nombreuses langues et cultures. "
               "Des montagnes de l’Himalaya au nord jusqu’aux plages du sud, le pays offre une grande variété de paysages. "
               "Sa capitale, New Delhi, joue un rôle majeur en Asie.",
        "ref": "L’Inde, pays asiatique diversifié, a pour capitale New Delhi."
    }
]


def summarize(model, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=80, num_beams=4)
    return tokenizer.decode(output[0], skip_special_tokens=True)

results = []

for sample in eval_data:
    lang = sample["lang"]
    base_out = summarize(base_model, sample["src"])
    ft_out = summarize(ft_model, sample["src"])

    # compute metrics per language
    rouge_base = rouge.compute(predictions=[base_out], references=[sample["ref"]])["rougeL"]
    rouge_ft = rouge.compute(predictions=[ft_out], references=[sample["ref"]])["rougeL"]
    bleu_base = bleu.compute(predictions=[base_out], references=[[sample["ref"]]])["bleu"]
    bleu_ft   = bleu.compute(predictions=[ft_out], references=[[sample["ref"]]])["bleu"]

    bert_base = bertscore.compute(predictions=[base_out], references=[sample["ref"]], lang=lang)["f1"][0]
    bert_ft = bertscore.compute(predictions=[ft_out], references=[sample["ref"]], lang=lang)["f1"][0]

    results.append({
        "lang": lang,
        "base_ROUGE": rouge_base,
        "ft_ROUGE": rouge_ft,
        "ΔROUGE": rouge_base - rouge_ft,
        "base_BLEU": bleu_base,
        "ft_BLEU": bleu_ft,
        "base_BERT": bert_base,
        "ft_BERT": bert_ft,
        "ΔBERT": bert_base - bert_ft
    })

# print table
import pandas as pd
df = pd.DataFrame(results)
print(df)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  lang  base_ROUGE  ft_ROUGE    ΔROUGE  base_BLEU  ft_BLEU  base_BERT  \
0   hi    0.000000  0.000000  0.000000        0.0      0.0   0.692693   
1   mr    0.000000  0.000000  0.000000        0.0      0.0   0.632831   
2   en    0.181818  0.233333 -0.051515        0.0      0.0   0.825898   
3   fr    0.266667  0.187500  0.079167        0.0      0.0   0.603273   

    ft_BERT     ΔBERT  
0  0.527638  0.165055  
1  0.546008  0.086823  
2  0.866458 -0.040560  
3  0.702960 -0.099687  
