In [None]:
import os, torch, time, numpy as np, pandas as pd
import warnings
warnings.filterwarnings("ignore")

import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

try:
    from peft import PeftModel
except:
    PeftModel = None

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(" Running on device:", DEVICE)

BASE_MODEL = "./mT5-multilingual-XLSum"
HF_VANILLA = "suryakantmani/mt5-vanilla-finetune-summarization"


tokenizer_vanilla = AutoTokenizer.from_pretrained(HF_VANILLA)
vanilla_model_obj = AutoModelForSeq2SeqLM.from_pretrained(HF_VANILLA).to(DEVICE).eval()



model_paths = {
    "mt5_base": "./mt5-multilingual-XLSum",
    "vanilla": HF_VANILLA,                
    "lora": "./model-lora-finetuned_2",
    "langanchor": "./model-langanchor-finetuned",
}

rouge_m = evaluate.load("rouge")
bleu_m = evaluate.load("sacrebleu")
bert_m = evaluate.load("bertscore")

test_data = [
    {"lang": "en", "text": "The Indian economy is growing steadily this year.", "summary": "India's economy is expanding."},
    {"lang": "fr", "text": "Le marché mondial du pétrole a chuté récemment.", "summary": "Le prix du pétrole a baissé."},
    {"lang": "hi", "text": "प्रधानमंत्री ने नई शिक्षा नीति की घोषणा की।", "summary": "नई शिक्षा नीति घोषित की गई।"},
    {"lang": "es", "text": "El clima está cambiando rápidamente en todo el mundo.", "summary": "El cambio climático se acelera."},
]

# Model loader (supports direct HF, local, PEFT LoRA)
def try_load_model(path):
    try:
        m = AutoModelForSeq2SeqLM.from_pretrained(path)
        print("  - Loaded HF/Local model:", path)
        return m
    except:
        if PeftModel is not None:
            try:
                base_m = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
                m = PeftModel.from_pretrained(base_m, path)
                print("  - Loaded PEFT adapter:", path)
                return m
            except:
                print("  - Failed adapter load:", path)
                return None
        return None

def do_summary(m, tok, txt, max_len=80):
    enc = tok(txt, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
    with torch.no_grad():
        out_ids = m.generate(**enc, max_length=max_len, num_beams=4)
    return tok.decode(out_ids[0], skip_special_tokens=True)

def get_ppl(m, tok, texts):
    vals = []
    for t in texts:
        x = tok(t, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
        with torch.no_grad():
            l = m(**x, labels=x["input_ids"]).loss
        vals.append(torch.exp(l).item())
    return np.mean(vals)


def evaluate_one(model_name, model_dir):
    print(f"\n>>> Evaluating: {model_name}")

    if model_name == "vanilla":
        m = vanilla_model_obj
        tok = tokenizer_vanilla
    else:

        if os.path.exists(os.path.join(model_dir, "config.json")):
            tok = AutoTokenizer.from_pretrained(model_dir)
        else:
            tok = AutoTokenizer.from_pretrained(BASE_MODEL)


        m = try_load_model(model_dir)
        if m is None:
            print("  !! Skipping:", model_name)
            return None
        m = m.to(DEVICE).eval()

    preds, refs = [], []
    t0 = time.time()

    for s in test_data:
        p = do_summary(m, tok, s["text"])
        preds.append(p)
        refs.append(s["summary"])
        print(f"[{s['lang']}] → {p}")

    # Metrics
    r = rouge_m.compute(predictions=preds, references=refs)
    b = bleu_m.compute(predictions=preds, references=[[x] for x in refs])
    bert_s = bert_m.compute(predictions=preds, references=refs, lang="en")
    ppl_v = get_ppl(m, tok, [x["text"] for x in test_data])
    elapsed = round(time.time() - t0, 2)

    return {
        "Model": model_name,
        "ROUGE-1": r["rouge1"],
        "ROUGE-2": r["rouge2"],
        "ROUGE-L": r["rougeL"],
        "BLEU": b["score"],
        "BERTScore": float(np.mean(bert_s["f1"])),
        "Perplexity": ppl_v,
        "Time(s)": elapsed
    }

# Final evaluation
all_rows = []
for nm, pth in model_paths.items():
    res = evaluate_one(nm, pth)
    if res:
        all_rows.append(res)

df = pd.DataFrame(all_rows)
print("\n=== Final Results ===")
print(df.round(4))
df.to_csv("multilingual_eval_results.csv", index=False)

>> Running on device: cpu


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.



>>> Evaluating: mt5_base


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  - Loaded HF/Local model: ./mt5-multilingual-XLSum
[en] → India's economy is at its highest rate in more than a decade.
[fr] → Le prix du pétrole a chuté à un niveau record.
[hi] → प्रधानमंत्री नरेंद्र मोदी ने नई शिक्षा नीति की घोषणा की है.
[es] → El cambio climático está cambiando en todo el mundo.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



>>> Evaluating: vanilla
[en] →  Indian economy is growing steadily this year . Indian economy is growing steadily this year . Indian economy is growing steadily this year .
[fr] →  Le marché international du pétrole a chuté récemment . Le marché international du pétrole a chuté récemment .
[hi] →  PM 'tई शिक्षा नीति' घोषणा in UAE . PM 'tई शिक्षा नीति' घोषणा in UAE .
[es] →  El clima is changing rápidamente en all el continent en all el continent . El clima is changing rápidamente en all el continent .

>>> Evaluating: lora


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  - Loaded PEFT adapter: ./model-lora-finetuned_2
[en] → India's economy is at its highest rate in more than a decade.
[fr] → Le prix du pétrole a chuté à un niveau record.
[hi] → प्रधानमंत्री नरेंद्र मोदी ने नई शिक्षा नीति की घोषणा की है.
[es] → El clima está cambiando rápidamente en todo el mundo.

>>> Evaluating: langanchor


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  - Loaded HF/Local model: ./model-langanchor-finetuned
[en] → The Indian economy is growing sharply in the past few years.
[fr] → Le marché mondial du pétrole a chuté à un niveau record.
[hi] → प्रधानमंत्री नरेंद्र मोदी ने नई शिक्षा नीति की घोषणा की है.
[es] → El clima está cambiando rápidamente en todo el mundo.

=== Final Results ===
        Model  ROUGE-1  ROUGE-2  ROUGE-L     BLEU  BERTScore  Perplexity  \
0    mt5_base   0.4126   0.3676   0.4126  17.1513     0.9317      3.3336   
1     vanilla   0.1477   0.0808   0.1477   2.6565     0.8726     11.7661   
2        lora   0.3188   0.2604   0.3188  15.0482     0.9177      2.5572   
3  langanchor   0.2326   0.1295   0.2326   7.2771     0.9079      2.2652   

   Time(s)  
0     9.06  
1     6.98  
2     7.96  
3     8.98  
