# Evaluating the Performance of Qwen 2.5 3B Instruct Model

## Evaluation Metrics

The evaluation metrics that will be used to evaluate the text summarization performance of the Large Language Models (LLMs) are:
1. METEOR (Metric for Evaluation of Translation with Explicit Ordering)
2. ROUGE-N (Recall-Oriented Understudy for Gisting Evaluation)
3. BERTScore
4. BLEU (BiLingual Evaluation Understudy)
5. G-Eval
6. FactCC
7. Model's Inference Time

In [None]:
import os
import torch
import pandas as pd

from datasets import load_from_disk
from transformers.utils import is_flash_attn_2_available
from unsloth import FastLanguageModel

import evaluate

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "XXXXXXXXXXXXXXXXXXXXX"

device = "cuda"
torch.cuda.empty_cache()

dataset = load_from_disk("../datasets/xsum_dataset.hf")
dataset

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 3750
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 3750
    })
})

In [None]:
def load_peft_model():
    
    if (is_flash_attn_2_available() and (torch.cuda.get_device_capability(0)[0] >= 8)):
        attn_implementation = "flash_attention_2"
    else:
        attn_implementation = "sdpa"
    
    print(f"[INFO] Using attention implementation: {attn_implementation}")

    model_id = "unsloth/qwen2.5-3B-Instruct"
    print(f"[INFO] Using model_id: {model_id}")

    peft_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = 8192,
        dtype = None,
        load_in_4bit = True,
        token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    )

    peft_model.to(device)

    return peft_model, tokenizer

peft_model, tokenizer = load_peft_model()

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: unsloth/Qwen2.5-3B-Instruct
==((====))==  Unsloth 2024.12.4: Fast Qwen2 patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


In [3]:
dataset = dataset['test']

articles = dataset['document'][0:50]
human_summaries = dataset['summary'][0:50]
generated_summaries = []

for idx, article in enumerate(articles):
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following text.

### Input:
{article}

### Response:
"""
    input_ids = tokenizer(prompt, return_tensors='pt')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = input_ids.to(device)
    human_baseline_text_output = human_summaries[idx]
    FastLanguageModel.for_inference(peft_model)
    peft_model_output = peft_model.generate(**input_ids, max_new_tokens = 8192, temperature = 0.1)
    prompt_length = input_ids['input_ids'].shape[1]
    peft_model_text_output = tokenizer.decode(peft_model_output[0][prompt_length:], skip_special_tokens = True)
    generated_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(articles, human_summaries, generated_summaries))

In [4]:
df = pd.DataFrame(zipped_summaries, columns = ['Article', 'Human Summary', 'Generated Summary'])
df

Unnamed: 0,Article,Human Summary,Generated Summary
0,"The Newport man faces other charges, including...",A 22-year-old man has been charged with causin...,The text describes an incident where a silver ...
1,"Staff at RSPCA Gonsal Farm animal centre, in D...",A Shropshire charity has designated October 'B...,The text discusses the situation at RSPCA Gons...
2,"According to CNN, the former FBI director and ...",Now that Hurricane Junior has blown through Wa...,The text discusses the potential involvement o...
3,Former leader Nick Paget-Brown resigned on 30 ...,The new leader of Kensington and Chelsea Counc...,The text discusses the resignation of Nick Pag...
4,That makes it more serious than a technical co...,"The index of the UK's biggest 100 companies, t...",The text discusses the current state of the UK...
5,Gorse fires have been big news this week and t...,"A ""river of filth"", a spate of gorse fires, an...",The provided text covers a wide range of local...
6,"David Davies, Ian Lucas, Albert Owen and Gerai...",Four Welsh MPs are standing for election as ch...,The text discusses the upcoming changes in lea...
7,Three auctioneers at Hotel Drouot also receive...,A French court has jailed 35 porters at the co...,The text discusses a scandal involving porters...
8,The Financial Conduct Authority (FCA) said tha...,"Investors must be quoted an ""all-in fee"" to ma...",The text discusses the findings of the Financi...
9,Yonhap news agency quoted a South Korean offic...,North Korean leader Kim Jong-il is paying his ...,The text discusses North Korean leader Kim Jon...


In [5]:
df.to_pickle("../generated_results/qwen_2_5_3B_results.pkl")

In [6]:
df = pd.read_pickle("../generated_results/qwen_2_5_3B_results.pkl")
df

Unnamed: 0,Article,Human Summary,Generated Summary
0,"The Newport man faces other charges, including...",A 22-year-old man has been charged with causin...,The text describes an incident where a silver ...
1,"Staff at RSPCA Gonsal Farm animal centre, in D...",A Shropshire charity has designated October 'B...,The text discusses the situation at RSPCA Gons...
2,"According to CNN, the former FBI director and ...",Now that Hurricane Junior has blown through Wa...,The text discusses the potential involvement o...
3,Former leader Nick Paget-Brown resigned on 30 ...,The new leader of Kensington and Chelsea Counc...,The text discusses the resignation of Nick Pag...
4,That makes it more serious than a technical co...,"The index of the UK's biggest 100 companies, t...",The text discusses the current state of the UK...
5,Gorse fires have been big news this week and t...,"A ""river of filth"", a spate of gorse fires, an...",The provided text covers a wide range of local...
6,"David Davies, Ian Lucas, Albert Owen and Gerai...",Four Welsh MPs are standing for election as ch...,The text discusses the upcoming changes in lea...
7,Three auctioneers at Hotel Drouot also receive...,A French court has jailed 35 porters at the co...,The text discusses a scandal involving porters...
8,The Financial Conduct Authority (FCA) said tha...,"Investors must be quoted an ""all-in fee"" to ma...",The text discusses the findings of the Financi...
9,Yonhap news agency quoted a South Korean offic...,North Korean leader Kim Jong-il is paying his ...,The text discusses North Korean leader Kim Jon...


### METEOR (Metric for Evaluation of Translation with Explicit Ordering)

In [7]:
meteor = evaluate.load("meteor")

peft_model_meteor_results = meteor.compute(
    predictions = generated_summaries,
    references = human_summaries[0:len(generated_summaries)]
)

print(peft_model_meteor_results)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.1690536194569562}


### ROUGE-N (Recall-Oriented Understudy for Gisting Evaluation)

In [8]:
rouge = evaluate.load("rouge")

peft_model_rouge_results = rouge.compute(
    predictions = generated_summaries,
    references = human_summaries[0:len(generated_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

print(peft_model_rouge_results)

{'rouge1': 0.11408670620746059, 'rouge2': 0.027356397486617057, 'rougeL': 0.08139195422125922, 'rougeLsum': 0.0820312132955069}


### BERTScore

In [9]:
from statistics import mean

bert_score = evaluate.load("bertscore")

peft_model_bert_score_results = bert_score.compute(
    predictions = df['Generated Summary'],
    references = df['Human Summary'][0:len(df['Generated Summary'])],
    lang = "en"
)

print(mean(peft_model_bert_score_results['precision']))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.8122566175460816


### BLEU (BiLingual Evaluation Understudy)

In [10]:
bleu_score = evaluate.load("bleu")

peft_model_bleu_score_results = bleu_score.compute(
    predictions = generated_summaries,
    references = human_summaries[0:len(generated_summaries)]
)

print(peft_model_bleu_score_results)

{'bleu': 0.0010697191667544998, 'precisions': [0.009149300599486154, 0.0019282959577203256, 0.0004716981132075472, 0.00015734515806036333], 'brevity_penalty': 1.0, 'length_ratio': 56.866883116883116, 'translation_length': 70060, 'reference_length': 1232}


### Average Inference Time

In [11]:
import time

inference_times = []

for idx, article in enumerate(articles):
    input_ids = tokenizer(prompt, return_tensors='pt')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids.to(device)

    FastLanguageModel.for_inference(peft_model)
    inference_start_time = time.time()
    model_output = peft_model.generate(**input_ids, max_new_tokens = 8192, temperature = 0.1)
    prompt_length = input_ids['input_ids'].shape[1]
    model_text_output = tokenizer.decode(model_output[0][prompt_length:], skip_special_tokens = True)
    inference_end_time = time.time()
    inference_time = inference_end_time - inference_start_time
    inference_times.append(inference_time)

mean_inference_time = mean(inference_times)
print(f"Average Inference Time: {mean_inference_time}")

Average Inference Time: 158.33024220466615
