In [1]:
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import re
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
from tqdm import tqdm
from evaluate import load
import numpy as np
import yaml
from pprint import pprint


# Set CUDA device
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True"

# Load metrics
rouge = load('rouge')
bleu = load('bleu')
meteor = load('meteor')
bertscore = load('bertscore')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/infres/abounhar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/infres/abounhar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/infres/abounhar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Helper functions

## Compute metrics

In [18]:
@torch.no_grad()
def compute_metrics_causal_lm(eval_pred, tokenizer):
    """Compute ROUGE and BLEU scores for evaluation."""
    predictions, references = eval_pred

    # Clip token IDs to the valid range
    vocab_size = tokenizer.vocab_size

    def clip_token_ids(token_ids):
        """Clip token IDs to the valid range [0, vocab_size - 1]."""
        return [min(max(token_id, 0), vocab_size - 1) for token_id in token_ids]

    # Decode predictions and references
    decoded_preds = [
        tokenizer.decode(clip_token_ids(pred), skip_special_tokens=True)
        for pred in predictions
    ]
    decoded_refs = [
        tokenizer.decode(clip_token_ids(ref), skip_special_tokens=True)
        for ref in references
    ]
    
    # Clean summaries
    def clean_summary(text):
        special_tokens = ["<|im_end|>", "<|assistant|>", "<|user|>", "<|system|>"]
        for token in special_tokens:
            text = text.replace(token, "")
        return re.sub(r"\s+", " ", text).strip()
    
    pred_summaries = []
    for pred in decoded_preds:
        if "<|assistant|>" in pred:
            summary = pred.split("<|assistant|>")[-1].strip()
            summary = clean_summary(summary)
            pred_summaries.append(summary)
        else:
            summary = pred.strip()
            summary = clean_summary(summary)
            pred_summaries.append(summary)
            
    # apply the same to the references
    ref_summaries = []
    for ref in decoded_refs:
        if "<|assistant|>" in ref:
            summary = ref.split("<|assistant|>")[-1].strip()
            summary = clean_summary(summary)
            ref_summaries.append(summary)
        else:
            summary = ref.strip()
            summary = clean_summary(summary)
            ref_summaries.append(summary)
            
    # print(f'0 - ref_summaries[0]: {ref_summaries[0]}')
    
    # Convert to token IDs
    pred_token_ids = [tokenizer.encode(p, add_special_tokens=False) for p in pred_summaries]
    ref_token_ids = [tokenizer.encode(r, add_special_tokens=False) for r in ref_summaries]

    # Use the exact same metric function from training
    eval_pred = (pred_token_ids, ref_token_ids)
    
    predictions, references = eval_pred

    # Clip token IDs to the valid range
    vocab_size = tokenizer.vocab_size

    # Decode predictions and references in batches
    decoded_preds = tokenizer.batch_decode([clip_token_ids(pred) for pred in predictions], skip_special_tokens=True)
    decoded_refs = tokenizer.batch_decode([clip_token_ids(ref) for ref in references], skip_special_tokens=True)
    
    # Print decoded examples to inspect issues
    print(f'decoded_preds[0]: {decoded_preds[0]}')
    print(f'decoded_refs[0]: {decoded_refs[0]}')

    # Compute ROUGE, BLEU and BERT scores
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_refs, use_stemmer=True)
    bleu_results = bleu.compute(predictions=decoded_preds, references=decoded_refs)
    bertscore_results = bertscore.compute(
        predictions=decoded_preds, 
        references=decoded_refs, 
        lang='ar'
    )

    # save metrics
    metrics = {key: rouge_results[key] * 100 for key in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}
    metrics["bleu"] = bleu_results["bleu"] * 100
    metrics["bertscore_precision"] = sum(bertscore_results['precision']) / len(bertscore_results['precision']) * 100,
    metrics["bertscore_recall"] = sum(bertscore_results['recall']) / len(bertscore_results['recall']) * 100,
    metrics["bertscore_f1"] = sum(bertscore_results['f1']) / len(bertscore_results['f1']) * 100

    return metrics


@torch.no_grad()
def compute_metrics_seq2seq(eval_pred, tokenizer):
    preds, labels = eval_pred
    
    print(preds)
    print(labels)
    # # Clip token IDs to valid range
    # preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    # labels = np.clip(labels, 0, tokenizer.vocab_size - 1)
    
    # # Ensure labels are not masked
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels directly using batch_decode
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    print(f'text_preds[0]: {text_preds[0]}')
    print(f'text_labels[0]: {text_labels[0]}')
    
    # Compute ROUGE, BLEU and BERT scores
    rouge_results = rouge.compute(predictions=text_preds, references=text_labels, use_stemmer=True)
    bleu_results = bleu.compute(predictions=text_preds, references=text_labels)
    bertscore_results = bertscore.compute(
        predictions=text_preds, 
        references=text_labels, 
        lang='ar'
    )

    # save metrics
    metrics = {key: rouge_results[key] * 100 for key in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}
    metrics["bleu"] = bleu_results["bleu"] * 100
    metrics["bertscore_precision"] = sum(bertscore_results['precision']) / len(bertscore_results['precision']) * 100,
    metrics["bertscore_recall"] = sum(bertscore_results['recall']) / len(bertscore_results['recall']) * 100,
    metrics["bertscore_f1"] = sum(bertscore_results['f1']) / len(bertscore_results['f1']) * 100
    
    return metrics

## Batch summarization functions

In [19]:
def summarize_dataset(dataset, model, tokenizer, model_name, is_causal, max_length=1024, max_new_tokens=256, batch_size=16, device="cuda"):
    """Summarize all texts in the dataset using the trained model."""
    model.eval()
    model.to(device)

    # Summarize in batches
    summaries = []
    
    # Get the actual text column name from the dataset
    text_column = 'text'  # adjust if your column name is different
    
    for i in tqdm(range(0, len(dataset), batch_size)):
        # Get a batch of examples
        batch = dataset[i:i + batch_size]
        
        # Extract the text content properly
        batch_texts = batch[text_column]
        
        # causal models were trained in SFT mode with chat template
        if is_causal:
            # Prepare the messages for the model using the tokenizer's chat template
            messages = [
                [{"role": "user", "content": text}] for text in batch_texts
            ]
            
            # Apply the chat template
            input_ids = tokenizer.apply_chat_template(
                messages, 
                truncation=True,
                max_length=max_length,  # adjust based on your model's context window
                add_generation_prompt=True,
                return_tensors="pt",
                padding=True
            )
            # Create attention mask based on non-zero tokens
            attention_mask = (input_ids != tokenizer.pad_token_id).long()
        else:
            
            # Tokenize with explicit padding to max_length
            inputs = tokenizer(batch_texts, max_length=max_length, truncation=True, padding='max_length')
            input_ids = torch.LongTensor(inputs['input_ids'])
            attention_mask = torch.LongTensor(inputs['attention_mask'])

            # Fix 1: Add explicit position IDs clamping
            position_ids = torch.arange(0, input_ids.size(-1), dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

            # Access max_position_embeddings directly from the model's config
            max_position_embeddings = max_length

            position_ids = position_ids.clamp(max=max_position_embeddings - 1)

            # Fix 2: Ensure correct truncation length
            truncation_length = max_position_embeddings
            if input_ids.shape[1] > truncation_length:
                input_ids = input_ids[:, :truncation_length]
                attention_mask = attention_mask[:, :truncation_length]


        
        # Move tensors to device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Summarize the batch
        batch_summaries = summarize_batch(
            input_ids,
            attention_mask,
            model,
            tokenizer,
            max_new_tokens,
            device
        )
        
        # Save summaries
        summaries.extend(batch_summaries)
        
    # Add summaries to the dataset
    dataset = dataset.add_column(f"summary_{model_name}", summaries)
    return dataset

def summarize_batch(input_ids, attention_mask, model, tokenizer, max_new_tokens, device):
    generation_config = model.generation_config
    
    # Generate summaries
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            bos_token_id=generation_config.bos_token_id,
            eos_token_id=generation_config.eos_token_id,
            pad_token_id=generation_config.pad_token_id,
            # num_beams=3,
            # do_sample=True,
            # temperature=0.7,
            # top_k=50,
            # top_p=0.95
        )
    
    # Decode the generated outputs
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # Extract summaries (text after <|assistant|>)
    summaries = []
    for text in generated_texts:
        if "<|assistant|>" in text:
            summaries.append(text.split("<|assistant|>")[-1].strip())
        else:
            summaries.append(text.strip())
    
    # Clean summaries
    def clean_summary(text):
        special_tokens = ["<|im_end|>", "<|assistant|>", "<|user|>", "<|system|>"]
        for token in special_tokens:
            text = text.replace(token, "")
        return re.sub(r"\s+", " ", text).strip()
    
    cleaned_summaries = [clean_summary(summary) for summary in summaries]
    
    return cleaned_summaries

## Chat template creation and tokenization

In [20]:
def create_conversation(example):
    """
    Transform the dataset into a conversational format.
    The user provides the text, and the assistant provides the summary.
    """
    # Create a conversation with user and assistant roles
    messages = [
        {"role": "user", "content": example["text"]},  # User provides the text
    ]
    # Return the conversation as a dictionary
    return {"messages": messages}

def apply_chat_template(example, tokenizer):
    """ Apply the chat template to the dataset. """
    example["text"] = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    return example

def preprocess_function(examples, tokenizer):
    return tokenizer(examples['text'], padding=True, truncation=True, return_tensors="pt")

In [21]:
MODELS_DICT = {
    ##############################                 QWEN models                #############################################
    # full mixed precision finetuning
    "BounharAbdelaziz/Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5": {
        "batch_size": 64,
        "is_causal": True,
        "max_len": 2048,
    },
    "BounharAbdelaziz/Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5":{
        "batch_size": 64,
        "is_causal": True,
        "max_len": 2048,
    },
    # LoRA finetuned models
    "BounharAbdelaziz/Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5": {
        "batch_size": 16,
        "is_causal": True,
        "max_len": 2048,
    },
    "BounharAbdelaziz/Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5":{
        "batch_size": 16,
        "is_causal": True,
        "max_len": 2048,
    },
    ###############################                 Falcon models                #############################################
    # full mixed precision finetuning
    "BounharAbdelaziz/Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5": {
        "batch_size": 32,
        "is_causal": True,
        "max_len": 1024,
    },
    "BounharAbdelaziz/Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5":{
        "batch_size": 32,
        "is_causal": True,
        "max_len": 1024,
    },
    # LoRA finetuned models
    "BounharAbdelaziz/Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5": {
        "batch_size": 32,
        "is_causal": True,
        "max_len": 1024,
    },
    "BounharAbdelaziz/Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5":{
        "batch_size": 32,
        "is_causal": True,
        "max_len": 1024,
    },
    ###############################                 mT5 models                #############################################
    # full mixed precision finetuning
    "BounharAbdelaziz/mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5": {
        "batch_size": 128,
        "is_causal": False,
        "max_len": 1024,
    },
    "BounharAbdelaziz/mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5":{
        "batch_size": 128,
        "is_causal": False,
        "max_len": 1024,
    },
    "BounharAbdelaziz/mt5-small-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5": {
        "batch_size": 128,
        "is_causal": False,
        "max_len": 1024,
    },
    "BounharAbdelaziz/mt5-base-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5": {
        "batch_size": 128,
        "is_causal": False,
        "max_len": 1024,
    },
    # ###############################                 GPT2 models                #############################################
    # # full mixed precision finetuning
    # "BounharAbdelaziz/gpt2-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-SFT-mx-1024-v5": {
    #     "batch_size": 1,
    #     "is_causal": True,
    #     "max_len": 1024,
    # },
    # "BounharAbdelaziz/gpt2-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-8-gnm-1.0-FP16-SFT-mx-1024-v5":{
    #     "batch_size": 1,
    #     "is_causal": True,
    #     "max_len": 1024,
    # },
    # "BounharAbdelaziz/gpt2-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-mx-1024-v5": {
    #     "batch_size": 1,
    #     "is_causal": True,
    #     "max_len": 1024,
    # },
}

In [6]:
# Load dataset
eval_dataset = load_dataset("BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Filtered", split='test')

In [8]:
eval_dataset

Dataset({
    features: ['text', 'summary', 'summary_model_name', 'tokenizer_name', 'dataset_source', 'sequence_length'],
    num_rows: 444
})

In [12]:
# chat template for SFT models
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
# precision
torch_dtype = torch.float16

# inference device
device = "cuda" if torch.cuda.is_available() else "cput"

# maximum number of tokens for generate()
MAX_NEW_TOKENS = 256

In [None]:
# initialize and empty dataframe to store the results
metrics_df = pd.DataFrame()

for model_path, config in MODELS_DICT.items():
    
    BATCH_SIZE = config['batch_size']
    IS_CAUSAL_LM = config['is_causal']
    MAX_LEN = config['max_len']

    if IS_CAUSAL_LM:
        # load model
        if "gpt2" in model_path:
            # GPT2 models surprisingly don't work. They raise a cuda error that I wasn't able to debug.
            # With the fact that other models already performed better on the evaluation set, I didn't include them in this work.
            # Also, they generate a similar error when trained with a batch size larger than 1! 
            model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype)
        else:
            model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype).to("cuda")
        model.use_cache = True
        
        # load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
            
        # Set chat template
        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
        
        if "gpt2" in model_path:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        
    else:
        # load model
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch_dtype).to("cuda")
        model.use_cache = True
        
        # load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # set padding side deppending on model type
    tokenizer.padding_side= 'left' if IS_CAUSAL_LM else 'right'

    # Set reasonable default for models without max length
    tokenizer.model_max_length = MAX_LEN

    # Set pad_token_id equal to the eos_token_id if not set
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
        
    # get model name -> used in column name for saving summaries predictions in dataset
    model_name = model_path.split('/')[-1].strip()
    
    # run summarization and saving of dataset
    eval_dataset = summarize_dataset(eval_dataset, model, tokenizer, model_name, is_causal=IS_CAUSAL_LM, max_length=MAX_LEN, max_new_tokens=MAX_NEW_TOKENS, batch_size=BATCH_SIZE, device=device)
    
    # get predictions and references
    predictions = eval_dataset[f"summary_{model_path.split('/')[-1]}"]
    references = eval_dataset["summary"]

    # Convert to token IDs like during training
    pred_token_ids = [tokenizer.encode(p) for p in predictions]
    ref_token_ids = [tokenizer.encode(r) for r in references]

    # Use the exact same metric function from training
    eval_pred = (pred_token_ids, ref_token_ids)
    
    # compute metrics
    if IS_CAUSAL_LM:
        metrics = compute_metrics_causal_lm(eval_pred, tokenizer)
    else:
        metrics = compute_metrics_seq2seq(eval_pred, tokenizer)
        
    print(metrics)
    
    # save metrics
    metrics_df[model_name] = metrics
    
    # Clear GPU cache
    torch.cuda.empty_cache()
    
    print("-"*100)

In [43]:
eval_dataset

Dataset({
    features: ['text', 'summary', 'summary_model_name', 'tokenizer_name', 'dataset_source', 'sequence_length', 'summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5', 'summary_Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5', 'summary_mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5', 'summary_mt5-sma

In [46]:
metrics_df = metrics_df.transpose()

In [47]:
metrics_df

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu,bertscore_precision,bertscore_recall,bertscore_f1
Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5,10.154739,4.39905,10.042021,10.094154,2.364585,"(65.95857325974886,)","(72.82558612458341,)",69.142793
Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5,8.773724,3.949345,8.650295,8.765712,2.038522,"(73.20954905705409,)","(73.9764558570879,)",73.494251
Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5,13.796634,6.548504,13.531252,13.649635,2.952568,"(75.026213001829,)","(76.43683350032514,)",75.618538
Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5,13.618346,6.298045,13.241807,13.384698,2.987951,"(75.16128604745006,)","(76.52390929492744,)",75.737516
Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5,12.72673,5.657916,12.273076,12.344337,2.961368,"(68.77494174483661,)","(76.1501824130883,)",72.178532
Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5,13.231652,5.493227,12.838139,12.840043,2.95402,"(68.61420049592182,)","(76.11227748361794,)",72.074321
Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5,13.088594,5.741706,12.63974,12.649485,3.057522,"(68.65262965764012,)","(76.27777336417017,)",72.178872
Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5,13.387896,6.16999,12.880398,12.948814,3.018385,"(68.92468658116486,)","(76.25782343449893,)",72.30786
mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5,7.060375,1.708262,7.040035,7.073086,4.883842,"(69.3450996467659,)","(69.0343491128973,)",69.115218
mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5,5.048542,0.79461,4.940549,5.043148,4.353375,"(68.84305755028853,)","(67.02528249573064,)",67.814629


In [48]:
metrics_df.to_csv('test_metrics.csv')

In [None]:
Dataset.from_pandas(metrics_df).push_to_hub("BounharAbdelaziz/Arabic-Summarization-Eval-Metrics")

In [49]:
eval_dataset

Dataset({
    features: ['text', 'summary', 'summary_model_name', 'tokenizer_name', 'dataset_source', 'sequence_length', 'summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5', 'summary_Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5', 'summary_mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5', 'summary_mt5-sma

In [50]:
eval_dataset.push_to_hub("BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval", commit_message="Pushed model predictions.")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 22.91ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval/commit/02ecbfe1e3d1b1433ea6f7bbe023473a4bd56b39', commit_message='Pushed model predictions.', commit_description='', oid='02ecbfe1e3d1b1433ea6f7bbe023473a4bd56b39', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval'), pr_revision=None, pr_num=None)

In [51]:
eval_dataset.to_csv('test_summaries.csv')

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  5.45ba/s]


12320738

# LLM as a Judge Evaluation

In [107]:
def evaluate_quality_batch(model, tokenizer, texts, summaries, system_prompt, batch_size, max_new_tokens=512):
    """
    This function evaluates the quality of summaries for a batch of queries using the model.
    
    Args:
        model: the preloaded model to use
        tokenizer: the preloaded tokenizer to use
        texts: list of input texts (for context)
        summaries: list of model summaries to evaluate
        system_prompt: the system prompt for summarization quality evaluation
        batch_size: number of queries to process in parallel
        max_new_tokens: maximum number of new tokens to generate (not used in this case)
        
    Returns:
        list of dictionaries with quality scores and model names
    """
    
    # Set model to eval mode
    model.eval()
    
    # Results will store the evaluation scores
    results = []
    
    # Use torch.no_grad() to disable gradient computation
    with torch.no_grad():
        # Iterate through queries and summaries in batches
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_summaries = summaries[i:i + batch_size]
            
            # Prepare messages for the entire batch
            batch_messages = [
                [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Text: {text}\nSummary: {summary}"}
                ] for text, summary in zip(batch_texts, batch_summaries)
            ]
            
            # Tokenize the batch with left padding
            batch_texts = [
                tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                ) for messages in batch_messages
            ]
            
            model_inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, padding_side='left').to(model.device)
            
            # Generate evaluations for the batch
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens
            )
            
            # Decode the generated tokens (which contains the score)
            generated_ids = [
                output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
            ]
            
            # Batch decode the results
            quality_scores = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            
            # Convert the quality scores to numeric values (assuming LLM outputs valid integers like "0", "1", or "2")
            for score in quality_scores:
                try:
                    results.append(int(score.strip()))
                except ValueError:
                    results.append(-1)  # Error handling if the response is invalid
    
    return results

def compute_average_scores(dataset, model_names):
    """
    Compute the average scores for each model in the dataset.
    
    Args:
        dataset: The dataset containing quality scores for each model
        model_names: List of model names whose average score needs to be computed
        
    Returns:
        A dictionary of average scores per model
    """
    avg_scores = {}
    for model_name in model_names:
        quality_scores = dataset[f'quality_score_summary_{model_name}']
        avg_scores[model_name] = sum(quality_scores) / len(quality_scores) if len(quality_scores) > 0 else 0
        
    # Convert to DataFrame
    avg_scores = pd.DataFrame(list(avg_scores.items()), columns=['Model', 'LLM Average Score'])
    return avg_scores

In [108]:
# device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# get configuration
with open('eval_config.yaml') as file:
    config = yaml.safe_load(file)
    
print('-'*50)
print("[INFO] Loaded configuration:")
print('-'*50)

eval_dataset = load_dataset('BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval', split='test')

# set the model to use
JUDGE_MODEL_NAME = config['JUDGE_MODEL_NAME']
print(f'[INFO] Using LLM Judge: {JUDGE_MODEL_NAME}')

# set the system prompt for quality evaluation
SYSTEM_PROMPT = config['SYSTEM_PROMPT']

# Batched processing
batch_size = config['BATCH_SIZE']

# load the model
model = AutoModelForCausalLM.from_pretrained(
    JUDGE_MODEL_NAME,
    torch_dtype=torch.float16,  # Use float16 for faster inference
    device_map="auto",
    attn_implementation="flash_attention_2",  # Enable Flash Attention 2 for faster inference
)
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL_NAME)

# Set padding side to left for decoder-only models
tokenizer.padding_side = 'left'

--------------------------------------------------
[INFO] Loaded configuration:
--------------------------------------------------
[INFO] Using LLM Judge: Qwen/Qwen2.5-7B-Instruct-AWQ


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.73s/it]


In [109]:
batch_size = 16

In [130]:
eval_dataset

Dataset({
    features: ['text', 'summary', 'summary_model_name', 'tokenizer_name', 'dataset_source', 'sequence_length', 'summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5', 'summary_Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5', 'summary_mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5', 'summary_mt5-sma

In [131]:
# Clear GPU cache
torch.cuda.empty_cache()

# dataset that we will update
updated_dataset = DatasetDict()

model_names = []  # To track the model names

# Prepare texts and summaries for batched processing
texts = eval_dataset['text']

# Process each model's summaries (you may iterate through all model summaries here)
for model_col in eval_dataset.column_names:
    if model_col.startswith("summary_") and model_col != "summary_model_name":
        print(f'model_col: {model_col}')
        summaries = eval_dataset[model_col]
        
        model_name = model_col.replace("summary_", "")
        model_names.append(model_name)  # Store the model name for later
        
        # Process in batches with progress bar
        updated_results = []
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Evaluating {model_col}"):
            batch_texts = texts[i:i + batch_size]
            batch_summaries = summaries[i:i + batch_size]
            batch_scores = evaluate_quality_batch(
                model,
                tokenizer,
                batch_texts,
                batch_summaries,
                SYSTEM_PROMPT,
                batch_size=batch_size
            )
            updated_results.extend(batch_scores)
        
        # Save results in the dataset
        eval_dataset = eval_dataset.add_column(f"quality_score_{model_col}", updated_results)


model_col: summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5


Evaluating summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5:   0%|          | 0/28 [00:00<?, ?it/s]

Evaluating summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5: 100%|██████████| 28/28 [03:37<00:00,  7.78s/it]


model_col: summary_Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5


Evaluating summary_Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5: 100%|██████████| 28/28 [03:19<00:00,  7.11s/it]


model_col: summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5


Evaluating summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5: 100%|██████████| 28/28 [03:20<00:00,  7.17s/it]


model_col: summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5


Evaluating summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5: 100%|██████████| 28/28 [03:22<00:00,  7.22s/it]


model_col: summary_Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5


Evaluating summary_Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5: 100%|██████████| 28/28 [02:28<00:00,  5.30s/it]


model_col: summary_Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5


Evaluating summary_Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5: 100%|██████████| 28/28 [02:29<00:00,  5.33s/it]


model_col: summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5


Evaluating summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5: 100%|██████████| 28/28 [02:32<00:00,  5.44s/it]


model_col: summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5


Evaluating summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5: 100%|██████████| 28/28 [02:27<00:00,  5.27s/it]


model_col: summary_mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5


Evaluating summary_mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5: 100%|██████████| 28/28 [02:09<00:00,  4.62s/it]


model_col: summary_mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5


Evaluating summary_mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5: 100%|██████████| 28/28 [02:04<00:00,  4.45s/it]


model_col: summary_mt5-small-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5


Evaluating summary_mt5-small-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5: 100%|██████████| 28/28 [02:12<00:00,  4.72s/it]


model_col: summary_mt5-base-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5


Evaluating summary_mt5-base-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5: 100%|██████████| 28/28 [02:02<00:00,  4.37s/it]


In [132]:
eval_dataset

Dataset({
    features: ['text', 'summary', 'summary_model_name', 'tokenizer_name', 'dataset_source', 'sequence_length', 'summary_Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5', 'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5', 'summary_Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5', 'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5', 'summary_mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5', 'summary_mt5-sma

In [133]:
model_names

['Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5',
 'Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-v5',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-256-a-128-v5',
 'Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5',
 'Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-v5',
 'Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-128-a-64-v5',
 'Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5',
 'mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5',
 'mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5',
 'mt5-small-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP16-mx-1024-v5',
 'mt5-base-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm-1.0-FP1

In [134]:
# Compute and display average scores for each model
avg_scores = compute_average_scores(eval_dataset, model_names)
print("[INFO] Average scores per model:")
print(avg_scores)

[INFO] Average scores per model:
                                                Model  LLM Average Score
0   Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-3...           1.168919
1   Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0...           1.074324
2   Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1...           1.637387
3   Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1...           1.632883
4   Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gac...           1.403153
5   Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...           1.398649
6   Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...           1.443694
7   Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...           1.436937
8   mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm...           0.520270
9   mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gn...           0.229730
10  mt5-small-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gn...           0.189189
11  mt5-base-bs-2-lr-0.005-ep-3-wp-0.1-gacc-16-gnm...           0.036036


In [135]:
avg_scores

Unnamed: 0,Model,LLM Average Score
0,Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-3...,1.168919
1,Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0...,1.074324
2,Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1...,1.637387
3,Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1...,1.632883
4,Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gac...,1.403153
5,Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...,1.398649
6,Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...,1.443694
7,Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...,1.436937
8,mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm...,0.52027
9,mt5-small-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gn...,0.22973


In [136]:
Dataset.from_pandas(avg_scores).push_to_hub("BounharAbdelaziz/Arabic-Summarization-Eval-LLM-as-a-Judge", private=True, commit_message="LLM as a judge evaluation scores.")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1960.87ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Arabic-Summarization-Eval-LLM-as-a-Judge/commit/8a61df3a3271ef62bb900b1e8a0c2d6f1d97fc50', commit_message='LLM as a judge evaluation scores.', commit_description='', oid='8a61df3a3271ef62bb900b1e8a0c2d6f1d97fc50', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Arabic-Summarization-Eval-LLM-as-a-Judge', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Arabic-Summarization-Eval-LLM-as-a-Judge'), pr_revision=None, pr_num=None)

# Model Quantization

As the best performing model is a 3B model, we quantize it and verify the scores of the quantized version.

In [144]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

In [146]:
best_model_path = "BounharAbdelaziz/Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5"
quant_path = 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

In [147]:
# Load model
model = AutoAWQForCausalLM.from_pretrained(
    best_model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(best_model_path, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

Fetching 13 files: 100%|██████████| 13/13 [00:00<00:00, 115033.65it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.41it/s]
Repo card metadata block was not found. Setting CardData to empty.
AWQ: 100%|██████████| 36/36 [09:37<00:00, 16.03s/it]


In [150]:
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

('Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq/tokenizer_config.json',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq/special_tokens_map.json',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq/vocab.json',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq/merges.txt',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq/added_tokens.json',
 'Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5-awq/tokenizer.json')

In [None]:
# Push using `huggingface_hub`
from huggingface_hub import HfApi

hub_path = "BounharAbdelaziz/Qwen2.5-3B-Instruct-Summarizer-AWQ"

api = HfApi()
api.create_repo(repo_id=hub_path, exist_ok=True)
api.upload_folder(repo_id=hub_path, folder_path=quant_path)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
tokenizer.json: 100%|██████████| 11.4M/11.4M [00:01<00:00, 9.68MB/s]


## Evaluate the quantized model

In [5]:
eval_dataset = load_dataset("BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval", split="test")
metrics_df = load_dataset("BounharAbdelaziz/Arabic-Summarization-Eval-Metrics", split="train").to_pandas()
avg_scores = load_dataset("BounharAbdelaziz/Arabic-Summarization-Eval-LLM-as-a-Judge", split="train").to_pandas()

In [87]:
# set new column name
metrics_df.columns = [['rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'bleu', 'bertscore_precision', 'bertscore_recall', 'bertscore_f1', 'Model']]
# change order
metrics_df = metrics_df[['Model', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'bleu', 'bertscore_precision', 'bertscore_recall', 'bertscore_f1']]

In [90]:
# Flatten MultiIndex columns (if they exist)
if isinstance(metrics_df.columns, pd.MultiIndex):
    metrics_df.columns = metrics_df.columns.get_level_values(0)  # Keep only the first level

In [10]:
quantized_model_path = "BounharAbdelaziz/Qwen2.5-3B-Instruct-Summarizer-AWQ"
torch_dtype = torch.float16

In [24]:
BATCH_SIZE = 32
IS_CAUSAL_LM = True
MAX_LEN = 2048

# load model
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype=torch_dtype).to("cuda")
model.use_cache = True

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
    
# Set chat template
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
    
# set padding side deppending on model type
tokenizer.padding_side= 'left'

# Set reasonable default for models without max length
tokenizer.model_max_length = MAX_LEN

# Set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    
# get model name -> used in column name for saving summaries predictions in dataset
model_name = quantized_model_path.split('/')[-1].strip()

# get predictions and references
predictions = eval_dataset[f"summary_{quantized_model_path.split('/')[-1]}"]
references = eval_dataset["summary"]

# Convert to token IDs like during training
pred_token_ids = [tokenizer.encode(p) for p in predictions]
ref_token_ids = [tokenizer.encode(r) for r in references]

# Use the exact same metric function from training
eval_pred = (pred_token_ids, ref_token_ids)

# compute metrics
metrics = compute_metrics_causal_lm(eval_pred, tokenizer)
print(metrics)

Token indices sequence length is longer than the specified maximum sequence length for this model (2228 > 2048). Running this sequence through the model will result in indexing errors


decoded_preds[0]: في صقلية قبل 1061 ميلاديًا، كانت الجزيرة مجزأة إلى خمس إمارات، وكان العرب والأشداء يتنافسون. بعد وفاة روجر الأول في 1101م، أصبحت باليرمو تحت حكم روجر الثاني. استمرت اللغة العربية في بلاط ملك النورمان حتى القرن 12. جورج الأنطاكي هو مثال على الشخصيات التي كانت تتحدث العربية في بلاطهم. أدرك فريدريك الثاني أهمية القصيدة العربية في بلاطه. بينما تظل صقلية جزيرة مسلمة في عهد الكلبيين، إلا أنها لم تفقد ثقافة العرب.
decoded_refs[0]: في 1061، كانت صقلية مجزأة إلى خمس إمارات مع تنافس عربي وأمازيغي. استطاع الملك النورماني روeger الأول السيطرة عليها، وأصبحت باليرمو عاصمتها عام 1072. رغم فقدان العرب للسلطة السياسية، ظلوا الثقافة الرئيسية، وازدهرت فيها الأدب والعلم حتى القرن الثاني عشر. بعد وفاة روeger الأول، استمر هذا الوضع مع ابنه روجر الثاني.
{'rouge1': 11.781275166703985, 'rouge2': 5.380885414174803, 'rougeL': 11.666032341378854, 'rougeLsum': 11.745504217336922, 'bleu': 2.677481236207293, 'bertscore_precision': (74.0169356736514,), 'bertscore_recall': (75.99423225667026,), 'bert

In [117]:
eval_dataset = eval_dataset.add_column(f"summary_{quantized_model_path.split('/')[-1]}", predictions)

In [119]:
# Clear GPU cache
torch.cuda.empty_cache()

# dataset that we will update
updated_dataset = DatasetDict()

model_names = []  # To track the model names

# Prepare texts and summaries for batched processing
texts = eval_dataset['text']

# Process each model's summaries (you may iterate through all model summaries here)
for model_col in eval_dataset.column_names:
    if model_col == f"summary_{quantized_model_path.split('/')[-1]}":
        print(f'model_col: {model_col}')
        summaries = eval_dataset[model_col]
        
        model_name = model_col.replace("summary_", "")
        model_names.append(model_name)  # Store the model name for later
        
        # Process in batches with progress bar
        updated_results = []
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Evaluating {model_col}"):
            batch_texts = texts[i:i + batch_size]
            batch_summaries = summaries[i:i + batch_size]
            batch_scores = evaluate_quality_batch(
                model,
                tokenizer,
                batch_texts,
                batch_summaries,
                SYSTEM_PROMPT,
                batch_size=batch_size
            )
            updated_results.extend(batch_scores)
        
        # Save results in the dataset
        eval_dataset = eval_dataset.add_column(f"quality_score_{model_col}", updated_results)


model_col: summary_Qwen2.5-3B-Instruct-Summarizer-AWQ


Evaluating summary_Qwen2.5-3B-Instruct-Summarizer-AWQ: 100%|██████████| 28/28 [03:14<00:00,  6.94s/it]


In [125]:
# do it only for the AWQ, we already have the others
model_names = ['Qwen2.5-3B-Instruct-Summarizer-AWQ']

In [129]:
# Compute and display average scores for each model
new_avg_scores = compute_average_scores(eval_dataset, model_names)
print(new_avg_scores)

                                Model  LLM Average Score
0  Qwen2.5-3B-Instruct-Summarizer-AWQ           1.536036


#### The AWQ model scores pretty well in the LLM as a Judge evaluation

In [135]:
new_avg_scores_df = pd.DataFrame(new_avg_scores)
all_avg_scores = pd.concat([new_avg_scores_df, avg_scores])
all_avg_scores

Unnamed: 0,Model,LLM Average Score
0,Qwen2.5-3B-Instruct-Summarizer-AWQ,1.536036
0,Qwen2.5-0.5B-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-3...,1.168919
1,Qwen2.5-0.5B-Instruct-bs-2-lr-0.0001-ep-3-wp-0...,1.074324
2,Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1...,1.637387
3,Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1...,1.632883
4,Falcon3-1B-Base-bs-1-lr-0.0001-ep-3-wp-0.1-gac...,1.403153
5,Falcon3-1B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...,1.398649
6,Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...,1.443694
7,Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1...,1.436937
8,mt5-base-bs-2-lr-0.001-ep-3-wp-0.1-gacc-16-gnm...,0.52027


In [136]:
all_avg_scores_ds = Dataset.from_pandas(all_avg_scores).push_to_hub("BounharAbdelaziz/Arabic-Summarization-Eval-LLM-as-a-Judge", commit_message="Added AWQ scores")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1711.96ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


In [138]:
eval_dataset = eval_dataset.remove_columns(['quality_score_summary_Qwen2.5-3B-Instruct-Summarizer-AWQ'])

In [141]:
# push new summaries
eval_dataset.push_to_hub('BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval', commit_message="Added AWQ summaries")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 22.85ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval/commit/9149d8806f459bf32008ea27d59d1ec488147d8f', commit_message='Added AWQ summaries', commit_description='', oid='9149d8806f459bf32008ea27d59d1ec488147d8f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Arabic-Synthetic-Summarization-Dataset-Eval'), pr_revision=None, pr_num=None)

# Human Eval

In [None]:
# this cell was run in another notebook, i just kept it here to see which columns (i.e. model) were used 

# select data for human eval
selected_for_human_eval = [
    'text', # we keep text to show it
    'summary_Qwen2.5-3B-Instruct-bs-2-lr-0.0001-ep-3-wp-0.1-gacc-32-gnm-1.0-FP16-SFT-mx-2048-r-128-a-64-v5', 
    'summary_Qwen2.5-3B-Instruct-Summarizer-AWQ', 
    'summary_Falcon3-3B-Instruct-bs-1-lr-0.0001-ep-3-wp-0.1-gacc-1-gnm-1.0-FP16-SFT-mx-1024-r-256-a-128-v5'
]

human_eval_dataset = eval_dataset.select_columns(selected_for_human_eval)

human_eval_dataset.push_to_hub('BounharAbdelaziz/Arabic-Summarization-Human-Eval-Summaries')