# Model Evaluation
This notebook evaluates a language model using the ROUGE metric. It loads a pre-trained model, applies a LoRA adapter, and evaluates the model's performance on a dataset.

## Load Model and Tokenizer
Load the pre-trained model and tokenizer, and apply the LoRA adapter.

In [None]:
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

# Load tokenizer & base model
model_id = "meta-llama/llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(model_id)

# Add LoRA adapter to base model
adapter_path = "../adapters/yugioh_phi_4"
lora_model = PeftModel.from_pretrained(base_model, adapter_path)

## Load Dataset
Load the evaluation dataset and prepare it for input to the model.

In [8]:
from datasets import load_dataset

# Load dataset
dataset_file = "../data/eval_chatml.jsonl"
dataset = load_dataset("json", data_files=dataset_file, split="train")

def extract(row):
    sys, user, assistant = row["messages"][:3]
    return {
        "inputs": [sys, user],
        "reference": assistant["content"]
    }

dataset = dataset.map(extract, remove_columns=["messages"])
inputs, references = dataset["inputs"], dataset["reference"]

Map: 100%|██████████| 4/4 [00:00<00:00, 1169.63 examples/s]


## Generate Model Outputs
Use the model to generate outputs for the reference inputs.

In [None]:
# Generate model outputs for the reference inputs
pipe = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer
)

outputs = pipe(inputs, max_length=200)

def extract_generated_text(row):
    return row[0]['generated_text'][2]['content']

outputs = [extract_generated_text(o) for o in outputs]

Device set to use mps:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForCausalLM', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausa

## Evaluate the Model
Evaluate the model's performance using the ROUGE metric.

In [19]:
from collections import defaultdict
from rouge_score import rouge_scorer

# Evaluate the model
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = []

# Calculate scores for each reference-prediction pair
for ref, pred in zip(references, outputs):
    score = scorer.score(ref, pred)
    scores.append(score)

# Print score result
aggregated = defaultdict(lambda: defaultdict(list))
for score in scores:
    for rouge_type, metrics in score.items():
        aggregated[rouge_type]['precision'].append(metrics.precision)
        aggregated[rouge_type]['recall'].append(metrics.recall)
        aggregated[rouge_type]['fmeasure'].append(metrics.fmeasure)

for rouge_type, metrics in aggregated.items():
    print(f"\n{rouge_type.upper()}:")
    for metric_name, values in metrics.items():
        print(f"  {metric_name.capitalize()}: {np.mean(values):.4f}")


ROUGE1:
  Precision: 0.4797
  Recall: 0.4842
  Fmeasure: 0.4243

ROUGE2:
  Precision: 0.2217
  Recall: 0.1661
  Fmeasure: 0.1573

ROUGEL:
  Precision: 0.4019
  Recall: 0.4065
  Fmeasure: 0.3465
