# Evaluating models

### Things the could should do
1. Load model
1. Qualitative eval (prompt it)
1. Eval metrics

In [15]:
from pathlib import Path
from copy import deepcopy
import importlib

import torch
from peft import PeftModel
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

from llm_explore import utils
from llm_explore.definitions import ROOT_DIR

In [2]:
# User sets dataset and model names
DATASET_NAME = "knkarthick/dialogsum"
MODEL_NAME = "google/flan-t5-base"
PEFT_MODEL_FILE = Path(ROOT_DIR, "models", "peft-dialogue-summary-training-2025-05-08_21-14-28/checkpoint-2000/")
ADAPTER_NAME = "chk-2000"

In [None]:
device = utils.get_torch_device()

Returned MPS device


In [4]:
# Dataset and Model Initialization
dataset = load_dataset(DATASET_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(device)
 # Keep a copy of the original model for later use (keep on CPU for now)
model_orig = deepcopy(model).to(torch.device("cpu"))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

peft_model = PeftModel.from_pretrained(model,
    PEFT_MODEL_FILE,
    torch_device=device,
    adapter_name=ADAPTER_NAME,
    is_trainable=False)

In [5]:
# Load another adapter
path = Path(ROOT_DIR, "models", "peft-dialogue-summary-training-2025-05-05_08-03-52/checkpoint-1246/")
peft_model.load_adapter(path, adapter_name="chk-1246")

<All keys matched successfully>

In [6]:

# Tokenization and Dataset Preparation
def tokenize_function(example):
    """Tokenizes the input and output text for the model, 
    including a hardcoded prompt to summarize the conversation."""
    
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    output = tokenizer(prompt, truncation=True, padding='max_length', return_tensors='pt')
    output['labels'] = tokenizer(example['summary'], truncation=True, padding='max_length', return_tensors='pt').input_ids
    return output

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['id', 'topic', 'dialogue', 'summary'])


In [None]:
my_id = 102
prompt = utils.make_n_shot_summary_prompt(summarize_id=my_id, data=dataset)
print(prompt)


Summarize the following conversation.

#Person1#: Well, I'll see you later, Mrs. Todd. My wife is waiting for me to take her shopping.
#Person2#: I understand. There's a lot to get done at weekends, especially when you two work and the children are small.
#Person1#: That's right. Jane and I have been talking about visiting you. So when I saw you in the garden, I decided to come over and say hello.
#Person2#: I'm glad you did. In fact, I should have called on you first, since you have newly moved here.
#Person1#: By the way, do you need anything from the store?
#Person2#: No, but thanks for the offer. And thank you for coming over.
#Person1#: It's a pleasure.

Summary:



In [23]:
torch.mps.manual_seed(42)
sentence_encoded = tokenizer(prompt, return_tensors='pt').to('mps')  # Move the entire batch to MPS
model_orig = model_orig.to(device) 
peft_model.set_adapter("chk-2000")
completion = peft_model.generate(input_ids=sentence_encoded.input_ids,
                            num_beams=1,
                            do_sample=True,
                            max_new_tokens=1000,
                            generation_config=None)[0]  # No need to call .to('mps') again
tokenizer.decode(completion, skip_special_tokens=True)


"Jane wants to visit her son but Jane's wife's waiting to take her shopping. Mr. Todd encourages her. Mrs. Todd offers her a free gift."

In [24]:
peft_model.set_adapter("chk-1246")
completion = peft_model.generate(input_ids=sentence_encoded.input_ids,
                            num_beams=1,
                            do_sample=True,
                            max_new_tokens=1000,
                            generation_config=None)[0]  
tokenizer.decode(completion, skip_special_tokens=True)


'Mrs. Todd will see #Person1# at the weekend while the wife is busy shopping. Mrs. Todd decides to visit her in the garden.'

In [None]:
# PEFT model completion without sampling
completion = utils.get_model_completion(prompt, model=peft_model, tokenizer=tokenizer)
print(completion)

Mrs. Todd's wife is waiting for her shopping at weekends. Mrs. Todd's wife is waiting for her shopping.


In [None]:
# Get base model completion
completion = utils.get_model_completion(prompt, model=model_orig, tokenizer=tokenizer)
print(completion)

Person1#: I'm going to visit you later, Mrs. Todd.


- chk-1246 seems to be better than chk-2000, despite less training
- both seem to be better than base model