# Evaluating models

In [None]:
from pathlib import Path
from copy import deepcopy
import importlib
import pandas as pd
pd.options.display.max_colwidth = None

import torch
from peft import PeftModel
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate

from llm_explore import utils
from llm_explore.definitions import ROOT_DIR

In [2]:
# User sets dataset and model names
DATASET_NAME = "knkarthick/dialogsum"
MODEL_NAME = "google/flan-t5-base"
PEFT_MODEL_FILE = Path(ROOT_DIR, "models", "peft-dialogue-summary-training-2025-05-08_21-14-28/checkpoint-2000/")
ADAPTER_NAME = "chk-2000"

In [None]:
device = utils.get_torch_device()

Returned MPS device


In [4]:
# Dataset and Model Initialization
dataset = load_dataset(DATASET_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(device)
 # Keep a copy of the original model for later use (keep on CPU for now)
model_orig = deepcopy(model).to(torch.device("cpu"))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

peft_model = PeftModel.from_pretrained(model,
    PEFT_MODEL_FILE,
    torch_device=device,
    adapter_name=ADAPTER_NAME,
    is_trainable=False)

In [5]:
# Load another adapter
path = Path(ROOT_DIR, "models", "peft-dialogue-summary-training-2025-05-05_08-03-52/checkpoint-1246/")
peft_model.load_adapter(path, adapter_name="chk-1246")

<All keys matched successfully>

In [61]:

# Tokenization and Dataset Preparation
def tokenize_function(example):
    """Tokenizes the input and output text for the model, 
    including a hardcoded prompt to summarize the conversation."""
    
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    output = tokenizer(prompt, truncation=True, padding='max_length', return_tensors='pt')
    output['labels'] = tokenizer(example['summary'], truncation=True, padding='max_length', return_tensors='pt').input_ids
    return output

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['id', 'topic', 'dialogue', 'summary'])
tokenized_dataset.set_format(type='torch')

## Qualitative Eval

In [62]:
my_id = 105
prompt = utils.make_n_shot_summary_prompt(summarize_id=my_id, data=dataset)
print(prompt)


Summarize the following conversation.

#Person1#: What's the matter, Bill? You look kind of pale.
#Person2#: Oh, I'm just tired.
#Person1#: Why?
#Person2#: Well, I've been working until around ten every night this week.
#Person1#: You should go home at quitting time today and take it easy.
#Person2#: Yes. I think I will.
#Person1#: That's good. Say, how's your brother?
#Person2#: He's fine, but he is awfully busy. He went to the States on a business trip two weeks ago.
#Person1#: Oh, really? Is he back yet?
#Person2#: No, he won't come back for several more weeks.
#Person1#: Wow! He must have a lot to do there.
#Person2#: Yes, he does.
#Person1#: I want to be sure of the time because I'm going to meet a friend at five o'clock sharp.
#Person2#: Well, my watch says 4:30, and that time should be right. I set it with the radio yesterday.
#Person1#: Good.

Summary:



In [None]:
torch.mps.manual_seed(42)
sentence_encoded = tokenizer(prompt, return_tensors='pt').to('mps')  # Move the entire batch to MPS
model_orig = model_orig.to(device) 
peft_model.set_adapter("chk-2000")
completion = peft_model.generate(input_ids=sentence_encoded.input_ids,
                            num_beams=1,
                            do_sample=True,
                            max_new_tokens=1000,
                            generation_config=None)[0]  # No need to call .to('mps') again
tokenizer.decode(completion, skip_special_tokens=True)


In [32]:
peft_model.set_adapter("chk-1246")
completion = peft_model.generate(input_ids=sentence_encoded.input_ids,
                            num_beams=1,
                            do_sample=True,
                            max_new_tokens=1000,
                            generation_config=None)[0]  
tokenizer.decode(completion, skip_special_tokens=True)


"Bill asks Bill if he seems pale and #Person2#'s brother's busy. Bill recommends Bill to go home at quitting time today, and makes sure that his brother comes back from a business trip 2 weeks later."

In [33]:
# PEFT model completion without sampling
completion = utils.get_model_completion(prompt, model=peft_model, tokenizer=tokenizer)
print(completion)

Bill is tired and wants to go home at quitting time today. Bill's brother is busy and will not come back for several more weeks.


In [37]:
# Get base model completion
completion = utils.get_model_completion(prompt, model=model_orig.to(device), tokenizer=tokenizer)
print(completion)

#Person1#: Oh, I'm tired. #Person2#: Oh, I'm tired. #Person1#: I'm tired. #Person2#: I'm tired. #Person1#: I'm tired. #Person2#: I'm tired. #Person1#: I'm tired. #Person2#: I'm tired. #Person1#: I'm tired. #Person2#: I'm tired. #Person1#: I'm tired. #Person2#: I'm tired. #Person1#: I'm tired.


In [35]:
dataset['test'][my_id]['summary']  # Original summary

"Bill is tired. Bill and #Person1# talk about Bill's brother."

- chk-1246 seems to be better than chk-2000, despite less training
- both seem to be better than base model

## Use ROUGE score to compare on test set

In [108]:
def get_batch_model_completion(batch, model, tokenizer, device=device):
    """Get model completion for a batch of inputs."""
    completion_list = []
    for input in batch:
        with torch.no_grad():
            completion = model.generate(input_ids=input.unsqueeze(0).to(device),
                                    num_beams=1,
                                    do_sample=False,
                                    max_new_tokens=1000,
                                    generation_config=None) 
            completion_list.append(completion)
    
    decoded_completion_list = []
    for comp in completion_list:
        decoded_completion = tokenizer.decode(comp[0], skip_special_tokens=True)
        decoded_completion_list.append(decoded_completion)
    
    return decoded_completion_list

In [109]:
peft_model.set_adapter("chk-2000")
n_samples = 15
chk_2000_completions = get_batch_model_completion(tokenized_dataset['test']['input_ids'][:n_samples],
    model=peft_model,
    tokenizer=tokenizer,
    device=device)
# 11 minutes to do 124 samples
# Maybe let's just do 15 samples

In [110]:
peft_model.set_adapter("chk-1246")
chk_1246_completions = get_batch_model_completion(tokenized_dataset['test']['input_ids'][:n_samples],
    model=peft_model,
    tokenizer=tokenizer,
    device=device)

In [None]:
base_model_completions = get_batch_model_completion(tokenized_dataset['test']['input_ids'][:n_samples],
    model=model_orig,
    tokenizer=tokenizer,
    device=device)

In [140]:
colnames = ['human_summary', 'chk_2000_summary', 'chk_1246_summary', 'base_model_summary']
summary_df = pd.DataFrame(dict(human_summary = dataset['test']['summary'][:n_samples],
                           chk_2000_summary = chk_2000_completions,
                           chk_1246_summary = chk_1246_completions,
                           base_model_summary = base_model_completions))
summary_df.index.name = 'id'

In [142]:
summary_df.head(2)

Unnamed: 0_level_0,human_summary,chk_2000_summary,chk_1246_summary,base_model_summary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.,Ms. Dawson asks Ms. Dawson to take a dictation for her. Dawson tells #Person1# that all office communications are restricted to email correspondence and official memos. Dawson tells #Person2# that employees use Instant Messaging to communicate with clients. Dawson tells #Person1# that the memo should be distributed to all employees before 4 pm.,#Person1# asks Ms. Dawson to take a dictation for #Person1#.,#Person1#: I need to take a dictation for you.
1,"In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.",Ms. Dawson asks Ms. Dawson to take a dictation for her. Dawson tells #Person1# that all office communications are restricted to email correspondence and official memos. Dawson tells #Person2# that employees use Instant Messaging to communicate with clients. Dawson tells #Person1# that the memo should be distributed to all employees before 4 pm.,#Person1# asks Ms. Dawson to take a dictation for #Person1#.,#Person1#: I need to take a dictation for you.


In [None]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [135]:
base_model_rouge = rouge.compute(predictions=summary_df.base_model_summary.tolist(),
    references=summary_df.human_summary.tolist(),
    use_aggregator=True,
    use_stemmer=True)
chk_2000_rouge = rouge.compute(predictions=summary_df.chk_2000_summary.tolist(),
    references=summary_df.human_summary.tolist(),
    use_aggregator=True,
    use_stemmer=True)
chk_1246_rouge = rouge.compute(predictions=summary_df.chk_1246_summary.tolist(),
    references=summary_df.human_summary.tolist(),
    use_aggregator=True,
    use_stemmer=True)
rouge_results = pd.DataFrame(dict(base_model_rouge = base_model_rouge,
    chk_2000_rouge = chk_2000_rouge,
    chk_1246_rouge = chk_1246_rouge))
rouge_results = rouge_results.transpose()

In [136]:
rouge_results

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
base_model_rouge,0.243815,0.094862,0.224181,0.22498
chk_2000_rouge,0.365717,0.131159,0.281413,0.281743
chk_1246_rouge,0.390178,0.157443,0.324932,0.325072


In [143]:
# Save the completions to a file
summary_df.to_csv(Path(ROOT_DIR, "data", "dialogue_summaries.csv"), index=True)