In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_from_disk
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [4]:
model_path = "../model/final" 

model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

model.eval()
print("Model & tokenizer loaded.")


Model & tokenizer loaded.


In [5]:
from pathlib import Path

dataset_path = Path("../data/tokenized_squad_small")
dataset = load_from_disk(str(dataset_path))

print(dataset)
print("Dataset loaded.")


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})
Dataset loaded.


In [6]:
def predict_answer(example):
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=64,
            num_beams=4,
            early_stopping=True
        )

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)


In [7]:
example = dataset["validation"][0]

prediction = predict_answer(example)

labels = [t for t in example["labels"] if t != -100]
reference = tokenizer.decode(labels, skip_special_tokens=True)

print("Prediction:", prediction)
print("Reference :", reference)


Prediction: 1852
Reference : 1852


In [8]:
import re
import string
from collections import Counter

def normalize(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def compute_exact(a, b):
    return int(normalize(a) == normalize(b))

def compute_f1(a, b):
    a_tokens = normalize(a).split()
    b_tokens = normalize(b).split()

    common = Counter(a_tokens) & Counter(b_tokens)
    num_same = sum(common.values())

    if len(a_tokens) == 0 or len(b_tokens) == 0:
        return int(a_tokens == b_tokens)

    if num_same == 0:
        return 0

    precision = num_same / len(a_tokens)
    recall = num_same / len(b_tokens)

    return 2 * precision * recall / (precision + recall)


In [9]:
exact_scores = []
f1_scores = []

for example in tqdm(dataset["validation"]):
    pred = predict_answer(example)

    labels = [t for t in example["labels"] if t != -100]
    ref = tokenizer.decode(labels, skip_special_tokens=True)

    exact_scores.append(compute_exact(pred, ref))
    f1_scores.append(compute_f1(pred, ref))

exact_match = sum(exact_scores) / len(exact_scores)
f1 = sum(f1_scores) / len(f1_scores)

print("Exact Match:", exact_match)
print("F1 Score   :", f1)


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [04:09<00:00,  8.01it/s]

Exact Match: 0.5745
F1 Score   : 0.7570155827955984





In [10]:
for i in range(5):
    example = dataset["validation"][i]
    pred = predict_answer(example)

    labels = [t for t in example["labels"] if t != -100]
    ref = tokenizer.decode(labels, skip_special_tokens=True)

    print(f"\n--- Example {i+1} ---")
    print("Prediction:", pred)
    print("Reference :", ref)



--- Example 1 ---
Prediction: 1852
Reference : 1852

--- Example 2 ---
Prediction: 1962
Reference : 1962

--- Example 3 ---
Prediction: Horace Walpole
Reference : Horace Walpole

--- Example 4 ---
Prediction: Shimer College
Reference : several regional colleges and universities

--- Example 5 ---
Prediction: Jonathan Stewart
Reference : Jonathan Stewart
