In [None]:
from pathlib import Path

import evaluate
from datasets import load_dataset, load_from_disk
from tqdm import tqdm
from transformers import BertForQuestionAnswering, BertTokenizer, pipeline

In [2]:
model_name = "bert-base-uncased"
dataset_path = Path.cwd().parent / "data/tokenized_squad"

In [3]:
# Load model and tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tokenized_dataset = load_from_disk(dataset_path)

In [None]:
# Load model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Load SQuAD dataset
dataset = load_dataset("squad", split="validation")

# Initialize QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=-1,  # Use GPU if available (-1 for CPU)
)

# Prepare evaluation
squad_metric = evaluate.load("squad")
predictions = []
references = []

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [None]:
pbar = tqdm(total=10570)

for sample in dataset:
    pbar.update(1)
    # Generate prediction
    prediction = qa_pipeline(
        question=sample["question"],
        context=sample["context"],
        max_seq_len=384,
        doc_stride=128,
        handle_impossible_answer=False,
    )

    # Format prediction
    formatted_prediction = {
        "id": sample["id"],
        "prediction_text": prediction["answer"],
    }

    # Format reference
    formatted_reference = {
        "id": sample["id"],
        "answers": sample["answers"],
    }

    predictions.append(formatted_prediction)
    references.append(formatted_reference)

pbar.close()

In [None]:
results = squad_metric.compute(predictions=predictions, references=references)
print(f"F1 Score: {results['f1']:.2f}")
print(f"Exact Match: {results['exact_match']:.2f}")