In [1]:
# Install packages
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from datasets import load_dataset

In [4]:
# Load LLMs
model_names = {
    "BERT": "bert-large-uncased-whole-word-masking-finetuned-squad",
    "RoBERTa": "deepset/roberta-base-squad2",
    "GPT-2": "gpt2",
    "T5": "t5-large"
}

tokenizers = {name: AutoTokenizer.from_pretrained(model) for name, model in model_names.items()}
models = {name: AutoModelForQuestionAnswering.from_pretrained(model) for name, model in model_names.items()}

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForQuestionAnswer

In [9]:
# Preprocess function
def preprocessor(examples, tokenizer):
    tokenized = tokenizer(
        examples["question"], examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # Debugging: print lengths to ensure consistency
    input_lengths = [len(input_id) for input_id in tokenized["input_ids"]]
    print(f"Tokenized input lengths: {input_lengths}")
    return tokenized

# Evaluation function
def evaluate_model(model_name, model, tokenizer, dataset):
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    correct = 0
    total = 0
    for example in dataset["validation"]:
        result = qa_pipeline(question=example["question"], context=example["context"])
        if result["answer"].strip().lower() == example["answers"]["text"][0].strip().lower():
            correct += 1
        total += 1
    accuracy = correct / total
    print(f"Model: {model_name}, Accuracy: {accuracy:.2f}")
    return accuracy


In [10]:
# Load data
squad = load_dataset("squad_v2")

# Tokenize datasets
tokenized_datasets = {}
for name, tokenizer in tokenizers.items():
    try:
        tokenized_datasets[name] = squad.map(lambda x: preprocessor(x, tokenizer), batched=True, remove_columns=["id", "title"])
    except Exception as e:
        print(f"Error tokenizing dataset for {name}: {e}")


results = {}
for name, (model, tokenizer) in models.items():
    try:
        results[name] = evaluate_model(name, model, tokenizer, tokenized_datasets[name])
    except Exception as e:
        print(f"Error evaluating model {name}: {e}")

for model_name, accuracy in results.items():
    print(f"Model: {model_name}, Accuracy: {accuracy:.2f}")

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Tokenized input lengths: [384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384,

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Tokenized input lengths: [384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384,

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Error tokenizing dataset for GPT-2: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Tokenized input lengths: [384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384,

TypeError: cannot unpack non-iterable BertForQuestionAnswering object

In [3]:
# Perform comparisons: SQuAD
tokenized_datasets = {name: squad.map(lambda x: preprocessor(x, tokenizer), batched=True)
                      for name, tokenizer in tokenizers.items()}

results = {name: evaluate_model(name, model, tokenizer, squad) for name, (model, tokenizer) in models.items()}

for model_name, accuracy in results.items():
    print(f"Model: {model_name}, Accuracy: {accuracy:.2f}")

NameError: name 'tokenizers' is not defined

In [1]:
import transformers
import datasets
import torch

print(transformers.__version__)
print(datasets.__version__)
print(torch.__version__)


ModuleNotFoundError: No module named 'transformers'