# ❓ Fine-tune BERT on SQuAD v1.1

This notebook demonstrates how to fine-tune `bert-base-uncased` on the SQuAD v1.1 dataset using HuggingFace Transformers.

## 📦 Install dependencies

In [None]:
!pip install transformers datasets scikit-learn tqdm

## 📚 Load SQuAD dataset

In [None]:
from datasets import load_dataset
squad = load_dataset("squad")
squad["train"][0]

## ✂️ Preprocessing: Tokenization and label alignment

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    inputs = tokenizer(
        questions,
        example["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = example["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad.map(preprocess, batched=True, remove_columns=squad["train"].column_names)


## 🏋️ Fine-tune BERT for QA

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

args = TrainingArguments(
    output_dir="./outputs/bert-qa",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer
)

trainer.train()


## 📊 Evaluate on validation set

In [None]:
trainer.evaluate()

## 🔍 Inference on a custom QA example

In [None]:
question = "Where did Barack Obama grow up?"
context = "Barack Obama was born in Hawaii and later lived in Indonesia before returning to Hawaii."

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

start = torch.argmax(outputs.start_logits)
end = torch.argmax(outputs.end_logits) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end]))

print(f"Answer: {answer}")
