In [None]:
!pip install transformers datasets evaluate
!pip install --upgrade transformers

from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
from evaluate import load


In [None]:
# Load SQuAD 2.0 dataset
dataset = load_dataset("squad_v2")

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
# Preprocessing function
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        answer = answers[i]
        if len(answer["answer_start"]) == 0:  # No answer case
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while idx < len(sequence_ids) and sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is outside the context, label (0, 0)
            if not (start_char >= offsets[context_start][0] and end_char <= offsets[context_end][1]):
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise, set token start and end
                token_start_index = context_start
                token_end_index = context_end

                while token_start_index <= context_end and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while token_end_index >= context_start and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
)

In [None]:
# Define metric
metric = load("squad_v2")

def compute_metrics(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    start_labels, end_labels = eval_pred.label_ids

    start_preds = start_logits.argmax(-1)
    end_preds = end_logits.argmax(-1)

    return {
        "start_accuracy": (start_preds == start_labels).mean(),
        "end_accuracy": (end_preds == end_labels).mean(),
    }

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Only run for first time, makes directories and stuff

In [None]:
# THIS CELL WILL OVERWRITE ANY MODEL PREVIOUSLY SAVED

import os
import shutil

model_saving_path = '/content/drive/MyDrive/NLP_BERT_Model'

# Check if the folder exists
if os.path.exists(model_saving_path):
    # Empty the folder by deleting and recreating it
    shutil.rmtree(model_saving_path)
    os.makedirs(model_saving_path)
else:
    # Create the folder because it does not exist
    os.makedirs(model_saving_path)

## Trainer

In [None]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
trainer.save_model(model_saving_path)