In [1]:
# install necessary libraries
!pip install -U transformers datasets huggingface_hub

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [2]:
# import everything
from huggingface_hub import login
from datasets import load_dataset
from transformers import ( AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator, pipeline)
import torch

In [3]:
# login to huggingface to access models
HF_TOKEN = os.getenv("HF_TOKEN")

In [4]:
# load 5000 samples from squad and split into 80/20 train-test
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [5]:
# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# helps pad the batches automatically
data_collator = DefaultDataCollator()

In [7]:
# preprocess the dataset
def preprocess_function(examples):
  # clean up questions and tokenize them with context
    questions = [q.strip() for q in examples["question"]] # remove extra spaces from all questions
    # tokenize questions and contexts together
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=250,  # limit total length to save memory and stay consistent
        truncation="only_second", # cut context if it's too long, not the question
        return_offsets_mapping=True, # keep track of where each token came from in the original text
        padding="max_length",  # pad all sequences to 250 tokens so batches are uniform
    )

    # store offset mappings to align answer later
    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # in case there's no valid answer
    for i, offsets in enumerate(offset_mapping):
        answer = answers[i]
        if not answer["answer_start"] or not answer["text"]:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # find where context tokens begin and end
        context_start = None
        context_end = None
        for idx, seq_id in enumerate(sequence_ids):
            if seq_id == 1 and context_start is None:
                context_start = idx
            if seq_id == 1:
                context_end = idx
         # if answer is outside context, label it as (0, 0)
        if (
            context_start is None or context_end is None or
            offsets[context_start][0] is None or offsets[context_end][1] is None or
            offsets[context_start][0] > end_char or offsets[context_end][1] < start_char
        ):
            start_positions.append(0)
            end_positions.append(0)
        else:
           # otherwise find the closest token positions
            idx = context_start
            while idx <= context_end and offsets[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offsets[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["example_id"] = examples["id"]
    return inputs

In [8]:
# apply preprocessing to the dataset
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
# setup training config
training_args = TrainingArguments(
    output_dir="distilbert_base_squad_model",
    per_device_train_batch_size=8, # small batch size to fit within Colab GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5, # standard fine-tuning LR for transformers
    weight_decay=0.01,  # slight regularisation to prevent overfitting
    push_to_hub=False,
    report_to="none"
)
# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"], # training split from SQuAD
    eval_dataset=tokenized_squad["test"],   # evaluation split from SQuAD
    tokenizer=tokenizer, # tokeniser for converting predictions
    data_collator=data_collator,
)

  trainer = Trainer(


In [10]:
trainer.train()

Step,Training Loss
500,3.0429
1000,1.5077
1500,1.0673


TrainOutput(global_step=1500, training_loss=1.8726406656901042, metrics={'train_runtime': 301.9743, 'train_samples_per_second': 39.738, 'train_steps_per_second': 4.967, 'total_flos': 765545508000000.0, 'train_loss': 1.8726406656901042, 'epoch': 3.0})

In [13]:
question_answerer = pipeline(
    "question-answering",
    model="distlbert_base_squad_model_final",
    tokenizer="distlbert_base_squad_model_final"
)

Device set to use cuda:0


In [14]:
# to test model predictions
def ask_question(question, context):
    print(f"\nQ: {question}")
    print(f"Context: {context}")
    answer = question_answerer(question=question, context=context)
    print(f"Answer: {answer['answer']}")

In [15]:
ask_question(
    "How many programming languages does BLOOM support?",
    "BLOOM has 176 billion parameters and can generate text in 46 natural languages and 13 programming languages."
)


Q: How many programming languages does BLOOM support?
Context: BLOOM has 176 billion parameters and can generate text in 46 natural languages and 13 programming languages.
Answer: 46 natural languages and 13


In [16]:
ask_question("When did Chomsky earn his PhD?",
             "During his postgraduate work in the Harvard Society of Fellows, Chomsky developed the theory of transformational grammar for which he earned his doctorate in 1955.")


Q: When did Chomsky earn his PhD?
Context: During his postgraduate work in the Harvard Society of Fellows, Chomsky developed the theory of transformational grammar for which he earned his doctorate in 1955.
Answer: 1955


In [17]:
ask_question("Where is the University of Hull?",
             "The main university campus is located in Hull and is home to the Hull York Medical School, a joint initiative with the University of York.")


Q: Where is the University of Hull?
Context: The main university campus is located in Hull and is home to the Hull York Medical School, a joint initiative with the University of York.
Answer: Hull York Medical School


In [18]:
ask_question("When did harlequin rasboras first become popular?",
             "The species became an instant favorite among aquarists after its introduction in the early 1900s and is the best known and most widely kept species among the rasboras.")


Q: When did harlequin rasboras first become popular?
Context: The species became an instant favorite among aquarists after its introduction in the early 1900s and is the best known and most widely kept species among the rasboras.
Answer: 1900s


In [19]:
ask_question("What does UNFCCC stand for?",
             "The 2023 United Nations Climate Change Conference or Conference of the Parties of the UNFCCC, more commonly referred to as COP28, is the 28th United Nations Climate Change conference, being held from 30 November until 12 December 2023 at Expo City, Dubai.")


Q: What does UNFCCC stand for?
Context: The 2023 United Nations Climate Change Conference or Conference of the Parties of the UNFCCC, more commonly referred to as COP28, is the 28th United Nations Climate Change conference, being held from 30 November until 12 December 2023 at Expo City, Dubai.
Answer: COP28


In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [21]:
from collections import OrderedDict
import numpy as np
import evaluate

# load the SQuAD evaluation metric
metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [22]:
# takes raw logits and turns them into readable text answers
def postprocess_qa_predictions(examples, features, raw_predictions, tokenizer):
    all_start_logits, all_end_logits = raw_predictions
    # map example ids to their index
    example_id_to_index = {k["id"]: i for i, k in enumerate(examples)}
    features_per_example = {}

     # group features by example
    for i, feature in enumerate(features):
        example_id = feature["example_id"]
        features_per_example.setdefault(example_id, []).append(i)

    predictions = OrderedDict()

    # go through each example and find best span based on scores
    for example_id, feature_indices in features_per_example.items():
        context = examples[example_id_to_index[example_id]]["context"]
        best_score = -float("inf")
        answer = ""

        for i in feature_indices:
            start_logits = all_start_logits[i]
            end_logits = all_end_logits[i]
            offset_mapping = features[i]["offset_mapping"]

            # get token positions with highest score
            start_idx = np.argmax(start_logits)
            end_idx = np.argmax(end_logits)

            if start_idx >= len(offset_mapping) or end_idx >= len(offset_mapping):  # skip if indices are invalid
                continue

            start_char = offset_mapping[start_idx][0]
            end_char = offset_mapping[end_idx][1]
            if start_char is None or end_char is None:
                continue

            current_answer = context[start_char:end_char] # extract predicted answer and calculate score
            score = start_logits[start_idx] + end_logits[end_idx]

            if score > best_score:
                best_score = score
                answer = current_answer
        # save best prediction
        predictions[example_id] = answer

    return predictions

In [23]:
def compute_metrics(eval_preds): # evaluation logic using EM and F1 from SQuAD script
    predictions, _ = eval_preds
    final_predictions = postprocess_qa_predictions(
        examples=squad["test"],
        features=tokenized_squad["test"],
        raw_predictions=predictions,
        tokenizer=tokenizer
    )
        # format predictions and references for the metric
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in squad["test"]]
  # return F1 and EM
    return metric.compute(predictions=formatted_predictions, references=references)

In [22]:
# set up trainer again but now with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
# run eval
results = trainer.evaluate()
print(results)

  trainer = Trainer(


{'eval_loss': 1.5190998315811157, 'eval_model_preparation_time': 0.0013, 'eval_exact_match': 49.9, 'eval_f1': 60.319026888912774, 'eval_runtime': 13.7098, 'eval_samples_per_second': 72.94, 'eval_steps_per_second': 9.118}


In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
trainer.save_model("/content/drive/MyDrive/distilbert_base_squad_model_final")