<a href="https://colab.research.google.com/github/surajverma0491/Question-Answering-System-Using-Transformers/blob/main/Question_Answering_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🛠️ STEP 1: Install Required Libraries

In [2]:
!pip install transformers==4.39.3 datasets evaluate accelerate


Collecting transformers==4.39.3
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.39.3)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.

# 🛠️ STEP 2: Import Modules

In [3]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.39.3
    Uninstalling transform

In [4]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AlbertTokenizerFast,
    AlbertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
)

# 🧠 STEP 3: Load the ALBERT-base-v2 Model and Tokenizer

In [5]:
model_name = "albert-base-v2"
tokenizer = AlbertTokenizerFast.from_pretrained(model_name)
model = AlbertForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 📦 STEP 4: Load the SQuAD2.0 Dataset

In [6]:
dataset = load_dataset("squad_v2")

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

# 🧹 STEP 5: Preprocess the Dataset

In [8]:
def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    contexts = example["context"]
    answers = example["answers"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        if len(answer["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find token start and end index of the context
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # If the answer is not fully inside the span, label CLS
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            # Otherwise, find the exact start and end token indices
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)

            while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

encoded_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [9]:
def preprocess_validation(example):
    questions = [q.strip() for q in example["question"]]
    contexts = example["context"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    tokenized_examples["example_id"] = []

    for i, mapping in enumerate(sample_mapping):
        tokenized_examples["example_id"].append(example["id"][mapping])

    return tokenized_examples

encoded_dataset["validation"] = dataset["validation"].map(
    preprocess_validation, batched=True, remove_columns=dataset["validation"].column_names)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [10]:
print(encoded_dataset["train"][0].keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'])


In [11]:
print(encoded_dataset["validation"].features)


{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'offset_mapping': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), 'overflow_to_sample_mapping': Value(dtype='int64', id=None), 'example_id': Value(dtype='string', id=None)}


# 📐 STEP 6: Define Metrics (EM & F1)

In [12]:
!pip install -q evaluate

import evaluate

squad_metric = evaluate.load("squad_v2")

def compute_metrics(pred):
    start_logits, end_logits = pred.predictions
    n_best = []

    for i in range(len(start_logits)):
        start_idx = np.argmax(start_logits[i])
        end_idx = np.argmax(end_logits[i])

        if end_idx >= start_idx:
            answer = tokenizer.decode(
                encoded_dataset["validation"][i]["input_ids"][start_idx:end_idx+1],
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
        else:
            answer = ""

        n_best.append({
            "id": str(i),
            "prediction_text": answer,
            "no_answer_probability": 0.0
        })

    references = [{"id": str(i), "answers": dataset["validation"][i]["answers"]} for i in range(len(n_best))]

    return squad_metric.compute(predictions=n_best, references=references)

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

# 🧪 STEP 7: Training Arguments (Optimized for Colab GPU ~1.5 hours)

In [36]:
# Modify TrainingArguments to include remove_unused_columns=False
training_args = TrainingArguments(
    output_dir="./results_albert_squad2",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False  # Add this line
)



# 🏋️ STEP 8: Initialize Trainer

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


# 🚀 STEP 9: Train the Model

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8068,No log


TrainOutput(global_step=4124, training_loss=0.9903446922709246, metrics={'train_runtime': 3326.4743, 'train_samples_per_second': 39.669, 'train_steps_per_second': 1.24, 'total_flos': 2185600870665216.0, 'train_loss': 0.9903446922709246, 'epoch': 1.0})

# 💾 STEP 10: Save Fine-tuned Model & Tokenizer

In [16]:
model.save_pretrained("./albert-squad2-model")
tokenizer.save_pretrained("./albert-squad2-tokenizer")

('./albert-squad2-tokenizer/tokenizer_config.json',
 './albert-squad2-tokenizer/special_tokens_map.json',
 './albert-squad2-tokenizer/spiece.model',
 './albert-squad2-tokenizer/added_tokens.json',
 './albert-squad2-tokenizer/tokenizer.json')

# ✅ STEP 11: Evaluate & Print Final Metrics

In [17]:
metrics = trainer.evaluate()
print(metrics)

{'eval_runtime': 113.0032, 'eval_samples_per_second': 107.705, 'eval_steps_per_second': 3.372, 'epoch': 1.0}


# ⭐ STEP 12: Get Answer for the Given Context and Question

In [52]:
def get_answer(question, context, tokenizer, model):
    # Tokenize input
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation="only_second",
        max_length=384,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")

    # Move model and inputs to GPU if available
    if torch.cuda.is_available():
        model = model.to("cuda")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract start and end logits
    start_logits = outputs.start_logits[0]
    end_logits = outputs.end_logits[0]

    # Get most probable start and end index
    start_index = torch.argmax(start_logits).item()
    end_index = torch.argmax(end_logits).item()

    # Get character positions from offset
    start_char = offset_mapping[0][start_index][0]
    end_char = offset_mapping[0][end_index][1]

    # Return extracted answer
    answer = context[start_char:end_char]
    return answer

# 🧪 Example Usage:
context = "The Great Wall of China is a series of fortifications built across northern China over several centuries to protect against invasions. Construction began as early as the 7th century BC, and various dynasties contributed to the expansion and reinforcement of the wall. The most well-known sections were built during the Ming dynasty. The wall stretches over 13,000 miles and is considered one of the most impressive architectural feats in history."
question = "During which dynasty were the most well-known sections of the Great Wall built?"

# Use your fine-tuned model and tokenizer
answer = get_answer(question, context, tokenizer, trainer.model)
print("Answer:", answer)


Answer: Ming dynasty


# ✅ STEP 13: Exact Match & F1-Score

In [48]:
def get_predictions(trainer, dataset, tokenizer):
    raw_predictions = trainer.predict(dataset)
    start_logits, end_logits = raw_predictions.predictions

    predictions = []
    references = []

    for i, example in enumerate(dataset):
        offset = example["offset_mapping"]
        input_ids = example["input_ids"]
        context = tokenizer.decode(input_ids, skip_special_tokens=True)

        start = start_logits[i].argmax()
        end = end_logits[i].argmax()

        # Ensure valid start and end positions
        if start >= len(offset) or end >= len(offset) or start > end:
            pred_answer = ""
        else:
            pred_tokens = input_ids[start:end + 1]
            pred_answer = tokenizer.decode(pred_tokens, skip_special_tokens=True)

        predictions.append({"id": str(i), "prediction_text": pred_answer})

        # Get the correct answer from the dataset
        answer_text = example["answers"]["text"][0] if len(example["answers"]["text"]) > 0 else ""
        references.append({"id": str(i), "answers": {"answer_start": [example["answers"]["answer_start"][0]], "text": [answer_text]}})

    return predictions, references


In [53]:
# ipython-input-37-aa90ad1f409b
# Ensure you have the `evaluate` library loaded for the SQuAD metric
import evaluate
squad_metric = evaluate.load("squad")

# Run prediction
preds, refs = get_predictions(trainer, dataset["validation"], tokenizer) # Changed here

# Get Exact Match (EM) and F1 score
results = squad_metric.compute(predictions=preds, references=refs)
print(f"Exact Match (EM): {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

AttributeError: `AcceleratorState` object has no attribute `distributed_type`. This happens if `AcceleratorState._reset_state()` was called and an `Accelerator` or `PartialState` was not reinitialized.