<a href="https://colab.research.google.com/github/surajverma0491/Question-Answering-System-Using-Transformers/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 🛠️ STEP 1: Install Required Libraries

In [1]:
!pip install transformers==4.51.3 datasets evaluate accelerate


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manyl

## 🛠️ STEP 2: Import Modules

In [2]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AlbertTokenizerFast,
    AlbertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
)

## 🧠 STEP 3: Load the ALBERT-base-v2 Model and Tokenizer

In [3]:
model_name = "albert-base-v2"
tokenizer = AlbertTokenizerFast.from_pretrained(model_name)
model = AlbertForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 📦 STEP 4: Load the SQuAD2.0 Dataset

In [4]:
dataset = load_dataset("squad_v2")

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

## 🧹 STEP 5: Preprocess the Dataset

In [5]:
def preprocess(example):
    questions = [q.strip() for q in example["question"]]
    contexts = example["context"]
    answers = example["answers"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        if len(answer["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find token start and end index of the context
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # If the answer is not fully inside the span, label CLS
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            # Otherwise, find the exact start and end token indices
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)

            while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

encoded_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [6]:
def preprocess_validation(example):
    questions = [q.strip() for q in example["question"]]
    contexts = example["context"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    tokenized_examples["example_id"] = []

    for i, mapping in enumerate(sample_mapping):
        tokenized_examples["example_id"].append(example["id"][mapping])

    return tokenized_examples

encoded_dataset["validation"] = dataset["validation"].map(
    preprocess_validation, batched=True, remove_columns=dataset["validation"].column_names)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [7]:
print(encoded_dataset["train"][0].keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'])


In [8]:
print(encoded_dataset["validation"].features)


{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'offset_mapping': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), 'overflow_to_sample_mapping': Value(dtype='int64', id=None), 'example_id': Value(dtype='string', id=None)}


## 📐 STEP 6: Define Metrics (EM & F1)

In [9]:
!pip install -q evaluate

import evaluate

squad_metric = evaluate.load("squad_v2")

def compute_metrics(pred):
    start_logits, end_logits = pred.predictions
    n_best = []

    for i in range(len(start_logits)):
        start_idx = np.argmax(start_logits[i])
        end_idx = np.argmax(end_logits[i])

        if end_idx >= start_idx:
            answer = tokenizer.decode(
                encoded_dataset["validation"][i]["input_ids"][start_idx:end_idx+1],
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
        else:
            answer = ""

        n_best.append({
            "id": str(i),
            "prediction_text": answer,
            "no_answer_probability": 0.0
        })

    references = [{"id": str(i), "answers": dataset["validation"][i]["answers"]} for i in range(len(n_best))]

    return squad_metric.compute(predictions=n_best, references=references)

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

## 🧪 STEP 7: Training Arguments

In [10]:
training_args = TrainingArguments(
    output_dir="./results_albert_squad2",
    # Replace 'evaluation_strategy' with 'eval_strategy'
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    save_strategy="epoch",
    report_to="none"
)

## 🏋️ STEP 8: Initialize Trainer

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


## 🚀 STEP 9: Train the Model

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.812,No log


TrainOutput(global_step=4124, training_loss=0.9930080780118097, metrics={'train_runtime': 3353.6521, 'train_samples_per_second': 39.348, 'train_steps_per_second': 1.23, 'total_flos': 2185600870665216.0, 'train_loss': 0.9930080780118097, 'epoch': 1.0})

## 💾 STEP 10: Save Fine-tuned Model & Tokenizer

In [23]:
model.save_pretrained("./albert-squad2-model")
tokenizer.save_pretrained("./albert-squad2-tokenizer")

('./albert-squad2-tokenizer/tokenizer_config.json',
 './albert-squad2-tokenizer/special_tokens_map.json',
 './albert-squad2-tokenizer/spiece.model',
 './albert-squad2-tokenizer/added_tokens.json',
 './albert-squad2-tokenizer/tokenizer.json')

## ✅ STEP 11: Evaluate & Print Final Metrics

In [24]:
metrics = trainer.evaluate()
print(metrics)

{'eval_runtime': 114.6125, 'eval_samples_per_second': 106.193, 'eval_steps_per_second': 3.324, 'epoch': 1.0}


In [30]:
from datasets import load_dataset

# Load the evaluation set from the SQuAD v2 dataset
dataset = load_dataset("squad_v2")
eval_dataset = dataset["validation"]


In [32]:
example = eval_dataset[0]

for key, value in example.items():
    print(f"{key}: {value}")



id: 56ddde6b9a695914005b9628
title: Normans
context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
question: In what country is Normandy located?
answers: {'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}


In [33]:
from transformers import AutoTokenizer

# Example tokenizer name (replace with your model’s tokenizer if needed)
tokenizer = AutoTokenizer.from_pretrained("./albert-squad2-tokenizer")

def preprocess(example):
    inputs = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    inputs["offset_mapping"] = [
        offset if i == 0 else [(0, 0)] * len(offset)
        for i, offset in enumerate(inputs["offset_mapping"])
    ]

    # Re-map answers
    answers = []
    for i in sample_mapping:
        answer = example["answers"]
        answers.append(answer)

    inputs["answers"] = answers
    return inputs

# Apply preprocessing to evaluation dataset
tokenized_eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=eval_dataset.column_names)


Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

## ✅ STEP 13: SQUAD-v2-Metrics

In [62]:
import torch
import evaluate  # Corrected import

# Function to get answer for a single question-context pair
def get_answer(question, context, tokenizer, model):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation="only_second",
        max_length=384,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length"
    )
    offset_mapping = inputs.pop("offset_mapping")

    if torch.cuda.is_available():
        model = model.to("cuda")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits[0]
    end_logits = outputs.end_logits[0]
    start_index = torch.argmax(start_logits).item()
    end_index = torch.argmax(end_logits).item()

    start_char = offset_mapping[0][start_index][0]
    end_char = offset_mapping[0][end_index][1]

    return context[start_char:end_char]

# Function to generate predictions and references for evaluation
def get_predictions(trainer, tokenized_dataset, original_dataset, tokenizer):
    raw_predictions = trainer.predict(tokenized_dataset)
    start_logits, end_logits = raw_predictions.predictions

    predictions = []
    references = []

    for i, example in enumerate(original_dataset):
        offset = tokenized_dataset[i]["offset_mapping"]
        input_ids = tokenized_dataset[i]["input_ids"]
        context = example["context"]

        start = start_logits[i].argmax()
        end = end_logits[i].argmax()

        if start >= len(offset) or end >= len(offset) or start > end:
            pred_answer = ""
        else:
            pred_tokens = input_ids[start:end + 1]
            pred_answer = tokenizer.decode(pred_tokens, skip_special_tokens=True)

        predictions.append({
            "id": str(i),
            "prediction_text": pred_answer
        })

        # Handle unanswerable questions (e.g., in SQuAD2.0)
        if len(example["answers"]["text"]) > 0:
            answer_text = example["answers"]["text"][0]
            answer_start = example["answers"]["answer_start"][0]
        else:
            answer_text = ""
            answer_start = 0

        references.append({
            "id": str(i),
            "answers": {
                "answer_start": [answer_start],
                "text": [answer_text]
            }
        })

    return predictions, references

# Function to evaluate predictions using SQuAD metric
def evaluate_squad(predictions, references):
    metric = evaluate.load("squad_v2")  # Use "squad_v2" for SQuAD2.0

    # Modify predictions for unanswerable questions (where no answer is present)
    for pred in predictions:
        if pred["prediction_text"] == "":
            # If no answer is predicted, set the answer to be empty and no_answer_probability to 1
            pred["no_answer_probability"] = 1.0
        else:
            # If an answer is predicted, set no_answer_probability to 0
            pred["no_answer_probability"] = 0.0

    return metric.compute(predictions=predictions, references=references)


In [61]:
preds, refs = get_predictions(trainer, tokenized_eval_dataset, eval_dataset, tokenizer)
metrics = evaluate_squad(preds, refs)


NameError: name 'load_metric' is not defined

In [54]:
for k, v in metrics.items():
    print(f"{k}: {v:.2f}")

exact: 22.86
f1: 23.08
total: 11873.00
HasAns_exact: 22.86
HasAns_f1: 23.08
HasAns_total: 11873.00
best_exact: 22.86
best_exact_thresh: 1.00
best_f1: 23.08
best_f1_thresh: 1.00


## ✅ STEP 14: Predicting Answer For Given Context & Question

In [58]:
import torch
from IPython.display import display, HTML
import ipywidgets as widgets

# Input widgets
context_widget = widgets.Textarea(
    value='',
    placeholder='Enter the context here',
    description='Context:',
    layout=widgets.Layout(width='100%', height='145px'),
    style={'description_width': 'initial'}
)

question_widget = widgets.Text(
    value='',
    placeholder='Enter your question here',
    description='Question:',
    layout=widgets.Layout(width='100%'),
    style={'description_width': 'initial'}
)

# Output + Button
output_widget = widgets.Output()
predict_button = widgets.Button(
    description='Predict Answer',
    button_style='success',
    layout=widgets.Layout(width='30%')
)

# Display everything
display(context_widget)
display(question_widget)
display(predict_button)
display(output_widget)

# Prediction logic
def predicting_answer_QnA(question, context, tokenizer, model):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation="only_second",
        max_length=384,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")

    if torch.cuda.is_available():
        model = model.to("cuda")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits[0]
    end_logits = outputs.end_logits[0]
    start_index = torch.argmax(start_logits).item()
    end_index = torch.argmax(end_logits).item()
    start_char = offset_mapping[0][start_index][0]
    end_char = offset_mapping[0][end_index][1]
    answer = context[start_char:end_char]
    return answer

# On-click behavior
def on_predict_button_click(b):
    with output_widget:
        output_widget.clear_output()
        context = context_widget.value
        question = question_widget.value

        if not context or not question:
            print("❗ Please enter both a context and a question.")
            return

        # Load model + tokenizer
        from transformers import AutoTokenizer, AutoModelForQuestionAnswering
        tokenizer = AutoTokenizer.from_pretrained("twmkn9/albert-base-v2-squad2")
        model = AutoModelForQuestionAnswering.from_pretrained("twmkn9/albert-base-v2-squad2")

        answer = predicting_answer_QnA(question, context, tokenizer, model)

        # Display styled HTML output
        display(HTML(f"<p><b>🔍 Predicted Answer:</b> <mark><b>{answer}</b></mark></p>"))

# Attach button event
predict_button.on_click(on_predict_button_click)


Textarea(value='', description='Context:', layout=Layout(height='145px', width='100%'), placeholder='Enter the…

Text(value='', description='Question:', layout=Layout(width='100%'), placeholder='Enter your question here', s…

Button(button_style='success', description='Predict Answer', layout=Layout(width='30%'), style=ButtonStyle())

Output()