In [None]:
# building a question answer model

In [None]:
# installing the required libraries

In [6]:
pip install transformers datasets tensorflow evaluate

Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import tensorflow as tf
from transformers import TFAutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
import numpy as np
import collections
import re

In [None]:
# load the dataset

In [9]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from datasets import load_dataset

dataset = load_dataset("elsayed2002/newsqa-dataset", split="train")

import pandas as pd
df = pd.DataFrame(dataset)

print(df.head())
print(df.columns)

    id                                            context  \
0  0_0  A high court in northern India on Friday acqui...   
1  0_1  A high court in northern India on Friday acqui...   
2  0_2  A high court in northern India on Friday acqui...   
3  0_3  A high court in northern India on Friday acqui...   
4  0_4  A high court in northern India on Friday acqui...   

                                            question  \
0          What was the amount of children murdered?   
1               When was Pandher sentenced to death?   
2  The court aquitted Moninder Singh Pandher of w...   
3                                  who was acquitted   
4                                  who was sentenced   

                                             answers  
0           {'answer_start': [260], 'text': ['19 ']}  
1     {'answer_start': [231], 'text': ['February.']}  
2  {'answer_start': [582], 'text': ['rape and mur...  
3  {'answer_start': [165], 'text': ['Moninder Sin...  
4  {'answer_start': [

In [None]:
# renaming the columns to match pipeline

In [11]:
df = df.rename(columns={'text': 'context', 'question': 'question', 'answer': 'answers'})

In [12]:
df['answers'] = df['answers'].apply(lambda x: {'text': [x], 'answer_start': [0]})  

In [13]:
from datasets import Dataset
dataset_hf = Dataset.from_pandas(df)

In [14]:
dataset = load_dataset("elsayed2002/newsqa-dataset", split="train")

In [None]:
# preparing the dataset

In [None]:
# the dataset has the variables or content in the form of numpy array so we need to convert it into dictionary format

In [16]:
import pandas as pd

# Convert to pandas dataframe
df = pd.DataFrame(dataset)

print(df.columns)

Index(['id', 'context', 'question', 'answers'], dtype='object')


In [19]:
# handles missing data
df = df.dropna(subset=['context', 'question', 'answers'])

In [None]:
# convert to question answer format

In [None]:
# apply fromatting

In [33]:
df.columns

Index(['id', 'context', 'question', 'answers'], dtype='object')

In [26]:
print(dataset_hf)
print(dataset_hf[0])

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 106819
})
{'id': '0_0', 'context': 'A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."Moninder Singh Pandher was sentenced to death by a lower court in February.The teen was one of 19 victims -- children and young women -- in one of the most gruesome serial killings in India in recent years.The Allahabad high court has acquitted Moninder Singh Pandher, his lawyer Sikandar B. Kochar told CNN.Pandher and his domestic employee Surinder Koli were sentenced to death in February by a lower court for the rape and murder of the 14-year-old.The high court upheld Koli\'s death sentence, Kochar said.The two were arrested two years ago after body parts packed in plastic bags were found near their home in Noida, a New Delhi suburb. Their home was later dubbed a "house of horrors" by the Indian media.Pandher

In [None]:
# tokenization

In [34]:
from transformers import AutoTokenizer

MODEL = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

max_length = 300
doc_stride = 100

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# applying tokenization function to dataset

In [46]:
def prepare_train_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answer = examples["answers"][sample_index]

        # Check if the answer exists
        if len(answer["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = tokenized.sequence_ids(i)

        # find start and end token indices
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        token_end_index = len(sequence_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # If answer is outside the span
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            start_positions.append(0)
            end_positions.append(0)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)

            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized


In [47]:
tokenized_dataset = dataset.map(
    prepare_train_features, 
    batched=True, 
    remove_columns=dataset.column_names  # removes 'id', 'context', etc.
)

Map:   0%|          | 0/106819 [00:00<?, ? examples/s]

In [None]:
# convert hugging face dataset to tensorflow dataset

In [48]:
import tensorflow as tf

batch_size = 5

tf_train_dataset = tokenized_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["start_positions", "end_positions"],
    shuffle=True,
    batch_size=batch_size
)

In [None]:
# loading pretrained model using tensorflow itself

In [56]:
pip install transformers==4.30.2

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==4.30.2
  Using cached transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Using cached tokenizers-0.13.3.tar.gz (314 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Building wheels for collected packages: tokenizers
  Building wheel for tokenizers (pyproject.toml): started
  Building wheel for tokenizers (pyproject.toml): finished with status 'error'
Failed to build tokenizers
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Building wheel for tokenizers (pyproject.toml) did not run successfully.
  exit code: 1
  
  [62 lines of output]
  !!
  
          ********************************************************************************
          Please consider removing the following classifiers in favor of a SPDX license expression:
  
          License :: OSI Approved :: Apache Software License
  
          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
          ********************************************************************************
  
  !!
    self._finalize_license_expression()
  running bdist_wheel
  running build
  running build_py
  creating build\lib.win-amd64-cpython-313\tokenizers
  copying py_src\tokenizers\__init__.py -> build\lib.win-amd64-cpython-313\tokenizers
  creating build\lib.win-amd64-cpython-313\tokenizers\models
  copying py_src\tokenizers\models\__init__.py -> build\lib.win-amd64-cpyth

In [58]:
from transformers import TFAutoModelForQuestionAnswering, AutoTokenizer

MODEL = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = TFAutoModelForQuestionAnswering.from_pretrained(MODEL)

TypeError: 'builtins.safe_open' object is not iterable

In [None]:
# switched to pytorch because the model was not able to load even after trying so hard and after installing tensorflow older version as well.

In [61]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [63]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.9.0+cpu
False


In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

MODEL = "bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL) 

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# testing the model on some examples

In [4]:
import torch

context = "Vidya is learning NLP using Hugging Face Transformers."
question = "What is Vidya learning?"

inputs = tokenizer(question, context, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

start_index = torch.argmax(outputs.start_logits)
end_index = torch.argmax(outputs.end_logits) + 1  # include the last token

answer_tokens = inputs["input_ids"][0][start_index:end_index]
answer = tokenizer.decode(answer_tokens)  # decode properly to string

print("Answer:", answer)

Answer: nlp
