I am building a LLM model from scratch for Question and Answering and I am making use of Stanford Question Answering Dataset gotten from Kaggle public repository and also available on huggingface as well.

# Seting up Environment

In [1]:
!pip install transformers
!pip install datasets
!pip install torch




Here, I'm trying to install some dependences that I will be needing to for my LLM model

# Load and Preprocess Data

In [2]:
import pandas as pd
from datasets import load_dataset

# Load the SQuAD training and validation dataset
train_data = pd.read_json("/content/train-v1.1.json")
validation_data = pd.read_json("/content/dev-v1.1.json")


The dataset is in a .json formart, and here I'm trying to load the dataset using the pandas library by calling the read_json method

To manage computational resources, a random sample of 200 entries from the training data and 25 entries from the validation data is taken.

In [3]:
# Selecting randome sample 200 entries from the training data
train_sample = train_data.sample(n=200, random_state=0).reset_index(drop=True)

# Selecting Random sample 25 entries from the validation data
validation_sample = validation_data.sample(n=25, random_state=0).reset_index(drop=True)

In [4]:
# Inspect the structure of the data
print(train_data.head())
print(validation_data.head())

                                                data  version
0  {'title': 'University_of_Notre_Dame', 'paragra...      1.1
1  {'title': 'Beyoncé', 'paragraphs': [{'context'...      1.1
2  {'title': 'Montana', 'paragraphs': [{'context'...      1.1
3  {'title': 'Genocide', 'paragraphs': [{'context...      1.1
4  {'title': 'Antibiotics', 'paragraphs': [{'cont...      1.1
                                                data  version
0  {'title': 'Super_Bowl_50', 'paragraphs': [{'co...      1.1
1  {'title': 'Warsaw', 'paragraphs': [{'context':...      1.1
2  {'title': 'Normans', 'paragraphs': [{'context'...      1.1
3  {'title': 'Nikola_Tesla', 'paragraphs': [{'con...      1.1
4  {'title': 'Computational_complexity_theory', '...      1.1


The first five rows of each of the train and test dataset are printed out to have an insight of what the dataset contains.

In [5]:
# The necessary information of my train sample dataset

train_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   data     200 non-null    object 
 1   version  200 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.2+ KB


In [6]:
# The necessary information of my validation sample dataset

validation_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   data     25 non-null     object 
 1   version  25 non-null     float64
dtypes: float64(1), object(1)
memory usage: 528.0+ bytes


In [8]:
# Import the Necessary Libraries

from transformers import BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_metric
import torch


I tried to define a function to prepocess and prepare the dataset for the next stage which is tokenization

In [9]:
def prepare_data_for_tokenizer(data):
    """
    Processes a DataFrame containing raw SQuAD data,
    organizing it to be compatible with the BERT tokenizer.
    """

    # Initializing lists to store processed data
    ids = []
    titles = []
    contexts = []
    questions = []
    answers = []

    # Iterating over each row in the DataFrame
    for _, row in data.iterrows():
        document = row['data']
        qa_title = document['title']
        paragraphs = document['paragraphs']

        # Extracting context, questions, and answers from each paragraph
        for paragraph in paragraphs:
            context = paragraph['context']
            qas = paragraph['qas']

            for qa in qas:
                qa_id = qa['id']
                question = qa['question']
                answer = qa['answers'][0]

                formatted_answer = {
                    'answer_start': [answer['answer_start']],
                    'text': [answer['text']]
                }

                # Appending extracted data to corresponding lists
                ids.append(qa_id)
                titles.append(qa_title)
                contexts.append(context)
                questions.append(question)
                answers.append(formatted_answer)

    # Creating a DataFrame from the organized data
    cleaned_data = {
        'id': ids,
        'title': titles,
        'context': contexts,
        'question': questions,
        'answers': answers
    }

    return pd.DataFrame(cleaned_data)

# Preparing the data
train_cleaned = prepare_data_for_tokenizer(train_sample)
validation_cleaned = prepare_data_for_tokenizer(validation_sample)


In [10]:
# Display the cleaned data to verify the structure

print(train_cleaned.head())
print(validation_cleaned.head())

                         id   title  \
0  572e7c43cb0c0d14000f11a6  Cyprus   
1  572e7c43cb0c0d14000f11a7  Cyprus   
2  572e7c43cb0c0d14000f11a8  Cyprus   
3  572e7c43cb0c0d14000f11a9  Cyprus   
4  572e7c43cb0c0d14000f11aa  Cyprus   

                                             context  \
0  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
1  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
2  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
3  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
4  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   

                                            question  \
0                What is the official name of Cypus?   
1                           Where is Cyprus located?   
2                  What countries are nearby Cyprus?   
3  What is Cyprus' affiliation with the European ...   
4  Is Cyprus an island country or land-locked cou...   

                                             answers  
0  {'answer_start': [99], 'text': ['

Now the data is preprocessed and available for **tokenization**

Tokenization converts the text data into a format that the BERT model can process. The BertTokenizerFast is used for this **purpose**

In [11]:
from transformers import BertTokenizerFast

# Loading BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def preprocess_function(data):
    inputs = tokenizer(
        data['question'].tolist(),
        data['context'].tolist(),
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors='pt'
    )

    start_positions = []
    end_positions = []

    for i in range(len(data)):
        start_char = data['answers'][i]['answer_start'][0]
        end_char = start_char + len(data['answers'][i]['text'][0])
        offset_mapping = inputs['offset_mapping'][i]
        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if offset_mapping[context_start][0] > start_char or offset_mapping[context_end][1] < end_char:
            start_positions.append(context_start)
            end_positions.append(context_start)
        else:
            start_positions.append(next(idx for idx, offset in enumerate(offset_mapping) if offset[0] <= start_char < offset[1]))
            end_positions.append(next(idx for idx, offset in enumerate(offset_mapping) if offset[0] < end_char <= offset[1]))

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

train_encodings = preprocess_function(train_cleaned)
val_encodings = preprocess_function(validation_cleaned)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]