I am building a LLM model from scratch for Question and Answering and I am making use of Stanford Question Answering Dataset gotten from Kaggle public repository and also available on huggingface as well.

# Seting up Environment

In [None]:
!pip install transformers
!pip install datasets
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

Here, I'm trying to install some dependences that I will be needing to for my LLM model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and Preprocess Data

The dataset is in a .json formart, and here I'm trying to load the dataset using the pandas library by calling the read_json method

In [None]:
import pandas as pd
from datasets import load_dataset

# Loading the SQuAD training and validation dataset
train_data = pd.read_json("/content/drive/MyDrive/train-v1.1.json")
validation_data = pd.read_json("/content/drive/MyDrive/dev-v1.1.json")


To manage computational resources, a random sample of 200 entries from the training data and 25 entries from the validation data is taken.

In [None]:
# Selecting randome sample 200 entries from the training data
train_sample = train_data.sample(n=200, random_state=0).reset_index(drop=True)

# Selecting Random sample 25 entries from the validation data
validation_sample = validation_data.sample(n=25, random_state=0).reset_index(drop=True)

In [None]:
# Printing the structure of the data
print(train_sample.head())
print(validation_sample.head())

                                                data  version
0  {'title': 'Cyprus', 'paragraphs': [{'context':...      1.1
1  {'title': 'Nonprofit_organization', 'paragraph...      1.1
2  {'title': 'Alsace', 'paragraphs': [{'context':...      1.1
3  {'title': 'Humanism', 'paragraphs': [{'context...      1.1
4  {'title': 'Iran', 'paragraphs': [{'context': '...      1.1
                                                data  version
0  {'title': 'Construction', 'paragraphs': [{'con...      1.1
1  {'title': 'Computational_complexity_theory', '...      1.1
2  {'title': 'Pharmacy', 'paragraphs': [{'context...      1.1
3  {'title': 'Private_school', 'paragraphs': [{'c...      1.1
4  {'title': 'Jacksonville,_Florida', 'paragraphs...      1.1


The first five rows of each of the train and test dataset are printed out to have an insight of what the dataset contains.

In [None]:
# The necessary information of my train sample dataset
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   data     200 non-null    object 
 1   version  200 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.2+ KB


In [None]:
# The necessary information of my validation sample dataset
validation_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   data     25 non-null     object 
 1   version  25 non-null     float64
dtypes: float64(1), object(1)
memory usage: 528.0+ bytes


In [None]:
# Importing the Necessary Libraries
from transformers import BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_metric
import torch


I defined a function to preprocess and prepare the data for tokenization by extracting relevant information such as context, questions, and answers.

In [None]:
def prepare_data_for_tokenizer(data):
    """
    Processes a DataFrame containing raw SQuAD data,
    organizing it to be compatible with the BERT tokenizer.
    """

    # Initializing lists to store processed data
    ids = []
    titles = []
    contexts = []
    questions = []
    answers = []

    # Iterating over each row in the DataFrame
    for _, row in data.iterrows():
        document = row['data']
        qa_title = document['title']
        paragraphs = document['paragraphs']

        # Extracting context, questions, and answers from each paragraph
        for paragraph in paragraphs:
            context = paragraph['context']
            qas = paragraph['qas']

            for qa in qas:
                qa_id = qa['id']
                question = qa['question']
                answer = qa['answers'][0]

                formatted_answer = {
                    'answer_start': [answer['answer_start']],
                    'text': [answer['text']]
                }

                # Appending extracted data to corresponding lists
                ids.append(qa_id)
                titles.append(qa_title)
                contexts.append(context)
                questions.append(question)
                answers.append(formatted_answer)

    # Creating a DataFrame from the organized data
    cleaned_data = {
        'id': ids,
        'title': titles,
        'context': contexts,
        'question': questions,
        'answers': answers
    }

    return pd.DataFrame(cleaned_data)

# Preparing the data
train_cleaned = prepare_data_for_tokenizer(train_sample)
validation_cleaned = prepare_data_for_tokenizer(validation_sample)


In [None]:
# Displaying the cleaned data to verify the structure
print(train_cleaned.head())
print(validation_cleaned.head())


                         id   title  \
0  572e7c43cb0c0d14000f11a6  Cyprus   
1  572e7c43cb0c0d14000f11a7  Cyprus   
2  572e7c43cb0c0d14000f11a8  Cyprus   
3  572e7c43cb0c0d14000f11a9  Cyprus   
4  572e7c43cb0c0d14000f11aa  Cyprus   

                                             context  \
0  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
1  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
2  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
3  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   
4  Cyprus (i/ˈsaɪprəs/; Greek: Κύπρος IPA: [ˈcipr...   

                                            question  \
0                What is the official name of Cypus?   
1                           Where is Cyprus located?   
2                  What countries are nearby Cyprus?   
3  What is Cyprus' affiliation with the European ...   
4  Is Cyprus an island country or land-locked cou...   

                                             answers  
0  {'answer_start': [99], 'text': ['

Now the data is preprocessed and available for tokenization

# Tokenize Data

Tokenization converts the text data into a format that the BERT model can process. The BertTokenizerFast is used for this purpose.

In [None]:
from transformers import BertTokenizerFast

# Loading BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def preprocess_function(data):
    inputs = tokenizer(
        data['question'].tolist(),
        data['context'].tolist(),
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors='pt'
    )

    start_positions = []
    end_positions = []

    for i in range(len(data)):
        start_char = data['answers'][i]['answer_start'][0]
        end_char = start_char + len(data['answers'][i]['text'][0])
        offset_mapping = inputs['offset_mapping'][i]
        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if offset_mapping[context_start][0] > start_char or offset_mapping[context_end][1] < end_char:
            start_positions.append(context_start)
            end_positions.append(context_start)
        else:
            start_positions.append(next(idx for idx, offset in enumerate(offset_mapping) if offset[0] <= start_char < offset[1]))
            end_positions.append(next(idx for idx, offset in enumerate(offset_mapping) if offset[0] < end_char <= offset[1]))

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

train_encodings = preprocess_function(train_cleaned)
val_encodings = preprocess_function(validation_cleaned)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Prepare Dataset Class

I created a custom dataset class QADataset to handle the tokenized data, making it compatible with PyTorch's DataLoader.

In [None]:
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

train_dataset = QADataset(train_encodings)
val_dataset = QADataset(val_encodings)


# Fine-tune BERT Model

In [None]:
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments

# Loading the pre-trained BERT model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Defining the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,1.5107,1.482655
2,1.1354,1.426388
3,0.8862,1.468098


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

TrainOutput(global_step=7788, training_loss=1.3071323545639124, metrics={'train_runtime': 3787.0433, 'train_samples_per_second': 32.903, 'train_steps_per_second': 2.056, 'total_flos': 2.441916177981696e+16, 'train_loss': 1.3071323545639124, 'epoch': 3.0})

I fine-tuned the pre-trained BERT model using the training dataset. The Trainer class from the transformers library simplifies the training process.

# Evaluate Model

I then evaluated the model using the validation dataset, and the results are printed.

In [None]:
from datasets import load_metric

# Loading the evaluation metric
metric = load_metric("squad")

def compute_metrics(p):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

# Evaluating the model
results = trainer.evaluate()
print(results)

  metric = load_metric("squad")


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

The repository for squad contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/squad.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] Y


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 1.4680976867675781, 'eval_runtime': 44.8276, 'eval_samples_per_second': 104.757, 'eval_steps_per_second': 6.558, 'epoch': 3.0}


# Example Question-Answer Pairs

To demonstrate the model's capability, I used an example question-answer.

In [None]:
import torch

def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    input_ids = inputs['input_ids'].tolist()[0]

    # Ensuring the model and inputs are on the same device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer


In [None]:
# Example:
question = "When did the United Kingdom join the European Union?"
context = "The United Kingdom joined the European Union on January 1, 1973. It was a significant moment in British history, marking the beginning of the UK's integration into the European political and economic sphere."
print(f"Q: {question}\nA: {answer_question(question, context)}")


Q: When did the United Kingdom join the European Union?
A: january 1, 1973
