In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit->simpletransformers)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [11]:
import json
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset


In [12]:
# Read train data
with open(r"/kaggle/input/stanford-question-answering-dataset/train-v1.1.json", "r") as read_file:
    train = json.load(read_file)

In [13]:
# Read test data
with open(r"/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json", "r") as read_file:
    test = json.load(read_file)

In [14]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
config = {"max_length": 384}

# Preprocess function
def preprocess_function(question, context, answer_start_char, answer_end_char):
    inputs = tokenizer(
        question,
        context,
        max_length=config["max_length"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset = inputs.pop("offset_mapping")
    sequence_ids = inputs.sequence_ids()
    
    # Finding the start and end positions of the context in the tokenized input
    context_start = sequence_ids.index(1)
    context_end = len(sequence_ids) - sequence_ids[::-1].index(1)
    
    context_offsets = offset[context_start:context_end]
    
    # Create a mapping of character index to token index
    charcter_pos_to_token_pos = {}
    for token_pos, (char_start, char_end) in enumerate(context_offsets):
        for i in range(char_start, char_end):
            charcter_pos_to_token_pos[i] = token_pos + context_start
            
    start_pos = charcter_pos_to_token_pos.get(answer_start_char, 0)
    end_pos = charcter_pos_to_token_pos.get(
        answer_end_char - 1, 
        0 if start_pos == 0 else context_end - 1
    )
    
    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
        
    return inputs

In [16]:
# Create dataset from JSON data
def create_dataset(data):
    input_ids_list = []
    attention_mask_list = []
    start_positions_list = []
    end_positions_list = []
    
    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer_start_char = qa['answers'][0]['answer_start']
                answer_text = qa['answers'][0]['text']
                answer_end_char = answer_start_char + len(answer_text)
                
                inputs = preprocess_function(question, context, answer_start_char, answer_end_char)
                
                input_ids_list.append(inputs["input_ids"])
                attention_mask_list.append(inputs["attention_mask"])
                start_positions_list.append(inputs["start_positions"])
                end_positions_list.append(inputs["end_positions"])
    
    return Dataset.from_dict({
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "start_positions": start_positions_list,
        "end_positions": end_positions_list,
    })

train_dataset = create_dataset(train)
eval_dataset = create_dataset(test)

**https://simpletransformers.ai/docs/qa-model/**

In [17]:
# Define token-level IoU custom metric
def token_level_iou(pred_start, pred_end, true_start, true_end):
    pred_range = set(range(pred_start, pred_end + 1))
    true_range = set(range(true_start, true_end + 1))
    
    intersection = len(pred_range & true_range)
    union = len(pred_range | true_range)
    
    return intersection / union if union != 0 else 0


In [19]:
train_dataset = train_dataset.select(range(1000))  # Train on a subset of 1000 examples
eval_dataset = eval_dataset.select(range(200))  

In [20]:

# Model initialization
model = BertForQuestionAnswering.from_pretrained("distilbert-base-uncased")


# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bi

Epoch,Training Loss,Validation Loss
1,No log,4.387908
2,No log,4.098793
3,No log,4.180162


TrainOutput(global_step=375, training_loss=4.517264322916667, metrics={'train_runtime': 136.2951, 'train_samples_per_second': 22.011, 'train_steps_per_second': 2.751, 'total_flos': 587917702656000.0, 'train_loss': 4.517264322916667, 'epoch': 3.0})

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Inference pipeline function
def inference_pipeline(question, context):
    # Encode the inputs
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    input_ids = inputs['input_ids']
    token_type_ids = inputs['token_type_ids']
    
    # Get the model's predictions
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=token_type_ids)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    # Debugging: Print the scores and input IDs
    print("Input IDs:", input_ids)
    print("Start Scores:", start_scores)
    print("End Scores:", end_scores)

    # Find the tokens with the highest start and end scores
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1

    # Debugging: Print the start and end positions
    print("Answer Start:", answer_start.item())
    print("Answer End:", answer_end.item())

    # Decode the tokens back to the answer text
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end])
    )

    return answer

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Example usage
question = "What is the capital of France?"
context = "Paris is the capital of France."
predicted_answer = inference_pipeline(question, context)
print(f"Predicted Answer: {predicted_answer}")

Input IDs: tensor([[ 101, 2054, 2003, 1996, 3007, 1997, 2605, 1029,  102, 3000, 2003, 1996,
         3007, 1997, 2605, 1012,  102]])
Start Scores: tensor([[ 0.1354, -0.3520, -0.0773,  0.2274,  0.5262, -0.0657,  0.2304, -0.1465,
          0.4132,  0.1155, -0.1289, -0.0262,  0.2893, -0.1477,  0.4441,  0.3073,
          0.3193]])
End Scores: tensor([[-0.3427, -0.3448, -0.0414,  0.1092,  0.0275,  0.0715, -0.1793, -0.2158,
         -0.0401,  0.1829, -0.0125,  0.1208,  0.3932,  0.1765,  0.0037,  0.2936,
          0.2919]])
Answer Start: 4
Answer End: 13
Predicted Answer: capital of france ? [SEP] paris is the capital
