In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
# Check if MPS is available and set device accordingly
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

In [2]:
import json

with open('train-v1.1.json') as train_file:
    train_data = json.load(train_file)

with open('dev-v1.1.json') as dev_file:
    dev_data = json.load(dev_file)

In [3]:
from datasets import Dataset

def format_qa_dataset(data):
    context_list = []
    question_list = []
    answer_list = []

    for article in data['data']:
        for paragraph in article['paragraphs']:
            context_text = paragraph['context']
            for qa_pair in paragraph['qas']:
                question_text = qa_pair['question']
                answer_dict = qa_pair['answers'][0]  # Taking the first answer only
                answer_dict['text'] = answer_dict['text']
                answer_dict['answer_start'] = answer_dict['answer_start']

                context_list.append(context_text)
                question_list.append(question_text)
                answer_list.append(answer_dict)
    
    return Dataset.from_dict({'context': context_list, 'question': question_list, 'answers': answer_list})

In [4]:
train_dataset_formatted = format_qa_dataset(train_data)
dev_dataset_formatted = format_qa_dataset(dev_data)

In [5]:
# Load model directly
from transformers import AutoTokenizer

pretrained_model_name = 'distilbert/distilbert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [6]:
def tokenize_and_align(examples):
    clean_questions = [q.strip() for q in examples['question']]
    tokenized_inputs = bert_tokenizer(
        clean_questions,
        examples['context'],
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors="pt"
    )
    
    start_positions_list = []
    end_positions_list = []
    
    for idx, ans in enumerate(examples['answers']):
        start_positions_list.append(ans['answer_start'])
        end_positions_list.append(ans['answer_start'] + len(ans['text']))
    
    tokenized_inputs.update({
        "start_positions": start_positions_list,
        "end_positions": end_positions_list,
    })
    
    return tokenized_inputs

In [7]:
tokenized_train_dataset = train_dataset_formatted.map(tokenize_and_align, batched=True)
tokenized_dev_dataset = dev_dataset_formatted.map(tokenize_and_align, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

qa_model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name).to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_parameters = TrainingArguments(
    output_dir="./qa_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False
)

In [10]:
model_trainer = Trainer(
    model=qa_model,
    args=training_parameters,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset
)

In [11]:
model_trainer.train()

  0%|          | 0/10950 [00:00<?, ?it/s]

{'loss': 5.9166, 'grad_norm': 3.2423887252807617, 'learning_rate': 1.9086757990867582e-05, 'epoch': 0.09}
{'loss': 5.7097, 'grad_norm': 8.277706146240234, 'learning_rate': 1.8173515981735163e-05, 'epoch': 0.18}
{'loss': 5.5541, 'grad_norm': 14.45805835723877, 'learning_rate': 1.726027397260274e-05, 'epoch': 0.27}
{'loss': 5.4709, 'grad_norm': 8.233311653137207, 'learning_rate': 1.634703196347032e-05, 'epoch': 0.37}
{'loss': 5.3793, 'grad_norm': 15.235696792602539, 'learning_rate': 1.54337899543379e-05, 'epoch': 0.46}
{'loss': 5.3542, 'grad_norm': 16.16097068786621, 'learning_rate': 1.4520547945205482e-05, 'epoch': 0.55}
{'loss': 5.3063, 'grad_norm': 19.32600212097168, 'learning_rate': 1.360730593607306e-05, 'epoch': 0.64}
{'loss': 5.24, 'grad_norm': 18.49815559387207, 'learning_rate': 1.2694063926940641e-05, 'epoch': 0.73}
{'loss': 5.2002, 'grad_norm': 22.82916259765625, 'learning_rate': 1.178082191780822e-05, 'epoch': 0.82}
{'loss': 5.1659, 'grad_norm': 30.131479263305664, 'learning_r

  0%|          | 0/661 [00:00<?, ?it/s]

{'eval_loss': 5.0377936363220215, 'eval_runtime': 177.0618, 'eval_samples_per_second': 59.697, 'eval_steps_per_second': 3.733, 'epoch': 1.0}
{'loss': 5.1149, 'grad_norm': 22.424560546875, 'learning_rate': 9.95433789954338e-06, 'epoch': 1.0}
{'loss': 4.9654, 'grad_norm': 21.65566062927246, 'learning_rate': 9.04109589041096e-06, 'epoch': 1.1}
{'loss': 4.9639, 'grad_norm': 34.950931549072266, 'learning_rate': 8.127853881278539e-06, 'epoch': 1.19}
{'loss': 4.9469, 'grad_norm': 15.26646614074707, 'learning_rate': 7.214611872146119e-06, 'epoch': 1.28}
{'loss': 4.9264, 'grad_norm': 16.952180862426758, 'learning_rate': 6.301369863013699e-06, 'epoch': 1.37}
{'loss': 4.9043, 'grad_norm': 21.248090744018555, 'learning_rate': 5.388127853881279e-06, 'epoch': 1.46}
{'loss': 4.863, 'grad_norm': 23.045116424560547, 'learning_rate': 4.4748858447488585e-06, 'epoch': 1.55}
{'loss': 4.8632, 'grad_norm': 15.300145149230957, 'learning_rate': 3.5616438356164386e-06, 'epoch': 1.64}
{'loss': 4.8221, 'grad_norm

  0%|          | 0/661 [00:00<?, ?it/s]

{'eval_loss': 4.827496528625488, 'eval_runtime': 176.7432, 'eval_samples_per_second': 59.804, 'eval_steps_per_second': 3.74, 'epoch': 2.0}
{'train_runtime': 10116.0486, 'train_samples_per_second': 17.319, 'train_steps_per_second': 1.082, 'train_loss': 5.1433682933789955, 'epoch': 2.0}


TrainOutput(global_step=10950, training_loss=5.1433682933789955, metrics={'train_runtime': 10116.0486, 'train_samples_per_second': 17.319, 'train_steps_per_second': 1.082, 'total_flos': 1.7167621364554752e+16, 'train_loss': 5.1433682933789955, 'epoch': 2.0})

In [13]:
from transformers import pipeline

qa_pipeline_model = pipeline("question-answering", model=qa_model, tokenizer=bert_tokenizer, device=0)

In [136]:
sample_context = "I am Sumeru. I like playing Poker."
sample_question = "What does Sumeru like?"

In [137]:
qa_result = qa_pipeline_model({
    'context': sample_context,
    'question': sample_question
})

In [138]:
print("Prediction:", qa_result)

Prediction: {'score': 0.021213460713624954, 'start': 28, 'end': 33, 'answer': 'Poker'}


In [139]:
predicted_start_idx = qa_result['start']
predicted_end_idx = qa_result['end']

In [140]:
actual_answer = "Poker"
actual_start_idx = sample_context.find(actual_answer)
actual_end_idx = actual_start_idx + len(actual_answer)

In [141]:
def calculate_iou(pred_span, true_span):
    pred_token_indices = set(range(pred_span['start_positions'], pred_span['end_positions']))
    true_token_indices = set(range(true_span['start_positions'], true_span['end_positions']))
    
    if not pred_token_indices or not true_token_indices:
        return 0.0
    
    intersection_len = len(pred_token_indices & true_token_indices)
    union_len = len(pred_token_indices | true_token_indices)
    
    return intersection_len / union_len if union_len != 0 else 0

In [142]:
predicted_span = {'start_positions': predicted_start_idx, 'end_positions': predicted_end_idx}
true_span = {'start_positions': actual_start_idx, 'end_positions': actual_end_idx}

In [143]:
iou_score_value = calculate_iou(predicted_span, true_span)
print("Predicted Answer:", qa_result['answer'])
print("True Answer:", actual_answer)
print("Token-level IoU:", iou_score_value)

Predicted Answer: Poker
True Answer: Poker
Token-level IoU: 1.0
