In [1]:
#dataset's link.
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

In [2]:
import os
import requests

def get_data(url):
    """get the dataset from url and save it locally."""
    #check if files exist.
    if not os.path.exists('squad/train-v2.0.json') and not os.path.exists('squad/dev-v2.0.json'):
        if not os.path.exists('squad'):
            os.mkdir('squad')

        for file in ['train-v2.0.json', 'dev-v2.0.json']:
            res = requests.get(f'{url}{file}')
            with open(f'./squad/{file}', 'wb') as f:
                for chunk in res.iter_content(chunk_size=4):
                    f.write(chunk)

In [3]:
import json

def read_data(path):
    """get the dataset then return the data in a list of dict."""
    get_data(url)
    
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    #dict to store contexts, questions, and answers.
    datas = {'question': [], 'id': [], 'answers': [], 'is_impossible': [], 'context': []}

    #iterate through all data in squad.
    for data in squad_dict['data']:
        for paragraph in data['paragraphs']:
            context = paragraph['context'].strip()
            for qa in paragraph['qas']:
                question = qa['question'].strip()
                
                datas['question'].append(question)
                datas['id'].append(qa['id'])
                datas['answers'].append(qa['answers'])
                datas['is_impossible'].append(qa['is_impossible'])
                datas['context'].append(context)
                
    #return formatted lists of data.
    return datas

In [4]:
train_data = read_data('./squad/train-v2.0.json')
test_data = read_data('./squad/dev-v2.0.json')

In [5]:
from transformers import BertTokenizerFast, BertForQuestionAnswering
#name of bert model to use.
model_name = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [6]:
max_length = 384 #the maximum length of a feature (question and context) - 3/4 of the model's maximum length.
doc_stride = 128 #the overlap between two part of the context when splittingis needed - since answer may lie at the splitting point.

In [7]:
def encode(data, max_length, doc_stride):
    encodings = tokenizer(data['question'],
                          data['context'],
                          max_length=max_length,
                          truncation='only_second',
                          stride=doc_stride,
                          return_overflowing_tokens=True,
                          return_offsets_mapping=True,
                          padding='max_length')
    return encodings

In [8]:
train_encodings = encode(train_data, max_length, doc_stride)
test_encodings = encode(test_data, max_length, doc_stride)

In [9]:
train_sample_mapping = train_encodings.pop('overflow_to_sample_mapping')
train_offset_mapping = train_encodings.pop('offset_mapping')

test_sample_mapping = test_encodings.pop('overflow_to_sample_mapping')
test_offset_mapping = test_encodings.pop('offset_mapping')

In [10]:
def add_position_tokens(data, encodings, sample_mapping, offset_mapping):
    start_positions = []
    end_positions = []

    for i, mapping_idx in enumerate(sample_mapping):
        start_pos = []
        end_pos = []
        answer = data['answers'][mapping_idx]
        offset = offset_mapping[i]
        if len(answer): #has an answer.
            answer = answer[0] #training data has at most 1 answer for each question.
            start_char = answer['answer_start']
            end_char = start_char + len(answer['text'])
            sequence_ids = encodings.sequence_ids(i)

            #find the start and end of the answer in context.
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            #if the answer is not fully inside the context, label it (0,0).
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                #otherwise it's the start and end token positions.
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)
        else: #no answers, label with (0,0).
            start_positions.append(0)
            end_positions.append(0)
            
    encodings['start_positions'] = start_positions
    encodings['end_positions'] = end_positions

In [11]:
add_position_tokens(train_data, train_encodings, train_sample_mapping, train_offset_mapping)

In [12]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
train_dataset = SquadDataset(train_encodings)
test_dataset = SquadDataset(test_encodings)

In [14]:
import numpy as np
import collections

def process_predictions(raw_predictions, test_data, test_encodings, test_sample_mapping, test_offset_mapping, n_best_size=20, max_answer_length=30):
    predictions = []

    i = 0
    while i < len(raw_predictions):
        valid_answers = []
        min_null_score = None #if is_impossible, the correct answer is set to (0,0), this will compare the score at (0,0) with the best score produced.
        start_logits = []
        end_logits = []
        offset_mappings = []
        sequence_ids = []

        idx = 0 #store the number of features map to this test_data['answers'].
        #store all the logits belong to this mapping index in test_data['answers'].
        while test_sample_mapping[i+idx]==test_sample_mapping[i]:
            start_logits.append(raw_predictions[i+idx][0].cpu().numpy()) #get all start logits.
            end_logits.append(raw_predictions[i+idx][1].cpu().numpy()) #get all end logits.
            offset_mappings.append(test_offset_mapping[i+idx]) #get all offsets.
            sequence_ids.append(test_encodings.sequence_ids(i+idx))
            idx += 1
            if i + idx >= len(raw_predictions): #the very last iteration.
                break
        
        #go through the features map to this test_data['answers'].
        for j in range(idx):
            #update minimum null prediction.
            cls_index = test_encodings['input_ids'][test_sample_mapping[i+j]].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[j][cls_index] + end_logits[j][cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            #a list of possible start/end indexes.
            start_indexes = np.argsort(start_logits[j])[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits[j])[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    #out of length or not in context indexes.
                    if (start_index >= len(offset_mappings[j]) or 
                        end_index >= len(offset_mappings[j]) or 
                        sequence_ids[j][start_index] != 1 or 
                        sequence_ids[j][end_index] != 1):
                        continue
                    #negative length or length greater than the set max length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mappings[j][start_index][0]
                    end_char = offset_mappings[j][end_index][1]
                    valid_answers.append({'score': start_logits[j][start_index] + end_logits[j][end_index],
                                          'text': test_data['context'][test_sample_mapping[i+j]][start_char:end_char]})
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x['score'], reverse=True)[0]
        else:
            best_answer = {'text': "", 'score': 0.0} #dummy for rare edge case.

        predictions.append(best_answer['text'] if best_answer['score'] > min_null_score else "")
        
        #skip idx iterations since it has already been dealt with.
        i += idx
    return predictions

In [15]:
import re
import string

#the below functions are from evaluation script at: https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def get_raw_scores(data, preds):
    exact_scores = []
    f1_scores = []
    for i, answers in enumerate(test_data['answers']):
        gold_answers = [a['text'] for a in answers if normalize_answer(a['text'])]
        if not gold_answers:
            gold_answers = ['']
        exact_scores.append(max(compute_exact(a, preds[i]) for a in gold_answers))
        f1_scores.append(max(compute_f1(a, preds[i]) for a in gold_answers))
    return exact_scores, f1_scores

In [17]:
def evaluate(model, test_dataloader):
    #evaluation mode.
    model.eval()

    #store variables.
    raw_predictions = []

    for batch in tqdm(test_dataloader):   
        #load into device.
        batch = tuple(batch[b].to(device) for b in batch if b != 'token_type_ids')

        #define inputs.
        inputs = {'input_ids':       batch[0],
                  'attention_mask':  batch[1]}

        #compute logits.
        with torch.no_grad():
            outputs = model(**inputs)

        #get predictions.
        start_pred = outputs.start_logits
        end_pred = outputs.end_logits
        
        for i in range(len(start_pred)):
            prediction = (start_pred[i], end_pred[i])
            #for computing accuracy.  
            raw_predictions.append(prediction)

    return raw_predictions

In [18]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

batch_size = 8
#use data loader.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#set up optimizer and scheduler for fine-tuning model.
optimizer = AdamW(model.parameters(),
                  lr=3e-5,
                  eps=1e-8,
                  weight_decay=1e-2)
epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader)*epochs)

#setup GPU/CPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
#move model to device.
model.to(device)

for epoch in range(1, epochs+1):
    #set model to training mode.
    model.train()
    
    #setting initial loss to 0.
    loss_train_total = 0
    
    #setting up the progress bar.
    progress_bar = tqdm(train_dataloader,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    
    for batch in progress_bar:
        #set gradient to 0.
        model.zero_grad()

        #the batch will be a tuple of 4.
        batch = tuple(batch[b].to(device) for b in batch if b != 'token_type_ids')

        #dictionary of inputs.
        inputs = {'input_ids':       batch[0],
                  'attention_mask':  batch[1],
                  'start_positions': batch[2],
                  'end_positions':   batch[3]}
        
        #unpack the dict straight into inputs.
        outputs = model(**inputs)
        #extract loss.
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()

        #gradient clipping.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    #check if path exists.
    if not os.path.exists('models'):
        os.mkdir('models')
    torch.save(model.state_dict(), f'models/Bert_qa_ft_epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_ave = loss_train_total/len(train_dataloader)
    tqdm.write(f'Training loss: {loss_train_ave}')
    
    raw_predictions = evaluate(model, test_dataloader)
    predictions = process_predictions(raw_predictions, 
                                      test_data,
                                      test_encodings, 
                                      test_sample_mapping, 
                                      test_offset_mapping, 
                                      n_best_size=20, 
                                      max_answer_length=30)
    exact_scores, f1_scores = get_raw_scores(test_data, predictions)
    exact_score, f1_score = np.average(exact_scores), np.average(f1_scores)
    tqdm.write(f'Exact match: {exact_score}')
    tqdm.write(f'F1 score: {f1_score}')

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 2070 Super


                                                                                                                       


Epoch 1
Training loss: 1.3042786912306121


100%|██████████████████████████████████████████████████████████████████████████████| 1517/1517 [03:12<00:00,  7.90it/s]


Exact match: 0.6717762991661753
F1 score: 0.705992944644306


                                                                                                                       


Epoch 2
Training loss: 0.7602272240335317


100%|██████████████████████████████████████████████████████████████████████████████| 1517/1517 [03:14<00:00,  7.80it/s]


Exact match: 0.7064768803166849
F1 score: 0.7374936603764435


                                                                                                                       


Epoch 3
Training loss: 0.47304157943793634


100%|██████████████████████████████████████████████████████████████████████████████| 1517/1517 [03:11<00:00,  7.90it/s]


Exact match: 0.6976332856059968
F1 score: 0.7326860566593234
