In [1]:
import json
import numpy as np
import pandas as pd
import os

In [2]:
def extract_df(file):
    # initialize dictionary
    diction = {'id':[],'questions':[],'context_id':[],'contexts':[],'answers':[]}
    data = pd.read_json(file)
    context_id = 0
    for i in range(len(data)):
        dp = data.iloc[[i]]
        dt = dp.data[i]
        title = dt['title']
        para = dt['paragraphs']
        for el in para:
            context = el['context']
            context_id+=1
            qas = el['qas']
            for qa in qas:
                ans = qa['answers'][0]
                answers = ans
                question = qa['question']
                id = qa['id']               
                diction['id'].append(id)
                diction['questions'].append(question)
                diction['context_id'].append(context_id)
                diction['contexts'].append(context)
                diction['answers'].append(answers)   
      
    # dictionary to dataframe
    df = pd.DataFrame(diction)
    return df

In [3]:
training_df = extract_df("squad1.1/train-v1.1.json")
validation_df = extract_df("squad1.1/dev-v1.1.json")

In [15]:
print(f"There are {len(training_df)} questions in the training dataset")
print(f"There are {len(validation_df)} questions in the validation dataset")

There are 87599 questions in the training dataset
There are 10570 questions in the validation dataset


In [16]:
print(f"There are {len(training_df.contexts.unique())} unique contexts in the training dataset")
print(f"There are {len(validation_df.contexts.unique())} unique contexts in the validation dataset")

There are 18891 unique contexts in the training dataset
There are 2067 unique contexts in the validation dataset


In [263]:
train_contexts, train_questions, train_answers = training_df['contexts'].to_list(),training_df['questions'].to_list(),training_df['answers'].to_list()
val_contexts, val_questions, val_answers = validation_df['contexts'].to_list(),validation_df['questions'].to_list(),validation_df['answers'].to_list()

In [264]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        answer_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(answer_text)
        if context[start_idx:end_idx] == answer_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == answer_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [271]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [272]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [273]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [274]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [275]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

EPOCHS = 3

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(EPOCHS):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
    
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
    
        loss = outputs[0]
        loss.backward()
     
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [278]:
from sklearn import metrics
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []
predicted = []
ground_truth = []
# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        # predict the f1 and other metrics
        predict = list(start_pred.cpu().numpy()+ end_pred.cpu().numpy())
        gt = list(start_true.cpu().numpy() + end_true.cpu().numpy())
        predicted.extend(predict)
        ground_truth.extend(gt)
# calculate average accuracy in total
acc = sum(acc)/len(acc)
bert_report =  metrics.classification_report(ground_truth,predicted)
print(bert_report)

100%|██████████| 661/661 [00:37<00:00, 17.82it/s]


In [280]:
print(f"accuracy is {acc * 100}%")

accuracy is 64.19629349560672%


In [57]:
'''SQuAD2.0'''
training_path = "squad2.0/train-v2.0.json"
validation_path = "squad2.0/dev-v2.0.json"

In [58]:
def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers


def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        answer_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(answer_text)
        if context[start_idx:end_idx] == answer_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == answer_text:
                   
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):

        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
    
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [59]:
train_contexts, train_questions, train_answers = read_squad(training_path)
val_contexts, val_questions, val_answers = read_squad(validation_path)

In [60]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [61]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [62]:
# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [63]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [65]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [66]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

EPOCHS = 5
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(EPOCHS):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Epoch 0: 100%|██████████| 5427/5427 [15:33<00:00,  5.81it/s, loss=1.46] 
Epoch 1: 100%|██████████| 5427/5427 [15:32<00:00,  5.82it/s, loss=1.18] 
Epoch 2: 100%|██████████| 5427/5427 [15:32<00:00,  5.82it/s, loss=0.977]
Epoch 3: 100%|██████████| 5427/5427 [15:35<00:00,  5.80it/s, loss=0.367] 
Epoch 4: 100%|██████████| 5427/5427 [15:33<00:00,  5.81it/s, loss=0.738] 
Epoch 5: 100%|██████████| 5427/5427 [15:34<00:00,  5.81it/s, loss=0.651]  
Epoch 6: 100%|██████████| 5427/5427 [15:37<00:00,  5.79it/s, loss=0.121]  
Epoch 7: 100%|██████████| 5427/5427 [15:38<00:00,  5.78it/s, loss=0.209]  
Epoch 8: 100%|██████████| 5427/5427 [15:36<00:00,  5.80it/s, loss=0.326]  
Epoch 9: 100%|██████████| 5427/5427 [15:36<00:00,  5.80it/s, loss=0.16]   
Epoch 10: 100%|██████████| 5427/5427 [15:39<00:00,  5.77it/s, loss=0.14]   
Epoch 11: 100%|██████████| 5427/5427 [15:33<00:00,  5.82it/s, loss=0.28]   
Epoch 12: 100%|██████████| 5427/5427 [15:31<00:00,  5.82it/s, loss=0.0266] 
Epoch 13: 100%|██████████| 542

('models/distilbert-custom/tokenizer_config.json',
 'models/distilbert-custom/special_tokens_map.json',
 'models/distilbert-custom/vocab.txt',
 'models/distilbert-custom/added_tokens.json',
 'models/distilbert-custom/tokenizer.json')

In [67]:
from sklearn import metrics
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []
predicted = []
ground_truth = []
# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        # predict the f1 and other metrics
        predict = list(start_pred.cpu().numpy()+ end_pred.cpu().numpy())
        gt = list(start_true.cpu().numpy() + end_true.cpu().numpy())
        predicted.extend(predict)
        ground_truth.extend(gt)
# calculate average accuracy in total
acc = sum(acc)/len(acc)
bert_report =  metrics.classification_report(ground_truth,predicted)
print(bert_report)

100%|██████████| 1269/1269 [01:10<00:00, 17.90it/s]

              precision    recall  f1-score   support

           2       0.66      0.88      0.76        95
           3       0.73      0.76      0.74        91
           4       0.58      0.61      0.59       138
           5       0.65      0.76      0.70       131
           6       0.49      0.49      0.49       140
           7       0.55      0.56      0.56        96
           8       0.42      0.51      0.46        93
           9       0.56      0.54      0.55       100
          10       0.60      0.59      0.60        95
          11       0.63      0.67      0.65        78
          12       0.69      0.62      0.65        92
          13       0.74      0.61      0.67        69
          14       0.54      0.62      0.58       100
          15       0.55      0.59      0.57        71
          16       0.56      0.68      0.61       133
          17       0.59      0.54      0.56       104
          18       0.48      0.59      0.52        82
          19       0.49    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
