In [21]:
!nvidia-smi

Fri Apr 16 13:16:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    34W / 250W |   8671MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [22]:
!pip install --quiet transformers
!pip install --quiet tqdm

In [23]:
import os 
import json
import requests 
import pprint
import torch

from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

## Download the SQuAD v2 dataset and store it in the Google Drive Folder

In [4]:
PATH = "/content/drive/MyDrive/SQuAD"
res = requests.get(f'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json')

In [None]:
# loop through
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    # make the request to download data over HTTP
    res = requests.get(f'{url}{file}')
    # write to file
    with open(f'{PATH}/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

## There are three parts to the dataset i.e. *Questions*, *Contexts*, *Answers*

1. Questions - strings containing the questionwhich we will ask our model.
2. Contexts - larger segments of text which contain the answers to the questions
3. Answers - extracts from the contexts that provide an answer to the question


In [5]:
def read_data(path):
  """
  Reads in the data and returns the contexts, questions and answers from the JSON object
  @params: path
  Path to the JSON data 
  """
  # open JSON file and load intro dictionary
  with open(path, 'rb') as f:
      squad_dict = json.load(f)

  # initialize lists for contexts, questions, and answers
  contexts = []
  questions = []
  answers = []
  # iterate through all data in squad data
  for group in squad_dict['data']:
      for passage in group['paragraphs']:
          context = passage['context']
          for qa in passage['qas']:
              question = qa['question']
              # check if we need to be extracting from 'answers' or 'plausible_answers'
              if 'plausible_answers' in qa.keys():
                  access = 'plausible_answers'
              else:
                  access = 'answers'
              for answer in qa[access]:
                  # append data to lists
                  contexts.append(context)
                  questions.append(question)
                  answers.append(answer)
  # return formatted data lists
  return contexts, questions, answers

In [6]:
# Extracting the contexts, questions and answers from the paths
train_contexts, train_questions, train_answers = read_data(f'{PATH}/train-v2.0.json')
val_contexts, val_questions, val_answers = read_data(f'{PATH}/dev-v2.0.json')
assert all(len(lst) > 0 for lst in [train_contexts, train_questions, train_answers])
assert all(len(lst) > 0 for lst in [val_contexts, val_questions, val_answers])

In [7]:
# Multiple questions and answers can stem from the same context!
for values in zip(train_contexts, train_questions, train_answers):
  pprint.pprint(f"Context: {values[0]}\nQuestion: {values[1]}\nAnswer: {values[2]}\n")
  break

('Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born '
 'September 4, 1981) is an American singer, songwriter, record producer and '
 'actress. Born and raised in Houston, Texas, she performed in various singing '
 'and dancing competitions as a child, and rose to fame in the late 1990s as '
 "lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew "
 "Knowles, the group became one of the world's best-selling girl groups of all "
 "time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in "
 'Love (2003), which established her as a solo artist worldwide, earned five '
 'Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy '
 'in Love" and "Baby Boy".\n'
 'Question: When did Beyonce start becoming popular?\n'
 "Answer: {'text': 'in the late 1990s', 'answer_start': 269}\n")


## The model will be trained to find the start and the end of the answer within the context so there needs to be a value to signify the end of the answer called *answer_end*

In [8]:
def add_end_idx(answers, contexts):
  """
  Adds the position of where the end of the answer is ,called "answer_end" to the context based on the answer
  @params: answers, contexts
  answers is the list of answers from the JSON data
  contexts is the list of contexts from the JSON data
  """
  # loop through each answer-context pair
  for answer, context in zip(answers, contexts):
      # gold_text refers to the answer we are expecting to find in context
      gold_text = answer['text']
      # we already know the start index
      start_idx = answer['answer_start']
      # and ideally this would be the end index...
      end_idx = start_idx + len(gold_text)

      # some of the answers are off by a character or two
      if context[start_idx:end_idx] == gold_text:
          # if the answer is not off :)
          answer['answer_end'] = end_idx
      else:
          # this means the answer is off by 1-2 tokens
          for n in [1, 2]:
              if context[start_idx-n:end_idx-n] == gold_text:
                  answer['answer_start'] = start_idx - n
                  answer['answer_end'] = end_idx - n

In [9]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

## Now the answers contain where the particular answer ends 

In [10]:
train_answers[:10]

[{'answer_end': 286, 'answer_start': 269, 'text': 'in the late 1990s'},
 {'answer_end': 226, 'answer_start': 207, 'text': 'singing and dancing'},
 {'answer_end': 530, 'answer_start': 526, 'text': '2003'},
 {'answer_end': 180, 'answer_start': 166, 'text': 'Houston, Texas'},
 {'answer_end': 286, 'answer_start': 276, 'text': 'late 1990s'},
 {'answer_end': 335, 'answer_start': 320, 'text': "Destiny's Child"},
 {'answer_end': 524, 'answer_start': 505, 'text': 'Dangerously in Love'},
 {'answer_end': 374, 'answer_start': 360, 'text': 'Mathew Knowles'},
 {'answer_end': 286, 'answer_start': 276, 'text': 'late 1990s'},
 {'answer_end': 301, 'answer_start': 290, 'text': 'lead singer'}]

## Tokenization of the strings

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




## Now the context-question pairs are merged in the format expected by BERT/DistillBert where the the questions and context are concatenated but separeted by the [SEP] token



In [12]:
pprint.pprint(tokenizer.decode(train_encodings['input_ids'][0]))

('[CLS] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( '
 'born september 4, 1981 ) is an american singer, songwriter, record producer '
 'and actress. born and raised in houston, texas, she performed in various '
 'singing and dancing competitions as a child, and rose to fame in the late '
 "1990s as lead singer of r & b girl - group destiny's child. managed by her "
 "father, mathew knowles, the group became one of the world's best - selling "
 "girl groups of all time. their hiatus saw the release of beyonce's debut "
 'album, dangerously in love ( 2003 ), which established her as a solo artist '
 'worldwide, earned five grammy awards and featured the billboard hot 100 '
 'number - one singles " crazy in love " and " baby boy ". [SEP] when did '
 'beyonce start becoming popular? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

## The issue with the tokenizer is that it doesn't produce the answer start-end token positions

In [13]:
def add_token_positions(encodings, answers):
  """
  Adds two more attributes called "start_positions" and "end_positions" to the Encoding objects
  as the tokenizer does not include them.
  "start_positions" and "end_positions" are just lists that contain the start/end token positions of the answer
  that corresponds to their respective question-context pairs. 
  @params: encodings, answers
  encodings is the list of encodings 
  answers is the list of answers 
  """
  # initialize lists to contain the token indices of answer start/end
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
      # append start/end token position using char_to_token method
      start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
      end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

      # if start position is None, the answer passage has been truncated
      if start_positions[-1] is None:
          start_positions[-1] = tokenizer.model_max_length
      # end position cannot be found, char_to_token found space, so shift position until found
      shift = 1
      while end_positions[-1] is None:
          end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
          shift += 1
  # update our encodings object with the new token-based start/end positions
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [14]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [15]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

## Initialize the Dataset

In [16]:
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  
  def __getitem__(self, idx):
    return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

In [17]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [18]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [19]:
# Use the GPU if available else use the CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Set the model to the appropriate device
model.to(device)
# activate training mode 
model.train()
# initialize the AdamW optimizer
optim = AdamW(model.parameters(), lr=5e-5)
# number of epochs 
EPOCH = 3
# initialize the data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(EPOCH):
  model.train()
  # setup the progress bar
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    # initialize the calculated gradients 
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    # train the model on the batch and return the output with the loss 
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    # extract the loss 
    loss = outputs[0]
    # calculate loss for every parameter that needs grad update
    loss.backward()
    # update parameters
    optim.step()
    # print relevant info to progress bar
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 8145/8145 [59:34<00:00,  2.28it/s, loss=0.979]
Epoch 1: 100%|██████████| 8145/8145 [59:36<00:00,  2.28it/s, loss=1.08]
Epoch 2: 100%|██████████| 8145/8145 [59:34<00:00,  2.28it/s, loss=1.42]


In [20]:
model.save_pretrained(PATH)
tokenizer.save_pretrained(PATH)

('/content/drive/MyDrive/SQuAD/tokenizer_config.json',
 '/content/drive/MyDrive/SQuAD/special_tokens_map.json',
 '/content/drive/MyDrive/SQuAD/vocab.txt',
 '/content/drive/MyDrive/SQuAD/added_tokens.json')

In [24]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)

100%|██████████| 1640/1640 [03:54<00:00,  6.98it/s]


In [38]:
print(acc)

0.6412538109756097


In [39]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	158	161
pred	67	68

true	158	161
pred	67	68

true	158	161
pred	67	68

true	158	161
pred	67	68

true	2	4
pred	2	4

true	18	21
pred	18	21

true	50	53
pred	50	53

true	100	103
pred	99	103

