In [1]:
!nvidia-smi

Thu Apr 15 13:47:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [34]:
import os 
import json
import requests 
import pprint

## Download the SQuAD v2 dataset and store it in the Google Drive Folder

In [26]:
PATH = "/content/drive/MyDrive/SQuAD"
res = requests.get(f'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json')

# loop through
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    # make the request to download data over HTTP
    res = requests.get(f'{url}{file}')
    # write to file
    with open(f'{PATH}/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

## There are three parts to the dataset i.e. *Questions*, *Contexts*, *Answers*

1. Questions - strings containing the questionwhich we will ask our model.
2. Contexts - larger segments of text which contain the answers to the questions
3. Answers - extracts from the contexts that provide an answer to the question


In [29]:
def read_data(path):
  """
  Reads in the data and returns the contexts, questions and answers from the JSON object
  @params: path
  Path to the JSON data 
  """
  # open JSON file and load intro dictionary
  with open(path, 'rb') as f:
      squad_dict = json.load(f)

  # initialize lists for contexts, questions, and answers
  contexts = []
  questions = []
  answers = []
  # iterate through all data in squad data
  for group in squad_dict['data']:
      for passage in group['paragraphs']:
          context = passage['context']
          for qa in passage['qas']:
              question = qa['question']
              # check if we need to be extracting from 'answers' or 'plausible_answers'
              if 'plausible_answers' in qa.keys():
                  access = 'plausible_answers'
              else:
                  access = 'answers'
              for answer in qa[access]:
                  # append data to lists
                  contexts.append(context)
                  questions.append(question)
                  answers.append(answer)
  # return formatted data lists
  return contexts, questions, answers

In [36]:
# Extracting the contexts, questions and answers from the paths
train_contexts, train_questions, train_answers = read_data(f'{PATH}/train-v2.0.json')
val_contexts, val_questions, val_answers = read_data(f'{PATH}/dev-v2.0.json')
assert all(len(lst) > 0 for lst in [train_contexts, train_questions, train_answers])
assert all(len(lst) > 0 for lst in [val_contexts, val_questions, val_answers])

In [44]:
# Multiple questions and answers can stem from the same context!
for values in zip(train_contexts, train_questions, train_answers):
  pprint.pprint(f"Context: {values[0]}\nQuestion: {values[1]}\nAnswer: {values[2]}\n")
  break

('Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born '
 'September 4, 1981) is an American singer, songwriter, record producer and '
 'actress. Born and raised in Houston, Texas, she performed in various singing '
 'and dancing competitions as a child, and rose to fame in the late 1990s as '
 "lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew "
 "Knowles, the group became one of the world's best-selling girl groups of all "
 "time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in "
 'Love (2003), which established her as a solo artist worldwide, earned five '
 'Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy '
 'in Love" and "Baby Boy".\n'
 'Question: When did Beyonce start becoming popular?\n'
 "Answer: {'text': 'in the late 1990s', 'answer_start': 269}\n")


## The model will be trained to find the start and the end of the answer within the context so there needs to be a value to signify the end of the answer called *answer_end*

In [45]:
def add_end_idx(answers, contexts):
  """
  Adds the answer_end to the context based on the answer
  @params: answers, contexts
  answers is the list of answers from the JSON data
  contexts is the list of contexts from the JSON data
  """
  # loop through each answer-context pair
  for answer, context in zip(answers, contexts):
      # gold_text refers to the answer we are expecting to find in context
      gold_text = answer['text']
      # we already know the start index
      start_idx = answer['answer_start']
      # and ideally this would be the end index...
      end_idx = start_idx + len(gold_text)

      # ...however, sometimes the answers are off by a character or two
      if context[start_idx:end_idx] == gold_text:
          # if the answer is not off :)
          answer['answer_end'] = end_idx
      else:
          # this means the answer is off by 1-2 tokens
          for n in [1, 2]:
              if context[start_idx-n:end_idx-n] == gold_text:
                  answer['answer_start'] = start_idx - n
                  answer['answer_end'] = end_idx - n

In [46]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [47]:
train_answers[:10]

[{'answer_end': 286, 'answer_start': 269, 'text': 'in the late 1990s'},
 {'answer_end': 226, 'answer_start': 207, 'text': 'singing and dancing'},
 {'answer_end': 530, 'answer_start': 526, 'text': '2003'},
 {'answer_end': 180, 'answer_start': 166, 'text': 'Houston, Texas'},
 {'answer_end': 286, 'answer_start': 276, 'text': 'late 1990s'},
 {'answer_end': 335, 'answer_start': 320, 'text': "Destiny's Child"},
 {'answer_end': 524, 'answer_start': 505, 'text': 'Dangerously in Love'},
 {'answer_end': 374, 'answer_start': 360, 'text': 'Mathew Knowles'},
 {'answer_end': 286, 'answer_start': 276, 'text': 'late 1990s'},
 {'answer_end': 301, 'answer_start': 290, 'text': 'lead singer'}]