## preprocess
For the dataset SQuAD2.0, it includes 3 impoortant parts:
1. Question: a string containing the question we will ask the model.
2. Context: a snippet of text that contains the answer to our question.
3. Answer: a shorter string that is an "excerpt" from the given context that provides the answer to our question.

First read the data from .json file to take a look at what the dataset looks like

In [15]:
import pandas as pd
import json
with open(r'../data/dev-v2.0.json', 'r', encoding='utf-8') as f:
    squad_data = json.load(f)

data = squad_data['data']
def read_dataset(data):
    rows = []
    for i in range(len(data)):
        paragraphs = data[i]['paragraphs']
        for j in range(len(paragraphs)):
            context = paragraphs[j]['context']
            qas = paragraphs[j]['qas']
            for k in range(len(qas)):
                question = qas[k]['question']
                id = qas[k]['id']
                answer_list = qas[k]['answers']
                for l in range(len(answer_list)):
                    answer = answer_list[l]['text']
                    answer_start = answer_list[l]['answer_start']
                    answer_end = answer_start + len(answer)
                    row = {'id': id, 'question': question, 'context': context,
                        'answer': answer, 'answer_start': answer_start, 'answer_end': answer_end}
                    rows.append(row)

    return pd.DataFrame(rows, columns=['id', 'question', 'context', 'answer', 'answer_start', 'answer_end'])
dev=read_dataset(data)
dev.head(5)

(20302, 6)


Unnamed: 0,id,question,context,answer,answer_start,answer_end
0,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159,165
1,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159,165
2,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159,165
3,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159,165
4,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,10th and 11th centuries,94,117


In [19]:
print("before drop the duplicates:",dev.shape)
dev1=dev.drop_duplicates()
print("After drop the duplicates:",dev1.shape)

before drop the duplicates: (20302, 6)
After drop the duplicates: (10388, 6)


In [20]:
with open(r'../data/train-v2.0.json', 'r', encoding='utf-8') as f:
    squad_data = json.load(f)
train_data = squad_data['data']
train = read_dataset(train_data)
print("before drop the duplicates:",train.shape)
train1 = train.drop_duplicates(train)
print("After drop the duplicates:",train1.shape)

before drop the duplicates: (86821, 6)
After drop the duplicates: (86821, 6)


training data doesn't have duplicates but dev data have


**step1**: We need to check if wether the answer should be extracted from 'answers' or 'plausible_answers':

If the 'plausible_answers' is in the 'qas': extract answer from 'plausible_answers', else extract answer from 'answers'.

In [24]:
def read_squad(path):

    with open(path, 'rb') as f:
        squad_dict = json.load(f)


    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                 
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('../data/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('../data/dev-v2.0.json')

**step2**:  Randomly split the train dataset into train_set and val_set:

In [25]:
import random
sample_indexes = random.sample(range(len(train_contexts)), 5000)
val_contexts, val_questions, val_answers = \
     [train_contexts[i] for i in sample_indexes],[train_questions[i] for i in
sample_indexes],[train_answers[i] for i in sample_indexes]

**step3**: The answer is contained in "text", and the start of the answer in context is provided in "answer_start". We need to train the model to find the beginning and end of an answer in context so we also need to add an "answer_end" value.

In [26]:
def add_end_idx(answers, contexts):

    for answer, context in zip(answers, contexts):
        
        gold_text = answer['text']
   
        start_idx = answer['answer_start']
       
        end_idx = start_idx + len(gold_text)

      
        if context[start_idx:end_idx] == gold_text:
           
            answer['answer_end'] = end_idx
        else:
         
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

**step 4**: Converts the string to an token, and then converts the answer start and answer end indexes from the character position to the token position.

In [27]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True,
padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True,
padding=True)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


KeyboardInterrupt: 

In [None]:
def add_token_positions(encodings, answers):
  # initialize lists to contain the token indices of answer start/end
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
      # append start/end token position using char_to_token method
      start_positions.append(encodings.char_to_token(i, answers[i]
['answer_start']))
      end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
      # if start position is None, the answer passage has been truncated
      if start_positions[-1] is None:
          start_positions[-1] = tokenizer.model_max_length
      # end position cannot be found, char_to_token found space, so shift one token forward
      go_back = 1
      while end_positions[-1] is None:
          end_positions[-1] = encodings.char_to_token(i, answers[i]
['answer_end']-go_back)
          go_back +=1
  # update our encodings object with the new token-based start/end positions
  encodings.update({'start_positions': start_positions, 'end_positions':
end_positions})
# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

**step 5**: Now we have the data ready and everything we need and we just need to convert it to the correct format for training with PyTorch. Construct a dataset object

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)