#  SQuAD Data Preprocessing

  ### Step1 : SQuAD data structure
  
    input : List[ entry : DICT ]
    entry : DICT[title:str , paragraphs: LIST]
    paragraphs : LIST[ paragraph: DICT]
    paragraph : DICT[context:str, qas: LIST]
    qas: DICT[ qa : DICT]
    qa : DICT[id:str , question:str , answers: LIST[DICT]]
    answers : LIST[ answer : DICT]
    answer : DICT[ answer_start : INT, text:str]

![](squad_json.png)

### 1.1 Load squad file  stored as .json file

    * Note : "answer_start" is char based offset

In [None]:
import json

with open('data/squad_.json','r',encoding = 'utf-8')as reader:
    input_data = json.load(reader)["data"]
# Entry
for entry in input_data:
    # Each entry has "title" and "paragraphs"
    print('Entry "{}"\n'.format(entry['title']))
    
    for paragraph in entry['paragraphs']:
        
        # Each paragraph has multiple (question,answer) pairs 
        for id1 , qa in enumerate(paragraph['qas']):
            
            print("     * question #{} with qa_id : ".format(id1), qa['id'])
            print("               : ", qa['question'])
            
            for id2 , answer in enumerate(qa['answers']):
            
                print("     * answer # {} : '{}'. Start position : {}\n".format(
                                        id2, answer['text'],answer['answer_start']))
        
        print('\n     * Context of paragraph \n{}\n\n'.format( paragraph['context'][:500]))
    
    # look at just one entry
    break


### Step2. Define a class to store SQuAD samples
    
    * All positions are word based.

In [None]:
class SquadExample(object):
    
    def __init__(self,
                qas_id,
                question_text,
                doc_tokens,
                orig_answer_text=None,
                start_position=None,
                end_position=None):
        
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        # Note: The start and end positions stores 
        #word based indexing positions
        self.start_position = start_position
        self.end_position = end_position
        
    def __str__(self):
        
        return self.__repr__()
    
    def __repr__(self):
        s=""
        s+="  - qas_id: {}".format(self.qas_id)
        s+="\n  - question_text: {}".format(self.question_text)
        s+="\n  - answer_text  : {}".format(self.orig_answer_text)
        
        if self.start_position:
            s += "\n  - start_position: {}".format(self.start_position)
        if self.start_position:
            s += "\n  - end_position: {}".format(self.end_position)
            s += "\n  - doc_tokens: \n\n     {}".format(" ".join(self.doc_tokens))
        return s
        

### Step 3 : Load as a list of SquadExamples

  * Conversion from char offset to word offset 

In [None]:
is_training=True
entry = input_data[0]
paragraph = entry['paragraphs'][0]

### 3 .1. computes the char to word offset map

In [None]:
# original text in paragraph
paragraph_text = paragraph["context"]

doc_tokens = []
char_to_word_offset =[]
prev_is_whitespace = True

# Token Generation 
for c in paragraph_text:

    if c== " " or c== "\t" or c =="\r" or c =="\n" or ord(c) == 0x202F:
        prev_is_whitespace = True
    else:
        if prev_is_whitespace:
            doc_tokens.append(c)
        else:
            doc_tokens[-1] += c
        prev_is_whitespace = False

    # make a mapping from char to word
    char_to_word_offset.append(len(doc_tokens) -1)

print(paragraph_text[:27])
print(char_to_word_offset[:27])

### 3.2 Extract question 

In [None]:
# There are multiple qas in a paragraph['qas']
# Let's look at the first one
qa = paragraph['qas'][0]

qas_id =qa['id']
question_text = qa['question']

### 3.3 Extract Answers

In [None]:
# Training data has answers too
start_position = None
end_position = None
orig_answer_text = None

if is_training:

    # Answer extraction 
    answer = qa['answers'][0]
    orig_answer_text = answer['text']
    #char offset to word offset
    answer_offset  = answer['answer_start']
    answer_length  = len(orig_answer_text)
    start_position = char_to_word_offset[answer_offset]
    end_position   = char_to_word_offset[answer_offset + answer_length -1]

    
    # Just minor preprecessing (joined tokens must be same as original text)
    actual_text = " ".join(doc_tokens[start_position:(end_position +1)])
    orig_answer_tokens = orig_answer_text.strip()
    if not orig_answer_tokens:
        original_answer_tokens = []
    cleaned_answer_text = " ".join(orig_answer_tokens.split())
    if actual_text.find(cleaned_answer_text) == -1:
        print("this text is not used")
        
        

In [None]:
example = SquadExample(
     qas_id = qas_id,
     question_text = question_text,
     doc_tokens = doc_tokens,
     orig_answer_text = orig_answer_text,
     start_position = start_position,
     end_position = end_position)

print(example,'\n')