# SQuAD Data Preprocessing

  ### Step1 : Look at the  SQuAD data
  
    input : List[ entry : DICT ]
    entry : DICT[title:str , paragraphs: LIST]
    paragraphs : LIST[ paragraph: DICT]
    paragraph : DICT[context:str, qas: LIST]
    qas: DICT[ qa : DICT]
    qa : DICT[id:str , question:str , answers: LIST[DICT]]
    answers : LIST[ answer : DICT]
    answer : DICT[ answer_start : INT, text:str]

![](squad_json.png)

In [12]:
# Load the json file 
import json

with open('squad_.json','r',encoding = 'utf-8')as reader:
    input_data = json.load(reader)["data"]

In [19]:
for entry in input_data:
    
    print('Entry "{}"\n'.format(entry['title']))
    
    for paragraph in entry['paragraphs']:
        
        
        for qa_id , qa in enumerate(paragraph['qas']):
            
            print("     * question #{} with qa_id : ".format(qa_id), qa['id'])
            
            print("               : ", qa['question'])
            
            for ans_id , answer in enumerate(qa['answers']):
            
                print("     * answer # {} : {}\n".format(ans_id, answer['text']))
        
        print('\n     * Context of paragraph \n{}\n\n'.format( paragraph['context'][:500]))
    
    # look at just one entry
    break


Entry "University_of_Notre_Dame"

     * question #0 with qa_id :  5733be284776f41900661182
               :  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
     * answer # 0 : Saint Bernadette Soubirous

     * question #1 with qa_id :  5733be284776f4190066117f
               :  What is in front of the Notre Dame Main Building?
     * answer # 0 : a copper statue of Christ

     * question #2 with qa_id :  5733be284776f41900661180
               :  The Basilica of the Sacred heart at Notre Dame is beside to which structure?
     * answer # 0 : the Main Building

     * question #3 with qa_id :  5733be284776f41900661181
               :  What is the Grotto at Notre Dame?
     * answer # 0 : a Marian place of prayer and reflection

     * question #4 with qa_id :  5733be284776f4190066117e
               :  What sits on top of the Main Building at Notre Dame?
     * answer # 0 : a golden statue of the Virgin Mary


     * Context of paragraph 
Architecturally, th

### Step2. Define a class to store SQuAD samples

In [20]:
class SquadExample(object):
    
    def __init__(self,
                qas_id,
                question_text,
                doc_tokens,
                orig_answer_text=None,
                start_position=None,
                end_position=None):
        
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position
        
    def __str__(self):
        
        return self.__repr__()
    
    def __repr__(self):
        s=""
        s+="  - qas_id: {}".format(self.qas_id)
        s+="\n  - question_text: {}".format(self.question_text)
        
        if self.start_position:
            s += "\n  - start_position: {}".format(self.start_position)
        if self.start_position:
            s += "\n  - end_position: {}".format(self.end_position)
            s+="\n  - doc_tokens: \n\n     {}".format(" ".join(self.doc_tokens))
        return s
        

### Step 3 : Read SQuAD data and Load as a list of SquadExamples

In [21]:
# utility function 
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a peice of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def is_whitespace(c):
    if c== " " or c== "\t" or c =="\r" or c =="\n" or ord(c) == 0x202F:
        return True
    return False

In [18]:
is_training=True

squad_examples =[]

for entry in input_data:
    
    for paragraph in entry["paragraphs"]:
        
        # original text in paragraph
        paragraph_text = paragraph["context"]
        
        doc_tokens = []
        char_to_word_offset =[]
        prev_is_whitespace = True

        for c in paragraph_text:

            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) -1)
        #break

        for qa in paragraph["qas"]:
            qas_id =qa['id']
            question_text = qa['question']
            start_position = None
            end_position = None
            orig_answer_text = None
            if is_training:
                
                if len(qa['answers']) != 1:
                    raise ValueError(
                        "For training each question has 1 anser.")
                answer = qa['answers'][0]
                
                orig_answer_text = answer['text']
                answer_offset = answer['answer_start']
                answer_length = len(orig_answer_text)
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset + answer_length -1]
                
                actual_text = " ".join(doc_tokens[start_position:(end_position +1)])
                cleaned_answer_text = " ".join(
                    whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    continue
            example = SquadExample(
                 qas_id = qas_id,
                 question_text = question_text,
                 doc_tokens = doc_tokens,
                 orig_answer_text = orig_answer_text,
                 start_position = start_position,
                 end_position = end_position)
            squad_examples.append(example)
                
print(squad_examples[0],'\n')
print(squad_examples[1])

  - qas_id: 5733be284776f41900661182
  - question_text: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
  - start_position: 90
  - end_position: 92
  - doc_tokens: 

     Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. 

  - qas_id: 5733be284776f4190066117f
  - question_text: What is in front of the Notre Dame Main Building?
 

### Step 4 : Convert SquadExamples to features

#### Initial Paramters

In [7]:

 
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-multilingual-cased',do_lower_case=True)
# The maximum total input sequence length after WordPiece tokenization
max_seq_length = 384

#When splitting up a long document into chunks, how much stride to take between chunks
doc_stride = 128

#The maximum number of tokens for the question.
max_query_length = 64 


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# A look at BERT Tokenizer
# divides tokens into sub_tokens
for (i, token) in enumerate(example.doc_tokens):
    print("Token : ", token)
    sub_tokens = tokenizer.tokenize(token)
    for sub_token in sub_tokens:
        print("     Sub-Token : ",sub_token)
    break

    tok_to_orig_index = []
    orig_to_tok_index =[]
     # list to store tokens
    all_doc_tokens =[]
  
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)
        
    print(tok_to_orig_index)
    print(orig_to_tok_index)
    print(all_doc_tokens)

    tok_start_position = None
    tok_end_positin = None
    
    if is_training:
        
        tok_start_position = orig_to_tok_index[example.start_position]
        
        # if document is shorter than the given end_position 
        if example.end_position < len(example.doc_tokens) -1:
            tok_end_position = orig_to_tok_index[example.end_position+1] -1
        else:
            tok_end_position = len(all_doc_tokens) -1
        

    tok_start_position = None
    tok_end_positin = None
    
    if is_training:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) -1:
            tok_end_position = orig_to_tok_index[example.end_position+1] -1
        else:
            tok_end_position = len(all_doc_tokens) -1
        
        (tok_start_position,tok_end_position) = improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.orig_answer_text)

tok_start_position

In [8]:
def improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
                         orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""

    # The SQuAD annotations are character based. We first project them to
    # whitespace-tokenized words. But then after WordPiece tokenization, we can
    # often find a "better match". For example:
    #
    #   Question: What year was John Smith born?
    #   Context: The leader was John Smith (1895-1943).
    #   Answer: 1895
    #
    # The original whitespace-tokenized answer will be "(1895-1943).". However
    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
    # the exact answer, 1895.
    #
    # However, this is not always possible. Consider the following:
    #
    #   Question: What country is the top exporter of electornics?
    #   Context: The Japanese electronics industry is the lagest in the world.
    #   Answer: Japan
    #
    # In this case, the annotator chose "Japan" as a character sub-span of
    # the word "Japanese". Since our WordPiece tokenizer does not split
    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
    # in SQuAD, but does happen.
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

import collections
print("doc_stride", doc_stride)
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
start_offset=0
_DocSpan = collections.namedtuple(
        "DocSpan",["start","length"])
doc_spans = []
start_offset = 0
print("all_doc_tokens", all_doc_tokens)
print("start_offset", start_offset)
while start_offset < len(all_doc_tokens):
    print(start_offset)
    length = len(all_doc_tokens) - start_offset
    if length > max_tokens_for_doc:
        length = max_tokens_for_doc
    doc_spans.append(_DocSpan(start=start_offset, length=length))
    if start_offset + length == len(all_doc_tokens):
        break
    start_offset += min(length, doc_stride)
doc_spans

In [9]:
def _check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""

    # Because of the sliding window approach taken to scoring documents, a single
    # token can appear in multiple documents. E.g.
    #  Doc: the man went to the store and bought a gallon of milk
    #  Span A: the man went to the
    #  Span B: to the store and bought
    #  Span C: and bought a gallon of
    #  ...
    #
    # Now the word 'bought' will have two scores from spans B and C. We only
    # want to consider the score with "maximum context", which we define as
    # the *minimum* of its left and right context (the *sum* of left and
    # right context will always be the same, of course).
    #
    # In the example the maximum context for 'bought' would be span C since
    # it has 1 left context and 3 right context, while span B has 4 left context
    # and 0 right context.
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span.start + doc_span.length - 1
        if position < doc_span.start:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span.start
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

In [10]:
class InputFeatures(object):
    """
    A single set of rfeatures of data
    """
    
    def __init__(self,
                unique_id,
                example_index,
                doc_span_index,
                tokens,
                token_to_orig_map,
                token_is_max_context,
                input_ids,
                input_mask,
                segment_ids,
                start_position=None,
                end_position=None):
        self.unique_id = unique_id
        self.example_index = example_index
        self.doc_span_index = doc_span_index
        self.tokens = tokens
        self.token_to_orig_map = token_to_orig_map
        self.token_is_max_context = token_is_max_context
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.start_position = start_position
        self.end_position = end_position
        

In [11]:
import collections
unique_id = 1000000000

train_features =[]

for (example_index, example) in enumerate(examples):
    
    # 1. tokenize question text
    query_tokens = tokenizer.tokenize(example.question_text)
    
    # truncate query tokens if larger than max_length
    if len(query_tokens) > max_query_length:
        query_tokens = query_tokens[0:max_query_length]
    
    
    # 2. Tokenize the document text
     # list as indexer
    tok_to_orig_index = []
    orig_to_tok_index =[]
     # list to store tokens
    all_doc_tokens =[]
  
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)
    
    
    # match example position to token position
    tok_start_position = None
    tok_end_positin = None
    
    if is_training:
        # start position in the new_token
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) -1:
            tok_end_position = orig_to_tok_index[example.end_position+1] -1
        else:
            tok_end_position = len(all_doc_tokens) -1
        
        (tok_start_position,tok_end_position) = improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.orig_answer_text)
        
    # maximum allowed token for document should account for
    # question and  [CLS] [SEP] and [SEP]
    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
    
    
    # if document is longer than the maximum sequence length
    # method : sliding windows approach
    # take chunks of the up to max_length with a stride of 'doc_stride'
    
    _DocSpan = collections.namedtuple(
        "DocSpan",["start","length"])
    
    doc_spans = []
    start_offset = 0
    # make sliding window of spans
    while start_offset < len(all_doc_tokens):
        length = len(all_doc_tokens) - start_offset
        if length > max_tokens_for_doc:
            length = max_tokens_for_doc
        
        doc_spans.append(_DocSpan(start=start_offset,length=length))
        if start_offset + length ==len(all_doc_tokens):
            break
        start_offset += min(length,doc_stride)
    
    
    for (doc_span_index, doc_span) in enumerate(doc_spans):
        tokens =[]
        token_to_orig_map ={}
        token_is_max_context ={}
        segment_ids =[]
        # initial symbol
        tokens.append("[CLS]")
        segment_ids.append(0)
        # question tokens
        for token in query_tokens:
            tokens.append(token)
            segment_ids.append(0)
        # separator token
        tokens.append("[SEP]")
        segment_ids.append(0)
        
        # document tokens
        for i in range(doc_span.length):
            split_token_index = doc_span.start + i
            # should make another mapping because the text is sliding window part
            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
            
            is_max_context = _check_is_max_context(doc_spans, doc_span_index,
                                                  split_token_index)
            token_is_max_context[len(tokens)] = is_max_context
            tokens.append(all_doc_tokens[split_token_index])
            segment_ids.append(1)
        #Separator token
        tokens.append("[SEP]")
        segment_ids.append(1)
        
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        
        # Mask 
        # real tokens : 1  , padding tokens : 0
        
        input_mask =[1]*len(input_ids)
        
        #Zero-pad up to the sequence length
        while len(input_ids)< max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        start_position = None
        end_position = None
        if is_training:
            # for training if document chunk does not contain an annotation 
            # we throw it out, since there is nothing to predict
            doc_start = doc_span.start
            doc_end   = doc_span.start + doc_span.length -1
            
            if (example.start_position < doc_start or
                   example.end_position <doc_start or
                   example.start_position > doc_end or 
                   example.end_position > doc_end):
                continue
            
            # [CLS] + question tokens + [SEP]
            doc_offset = len(query_tokens) + 2
            #               start position new_token   window_position 
            start_position = tok_start_position         - doc_start    + doc_offset
            end_position = tok_end_position - doc_start + doc_offset
        
        if example_index < 20:
            print("*** Example ***")
            print("unique_id: ",unique_id)
            print("example_index: ", example_index)
            print("doc_span_index: ",doc_span_index)
            print("tokens: " ," ".join(tokens))
            print("tokens_to_origin_map: ", " ".join([
                "{}:{}".format(x,y) for (x,y) in token_to_orig_map.items()]))
            print("token_is_max_content: "," ".join([
                  "{}:{}".format(x,y) for (x,y) in token_is_max_context.items()]))
            print("input_ids: ", " ".join([str(x) for x in input_ids]))
            print("input_mask: ", " ".join([str(x) for x in input_mask]))
            print("segment_ids: ", " ".join([str(x) for x in segment_ids]))
            if is_training:
                answer_text = " ".join(tokens[start_position:(end_position +1)])
                print("start_position : ", start_position)
                print("end_position : ", end_position)
                print("answer : ",answer_text)
                
        train_features.append(
            InputFeatures(
                unique_id = unique_id,
                example_index = example_index,
                doc_span_index = doc_span_index,
                tokens = tokens,
                token_to_orig_map=token_to_orig_map,
                token_is_max_context=token_is_max_context,
                input_ids = input_ids,
                input_mask = input_mask,
                segment_ids=segment_ids,
                start_position = start_position,
                end_position = end_position))
        
        
    

*** Example ***
unique_id:  1000000000
example_index:  0
doc_span_index:  0
tokens:  [CLS] to whom did the vir ##gin mar ##y allegedly appear in 1858 in lo ##urde ##s franc ##e ? [SEP] architectural ##ly , the school has a cat ##hol ##ic character . ato ##p the main building ' s gold dome is a golden statue of the vir ##gin mar ##y . immediately in front of the main building and facing it , is a copper statue of ch ##rist with arms up ##rais ##ed with the legend " ven ##ite ad me om ##nes " . next to the main building is the basilica of the sacred heart . immediately behind the basilica is the gr ##otto , a mari ##an place of prayer and reflect ##ion . it is a replica of the gr ##otto at lo ##urde ##s , franc ##e where the vir ##gin mar ##y rep ##uted ##ly appeared to saint bern ##ade ##tte sou ##bir ##ous in 1858 . at the end of the main drive ( and in a direct line that connects through 3 statues and the gold dome ) , is a simple , modern stone statue of mar ##y . [SEP]
tokens_to_ori

In [10]:
import torch
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
print("# of GPUs :" ,n_gpu)

# of GPUs : 4


In [11]:
# Total batch size for training
train_batch_size = 8
num_train_epochs = 3.0

import random
import numpy as np
seed = 42
fp16 = True
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)
    
output_dir = '/tmp/squad'

train_examples = examples

num_train_steps = int(len(train_examples) 
                      / train_batch_size 
                      * num_train_epochs)
print("Number of train steps : ", num_train_steps)


Number of train steps :  32849


In [12]:
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
bert_model = 'bert-base-uncased'
model = BertForQuestionAnswering.from_pretrained(
            bert_model,
            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1))

model.to(device)
model = torch.nn.DataParallel(model)

#prepare optimizer
param_optimizer = list(model.named_parameters())

#hack to remove "pooler"  which is not used
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

In [13]:
from pytorch_pretrained_bert.optimization import BertAdam

no_decay =['bias','LayerNorm.bias','LayerNorm.weight']
optimizer_grouped_parameters =[
    {'params':[ p for n , p in param_optimizer 
               if not any(nd in n for nd in no_decay)],'weight_decay':0.01},
    {'params':[p for n, p in param_optimizer
              if any(nd in n for nd in no_decay)],'weigth_decay':0.0}
    ]

t_total = num_train_steps

optimizer = BertAdam(optimizer_grouped_parameters,
                    lr=5e-5,
                    warmup=0.1,
                    t_total = t_total)


### Train


In [14]:
import pickle
global_step = 0
cached_train_features_file = 'data/train-v1.1.json_{}_{}_{}_{}'.format(
    list(filter(None,bert_model.split('/'))).pop(),str(max_seq_length),
    str(doc_stride),str(max_query_length))


#train_features = None
try:
    with open(cached_train_features_file,'rb') as reader:
        train_features = pickle.load(reader)
except:
     with open(cached_train_features_file,'wb') as writer:
        #train_features = features
        pickle.dump(train_features,writer)
   

In [15]:
print(" number of orig examples = ",len(train_examples))
print(" number of split examples = ", len(train_features))
print(" Batch size = ", train_batch_size)
print(" number of steps = ", num_train_steps)

 number of orig examples =  87599
 number of split examples =  88246
 Batch size =  8
 number of steps =  32849


In [16]:
all_input_ids = torch.tensor([f.input_ids for f in train_features],
                            dtype = torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features],
                             dtype = torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                              dtype = torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features],
                                  dtype = torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features],
                               dtype = torch.long)


In [17]:
from torch.utils.data import TensorDataset,\
                              DataLoader, \
                              RandomSampler, \
                              SequentialSampler

train_data = TensorDataset(all_input_ids,
                           all_input_mask,
                           all_segment_ids,
                           all_start_positions,
                           all_end_positions)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size = train_batch_size)




In [18]:
model.train()
from tqdm import tqdm, trange

for _ in trange(int(num_train_epochs), desc="Epoch"):
    for step, batch in enumerate(tqdm(train_dataloader,desc="Iteration")):
        
        batch = tuple(t.to(device) for t in batch)
        
        input_ids, input_mask,segment_ids,start_positions,end_positions=batch
        
        loss = model(input_ids,
                     segment_ids,
                     input_mask,
                     start_positions,
                     end_positions)
        
        loss = loss.mean()
        
        loss.backward()
        
        lr_this_step = args.learning_rate \
                           * warmup_linear(global_step/t_total,0.1)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step +=1

model_to_save = model.module if hasatt(model,'module') else model
output_model_file = os.path.join(output_dir,"pytorch_model.bin")
torch.save(model_to_save.state_dict(),output_model_file)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/11031 [00:00<?, ?it/s][A


RuntimeError: Creating MTGP constants failed. at /pytorch/aten/src/THC/THCTensorRandom.cu:35

In [None]:
python3.__version__

In [None]:
print(__version__)

In [None]:
import sys
sys.version

In [None]:
torch.__version__