## Import packages

In [7]:
import numpy as np
import json
import pickle
import nltk
from tqdm import tqdm, tqdm_notebook
import spacy

nlp = spacy.blank("en")

## Load GloVe pre-trained word vectors 

In [8]:
def file_len(fname):
    with open(fname, 'r', encoding="utf-8") as f:
        for i, l in enumerate(f):
            pass
    return i + 1

def load_GloVe(filename, d_size=300):
    num_lines = file_len(filename)
    vocab = dict()
    emb_mat = np.zeros((num_lines + 1, d_size))

    print("Start Loading...")
    pbar = tqdm_notebook(total=num_lines)
    
    with open(filename, 'r', encoding="utf-8") as f:
        
        emb_mat[-1, :] = np.random.uniform(size=d_size)
        vocab["--OOV--"] = num_lines
        
        for index, line in enumerate(f):
            row = line.strip().split(' ')

            try:
                emb_mat[index, :] = np.asarray(row[1:])
                vocab[row[0]] = index
            except:
                print("Missing", row[0])
                continue
        
            pbar.update(1)
    
    
    pbar.close()

    print("Loading finished...")
    return vocab, emb_mat

In [9]:
DATA_PATH = 'dataset/'

In [10]:
glove_file = DATA_PATH + 'glove.840B.300d.txt'
vocab, embd = load_GloVe(glove_file)

Start Loading...


Missing 0.20785
Missing 0.39511
Missing 0.13211
Missing -0.38024
Missing -0.0033421
Missing 0.14608
Missing -0.36288
Missing 0.5478
Missing 0.59759

Loading finished...


In [11]:
np.save(DATA_PATH + "word_embedding_matrix", embd)
with open(DATA_PATH + 'vocabulary.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
glove_char_file = DATA_PATH + 'glove.840B.300d-char.txt'
char_vocab, char_embd = load_GloVe(glove_char_file)

Start Loading...



Loading finished...


In [13]:
np.save(DATA_PATH + "char_embedding_matrix", char_embd)
with open(DATA_PATH + 'char_vocabulary.pickle', 'wb') as handle:
    pickle.dump(char_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Preprocess training data

### Load SQuAD 2.0 training & developing set

In [16]:
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

def get_cleaned_dataset(dataset):
    
    contexts = dict()
    questions = dict()
    context_counter = 0
    question_counter = 0
    
    for article in tqdm_notebook(dataset):
        for para in article["paragraphs"]:
            
            contexts[context_counter] = word_tokenize(para["context"])
            
            for q in para["qas"]:
                questions[question_counter] = {
                                                "context_id": context_counter,
                                                "question": word_tokenize(q["question"]),
                                                "answers": [{'text': word_tokenize(ans["text"]),
                                                             'answer_start': ans["answer_start"]} 
                                                            for ans in q["answers"]],
                                                "is_impossible": q["is_impossible"],
                                                }
                question_counter += 1

            context_counter += 1

    return contexts, questions

In [3]:
file_name = DATA_PATH + "train-v2.0.json"
with open(file_name) as json_data:
    training_set = json.load(json_data)
    training_set = training_set['data']

In [17]:
train_c, train_q = get_cleaned_dataset(training_set)

In [18]:
with open(DATA_PATH + 'trainset_context.pickle', 'wb') as handle:
    pickle.dump(train_c, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(DATA_PATH + 'trainset_question.pickle', 'wb') as handle:
    pickle.dump(train_q, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
file_name = DATA_PATH + "dev-v2.0.json"
with open(file_name) as json_data:
    dev_set = json.load(json_data)
    dev_set = dev_set['data']

In [22]:
dev_c, dev_q = get_cleaned_dataset(dev_set)

In [23]:
with open(DATA_PATH + 'devset_context.pickle', 'wb') as handle:
    pickle.dump(dev_c, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(DATA_PATH + 'devset_question.pickle', 'wb') as handle:
    pickle.dump(dev_q, handle, protocol=pickle.HIGHEST_PROTOCOL)