## Import packages

In [4]:
import numpy as np
import json
import pickle
import nltk
from tqdm import tqdm, tqdm_notebook
from tensorflow import keras

  from ._conv import register_converters as _register_converters


## Load GloVe pre-trained word vectors 

In [5]:
def load_GloVe(filename, d_size=300):
    
    # get common word set from imdb dataset
    imdb = keras.datasets.imdb
    common_word_set = set(imdb.get_word_index().keys())
    
    num_lines = len(common_word_set) + 2
    print('Length of vocabulary: {}'.format(num_lines))
    
    vocab = dict()
    vocab["<PAD>"] = 0
    vocab["--OOV--"] = 1 # unknown
    
    emb_mat = np.zeros((num_lines + 2, d_size))
    emb_mat[vocab["--OOV--"], :] = np.random.uniform(size=d_size)
    
    print("Start Loading...")
    pbar = tqdm_notebook(total=num_lines)
    with open(filename, 'r', encoding="utf-8") as f:
        word_counter = 0
        for line in f:
            row = line.strip().split(' ')

            if not row[0] in common_word_set:
                continue
                
            try:
                emb_mat[word_counter, :] = np.asarray(row[1:])
                vocab[row[0]] = word_counter + 2
                word_counter += 1
            
            except Exception as e:
                print(e)
                print("Missing", row[0])
                break
            
            pbar.update(1)
    
    
    pbar.close()

    print("Loading finished...")
    return vocab, emb_mat

In [6]:
DATA_PATH = 'dataset/'

In [36]:
glove_file = DATA_PATH + 'glove.840B.300d.txt'
vocab, embd = load_GloVe(glove_file)

Length of vocabulary: 88586
Start Loading...


HBox(children=(IntProgress(value=0, max=88586), HTML(value='')))

Loading finished...


In [37]:
np.save(DATA_PATH + "word_embedding_matrix", embd)
with open(DATA_PATH + 'vocabulary.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [38]:
glove_char_file = DATA_PATH + 'glove.840B.300d-char.txt'
char_vocab, char_embd = load_GloVe(glove_char_file)

Length of vocabulary: 88586
Start Loading...


HBox(children=(IntProgress(value=0, max=88586), HTML(value='')))

Loading finished...


In [39]:
np.save(DATA_PATH + "char_embedding_matrix", char_embd)
with open(DATA_PATH + 'char_vocabulary.pickle', 'wb') as handle:
    pickle.dump(char_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Preprocess training data

### Load SQuAD 2.0 training & developing set

In [7]:
def get_clean_dataset(dataset):
    
    contexts = dict()
    questions = []
    context_counter = 0
    question_counter = 0
    
    for article in tqdm_notebook(dataset):
        for para in article["paragraphs"]:
            
            contexts[context_counter] = para["context"]
            
            for q in para["qas"]:
                questions.append({"question_id": question_counter,
                                  "context_id": context_counter,
                                  "question": q["question"],
                                  "answers": [{'text': ans["text"], 'answer_start': ans["answer_start"]} 
                                                 for ans in q["answers"]],
                                  "is_impossible": q["is_impossible"],
                                  })
                question_counter += 1

            context_counter += 1

    return contexts, questions

In [8]:
file_name = DATA_PATH + "train-v2.0.json"
with open(file_name) as json_data:
    training_set = json.load(json_data)
    training_set = training_set['data']

In [9]:
train_c, train_q = get_clean_dataset(training_set)

HBox(children=(IntProgress(value=0, max=442), HTML(value='')))




In [48]:
with open(DATA_PATH + 'trainset_context.pickle', 'wb') as handle:
    pickle.dump(train_c, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(DATA_PATH + 'trainset_question.pickle', 'wb') as handle:
    pickle.dump(train_q, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
file_name = DATA_PATH + "dev-v2.0.json"
with open(file_name) as json_data:
    dev_set = json.load(json_data)
    dev_set = dev_set['data']

In [50]:
dev_c, dev_q = get_clean_dataset(dev_set)

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))

In [51]:
with open(DATA_PATH + 'devset_context.pickle', 'wb') as handle:
    pickle.dump(dev_c, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(DATA_PATH + 'devset_question.pickle', 'wb') as handle:
    pickle.dump(dev_q, handle, protocol=pickle.HIGHEST_PROTOCOL)