In [2]:
import numpy as np
import json
import pickle

In [2]:
filename = '/mnt/glove.6B.50d.txt'

def loadGloVe(filename):
    vocab = dict()
    embd = []
    file = open(filename,'r')
    for index, line in enumerate(file.readlines()):
        row = line.strip().split(' ')
        vocab[row[0]] = index
        embd.append(row[1:])
    print('Loaded GloVe!')
    file.close()
    return vocab,embd

vocab, embd = loadGloVe(filename)
embedding = np.asarray(embd)

Loaded GloVe!


In [3]:
np.save("word_embedding_matrix", embedding)

In [3]:
with open('vocabulary.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

Load training set

In [21]:
file_name = "training.json"
with open(file_name) as json_data:
    training_set = json.load(json_data)

In [22]:
file_name = "testing.json"
with open(file_name) as json_data:
    testing_set = json.load(json_data)

Load documents

In [23]:
file_name = "documents.json"
with open(file_name) as json_data:
    documents = json.load(json_data)

Preprocess the output

In [19]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer


def generate_dataset(dataset):
    lemmatizer = WordNetLemmatizer()
    data_list = []
    for index, x in enumerate(dataset):
        curr_dct = dict()

        q = x["question"].lower()
        curr_dct["question"] = [lemmatizer.lemmatize(i) for i in word_tokenize(q)]

        ans = x["text"].lower()
        tokenized_ans = [lemmatizer.lemmatize(i) for i in word_tokenize(ans)]

        docid = x["docid"]
        answer_para = x["answer_paragraph"]

        c = documents[docid]["text"][answer_para].lower()
        tokenized_c = [lemmatizer.lemmatize(i) for i in word_tokenize(c)]

        curr_dct["context"] = tokenized_c

        # find the position that answer occur
        for pos, i in enumerate(tokenized_c):
            ac_list = zip(tokenized_ans, tokenized_c[pos:])
            if False in [True if i in j else False for i, j in ac_list]:
                continue
            else:
                start = [0] * len(tokenized_c)
                end = [0] * len(tokenized_c)
                start[pos] = 1
                end[pos + len(tokenized_ans) - 1] = 1
                curr_dct["start"] = start
                curr_dct["end"] = end
                break

        if "start" in curr_dct and "end" in curr_dct:
            data_list.append(curr_dct)
        else:
            print("Cannot match answer:", index)
    
    return data_list

In [25]:
training_data = generate_dataset(training_set)

Cannot match answer: 452
Cannot match answer: 937
Cannot match answer: 1582
Cannot match answer: 1666
Cannot match answer: 7606
Cannot match answer: 7947
Cannot match answer: 9889
Cannot match answer: 9952
Cannot match answer: 10124
Cannot match answer: 11984
Cannot match answer: 12128
Cannot match answer: 12357
Cannot match answer: 13041
Cannot match answer: 13088
Cannot match answer: 13928
Cannot match answer: 14433
Cannot match answer: 14929
Cannot match answer: 15535
Cannot match answer: 15959
Cannot match answer: 16135
Cannot match answer: 18772
Cannot match answer: 19744
Cannot match answer: 19764
Cannot match answer: 21408
Cannot match answer: 21657
Cannot match answer: 22284
Cannot match answer: 23610
Cannot match answer: 26402
Cannot match answer: 28405
Cannot match answer: 30647
Cannot match answer: 30741
Cannot match answer: 32413
Cannot match answer: 32442
Cannot match answer: 32542
Cannot match answer: 32794
Cannot match answer: 33658
Cannot match answer: 35677
Cannot matc

In [12]:
with open('training_data.pickle', 'wb') as handle:
    pickle.dump(training_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
lemmatizer = WordNetLemmatizer()
testing_data = []
for index, x in enumerate(testing_set):
    curr_dct = dict()

    q = x["question"].lower()
    curr_dct["question"] = [lemmatizer.lemmatize(i) for i in word_tokenize(q)]

    docid = x["docid"]

    c = ""
    for para in documents[0]["text"]:
        c += para.lower()

    tokenized_c = [lemmatizer.lemmatize(i) for i in word_tokenize(c)]
    curr_dct["context"] = tokenized_c
    
    testing_data.append(curr_dct)

In [39]:
with open('testing_data.pickle', 'wb') as handle:
    pickle.dump(testing_data, handle, protocol=pickle.HIGHEST_PROTOCOL)