In [1]:
import numpy as np
import json
import pickle
import nltk

In [2]:
filename = '/mnt/glove.6B.50d.txt'

def loadGloVe(filename):
    vocab = dict()
    embd = []
    file = open(filename,'r')
    for index, line in enumerate(file.readlines()):
        row = line.strip().split(' ')
        vocab[row[0]] = index
        embd.append(row[1:])
    print('Loaded GloVe!')
    file.close()
    return vocab,embd

vocab, embd = loadGloVe(filename)
embedding = np.asarray(embd)

Loaded GloVe!


In [3]:
np.save("word_embedding_matrix", embedding)

In [3]:
with open('vocabulary.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Preprocess training data

Load training set

In [2]:
file_name = "training.json"
with open(file_name) as json_data:
    training_set = json.load(json_data)

In [7]:
training_set[9]

{'answer_paragraph': 2,
 'docid': 0,
 'question': 'How much energy is contained in the light to which human eyes are most sensitive?',
 'text': '7005216000000000000♠216 kj/mol'}

Load documents

In [2]:
file_name = "/mnt/documents.json"
with open(file_name) as json_data:
    documents = json.load(json_data)

Preprocess the list of dictionary

In [33]:
from nltk import word_tokenize

def sublist(lst1, lst2):
    return set(lst1) <= set(lst2)

def generate_dataset(dataset):
    data_list = []
    abandon_count = 0
    for index, x in enumerate(dataset):
        curr_dct = dict()

        q = x["question"].lower()
        curr_dct["question"] = word_tokenize(q)

        ans = x["text"].lower()
        tokenized_ans = word_tokenize(ans)

        docid = x["docid"]
        answer_para = x["answer_paragraph"]
        para = documents[docid]["text"][answer_para].lower()
        sentences = nltk.sent_tokenize(para)
        tokenized_sents = [word_tokenize(sent) for sent in sentences]

        context = None
        for sent in tokenized_sents:
            if sublist(tokenized_ans, sent):
                context = sent
                break 
        
        if not context:
            # print(index, 'cannot find answer!')
            abandon_count += 1
            continue

        curr_dct['context'] = context
        # find the position that answer occur
        for pos, i in enumerate(context):
            ac_list = zip(tokenized_ans, context[pos:])
            if False in [True if i in j else False for i, j in ac_list]:
                continue
            else:
                start = [0] * len(context)
                end = [0] * len(context)
                start[pos] = 1
                
                end_pos = pos + len(tokenized_ans) - 1
                if end_pos > len(context) - 1:
                    end_pos = len(context) - 1
                end[end_pos] = 1
                    
                curr_dct["start"] = start
                curr_dct["end"] = end
                break
        
        if "start" in curr_dct and "end" in curr_dct and "context" in curr_dct:
            if len(curr_dct["start"]) != len(curr_dct['context']) or len(curr_dct["end"]) != len(curr_dct['context']):
                print("ffffffffffffffffffff", index)
            data_list.append(curr_dct)
        else:
            abandon_count += 1
            # print("Cannot match answer:", index)
    
    print('The number of instances which have been removed:', abandon_count)
    return data_list

In [34]:
training_data = generate_dataset(training_set)

with open('training_data.pickle', 'wb') as handle:
    pickle.dump(training_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

The number of instances which have been removed: 1797


## Tf-idf term weighting

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [4]:
file_name = "testing.json"
with open(file_name) as json_data:
    testing_set = json.load(json_data)

print(testing_set[:10])

[{'question': 'Modern browser support standards-based and defacto what?', 'docid': 410, 'id': 0}, {'question': 'What do people typically call a web browser?', 'docid': 410, 'id': 1}, {'question': 'What is it called when content is changed from markup to an interactive document?', 'docid': 410, 'id': 2}, {'question': 'What platform is a browser used on?', 'docid': 410, 'id': 3}, {'question': 'When was Firefox released?', 'docid': 410, 'id': 4}, {'question': 'Who released the Internet Explorer browser?', 'docid': 410, 'id': 5}, {'question': 'When was the first browser created?', 'docid': 410, 'id': 6}, {'question': 'HTTP Secure is supported by what?', 'docid': 410, 'id': 7}, {'question': 'Who released Mosaic?', 'docid': 410, 'id': 8}, {'question': 'Who invented the first browser?', 'docid': 410, 'id': 9}]


In [5]:
print(documents[0])

{'docid': 0, 'text': ['First recognized in 1900 by Max Planck, it was originally the proportionality constant between the minimal increment of energy, E, of a hypothetical electrically charged oscillator in a cavity that contained black body radiation, and the frequency, f, of its associated electromagnetic wave. In 1905 the value E, the minimal energy increment of a hypothetical oscillator, was theoretically associated by Einstein with a "quantum" or minimal element of the energy of the electromagnetic wave itself. The light quantum behaved in some respects as an electrically neutral particle, as opposed to an electromagnetic wave. It was eventually called the photon.', 'Classical statistical mechanics requires the existence of h (but does not define its value). Eventually, following upon Planck\'s discovery, it was recognized that physical action cannot take on an arbitrary value. Instead, it must be some multiple of a very small quantity, the "quantum of action", now called the Plan

## Get Top N paragraph

In [12]:
testing_data = []

for doc in documents[410:]:
    
    tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize, lowercase=True)
    
    raw_text = []
    for para in doc['text']:
        raw_text.extend(nltk.sent_tokenize(para))

    td_mat = tfidf.fit_transform(raw_documents=raw_text)
    
    for question in testing_set:
        if question['docid'] == doc['docid']:
            temp_dict = dict()
            
            query = tfidf.transform([question['question']])
            doc_rank = np.dot(query, td_mat.T).toarray()

            idx = (-doc_rank).argsort()[0][:2]
            
            temp_dict["question"], temp_dict["id"] = question['question'], question["id"]
            temp_dict["text"] = ""

            for i in idx:
                temp_dict["text"] += " " + raw_text[i]
            
            testing_data.append(temp_dict)

In [13]:
with open('testing_data.pickle', 'wb') as handle:
    pickle.dump(testing_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
print(testing_data[3000])

{'question': 'Who has an active community of journalists and publishers?', 'id': 3000, 'text': ' Though situated in a remote part of the country, Himachal Pradesh has an active community of journalists and publishers. Virbhadra Singh who has held the top office in Himachal five times in the past, was administered the oath of office and secrecy by Governor Urmila Singh at an open ceremony at the historic Ridge Maidan in Shimla.'}


In [16]:
import re

with open('test.csv', 'w') as f:
    f.write("id,answer\n")
    for t in testing_data:
        ans = t['text'][0].strip()
        ans = re.sub(r'[^\w\s]', '', ans)
        f.write(str(t['id']) + ',' + ans + '\n')