In [1]:
import nltk
import json
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
with open("testing_data.pickle", "rb") as input_file:
    testing_data = pickle.load(input_file)

In [None]:
emb_mat = np.load("word_embedding_matrix.npy").astype(np.float)

with open("vocabulary.pickle", "rb") as input_file:
    voc = pickle.load(input_file)

In [None]:
with open("training_data.pickle", "rb") as input_file:
    training_data = pickle.load(input_file)

In [None]:
np.random.shuffle(training_data)

In [None]:
print(training_data[0])

In [None]:
def get_word_embedding(word, voc, e_mat):
    if word in voc:
        return e_mat[voc[word], :]
    else:
        return e_mat[0, :]

def get_tokenize_sentences(documents):
    tokens = []
    
    for doc in documents:
        sents = nltk.sent_tokenize(doc)
        for sent in sents:
            sent = sent.strip(".")
            sent = re.sub(r'[,;":\']', '', sent)
            tokens.extend(nltk.word_tokenize(sent) )

    return tokens

def get_sent_embedding(sent, voc, emb_mat):
    sent_embedding = np.zeros((len(sent), 50))
    for i, word in enumerate(sent):
        word_embedding = get_word_embedding(word, voc, emb_mat)
        sent_embedding[i, :] = word_embedding

    sent_embedding = np.mean(sent_embedding, axis=0)
    return sent_embedding
    
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [None]:
test_ans = []

for t in testing_data[:3]:
    ans = dict()
    tokenize_sentences = get_tokenize_sentences(t['text'])
    tokenize_question = get_tokenize_sentences([t['question']])
    q_emb = get_sent_embedding(tokenize_question[0], voc, emb_mat)

    sims = np.zeros((len(tokenize_sentences)))
    for i, sent in enumerate(tokenize_sentences):
        s_emb = get_sent_embedding(sent, voc, emb_mat)
        sims[i] = cos_sim(q_emb, s_emb)
    
    print(sims)
    sentences = []
    for para in t['text']:
        sentences.extend(nltk.sent_tokenize(para))
    
    ans["id"] = t['id']
    ans['question'] = t['question']
    ans["text"] = sentences[np.argmax(sims)]
    test_ans.append(ans)

In [None]:
print(testing_data[2000])

In [None]:
def traverse(tree):
    "recursively traverses an nltk.tree.Tree to find named entities"

    items = []

    if hasattr(tree, 'label') and tree.label:
        if tree.label() == "NP":
            items.append(' '.join([child[0] for child in tree]))
        else:
            for child in tree:
                items.extend(traverse(child))

    return items

In [1]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [4]:
def check_query(ent_lst, q):
    q = nltk.word_tokenize(q)
    return True in list(map(lambda x: x in q, ent_lst))

In [6]:
import nltk
import re


test_ans = []

for t in tqdm(testing_data):
    answer = dict()
    answer["id"] = t['id']
    
    query = t["question"].lower()
    
    doc = nlp(t["text"])
    answer['text'] = set()
    if check_query(["who", "organization"], query):
        for ent in doc.ents:
            if ent.label_ in {"ORG", "PERSON", "NORP"}:
                answer['text'].add(ent.text)
    elif check_query(["when", "time", "month", "day", "year"], query):
        for ent in doc.ents:
            if ent.label_ in {"DATE", "TIME", "CARDINAL"}:
                answer['text'].add(ent.text)
    elif check_query(["where", "place", "city", "country"], query):
        for ent in doc.ents:
            if ent.label_ in {"GPE", "LOC", "FACILITY", "ORG"}:
                answer['text'].add(ent.text)
    elif check_query(["how much", "how many"], query):
        for ent in doc.ents:
            if ent.label_ in {"PERCENT", "QUANTITY", "CARDINAL", "MONEY"}:
                answer['text'].add(ent.text)
    else:
        for ent in doc.ents:
            if not ent.label_:
                answer['text'].add(ent.text)

    if not answer['text']:
        for chunk in doc.noun_chunks:
            answer['text'].add(ent.text)

    # delete the entities which already appear in query
    answer["text"] = " ".join(list(answer["text"] - set(nltk.word_tokenize(query))))

    test_ans.append(answer)
    

100%|██████████| 3618/3618 [01:25<00:00, 42.49it/s]


In [8]:
print(testing_data[2205])

{'question': 'If one samples a continental group, what do the clusters become?', 'id': 2205, 'text': ' When one samples continental groups, the clusters become continental; if one had chosen other sampling patterns, the clustering would be different.'}


In [7]:
import re

with open('test.csv', 'w') as f:
    f.write("id,answer\n")
    for t in test_ans:
        ans = re.sub(r'[^\w\s]', '', t["text"])
        f.write(str(t['id']) + ', ' + ans + '\n')