In [1]:
import nltk
import json
import numpy as np
import pickle

In [2]:
emb_mat = np.load("word_embedding_matrix.npy").astype(np.float)

with open("vocabulary.pickle", "rb") as input_file:
    voc = pickle.load(input_file)
    
with open("testing_data.pickle", "rb") as input_file:
    testing_data = pickle.load(input_file)

In [22]:
from nltk.corpus import stopwords

def get_word_embedding(word, voc, e_mat):
    if word in voc:
        return e_mat[voc[word], :]
    else:
        return e_mat[0, :]

def get_tokenize_sentences(documents):
    sentences = []
    
    for doc in documents:
        sentences.extend(nltk.sent_tokenize(doc))

    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    return sentences

def get_sent_embedding(sent, voc, emb_mat):
    sent_embedding = np.zeros((len(sent), 50))
    for i, word in enumerate(sent):
        word_embedding = get_word_embedding(word, voc, emb_mat)
        sent_embedding[i, :] = word_embedding

    sent_embedding = np.mean(sent_embedding, axis=0)
    return sent_embedding
    
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [23]:
test_ans = []
stop_words = set(stopwords.words('english'))

for t in testing_data:
    ans = dict()
    tokenize_sentences = get_tokenize_sentences(t['text'])
    tokenize_question = get_tokenize_sentences([t['question']])
    q_emb = get_sent_embedding(tokenize_question[0], voc, emb_mat)

    sims = np.zeros((len(tokenize_sentences)))
    for i, sent in enumerate(tokenize_sentences):
        s_emb = get_sent_embedding(sent, voc, emb_mat)
        sims[i] = cos_sim(q_emb, s_emb)
    
    sentences = []
    for para in t['text']:
        sentences.extend(nltk.sent_tokenize(para))
    
    ans["id"] = t['id']
    ans["text"] = sentences[np.argmax(sims)]
    
    test_ans.append(ans)

In [32]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    chunked_sentences = nltk.ne_chunk_sents(sentences, binary=True)
    return chunked_sentences

def find_entities(chunks):
    "given list of tagged parts of speech, returns unique named entities"

    def traverse(tree):
        "recursively traverses an nltk.tree.Tree to find named entities"
          
        entity_names = []
    
        if hasattr(tree, 'label') and tree.label:
            if tree.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in tree]))
            else:
                for child in tree:
                    entity_names.extend(traverse(child))
    
        return entity_names
    
    named_entities = []
    
    for chunk in chunks:
        print(chunk)
        entities = sorted(list(set([word for tree in chunk for word in traverse(tree)])))
        for e in entities:
            if e not in named_entities:
                named_entities.append(e)
                
    return named_entities

In [24]:
file_name = "devel.json"
with open(file_name) as json_data:
    devel_set = json.load(json_data)

In [26]:
print(testing_data[0])

{'question': 'Modern browser support standards-based and defacto what?', 'id': 0, 'text': ['Early web browsers supported only a very simple version of HTML. The rapid development of proprietary web browsers led to the development of non-standard dialects of HTML, leading to problems with interoperability. Modern web browsers support a combination of standards-based and de facto HTML and XHTML, which should be rendered in the same way by all browsers.', 'In 1998, Netscape launched what was to become the Mozilla Foundation in an attempt to produce a competitive browser using the open source software model. That browser would eventually evolve into Firefox, which developed a respectable following while still in the beta stage of development; shortly after the release of Firefox 1.0 in late 2004, Firefox (all versions) accounted for 7% of browser use. As of August 2011, Firefox has a 28% usage share.', 'Available web browsers range in features from minimal, text-based user interfaces with 

In [33]:
for t in testing_data[:1]:
    for para in t['text']:
        find_entities(ie_preprocess(para))

(S
  Early/JJ
  web/NN
  browsers/NNS
  supported/VBD
  only/RB
  a/DT
  very/RB
  simple/JJ
  version/NN
  of/IN
  (NE HTML/NNP)
  ./.)
(S
  The/DT
  rapid/JJ
  development/NN
  of/IN
  proprietary/JJ
  web/NN
  browsers/NNS
  led/VBD
  to/TO
  the/DT
  development/NN
  of/IN
  non-standard/JJ
  dialects/NNS
  of/IN
  (NE HTML/NNP)
  ,/,
  leading/VBG
  to/TO
  problems/NNS
  with/IN
  interoperability/NN
  ./.)
(S
  (NE Modern/NNP)
  web/NN
  browsers/NNS
  support/VBP
  a/DT
  combination/NN
  of/IN
  standards-based/JJ
  and/CC
  de/FW
  facto/FW
  (NE HTML/NNP)
  and/CC
  (NE XHTML/NNP)
  ,/,
  which/WDT
  should/MD
  be/VB
  rendered/VBN
  in/IN
  the/DT
  same/JJ
  way/NN
  by/IN
  all/DT
  browsers/NNS
  ./.)
(S
  In/IN
  1998/CD
  ,/,
  (NE Netscape/NNP)
  launched/VBD
  what/WP
  was/VBD
  to/TO
  become/VB
  the/DT
  (NE Mozilla/NNP Foundation/NNP)
  in/IN
  an/DT
  attempt/NN
  to/TO
  produce/VB
  a/DT
  competitive/JJ
  browser/NN
  using/VBG
  the/DT
  open/JJ
  source/N

In [21]:
import re

with open('test.csv', 'w') as f:
    f.write("id,answer\n")
    for t in test_ans:
        ans = t['text'].strip()
        ans = find_entities(ie_preprocess(ans))
        if not ans:
            ans = t['text'].strip()
        else:
            ans = " ".join(ans)
        ans = re.sub(r'[^\w\s]', '', ans)
        f.write(str(t['id']) + ',' + ans + '\n')