In [63]:
import numpy as np
import json
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

stop_words = set(stopwords.words('english'))
def removeStopw(tokenized_sent):
    return [word for word in tokenized_sent if word.lower() not in stop_words]

<b>Load data</b>

In [71]:
f = open("./data/training.json", encoding='utf-8')
js = json.load(f)
train_qs = [ [ word for word in word_tokenize(item['question'])] for item in js]
train_texts = [item['text'] for item in js]
train_aps = [item['answer_paragraph'] for item in js]
train_docids = [item['docid'] for item in js]
f.close()

In [7]:
f = open("./data/testing.json", encoding='utf-8')
js = json.load(f)
test_qs = [item['question'] for item in js]
test_docids = [item['docid'] for item in js]
f.close()
print(test_qs[0])

Modern browser support standards-based and defacto what?


In [29]:
f = open("./data/documents.json", encoding='utf-8')
js = json.load(f)
docs = [ item['text'] for item in js]
docids = [item['docid'] for item in js]
f.close()
print(docs[0][0])

First recognized in 1900 by Max Planck, it was originally the proportionality constant between the minimal increment of energy, E, of a hypothetical electrically charged oscillator in a cavity that contained black body radiation, and the frequency, f, of its associated electromagnetic wave. In 1905 the value E, the minimal energy increment of a hypothetical oscillator, was theoretically associated by Einstein with a "quantum" or minimal element of the energy of the electromagnetic wave itself. The light quantum behaved in some respects as an electrically neutral particle, as opposed to an electromagnetic wave. It was eventually called the photon.


<b>Locate answer paragraph with TF-IDF</b>

In [173]:
TfidfVectorizers = []
tfidf_doc_mats = []
for doc in docs:
    tfidf = TfidfVectorizer(tokenizer=word_tokenize, lowercase=True)
    tfidf_doc_mats.append(tfidf.fit_transform(doc))
    TfidfVectorizers.append(tfidf)

test_context_sents = []
for i,query in enumerate(test_qs):
    docid = test_docids[i]
    tfidf_query = TfidfVectorizers[docid].transform([query])
    relativities = np.dot(tfidf_query, tfidf_doc_mats[docid].T)
    index = np.argmax(relativities)
    test_context_sents.append(docs[docid][index])
print(test_context_sents[0])

Early web browsers supported only a very simple version of HTML. The rapid development of proprietary web browsers led to the development of non-standard dialects of HTML, leading to problems with interoperability. Modern web browsers support a combination of standards-based and de facto HTML and XHTML, which should be rendered in the same way by all browsers.


In [56]:
#contexts = [word_tokenize(test_context) for test_context in test_contexts]
queries = [word_tokenize(query) for query in test_qs]
print(queries[0])

['Modern', 'browser', 'support', 'standards-based', 'and', 'defacto', 'what', '?']


<b>Identify NER in test contexts</b>

In [174]:
answers = []
for context in test_context_sents:
    doc = nlp(context)
    
    answer = set()
    for ent in doc.ents:
        answer.add(ent.text)
    answers.append(answer)
answers

[{'HTML', 'XHTML'},
 set(),
 {'CSS', 'HTML', 'Java', 'XML'},
 set(),
 {'1998',
  '28%',
  '7%',
  'August 2011',
  'Firefox',
  'Firefox 1.0',
  'Netscape',
  'late 2004',
  'the Mozilla Foundation'},
 {'1995', '2002', 'Microsoft', 'Mosaic', 'Windows', 'first', 'over 95%'},
 {'1990',
  'Berners-Lee',
  'Nexus',
  'Tim Berners-Lee',
  'WorldWideWeb',
  'first',
  'the World Wide Web Consortium',
  'the World Wide Web Foundation'},
 {'HTTP Secure'},
 {'1993',
  '1993 –',
  '1994',
  '90%',
  'Andreesen',
  'Andreessen',
  'Marc Andreessen',
  'Mosaic',
  'NCSA',
  'National Center for Supercomputing Applications',
  'Netscape',
  'Netscape Navigator',
  'first',
  'the 1990s',
  'the Mosaic-',
  'the World Wide Web'},
 {'1990',
  'Berners-Lee',
  'Nexus',
  'Tim Berners-Lee',
  'WorldWideWeb',
  'first',
  'the World Wide Web Consortium',
  'the World Wide Web Foundation'},
 {'January 2009',
  'Microsoft',
  'Microsoft Corp v Commission',
  'Windows',
  'the European Commission'},
 {'199

In [168]:
def in_query(tips, i):
    for tip in tips:
        if test_qs[i].find(tip) != -1:
            return True
    return False

answers = []
for i, context in enumerate(test_contexts):
    doc = nlp(context)
    
    answer = {}
    answer['text'] = set()
    
    if in_query(["who", "organization"], i):
        for ent in doc.ents:
            if ent.label_ in {"ORG", "PERSON", "NORP"}:
                answer['text'].add(ent.text)
    elif in_query(["when", "time", "month", "day", "year", "was", "did", "how"], i):
        for ent in doc.ents:
            if ent.label_ in {"DATE", "TIME", "CARDINAL"}:
                answer['text'].add(ent.text)
    elif in_query(["where", "place", "city", "country"], i):
        for ent in doc.ents:
            if ent.label_ in {"GPE", "LOC", "FACILITY", "ORG"}:
                answer['text'].add(ent.text)
    elif in_query(["much", "many"], i):
        for ent in doc.ents:
            if ent.label_ in {"PERCENT", "QUANTITY", "CARDINAL", "MONEY"}:
                answer['text'].add(ent.text)
    else:
        for ent in doc.ents:
            if not ent.label_:
                answer['text'].add(ent.text)

    if not answer['text']:
        for chunk in doc.noun_chunks:
            answer['text'].add(ent.text)
    
    answer["text"] = answer
    # delete the entities which already appear in query
    answer["text"] = " ".join(list(answer["text"] - set(queries[i])))
        
    answers.append(answer)
answers

TypeError: 'set' object is not subscriptable

In [167]:
import csv
import re
i=0
with open('result.csv', 'w') as f:
    f.write("id,answer\n")
    for t in answers:
        ans = re.sub(r'[^\w\s]', '', t["text"])
        f.write(str(i) + ', ' + ans + '\n')
        i+=1