## Rule based Question - Answering

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re, math
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import gensim
from gensim.models import Doc2Vec
from nltk.corpus import stopwords

In [2]:
def remove_stopwords(passage, questions):
    stop_words = set(stopwords.words("english"))
    word_passage = word_tokenize(passage)
    questions = "".join(questions)
    word_question = word_tokenize(questions)
    filtered_passage = []
    filtered_questions = []
    for w in word_passage:
        if w not in stop_words:
            filtered_passage.append(w)
    for q in word_question:
        if q not in stop_words:
            filtered_questions.append(q)
    filt_ques_str = " ".join(filtered_questions)
    filt_ques_str = [ques for ques in filt_ques_str.split("?") if ques!='']
    return " ".join(filtered_passage), filt_ques_str

In [3]:
def create_vector_passage(filt_passage):
    sent_passage = [word_tokenize(sent) for sent in sent_tokenize(filt_passage)]
    tagged_words = [nltk.pos_tag(word) for word in sent_passage]
    tagged_wordlist = [word for sentence in tagged_words for word in sentence]
    words, tags = zip(*tagged_wordlist)
    documents = gensim.models.doc2vec.TaggedDocument(list(words), list(tags))
    doc2vec_model = Doc2Vec(documents=[documents], min_count=1)
    docvec = [0] * 100
    passage_vec = []
    for i in sent_tokenize(filt_passage):
        for word in word_tokenize(i):
            docvec += doc2vec_model[word]
        vec = docvec / len(word_tokenize(i))
        passage_vec.append(np.sum(vec)/(np.shape(vec)[0]))
    return passage_vec

In [4]:
def create_vector_question(filt_question):
    sent_passage = [word_tokenize(sent) for sent in filt_question]
    tagged_words = [nltk.pos_tag(word) for word in sent_passage]
    tagged_wordlist = [word for sentence in tagged_words for word in sentence]
    words, tags = zip(*tagged_wordlist)
    documents = gensim.models.doc2vec.TaggedDocument(list(words), list(tags))
    doc2vec_model = Doc2Vec(documents=[documents], min_count=1)
    docvec = [0] * 100
    question_vec = []
    for i in filt_question:
        for word in word_tokenize(i):
            docvec += doc2vec_model[word]
        vec = docvec / len(word_tokenize(i))
        question_vec.append(np.sum(vec)/(np.shape(vec)[0]))
    return question_vec

In [5]:
def get_cosine(sentence, question):
    vec1 = Counter(word_tokenize(sentence))
    vec2 = Counter(word_tokenize(question))
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [6]:
def own_filter(question, wh):
    for key in wh.keys():
        if key in question:
            filt = wh.get(key)
    return filt

In [7]:
wh = {"What":     ["GPE", "NUMBER", "PERSON", "ORGANIZATION"], 
      "Which":    ["GPE", "ORGANIZATION"] , 
      "How many": ["NUMBER"], 
      "How much": ["NP"],
      "Who":      ["PERSON"],
      "When":     ["DATE", "TIME"] ,
      "Where":    ["LOCATION"]
     }

In [8]:
def get_candidate_answers(passage, questions):
    sentences_in_passage = sent_tokenize(passage)
    question_answer_similarity = {}
    for question in questions:
        for sentence in sentences_in_passage: 
            cosine_similarity = get_cosine(question, sentence)
            if question not in question_answer_similarity:
                question_answer_similarity[question] = (sentence, cosine_similarity)
            elif question_answer_similarity[question][1] < cosine_similarity:
                question_answer_similarity[question] = (sentence, cosine_similarity)
    return question_answer_similarity

In [9]:
def __get_candidate_answers__(passage, questions, passage_vector, question_vector):
    sentences_in_passage = sent_tokenize(passage)
    question_answer_similarity = {}
    j = 0
    for question in questions:
        i = 0
        for sentence in sentences_in_passage: 
            cosine_similarity = abs(np.subtract(passage_vector[i], question_vector[j]))
            if question not in question_answer_similarity:
                question_answer_similarity[question] = (sentence, cosine_similarity)
            elif question_answer_similarity[question][1] > cosine_similarity:
                question_answer_similarity[question] = (sentence, cosine_similarity)
            i += 1
        j += 1
    return question_answer_similarity

In [10]:
def chunk_candidate_answers(question_answers_map):
    named_entity_chunks = {}
    a = []
    for question in candidate_answers:
        answer = candidate_answers[question][0]
        a.append(answer)
        tagged_answer = nltk.pos_tag(word_tokenize(answer))
        ne_chunks = nltk.ne_chunk(tagged_answer)
        grammar = r"""
        NUMBER: {<CD>}
        NP:     {<RB><NN>}
        """
        regexp_parser = nltk.RegexpParser(grammar)
        ne_chunks = regexp_parser.parse(ne_chunks)
        named_entity_chunks[question] = ne_chunks
    for i in a:
        print(i)
    print()
    return named_entity_chunks

In [41]:
def extract_answers(chunked_answers):
    extracted_answer = []
    for question in chunked_answers:
        node = chunked_answers[question]
        filtered_answer = own_filter(question, wh)
        subtrees = list(node.subtrees(filter=lambda t: t.label() in filtered_answer))
        answer = []
        answer = [' '.join(list(zip(*tree.leaves()))[0]) for tree in subtrees]
        extracted_answer.append(answer)
    return extracted_answer

### Passage 1

In [42]:
passage_1 = """Oxygen is a chemical element.
Oxygen symbol is O.
Atomic number of periodic table for oxygen is 8.
Oxygen is a member of the chalcogen group on the periodic table.
Oxygen is a highly reactive nonmetal and oxidizing agent.
Oxygen readily forms compounds with most elements.
Hydrogen is the most abundant element in the universe.
Oxygen is the third-most abundant element in the universe.
Two atoms of the element bind to form dioxygen.
Oxygen is a colorless and odorless diatomic gas.
Diatomic oxygen gas constitutes 20.8% of the Earth atmosphere.
Oxygen is the most abundant element by mass in the Earth's crust.
Oxygen makes up almost half of the crust's mass."""

In [43]:
questions_1 = ["What is the most abundant element?",
             "What is the atomic number of the periodic table for oxygen?",
             "How many atoms combine to form dioxygen?",
             "How much oxygen makes up the Earth crust?",
             "Which gas makes up 20.8% of the Earth's atmosphere?",
             "What element makes up almost half of the earth's crust by mass?",
             "What is the atomic number of the element oxygen?"]

### By approach 1

In [44]:
filt_passage, filt_question = remove_stopwords(passage_1, questions_1)
passage_vector = create_vector_passage(filt_passage)
question_vector = create_vector_question(filt_question)
candidate_answers = __get_candidate_answers__(passage_1, questions_1, passage_vector, question_vector)
chunk_answer = chunk_candidate_answers(candidate_answers)
extracted_answers = extract_answers(chunk_answer)
for i in range(len(extracted_answers)):
    print(questions_1[i])
    print(extracted_answers[i])

Atomic number of periodic table for oxygen is 8.
Oxygen is a colorless and odorless diatomic gas.
Oxygen symbol is O.
Oxygen is a chemical element.
Atomic number of periodic table for oxygen is 8.
Atomic number of periodic table for oxygen is 8.
Oxygen is a chemical element.

What is the most abundant element?
['Atomic', '8']
What is the atomic number of the periodic table for oxygen?
['Oxygen']
How many atoms combine to form dioxygen?
[]
How much oxygen makes up the Earth crust?
['8']
Which gas makes up 20.8% of the Earth's atmosphere?
['Atomic', '8']
What element makes up almost half of the earth's crust by mass?
['Oxygen']
What is the atomic number of the element oxygen?
['Oxygen']


### By approach 2

In [45]:
candidate_answers = get_candidate_answers(passage_1, questions_1)
chunk_answer = chunk_candidate_answers(candidate_answers)
extracted_answers = extract_answers(chunk_answer)
for i in range(len(extracted_answers)):
    print(questions_1[i])
    print(extracted_answers[i])

Oxygen makes up almost half of the crust's mass.
Hydrogen is the most abundant element in the universe.
Oxygen makes up almost half of the crust's mass.
Hydrogen is the most abundant element in the universe.
Atomic number of periodic table for oxygen is 8.
Two atoms of the element bind to form dioxygen.
Diatomic oxygen gas constitutes 20.8% of the Earth atmosphere.

What is the most abundant element?
['Oxygen']
What is the atomic number of the periodic table for oxygen?
['Hydrogen']
How many atoms combine to form dioxygen?
['almost half']
How much oxygen makes up the Earth crust?
['Two']
Which gas makes up 20.8% of the Earth's atmosphere?
['Atomic', '8']
What element makes up almost half of the earth's crust by mass?
['Hydrogen']
What is the atomic number of the element oxygen?
['Diatomic']


### Passage 2

In [36]:
passage_3 = """Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. 
The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. 
The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. 
As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50."""

In [37]:
questions_3 = ["Which NFL team represented the AFC at Super Bowl 50?",
               "Where was the game played?", 
               "What was the theme of 50th Super Bowl?"]

### By approach 1

In [38]:
filt_passage, filt_question = remove_stopwords(passage_3, questions_3)
passage_vector = create_vector_passage(filt_passage)
question_vector = create_vector_question(filt_question)
candidate_answers = __get_candidate_answers__(passage_3, questions_3, passage_vector, question_vector)
chunk_answer = chunk_candidate_answers(candidate_answers)
extracted_answers = extract_answers(chunk_answer)
for i in range(len(extracted_answers)):
    print(questions_3[i])
    print(extracted_answers[i])

Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.
As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.

Which NFL team represented the AFC at Super Bowl 50?
['American', 'National Football League', 'NFL']
Where was the game played?
[]
What was the theme of 50th Super Bowl?
['7', '2016', 'Levi', 'San Francisco Bay Area', 'California']


### By approach 2

In [39]:
candidate_answers = get_candidate_answers(passage_3, questions_3)
chunk_answer = chunk_candidate_answers(candidate_answers)
extracted_answers = extract_answers(chunk_answer)
for i in range(len(extracted_answers)):
    print(questions_3[i])
    print(extracted_answers[i])  #Denver Broncos # Santa Clara, California # "golden anniversary"

Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.
Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.
As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.

Which NFL team represented the AFC at Super Bowl 50?
['American', 'National Football League', 'NFL']
Where was the game played?
[]
What was the theme of 50th Super Bowl?
['Super Bowl', 'Roman', 'Super Bowl', 'Arabic', '50']


### Paasage 3

In [40]:
passage_4 = """Steam engines are external combustion engines, where the working fluid is separate from the combustion products. 
Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. 
The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle.
In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. 
When expanded through pistons or turbines, mechanical work is done. 
The reduced-pressure steam is then condensed and pumped back into the boiler."""

In [26]:
questions_4 = ["What is a non-combustion heat source, along with geothermal and nuclear ?",
               "What types of engines are steam engines?"]

### By approach 1

In [27]:
filt_passage, filt_question = remove_stopwords(passage_4, questions_4)
passage_vector = create_vector_passage(filt_passage)
question_vector = create_vector_question(filt_question)
candidate_answers = __get_candidate_answers__(passage_4, questions_4, passage_vector, question_vector)
chunk_answer = chunk_candidate_answers(candidate_answers)
extracted_answers = extract_answers(chunk_answer)
for i in range(len(extracted_answers)):
    print(questions_4[i])
    print(extracted_answers[i])

Steam engines are external combustion engines, where the working fluid is separate from the combustion products.
Steam engines are external combustion engines, where the working fluid is separate from the combustion products.

What is a non-combustion heat source, along with geothermal and nuclear ?
['Steam']
What types of engines are steam engines?
['Steam']


### By approach 2

In [28]:
candidate_answers = get_candidate_answers(passage_4, questions_4)
chunk_answer = chunk_candidate_answers(candidate_answers)
extracted_answers = extract_answers(chunk_answer)
for i in range(len(extracted_answers)):
    print(questions_4[i])
    print(extracted_answers[i])

In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure.
Steam engines are external combustion engines, where the working fluid is separate from the combustion products.

What is a non-combustion heat source, along with geothermal and nuclear ?
[]
What types of engines are steam engines?
['Steam']
