In [361]:
import pandas as pd
import numpy as np
import re, string, random

#packages
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import Tree

# LDA Model
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
from gensim.models import CoherenceModel
import spacy

#sklearn & gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



### 1. Building Question Classifier

In [362]:
training_data = pd.read_csv("Question_Classification_Dataset.csv")
training_data = training_data[["Questions", "Category0"]]
training_data = training_data.rename(columns={"Category0": "class"})
training_data

Unnamed: 0,Questions,class
0,How did serfdom develop in and then leave Russ...,DESCRIPTION
1,What films featured the character Popeye Doyle ?,ENTITY
2,How can I find a list of celebrities ' real na...,DESCRIPTION
3,What fowl grabs the spotlight after the Chines...,ENTITY
4,What is the full form of .com ?,ABBREVIATION
...,...,...
5447,What 's the shape of a camel 's spine ?,ENTITY
5448,What type of currency is used in China ?,ENTITY
5449,What is the temperature today ?,NUMERIC
5450,What is the temperature for cooking ?,NUMERIC


In [363]:
def produce_tdm(df, specific_class):
    D_docs = [row['Questions'] for index,row in training_data.iterrows() if row['class'] == specific_class]


    vec_D = CountVectorizer()
    X_D = vec_D.fit_transform(D_docs)
    tdm_D = pd.DataFrame(X_D.toarray(), columns=vec_D.get_feature_names())

    return tdm_D, vec_D, X_D

In [364]:
tdm_D, vec_D, X_D  = produce_tdm(training_data, "DESCRIPTION")
tdm_E, vec_E, X_E = produce_tdm(training_data, "ENTITY")
tdm_A, vec_A, X_A = produce_tdm(training_data, "ABBREVIATION")
tdm_H, vec_H, X_H = produce_tdm(training_data, "HUMAN")
tdm_N, vec_N, X_N = produce_tdm(training_data, "NUMERIC")
tdm_L, vec_L, X_L = produce_tdm(training_data, "LOCATION")

In [365]:
def produce_freq(vec, X):
    word_list = vec.get_feature_names()
    count_list = X.toarray().sum(axis=0) 
    freq = dict(zip(word_list,count_list))
    freq

    return freq, count_list, word_list

In [366]:
freq_D, count_list_D, word_list_D = produce_freq(vec_D, X_D)
freq_E, count_list_E, word_list_E = produce_freq(vec_E, X_E)
freq_A, count_list_A, word_list_A = produce_freq(vec_A, X_A)
freq_H, count_list_H, word_list_H = produce_freq(vec_H, X_H)
freq_N, count_list_N, word_list_N = produce_freq(vec_N, X_N)
freq_L, count_list_L, word_list_L = produce_freq(vec_L, X_L)

In [367]:
def get_prob(count_list, word_list):
    prob = []
    for count in count_list:
        prob.append(count/len(word_list))
    return dict(zip(word_list, prob))

In [368]:
prob_D = get_prob(count_list_D, word_list_D)
prob_E = get_prob(count_list_E, word_list_E)
prob_A = get_prob(count_list_A, word_list_A)
prob_H = get_prob(count_list_H, word_list_H)
prob_N = get_prob(count_list_N, word_list_N)
prob_L = get_prob(count_list_L, word_list_L)


In [369]:
docs = [row['Questions'] for index,row in training_data.iterrows()]

vec = CountVectorizer()
X = vec.fit_transform(docs)

total_features = len(vec.get_feature_names())
total_features

8412

In [370]:
total_cnts_features_D = count_list_D.sum(axis=0)
total_cnts_features_E = count_list_E.sum(axis=0)
total_cnts_features_A = count_list_A.sum(axis=0)
total_cnts_features_H = count_list_H.sum(axis=0)
total_cnts_features_N = count_list_N.sum(axis=0)
total_cnts_features_L = count_list_L.sum(axis=0)

In [371]:
def get_prob_with_qns(new_word_list, freq, total_cnts_features, total_features):
    prob_with_ls = []
    for word in new_word_list:
        if word in freq.keys():
            count = freq[word]
        else:
            count = 0
        prob_with_ls.append((count + 1)/(total_cnts_features + total_features))
    output = dict(zip(new_word_list,prob_with_ls))
    value_list = output.values()
    value_list
    
    prob = 1
    for each in value_list:
        prob *= each
    return prob

In [372]:
def classify_qns(qns):
    new_word_list = word_tokenize(qns)
    
    prob_D = get_prob_with_qns(new_word_list, freq_D, total_cnts_features_D, total_features)
    prob_E = get_prob_with_qns(new_word_list, freq_E, total_cnts_features_E, total_features)
    prob_A = get_prob_with_qns(new_word_list, freq_A, total_cnts_features_A, total_features)
    prob_H = get_prob_with_qns(new_word_list, freq_H, total_cnts_features_H, total_features)
    prob_N = get_prob_with_qns(new_word_list, freq_N, total_cnts_features_N, total_features)
    prob_L = get_prob_with_qns(new_word_list, freq_L, total_cnts_features_L, total_features)

    prob = [prob_D, prob_E, prob_A, prob_H, prob_N, prob_L]
    classes = ["DESCRIPTION", "ENTITY", "ABBREVIATION", 'HUMAN', "NUMERIC", "LOCATION"]
    return(classes[prob.index(max(prob))], max(prob))

In [373]:
classify_qns('How much is the book')

('NUMERIC', 1.2506703038627668e-14)

In [374]:
classify_qns('where is singapore')

('LOCATION', 2.135747124966801e-08)

In [375]:
classify_qns('who is beyonce')

('HUMAN', 1.777338870239113e-08)

In [376]:
classify_qns('what colour is the sky')

('DESCRIPTION', 1.7397241521486881e-13)

In [377]:
classify_qns('How much should i invest in The Bank of England')

('NUMERIC', 2.8536498134118726e-35)

### 2. Formulating Query

In [464]:
def formulate_query(qns):
    qns_head = qns.split()[0]
    ner_gpe = get_continuous_chunks(qns, "GPE")
    ner_person = get_continuous_chunks(qns, "PERSON")
    ner_org = get_continuous_chunks(qns, "ORGANIZATION")
    ans_type = classify_qns(qns)
    return [[qns_head], ner_gpe, ner_person, ner_org, ans_type]
#     return {"qns_head":qns_head,
#             "ner_gpe": ner_gpe,
#             "ner_person": ner_person,
#             "ner_org": ner_org,
#             "ans_type": ans_type
#     }

In [381]:
def get_continuous_chunks(text, label):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
#     print(chunked)
    prev = None
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree and subtree.label() == label:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
#             print('current_chunk', current_chunk)
        if current_chunk:
            named_entity = " ".join(current_chunk)
#             print('named', named_entity)
#             print('continuous', continuous_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

In [461]:
query = formulate_query('How much is the annual salary in New York city?')
query

{'qns_head': 'How',
 'ner_gpe': ['New York'],
 'ner_person': [],
 'ner_org': [],
 'ans_type': ('NUMERIC', 4.522772464857608e-36)}

### 3. Answer Retrieval by Cosine Similarity

In [419]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations and special characters

In [420]:
def compute_similarity(cleaned_sent_lower):
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(cleaned_sent_lower)
    
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names())
    cosim = cosine_similarity(df, df)
    return cosim

In [421]:
def get_top3(cosim):
    top3prob = np.sort(cosim[-1])[::-1][1:4]
    top3docs = []
    for prob in top3prob:
        doc_num = np.where(cosim[-1] == prob)[0][0]
#         print("Doc:", doc_num, ", Cosine:", prob)
#         print(cleaned_sent_lower[doc_num])
        top3docs.append(cleaned_sent_lower[doc_num])
    return top3docs


### 4. Evaluate answer 

In [422]:
def evaluate_ans_1(query_ans_type, top3docs):
    output = {0:0, 1:0, 2:0}
    if query_ans_type[0] == 'NUMERIC':
        index = 0
        for each in top3docs:
            r1 = re.findall(r"[0-9]+",each)
    #         print(r1)
            if r1!=[]:
                output[index] = 1
            index +=1

    elif query_ans_type[0] == 'LOCATION':
        index = 0
        for each in top3docs:
            if get_continuous_chunks(each, "GPE") != []:
                output[index] = 1
            index +=1

    elif query_ans_type[0] == 'HUMAN':
        index = 0
        for each in top3docs:
            if get_continuous_chunks(each, "PEOPLE") != []:
                output[index] = 1
            index +=1
    return output    

In [423]:
def evaluate_ans_2(query_keywords, top3docs, output):
    for each in query_keywords:
        index = 0
        for doc in top3docs:
            if each in doc:
                output[index] += 1
            index += 1
    return output

In [443]:
def get_final_doc(top3docs, output):
    max_value = max(output.values())  # maximum value
    max_keys = [k for k, v in output.items() if v == max_value] # getting all keys containing the `maximum`
    return top3docs[max_keys[0]]

### Discourse

### 4. Shorten answer

### 5. Generate answer template

### 6. Check for Semantic answer 

## Testing with a random financial article context

In [457]:
def ans_qns(context, qns):
    text = str(context.read())
    sentences = text.split(".")

    #qns analysis
    query = formulate_query(qns)
    query_keywords = query[1] + query[2] + query[3]
    query_ans_type = query[4]
    query_ans_type
    
    # Remove trailing \n
    cleaned_sent_lower = [sent.replace("\n", "") for sent in sentences]

    #add test_doc
    cleaned_sent_lower.append(qns)

    #compute similarity
    cosim = compute_similarity(cleaned_sent_lower)
    
    #gettop3 docs
    top3docs = get_top3(cosim)

    # get evaluated ans I
    output = evaluate_ans_1(query_ans_type, top3docs)
#     print(output)
    
     # get evaluated ans II
    output = evaluate_ans_2(query_keywords, top3docs, output)
#     print(output)
    
    #get ans
    final_doc = get_final_doc(top3docs, output)
    return final_doc

In [458]:
context = open("context.txt", encoding="utf8")
qns = 'How much is the annual salary in New York city?' #works when qns and ask are near to each other
ans_qns(context, qns)

' An annual salary of $35,000 in New York City, for example, would leave you with around $27,490 after federal taxes without exemptions for the 2020-2021 filing season—about $2,291 a month'

In [465]:
context = open("context.txt", encoding="utf8")
qns = 'What is Disability income insurance?' #works when qns and ask are near to each other
ans_qns(context, qns)

' Disability income insurance protects your greatest asset—the ability to earn an income—by providing you with a steady income if you ever become unable to work for an extended period of time due to illness or injury'

In [466]:
context = open("context.txt", encoding="utf8")
qns = 'Why should I start saving for retirement?' #limitations: pick up line that is most similar but not neccesarily the ans
ans_qns(context, qns)

' Fifthly, Start Saving for Retirement'

In [467]:
context = open("context.txt", encoding="utf8")

qns = 'What insurance will I get when I am employed?' #limitations: you're, I perspective
ans_qns(context, qns)

" Read the policy carefully to see what's covered and what isn't"