In [1]:
import json
import nltk
import re
import operator
from nltk import word_tokenize
from math import log
from collections import defaultdict, Counter 
from nltk.corpus import stopwords

tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
stopwords = set(stopwords.words('english'))

def load_json(file):
    with open(file, 'r') as f: data = json.load(f)
    return data 

''' 
PROCESS SENTENCES 
for each doc, stokenize terms, remove stop words, non alpha characters, 
stem and lemmatize each term
'''
def lemmatize(word):
    # create an object of WordNetLemmatizer
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    lemma      = lemmatizer.lemmatize(word, 'v')
    if lemma  == word: lemma = lemmatizer.lemmatize(word, 'n')
    return lemma

def lemmatize_document(document):
    lemmatized_document = []
    for word in document:
        if word.isalnum():
            lemmatized_document.append(lemmatize(word.lower()))
    return lemmatized_document

def remove_non_letters(doc):
    ''' Process the tweet text to remove unwanted symbols and characters '''
    patterns = r'[^a-zA-Z0-9\-%]+'
    stripped = re.sub(patterns, ' ', doc)
    stripped = stripped.strip()
    return stripped

def process_token(token):
    token = token.lower()
    token = stemmer.stem(token)
    token = lemmatize(token)
    return token 

def extract_terms(doc):
    doc   = remove_non_letters(doc)
    # tokenize the sentence 
    tokens = word_tokenize(doc)
    # remove all stop words 
    tokens = [process_token(i) for i in tokens if i not in stopwords]
    return tokens 

def get_term_frequencies(documents):
    ''' Get terms' occurences in each sentence in a collection '''
    tf = defaultdict(dict)
    total_docs  = len(documents)
    for doc in documents:
        doc_id = documents.index(doc)
        doc = extract_terms(doc)
        for term in doc:
            tf[term][doc_id] = tf[term].get(doc_id, 0) + 1 
    return tf, total_docs

def get_tfidf(tf, total_docs):
    ''' 
    Calculate and return TF*IDF for all terms in a collection 
    IDF = log(N/df)
    total_docs = total number of documents in the collection
    '''
    tfidf = defaultdict(dict)
    for term, doc_list in tf.items():
        df = len(doc_list)
        for doc_id, freq in doc_list.items(): 
            tfidf[term][doc_id] = float(tf[term][doc_id]) * log(total_docs / df)
    return tfidf

def retrieve_sentence(tfidf, query):
    '''retrieve a sorted list of documents 
    on the order of closest distance to query'''
    scores = {}
    terms = extract_terms(query)
    #print("Terms: ", terms, '\n')
    for term in terms:
        if term in tfidf:
            posting_list = tfidf[term]
            for doc_id, weight in posting_list.items():
                scores[doc_id] = scores.get(doc_id, 0) + weight
    scores = [(k, v) for k, v in scores.items()]
    scores = sorted(scores, key=lambda t:t[1], reverse=True)
    return scores

def get_okapibm25(tf, total_docs, documents):
    '''Calculate and return term weights based on okapibm25'''
    k1, b, k3 = 1.5, 0.5, 0
    okapibm25 = defaultdict(dict)

    # calculate average doc length 
    total = 0
    for d in documents:
        total += len(d)
    avg_doc_length = total/len(documents)*1.0

    for term, doc_list in tf.items():
        df = len(doc_list)
        for doc_id, freq in doc_list.items():
            # term occurences in query
            # qtf = question.count(term) # SEPCIAL 
            qtf = 1.2
            idf = log((total_docs-df+0.5) / df+0.5)
            tf_Dt = ((k1+1)*tf[term][doc_id]) / (k1*((1-b)+b*(len(documents[doc_id])/avg_doc_length) + tf[term][doc_id]))
            if qtf == 0:
                third = 0
            else:
                third = ((k3+1)*qtf) / (k3+qtf)
                okapibm25[term][doc_id] = idf*tf_Dt*third

    return okapibm25

In [3]:

from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger
from nltk import word_tokenize
from nltk import sent_tokenize
import timeit
import time

jar = './stanford-ner.jar'
model = './english.all.3class.distsim.crf.ser.gz'
st = StanfordNERTagger(model, jar)

def check_uppercase(word):
    if word[0].isupper():
        return True
    else:
        return False

def entity_recognize(sentences):
       
    result = []
    for j, s in enumerate(st.tag_sents([word_tokenize(sent.replace('/','')) for sent in sentences])):
        r = []
        for i, (word,tag) in enumerate(s):
            if i != 0:
                if check_uppercase(word) and tag == 'O':
                    r.append((word, 'OTHER'))
                elif word.isdigit():
                    r.append((word, 'NUMBER'))
                elif tag == 'ORGANIZATION':
                    r.append((word, 'OTHER'))
                else:
                    r.append((word, tag))
            else:
                if word.isdigit():
                    r.append((word, 'NUMBER'))
                elif tag == 'ORGANIZATION':
                    r.append((word, 'OTHER'))
                else:
                    r.append((word, tag))
        result.append(r)
    return result

def rechunk(ner_output):
    tag, prev_tag = '',None
    clean_chunked = []
    for j, s in enumerate(ner_output):
        chunked,tag, prev_tag = [],'',None
        for i, word_tag in enumerate(s):
            word, tag = word_tag
            if tag in ['PERSON','OTHER','LOCATION','NUMBER'] and tag == prev_tag:
                chunked[-1] += word_tag
            else:
                chunked.append(word_tag)
            prev_tag = tag
       
        clean_chunked.append([tuple([' '.join(wordtag[::2]), wordtag[-1]])
            if len(wordtag) != 2 else wordtag for wordtag in chunked])
    return clean_chunked


In [4]:
from pprint import pprint
import timeit
import operator
from itertools import groupby
import math


rules = [
            ('person','PERSON'),
            ('location', 'LOCATION'),            
            ('who', 'PERSON'),
            ('why', 'OTHER'),
            ('are', 'OTHER'),
            ('from', 'LOCATION'),
            ('country', 'LOCATION'),
            ('capital', 'LOCATION'),
            ('city', 'LOCATION'),
            ('where', 'LOCATION'),
            ('when', 'NUMBER'),
            ('many', 'NUMBER'),
            ('long','NUMBER'),
            ('high', 'NUMBER'),
            ('year', 'NUMBER'),
            ('decade', 'NUMBER'),
            ('time', 'NUMBER'),
            ('cost', 'NUMBER'),
            ('population', 'NUMBER'),
            ('number','NUMBER')
        ]
  

def detect_answer_type(question):
    tag_of_answer = None
    for rule in rules:
        word = rule[0]
        tag = rule[1]
        if word in question.lower():
            tag_of_answer = tag
    if tag_of_answer == None:
        tag_of_answer = 'OTHER'
    return tag_of_answer
    
    
def check_word_in_query(word_tag, lemmatized_query):
    word = word_tag[0]
    if word in lemmatized_query:
        return True
    return False    

def get_open_class_word(query):
    tagged = nltk.pos_tag(word_tokenize(query), tagset="universal")
    #return [p[0] for p in tagged if p[1] in ["ADJ", "ADV","NOUN", "PROPN","VERB"]]
    # take all words that is not stop words
    return [p[0] for p in tagged if p[0] not in stopwords]

def first_pass(tagged_sent, query, sentence):
        '''
        calculate and append total distance between words and mutual open-class words
        '''
        result = []
        keywords = get_open_class_word(query)
        tokenized_sentence = word_tokenize(sentence.replace('/',''))
        lemmatized_keywords = lemmatize_document(keywords)
        lemmatized_sentence = lemmatize_document(word_tokenize(sentence))
        
        index_of_keywords = []
        word_in_sentence = []
        
        for lemmatized_keyword in lemmatized_keywords:
            if lemmatized_keyword in lemmatized_sentence:
                word_in_sentence.append(lemmatized_keyword)
                
        for word in word_in_sentence:
            index_of_keyword = lemmatized_sentence.index(word)
            index_of_keywords.append(index_of_keyword)
            

        for word_tag in tagged_sent:
            try:
                index_of_word = tokenized_sentence.index(word_tag[0])
            except ValueError:
                # error because word_tag[0] is actually a phrasal noun, take index of first word
                first_word = word_tokenize(word_tag[0])[0]
                index_of_word = tokenized_sentence.index(first_word)
                    
            total_distance = 0
            for index in index_of_keywords:
                total_distance += abs(index_of_word - index)

            # add total distance to tuple
            entity = word_tag + (total_distance,)
            # filter out words with meaningless character and total distance = 0
            if (entity[2] != 0) and (entity[0] != ',') and (entity[0] not in stopwords) and (entity[0] not in [':',',','(',')','.','-','[', ']','``','""','%',"'"]):
                entity1 = re.sub(' \)','',entity[0])
                entity2 = re.sub(' \(','',entity1)
                entity3 = re.sub(':','',entity2)
                result.append((entity3, word_tag[1], total_distance))

        return result

def second_pass(first_pass_result, query):
        # get answer type
        tag = detect_answer_type(query)
        result = []
        high_ranked = []
        low_ranked = []
        # filter out entity with tag pertain to answer type, rank higher than those do not match answer type
        for entry in first_pass_result:            
            if entry[1] == tag:
                high_ranked.append(entry)
            else:
                low_ranked.append(entry)
                      
        result.extend(high_ranked)
        result.extend(low_ranked)
        
        return result
    
def third_pass(question, second_pass_result):

    third_pass_result = []
    lemmatized_query = lemmatize_document(word_tokenize(question))
    # group entities with same tag
    for key, group in groupby(second_pass_result, key = operator.itemgetter(1)):
        
        # sort entities by distance
        group_sorted = sorted(group, key = operator.itemgetter(2), reverse = False)
                
        # sort entities again by putting entity occur in question to lowest of group
        high_ranked = []
        low_ranked = []
        result = []        
        for entity in group_sorted:
            if check_word_in_query(entity, lemmatized_query):
                low_ranked.append(entity)
            else:
                high_ranked.append(entity)                
        result.extend(high_ranked)
        result.extend(low_ranked)
        
        
        third_pass_result.extend(result)
    
    return third_pass_result[0][0]


In [6]:
import csv

def print_csv():
    data = load_json('QA_test.json')

    csv_file = open('output.csv', 'w', encoding='utf-8')
    writer = csv.writer(csv_file)
    writer.writerow(['id', 'answer'])
    
    for article in data:
        collection = article['sentences']
        tf, total_docs = get_term_frequencies(collection)
        #tfidf = get_tfidf(tf, total_docs)
        tagged_sentences = rechunk(entity_recognize(collection))       
        for qa in article['qa']:
            question = qa['question']
            qa_id = qa['id']
            okapibm25 = get_okapibm25(tf, total_docs, collection)
            sentence_id = retrieve_sentence(okapibm25, question)
            if len(sentence_id) == 0:
                writer.writerow([qa_id, 'Not sure'])
            else:
                doc_id = sentence_id[0][0]
                first_pass_result = first_pass(tagged_sentences[doc_id], question, collection[doc_id])
                if first_pass_result == []:
                    writer.writerow([qa_id, 'Not sure'])
                else:
                    second_pass_result = second_pass(first_pass_result, question)
                    third_pass_result = third_pass(question, second_pass_result)
                    writer.writerow([qa_id, third_pass_result])

    csv_file.close()

print_csv()