In [2]:
'''
    Team: 
        QAOutliers
    
    Memebers: 
        Zhanyi Qi
        Lianyu Zeng
        Yiheng Wang
        
    This cell contains def that we create for our Basic Q&A System. 
    Our Basic Q&A System has five parts:
        I.   Sentence Retrieval by algorithm BM25
        II.  Question Classification with simple rules
        III. Named Entity Recognition processing by Stanford NER Tagger
        IV.  Answer Ranking rules
        V.   Output 
'''
import csv
import json
import copy
import os
import nltk
import math
import re
import sys
import string
from math import log
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from collections import defaultdict, Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine as cos_distance
from sklearn.decomposition import TruncatedSVD
from nltk.tag import StanfordNERTagger
'''
    Part I. Sentence Retrieval by algorithm BM25
    There are 7 def in this part.
    1. get_BOW()
                input:  a list of word
                output: a dictionary of these words
    2. lemmatize()
                input:  a word
                output: lemmatized word
    3. input_data()
                input:  None 
                output: train, test, and dev data sets
    4. transform_text()
                input:  a sentence
                output: a list of words from this sentence by tokenizing, lemmatizing, filtering stopwords and 
                        punctuations
    5. get_Docfrequency_SentenceBOW()
                input:  a data set from train, test or dev
                output: 5 values which are list, dictionary, dictionary, list and list. These values will be used
                        in the calculating process in BM25
    6. find_max_score_sentence()
                input:  the index of this article, the index of this question, k1, k2 and b which are 
                        the coefficients in BM25 algorithm
                output: the best match sentence id
    7. BM25()
                input:  the index of this article, k1, k2 and b which are the coefficients in BM25 algorithm
                output: the list of sentence id that predicted by BM25
'''
def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma
def input_data():
    base_path = os.path.join('data/')
    train_file = base_path + 'QA_train.json'
    train_data = json.load(open(train_file))
    test_file = base_path + 'QA_test.json'
    test_data = json.load(open(test_file))
    dev_file = base_path + 'QA_dev.json'
    dev_data = json.load(open(dev_file))
    return train_data,test_data,dev_data
def transform_text(text):
    text = nltk.word_tokenize(text)
    text = [lemmatize(word.lower()) for word in text]
    result = []
    for word in text:
        if word not in stopwords and word not in punctuations:
            result.append(word)
    return result
def get_Docfrequency_SentenceBOW(dataset):
    #save dics, each dictionary contains document frequencies for all terms in the same article
    question_list = []
    #save lists, each list represent an article, saving sentences' bow
    total_sentence_bow = []
    #save lists, each list represent an article, saving questions' bow
    total_question_bow = []
    #save lists, each list represent all sentences' lengthes.
    sent_lengthes = []
    #save a list, each item represents the average length of sentences
    avg_lengthes = []  
    for article in dataset:
        #Docfrequency
        article_dic = defaultdict(list)
        keyterms = [] #save all distinct terms in questions      
        #SentenceBOW
        bow_list = []     
        #QuestionBOW
        que_list = []      
        #SentenceLength
        sent_len = []    
        #TotalLength
        total_len = 0     
        qas = article['qa']
        sentences = article['sentences']
        for qa in qas:
            question = qa['question']
            newquestion = transform_text(question)
            #QuestionBOW
            que_list.append(get_BOW(newquestion))          
            keyterms.extend(newquestion)
        keyterms = set(keyterms)       
        #save sentences' BOW in list sen_BOW
        sen_words = []
        for sent in sentences:
            sent = transform_text(sent)
            #Docfrequency
            sen_words.append(sent)           
            #SentenceBOW
            bow_list.append(get_BOW(sent))            
            #SentenceLength
            sent_len.append(len(sent))       
            #TotalLength
            total_len += len(sent)       
        #calculate doc frequency    
        for term in keyterms:
            for i,bow in enumerate(sen_words):
                if term in bow:
                    article_dic[term].append(i)                   
        #Docfrequency
        question_list.append(article_dic)
        #SentenceBOW
        total_sentence_bow.append(bow_list)
        #QuestionBOW
        total_question_bow.append(que_list)
        #SentenceLength
        sent_lengthes.append(sent_len)
        #AverageLength
        avg_lengthes.append(float(total_len)/len(sentences))  
    return question_list, total_sentence_bow, total_question_bow, sent_lengthes, avg_lengthes
def find_max_score_sentence(articles_index,index,k1,k2,b):
    #Get the dictionary of this question from QuestionBOW
    query_dict = total_question_bow[articles_index][index]
    max_score = 0
    guess_sentence = 0
    for index in range(len(total_sentence_bow[articles_index])):     
        score = 0  
        #Get the dictionary of this sentence from SentenceBOW
        sentence_dict = total_sentence_bow[articles_index][index]
        #Calculate the score of each word in question query
        for word in query_dict:
            document_fre_list = question_list[articles_index].get(word,None)
            N = len(total_sentence_bow[articles_index])
            n_qi = 0
            if document_fre_list != None:
                n_qi = len(document_fre_list)
            else:
                n_qi = 0
            fi = sentence_dict.get(word,0)
            qfi = query_dict.get(word,0)
            dl = sent_lengthes[articles_index][index]
            avgdl = avg_lengthes[articles_index]            
            K = k1*(1-b+b*(float(dl)/avgdl))
            W = math.log((N-n_qi+0.5)/(n_qi+0.5))
            R = (fi*(k1+1))/(fi+K)*qfi*(k2+1)/(qfi+k2)
            score += W*R
        if score > max_score:
            max_score = score
            guess_sentence = index
    return guess_sentence
def BM25(articles_index,k1,k2,b):
    result_id = []
    for index in range(len(total_question_bow[articles_index])):
        #Predict the sentence id of each question query
        guess_id = find_max_score_sentence(articles_index,index,k1,k2,b)
        result_id.append(guess_id)
    return result_id
'''
    Part II. Question Classification with simple rules
    There are 2 def in this part.
    1. transfer_pos_question()
                input:  pos of a question query
                output: edited pos of this question query
    2. get_continuous_chunks()
                input:  a question query
                output: chunk of this question query, like [('WHEN','when ...')...]
'''
def transfer_pos_question(pos):
    new_pos = []
    #Edit the pos tag from nltk pos_tag to customized tag
    for (word,wtype) in pos:
        if word.lower() == 'what' or word.lower() == 'what\'s':
            new_pos.append((word,'WHAT'))
        elif word.lower() == 'do' or word.lower() == 'does' or word.lower() == 'did':
            new_pos.append((word,'DO'))
        #Manully tag some word from number_list as 'TIME' to classisify the question type
        elif word.lower() in number_list:
            new_pos.append((word,'TIME'))
        #Manully tag some word from location_list as 'LOC' to classisify the question type
        elif word.lower() in location_list:
            new_pos.append((word,'LOC'))
        #Manully tag some word from name_list as 'NAME' to classisify the question type
        elif word.lower() in name_list:
            new_pos.append((word,'NAME'))
        elif word.lower() == 'is' or word.lower() == 'was' or word.lower() == 'are' or word.lower() == 'were' or word.lower() == 'be':
            new_pos.append((word,'BE'))
        elif word.lower() == 'when':
            new_pos.append((word,'WHEN'))
        elif word.lower() == 'where':
            new_pos.append((word,'WHERE'))
        elif word.lower() == 'how':
            new_pos.append((word,'HOW'))
        elif word.lower() == 'who' or word.lower() == 'whom' or word.lower() == 'whose'  or word.lower() == 'whos':
            new_pos.append((word,'WHO'))
        elif word.lower() == 'which':
            new_pos.append((word,'WHICH'))
        elif word.lower() == 'why' or word.lower() == 'wy':
            new_pos.append((word,'WHY'))
        else:
            new_pos.append((word,wtype))
    return new_pos
def get_continuous_chunks(text):
    #Get pos tag by nltk pos tagger of this qyestion query
    pos = pos_tag(nltk.word_tokenize(text))
    #Edit the pos tag to customized pos tag
    pos = transfer_pos_question(pos)
    #Define different grammar for different types of question
    grammar = r"""
                WHAT: 
                    {<WHAT>}
                    {<WHICH>}
                WHO:
                    {<WHO>}
                    {<WHAT><BE>?<DT>?<JJ|RB>*<NAME>}
                    {<WHAT><JJ|RB>*<NN>+<NAME>}
                WHEN:
                    {<WHICH><TIME>}
                    {<HOW><TIME>}
                    {<WHAT><BE>?<DT>?<JJ>?<NN>*<JJ>?<TIME>}
                    {<WHEN>}
                WHERE:
                    {<WHERE>}
                    {<WHAT><LOC>}
                HOW:
                    {<DO>}
                    {<WHY>}
                    {<HOW>}

                """
    #Load nltk RegexpParser to analyse the grammar
    cp = nltk.RegexpParser(grammar) 
    result = []
    tree = cp.parse(pos)
    #Parse the grammar tree
    for subtree in tree.subtrees():
        if subtree.label() != 'S':
            phrase = u''
            for word,pos in subtree.leaves():
                if word == ',':
                    phrase = phrase + word
                else:
                    phrase = phrase + u' '
                    phrase = phrase + word
            result.append((subtree.label(),phrase[1:]))         
    return result
'''
    Part III. Named Entity Recognition processing by Stanford NER Tagger
    There are 5 def in this part.
    1. input_NER()
                input:  None
                output: StanfordNER model file, StanfordNER jar file
    2. analyse_NER()
                input:  list of sentences that already tagged by edited NER tool
                output: remove 'O' tag and combine continuous same tags as one entity
    3. parse_NER()
                input:  list of sentences that already tagged by Stanford NER tool 
                output: list of sentences that correct the NER result manully(by some rules)
    4. extract_NER()
                input:  NER sentence and the model number
                output: result of orderred entities from this NER sentence 
    5. parse_token()
                input:  a token of a sentence
                output: correct mistake in tokens, and return the new tokens 
'''
def input_NER():
    stanford_dir = os.path.join('stanford-ner-2016-10-31')
    jarfile = os.path.join(stanford_dir,'stanford-ner.jar')
    modelfile = os.path.join(stanford_dir,'classifiers/english.all.3class.distsim.crf.ser.gz')
    return modelfile,jarfile
def analyse_NER(ner_sentences):
    result_sentences = []
    for ner_sentence in ner_sentences:
        result_sentence = []
        perv_type = u'O'
        word = u''
        conjunction = u''
        conjunc_flag = False
        for index,(entity,etype) in enumerate(ner_sentence):
            if perv_type == u'O' and etype != u'O':              
                perv_type = etype
                word = entity + u' '
            elif word != u'':
                if etype == u'O':
                    if entity not in conjunction_word:
                        result_sentence.append((word[:-1],perv_type))
                        word = u''
                        perv_type = u'O'
                        if conjunction != u'':
                            conjunction = u''
                            conjunc_flag = False
                    else:
                        if conjunction != u'':
                            conjunction = u''
                            conjunc_flag = False
                        else:
                            conjunction = entity
                            conjunc_flag = True
                elif etype != perv_type:
                    result_sentence.append((word[:-1],perv_type))
                    word = entity + u' '
                    perv_type = etype
                    conjunction = u''
                    conjunc_flag = False
                elif etype == perv_type:
                    if conjunc_flag:
                        if conjunction == u',':
                            word = word[:-1] + conjunction + u' ' + entity + u' '
                        else:
                            word = word + conjunction + u' ' + entity + u' '
                        conjunction = u''
                        conjunc_flag = False
                    else:
                        if entity in ['%'] or word == u'$ ':
                            word = word[:-1] + entity + u' '
                        else:
                            word = word + entity + u' '
        if word != u'':
            result_sentence.append((word[:-1],perv_type))
        result_sentences.append(result_sentence)      
    return result_sentences
def parse_NER(ner_sentences):
    pattern_number = re.compile(r'([0-9]+|\%|\$)')
    result_sentences = []
    for ner_sentence in ner_sentences:
        result_sentence = []
        for index,(entity,etype) in enumerate(ner_sentence):
            if entity != u'':
                entity.replace(u'\u2013',u'-')
                entity.replace(u'\u2014',u'-')
                entity.replace(u'\u2212',u'-')
                entity.replace(u'\u2044',u'%')
                if etype == u'O':
                    if pattern_number.search(entity) or entity in time_word:
                        result_sentence.append((entity,u'NUMBER'))
                    elif u'-' in entity:
                        word_seperate = entity.split(u'-')
                        for word in word_seperate:
                            if word in time_word:
                                result_sentence.append((entity,u'NUMBER'))
                                break
                    elif entity in location_word:
                        result_sentence.append((entity,u'LOCATION'))
                    elif index == 0 and entity.lower() not in stopwords:
                        result_sentence.append((entity,u'ORGANIZATION'))
                    elif index != 0 and entity[0].isupper():
                        result_sentence.append((entity,u'ORGANIZATION'))
                    else:
                        result_sentence.append((entity,etype))
                elif entity in ['(',')']:
                    result_sentence.append((entity,u'O'))
                else:
                    result_sentence.append((entity,etype))
        result_sentences.append(result_sentence)     
    return analyse_NER(result_sentences)
def extract_NER(parse_ner_sentence,mode):
    result = []
    if mode == 0:
        #PERSON
        for entity,etype in parse_ner_sentence:
            if etype == u'PERSON':
                result.append(entity)
    elif mode == 1:
        #NUMBER
        for entity,etype in parse_ner_sentence:
            if etype == u'NUMBER':
                result.append(entity)
    elif mode == 2:
        #LOCATION
        for entity,etype in parse_ner_sentence:
            if etype == u'LOCATION':
                result.append(entity)
    elif mode == 3:
        #ORGANIZATION
        for entity,etype in parse_ner_sentence:
            if etype == u'ORGANIZATION':
                result.append(entity)
    return result
def parse_token(token_sentence):
    result = []
    for index,word in enumerate(token_sentence):
        if index != 0 and index != (len(token_sentence)-1) and word == u'.':
            last_word = result[-1]
            last_word = last_word + u'.'
            result = result[:-1]
            result.append(last_word)
        else:
            result.append(word)
    return result
'''
    Part IV. Answer Ranking rules
    There are 3 def in this part.
    1. get_open_class_words()
                input:  token of the question query
                output: removed stopword and punctuation of the list
    2. rank_rule_1()
                input:  entity and the question query
                output: score of rule 1
    3. rank_rule_3()
                input:  answer sentence, token of the answer snetence, entity and the token of question query 
                output: score of rule 3
'''
def get_open_class_words(query):
    result = []
    for index in range(len(query)):
        if query[index] not in stopwords:
            if query[index] not in string.punctuation:
                result.append(query[index])
    return result
def rank_rule_1(entity,query):
    #lower scores for content words also appear in the query
    count = 0
    length = len(entity)
    for word in entity:
        word = lemmatize(word.lower())
        if word not in stopwords:
            if word in query:
                count += 1
    if length == 0:
        score = 0
    else:
        score = 1 - float(count)/length
    return score
def rank_rule_3(answer_sentence,sentence,entity,query):
    #higher scores for closer distance between an entity and the headword
    #step 1: using a filter to extract "useful" open-class words
    results = get_open_class_words(query)
    sent = sentence
    original_sent = answer_sentence
    entity_loc = []
    query_loc = []
    for word in entity:
        if word in original_sent:
            entity_loc.append(original_sent.index(word))
    for q in results:
        if q in sent:
            query_loc.append(sent.index(q))
    min_dist = len(original_sent)
    if query_loc != []:
        for i in query_loc:
            for j in entity_loc:
                dist = abs(i - j)
                if dist < min_dist:
                    min_dist = dist                   
    return 1 - float(min_dist)/len(original_sent)
'''
    Part V. Output
    There are 1 def in this part.
    1. output_result()
                input:  filename
                output: putting result in this file
'''
def output_result(filename):
    predictions_file = open(filename, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["id","answer"])
    for i in range(len(answers)):
        open_file_object.writerow([ids[i], answers[i].encode("utf-8")])
    predictions_file.close()

In [5]:
'''
    This cell is for initializing dataset and tools.
'''
#Initialize datasets and tools
train,test,dev = input_data()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
model,jar = input_NER()
st = StanfordNERTagger(model,jar)

#Define dataset for customized function
punctuations = [',','\'\'','?','\'','.','%','(',')',';','``']
'''
NER
'''
#Customized feature words in question
location_list = ['country','county','district']
number_list = ['version','size','msa','far','much','year','era','ration','years','time','many','population','large','percent','average','day','decade','big','long']
name_list = ['name','center','president','denominations','denomination','film','broadcaster','pitcher','commentator']
#Customized feature words in sentence
time_word = [
    'one','two','three','four','five','six','seven','eight','nine',
    'January','February','March','April','May','June','July','August','September','October','November','December',
    'million','billion',
    'minutes','hours','years','times',
    'mm','miles','inches','foot','feet',
    'late','early','around','over',
    'persons','seasons','square',
    'spring','summer','fall','autumn','winter'
]
location_word = [
    'southwest','southeast','northwest','northeast'
]
conjunction_word = ['and','of']
#Initialize data for BM25 processing
question_list, total_sentence_bow, total_question_bow, sent_lengthes, avg_lengthes = get_Docfrequency_SentenceBOW(dev)

In [6]:
'''
    This cell is for BM25 processing.
'''
#Set the range of k1, k2 and b
k1_list = [0.78]
k2_list = [0]
b_list = [0.5]
#Store the predicting result by BM25 in result_sentences
result_sentences = []
test_length = len(dev)
for k1 in k1_list:
    for k2 in k2_list:
        for b in b_list:
            for i in range(0,test_length):
                correct_id = BM25(i,k1,k2,b)
                result_sentences.append(correct_id)

In [22]:
'''
    This cell is for 
        I.   Question Classification
        II.  Named Entity Recognition processing
        III. Answer Ranking
        IV.  Output.
''' 
ids = []
answers = []

count_contains = 0
count_similar = 0.0
count_correct = 0
total_question = 0

for index in range(5):
    print index
    article = dev[index]
    qas = article['qa']
    sentences = article['sentences']
    token_sentences = copy.deepcopy(sentences)
    #Tag the sentence by NER processing
    for i in range(len(sentences)):
        token_sentences[i] = parse_token(nltk.word_tokenize(token_sentences[i]))
    ner_sentences = st.tag_sents(token_sentences)
    parse_ner_sentences = parse_NER(ner_sentences)
    #Predict the answer of each question
    total_question += len(qas)
    for i in range(len(qas)):
        qa = qas[i]
        #Extract the sentence of answer
        answer_sentence = sentences[result_sentences[index][i]]
        #Extract the id of sentence of answer
        answer_sentence_id = result_sentences[index][i]
        #Extract question query
        text_question = qa['question']
        #Get the grammar(POS) result of question 
        result = get_continuous_chunks(text_question)
        #id = qa['id']
        if result != []:
            wtype,word = result[0]
            #According to the different types of question, we have different solutions.
            if wtype == 'WHEN':
                #If question type is 'WHEN', extract NER 'NUMBER' from sentence
                answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],1)
                if answer_list == []:
                    #If nothing is 'NUMBER', get 'ORGANIZATION' from sentence
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],3)
            elif wtype == 'WHO':
                #If question type is 'WHO', extract NER 'PERSON' from sentence
                answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],0)
                if answer_list == []:
                    #If nothing is 'PERSON', get 'ORGANIZATION' and 'LOCATION' from sentence
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],2)
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],3)
            elif wtype == 'WHERE':
                #If question type is 'WHERE', extract NER 'LOCATION' from sentence
                answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],2)
                if answer_list == []:
                    #If nothing is 'LOCATION', get 'ORGANIZATION' and 'PERSON' from sentence
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],0)
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],3)
            else:
                #For type 'WHAT' or 'HOW', first extract NER 'ORGANIZATION' from sentence
                answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],3)
                if answer_list == []:
                    #If nothing is 'ORGANIZATION', get 'LOCATION', 'NUMBER' and 'PERSON' from sentence
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],0)
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],1)
                    answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],2)                    
        if answer_list != []:
            if qa['answer'] in answer_list:
                count_contains += 1
            #For question that have answer, ranking the answer by rule 1&3
            query = copy.deepcopy(text_question)
            query = nltk.word_tokenize(query)
            #token the answer sentence and copy it for further usage
            answer_sentence = nltk.word_tokenize(answer_sentence)
            sentence = copy.deepcopy(answer_sentence)         
            for query_index in range(len(query)):
                query[query_index] = lemmatize(query[query_index].lower())
            for sent_index in range(len(sentence)):
                sentence[sent_index] = lemmatize(sentence[sent_index].lower())                
            scores_1 = []
            scores_3 = []
            scores = []
            if len(answer_list) == 1:
                answer = answer_list[0]
            else:
                for entity in answer_list:
                    entity = nltk.word_tokenize(entity) 
                    score1 = rank_rule_1(entity,query)
                    scores_1.append(score1)
                    #answer_sentence is the original version and sentence is preprocessed
                    score3 = rank_rule_3(answer_sentence,sentence,entity,query)
                    scores_3.append(score3)
                    w1 = 0.2
                    w3 = 1 - w1
                    if score1 == 0:
                        score3 = 0
                    total = w1 * score1 + w3 * score3
                    scores.append(total)
                answer = answer_list[scores.index(max(scores))]
            
        else:
            #For question that doesn't have answer, return the total sentence
            answer = answer_sentence
        #ids.append(id)
        if answer == qa['answer']:
            count_correct += 1
            count_similar += 1
        else:
            answer_token = nltk.word_tokenize(answer)
            c_answer_token = nltk.word_tokenize(qa['answer'])
            if len(answer_token) > len(c_answer_token):
                mark_answer = 0
                mark_c_answer = 0
                match = 0
                while(True):
                    if answer_token[mark_answer] == c_answer_token[mark_c_answer]:
                        mark_answer += 1
                        mark_c_answer += 1
                        match += 1
                    else:
                        mark_answer -= (match-1)
                        match = 0
                        mark_c_answer = 0
                    if match == len(c_answer_token):
                        #print answer,'&',qa['answer']
                        count_similar += float(len(c_answer_token))/len(answer_token)
                        break
                    if mark_answer >= len(answer_token):
                        break                                        
            else:
                mark_answer = 0
                mark_c_answer = 0
                match = 0
                while(True):
                    if answer_token[mark_answer] == c_answer_token[mark_c_answer]:
                        mark_answer += 1
                        mark_c_answer += 1
                        match += 1
                    else:
                        mark_c_answer -= (match-1)
                        match = 0
                        mark_answer = 0
                    if match == len(answer_token):
                        #print answer,'&',qa['answer']
                        count_similar += float(len(answer_token))/len(c_answer_token)
                        break
                    if mark_c_answer >= len(c_answer_token):
                        break     
                
        #answers.append(answer)
print float(count_contains)/total_question, count_similar/total_question, float(count_correct)/total_question
                
# '''
#     IV. Output
# '''
# for i in range(len(answers)):
#     answers[i] = answers[i].replace(',','-COMMA-')
#     answers[i] = answers[i].replace('"','')
# output_result("result_basic_final.csv")

0
1
2
3
4
0.270812437312 0.278342693024 0.211634904714
