In [1]:
'''
    Team: 
        QAOutliers
    
    Memebers: 
        Zhanyi Qi
        Lianyu Zeng
        Yiheng Wang

    This cell contains def that we create for our Enhanced Q&A System. 
    Our Enhanced Q&A System has seven parts:
        I.   Sentence Retrieval by algorithm BM25
        II.  Extracting Headword from train dataset
        III. Question Classification with trained headwords
        IV.  Named Entity Recognition processing
        V.   Tagging sentence by POS Tagger
        VI.  Answer Ranking rules
        VII. Output
'''
import csv
import json
import copy
import os
import nltk
import sys 
import math
import operator
import re
import string
from math import log
from collections import defaultdict, Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine as cos_distance
from sklearn.decomposition import TruncatedSVD
from nltk.tag import StanfordNERTagger
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
'''
    Part I. Sentence Retrieval by algorithm BM25
    There are 7 def in this part.
    1. get_BOW()
                input:  a list of word
                output: a dictionary of these words
    2. lemmatize()
                input:  a word
                output: lemmatized word
    3. input_data()
                input:  None 
                output: train, test, and dev data sets
    4. transform_text()
                input:  a sentence
                output: a list of words from this sentence by tokenizing, lemmatizing, filtering stopwords and 
                        punctuations
    5. get_Docfrequency_SentenceBOW()
                input:  a data set from train, test or dev
                output: 5 values which are list, dictionary, dictionary, list and list. These values will be used
                        in the calculating process in BM25
    6. find_max_score_sentence()
                input:  the index of this article, the index of this question, k1, k2 and b which are 
                        the coefficients in BM25 algorithm
                output: the best match sentence id
    7. BM25()
                input:  the index of this article, k1, k2 and b which are the coefficients in BM25 algorithm
                output: the list of sentence id that predicted by BM25
'''
def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma
def input_data():
    base_path = os.path.join('data/')
    train_file = base_path + 'QA_train.json'
    train_data = json.load(open(train_file))
    test_file = base_path + 'QA_test.json'
    test_data = json.load(open(test_file))
    dev_file = base_path + 'QA_dev.json'
    dev_data = json.load(open(dev_file))
    return train_data,test_data,dev_data
def transform_text(text):
    text = nltk.word_tokenize(text)
    text = [lemmatize(word.lower()) for word in text]
    result = []
    for word in text:
        if word not in stopwords and word not in punctuations:
            result.append(word)
    return result
def get_Docfrequency_SentenceBOW(dataset):
    #save dics, each dictionary contains document frequencies for all terms in the same article
    question_list = []
    #save lists, each list represent an article, saving sentences' bow
    total_sentence_bow = []
    #save lists, each list represent an article, saving questions' bow
    total_question_bow = []
    #save lists, each list represent all sentences' lengthes.
    sent_lengthes = []
    #save a list, each item represents the average length of sentences
    avg_lengthes = []  
    for article in dataset:
        #Docfrequency
        article_dic = defaultdict(list)
        keyterms = [] #save all distinct terms in questions       
        #SentenceBOW
        bow_list = []       
        #QuestionBOW
        que_list = []       
        #SentenceLength
        sent_len = []        
        #TotalLength
        total_len = 0       
        qas = article['qa']
        sentences = article['sentences']
        for qa in qas:
            question = qa['question']
            newquestion = transform_text(question)
            #QuestionBOW
            que_list.append(get_BOW(newquestion))          
            keyterms.extend(newquestion)
        keyterms = set(keyterms)       
        #save sentences' BOW in list sen_BOW
        sen_words = []
        for sent in sentences:
            sent = transform_text(sent)
            #Docfrequency
            sen_words.append(sent)         
            #SentenceBOW
            bow_list.append(get_BOW(sent))         
            #SentenceLength
            sent_len.append(len(sent))           
            #TotalLength
            total_len += len(sent)     
        #calculate doc frequency    
        for term in keyterms:
            for i,bow in enumerate(sen_words):
                if term in bow:
                    article_dic[term].append(i)                   
        #Docfrequency
        question_list.append(article_dic)
        #SentenceBOW
        total_sentence_bow.append(bow_list)
        #QuestionBOW
        total_question_bow.append(que_list)
        #SentenceLength
        sent_lengthes.append(sent_len)
        #AverageLength
        avg_lengthes.append(float(total_len)/len(sentences))       
    return question_list, total_sentence_bow, total_question_bow, sent_lengthes, avg_lengthes
def find_max_n_sentences(articles_index,index,k1,k2,b,n):
    #Get the dictionary of this question from QuestionBOW
    query_dict = total_question_bow[articles_index][index]
    scores = {}
    for index in range(len(total_sentence_bow[articles_index])):     
        score = 0  
        #Get the dictionary of this sentence from SentenceBOW
        sentence_dict = total_sentence_bow[articles_index][index]
        #Calculate the score of each word in question query
        for word in query_dict:
            document_fre_list = question_list[articles_index].get(word,None)
            N = len(total_sentence_bow[articles_index])
            n_qi = 0
            if document_fre_list != None:
                n_qi = len(document_fre_list)
            else:
                n_qi = 0
            fi = sentence_dict.get(word,0)
            qfi = query_dict.get(word,0)
            dl = sent_lengthes[articles_index][index]
            avgdl = avg_lengthes[articles_index]
            
            K = k1*(1-b+b*(float(dl)/avgdl)) 
            W = math.log((N-n_qi+0.5)/(n_qi+0.5))
            R = (fi*(k1+1))/(fi+K)*qfi*(k2+1)/(qfi+k2)
            score += W*R
        scores[index] = score
    scores = sorted(scores.items(), key=operator.itemgetter(1),reverse=True)[:n]
    return scores
def BM25_n(articles_index,k1,k2,b,n):
    total_queries = len(total_question_bow[articles_index])
    count = 0
    correct_id = []
    correct_id_weight = []
    for index in range(len(total_question_bow[articles_index])):
        poss_results = find_max_n_sentences(articles_index,index,k1,k2,b,n)
        guess_ids = []
        weights = []
        #Save several most relative sentences with this question
        for sent in poss_results:
            guess_ids.append(sent[0])
            weights.append(sent[1])
        correct_id.append(guess_ids)
        correct_id_weight.append(weights)
    return correct_id,correct_id_weight
'''
    Part II. Extracting Headword from train dataset
    There are 2 def in this part.
    1. getHeadWord()
                input:  the question query
                output: the list of headwords of this query
'''
def getHeadWord(text):
    text = text.encode('ascii','replace')
    pos =  pos_tag(word_tokenize(text))
    #Edit the POS tag of 'Which'
    if pos[0]==('Which', 'JJ'):
        pos[0] = ('Which', 'WHICH')
    for i,item in enumerate(pos):   
        word = item[0].lower()
        ppos = item[1]
        if word=='what' and (ppos=='WP' or ppos=='WDT'):
            pos[i] = ('what', 'WHAT')   
        elif word=='which' and ppos=='JJ':
            pos[i] = ('which', 'WHICH')  
    #Define the Chunk grammar of question
    grammar = r"""
                V: {<V.*>}          # Verb
                HEAD:
                    {<IN>?<WHAT><V>?<DT>?<JJ.*|CD>*<V>?<IN>?<NN.*>+}
                    {<IN>?<WHAT><V>?<DT>?<JJ.*|CD>*<V>?<IN>?<VBG.*>+}
                    }<IN>?<WHAT><V>?<DT>?<JJ.*|CD>*<V>?<IN>?{  
                """
    cp = nltk.RegexpParser(grammar) 
    result = []
    tree = cp.parse(pos)
    #Get the result from tree
    for subtree in tree.subtrees():
        if subtree.label()=='HEAD':
            phrase = u' '.join([word for word,pos in subtree.leaves()])
            phrase_list = phrase.split()
            if len(phrase_list)>1:
                ph = phrase_list[-1]
            else:
                ph = phrase
            result.append(ph)     
    return result
'''
    Part III. Question Classification with trained headwords
    There are 2 def in this part.
    1. transfer_pos_question()
                input:  pos of a question query
                output: edited pos of this question query
    2. get_continuous_chunks()
                input:  a question query
                output: chunk of this question query, like [('WHEN','when ...')...]
'''
def transfer_pos_question(pos):
    new_pos = []
    #Edit the pos tag from nltk pos_tag to customized tag
    for (word,wtype) in pos:
        if word.lower() == 'what' or word.lower() == 'what\'s':
            new_pos.append((word,'WHAT'))
        elif word.lower() == 'do' or word.lower() == 'does' or word.lower() == 'did':
            new_pos.append((word,'DO'))
        #Tag the word from Trainning result
        elif word in number_list:
            new_pos.append((word,'NUMBER'))
        elif word in year_list:
            new_pos.append((word,'YEAR'))
        elif word in name_list:
            new_pos.append((word,'NAME'))
        elif word in location_list:
            new_pos.append((word,'LOC'))
        elif word.lower() == 'is' or word.lower() == 'was' or word.lower() == 'are' or word.lower() == 'were' or word.lower() == 'be':
            new_pos.append((word,'BE'))
        elif word.lower() == 'when':
            new_pos.append((word,'WHEN'))
        elif word.lower() == 'where':
            new_pos.append((word,'WHERE'))
        elif word.lower() == 'can':
            new_pos.append((word,'CAN'))
        elif word.lower() == 'how':
            new_pos.append((word,'HOW'))
        elif word.lower() == 'who' or word.lower() == 'whom' or word.lower() == 'whose'  or word.lower() == 'whos':
            new_pos.append((word,'WHO'))
        elif word.lower() == 'which':
            new_pos.append((word,'WHICH'))
        elif word.lower() == 'define':
            new_pos.append((word,'DEFINE'))
        elif word.lower() == 'should':
            new_pos.append((word,'SHOULD'))
        elif word.lower() == 'why' or word.lower() == 'wy':
            new_pos.append((word,'WHY'))
        else:
            new_pos.append((word,wtype))
    return new_pos
def get_continuous_chunks(text):
    t = copy.deepcopy(text)
    #Get pos tag by nltk pos tagger of this qyestion query
    pos = pos_tag(nltk.word_tokenize(t))
    #Edit the pos tag to customized pos tag
    pos = transfer_pos_question(pos)
    #Define different grammar for different types of question
    grammar = r"""
                WHAT: 
                    {<WHAT>}
                    {<WHICH>}
                    {<DEFINE>}
                WHO:
                    {<WHO>}
                    {<WHAT><BE>?<DT>?<JJ|RB>*<NAME>}
                    {<WHAT><JJ|RB>*<NN>+<NAME>}
                NUMBER:
                    {<WHICH><NUMBER>}
                    {<HOW><NUMBER>}
                    {<WHAT><BE>?<DT>?<JJ>?<NN>*<JJ>?<NUMBER>}
                WHEN:
                    {<WHICH><YEAR>}
                    {<WHAT><BE>?<DT>?<JJ>?<NN>*<JJ>?<YEAR>}
                    {<WHEN>}
                WHERE:
                    {<WHERE>}
                    {<WHAT><LOC>}
                    {<WHAT><BE>?<DT>?<RBS>?<JJ>?<LOC>}
                HOW:
                    {<CAN>}
                    {<DO>}
                    {<SHOULD>}
                    {<WHY>}
                    {<HOW>}

                """
    #Load nltk RegexpParser to analyse the grammar
    cp = nltk.RegexpParser(grammar) 
    result = []
    tree = cp.parse(pos)
    #Parse the grammar tree
    for subtree in tree.subtrees():
        if subtree.label() != 'S':
            phrase = u''
            for word,pos in subtree.leaves():
                if word == ',':
                    phrase = phrase + word
                else:
                    phrase = phrase + u' '
                    phrase = phrase + word
            result.append((subtree.label(),phrase[1:]))         
    return result
'''
    Part IV. Named Entity Recognition processing
    There are 5 def in this part.
    1. input_NER()
                input:  None
                output: StanfordNER model file, StanfordNER jar file
    2. analyse_NER()
                input:  list of sentences that already tagged by edited NER tool
                output: remove 'O' tag and combine continuous same tags as one entity
    3. parse_NER()
                input:  list of sentences that already tagged by Stanford NER tool 
                output: list of sentences that correct the NER result manully(by some rules)
    4. extract_NER()
                input:  NER sentence and the model number
                output: result of orderred entities from this NER sentence 
    5. parse_token()
                input:  a token of a sentence
                output: correct mistake in tokens, and return the new tokens 
'''
def input_NER():
    stanford_dir = os.path.join('stanford-ner-2016-10-31')
    jarfile = os.path.join(stanford_dir,'stanford-ner.jar')
    modelfile = os.path.join(stanford_dir,'classifiers/english.all.3class.distsim.crf.ser.gz')
    return modelfile,jarfile
def analyse_NER(ner_sentences):
    result_sentences = []
    for ner_sentence in ner_sentences:
        result_sentence = []
        perv_type = u'O'
        word = u''
        conjunction = u''
        conjunc_flag = False
        for index,(entity,etype) in enumerate(ner_sentence):
            if perv_type == u'O' and etype != u'O':              
                perv_type = etype
                word = entity + u' '
            elif word != u'':
                if etype == u'O':
                    if entity not in conjunction_word:
                        result_sentence.append((word[:-1],perv_type))
                        word = u''
                        perv_type = u'O'
                        if conjunction != u'':
                            conjunction = u''
                            conjunc_flag = False
                    else:
                        if conjunction != u'':
                            conjunction = u''
                            conjunc_flag = False
                        else:
                            conjunction = entity
                            conjunc_flag = True
                elif etype != perv_type:
                    result_sentence.append((word[:-1],perv_type))
                    word = entity + u' '
                    perv_type = etype
                    conjunction = u''
                    conjunc_flag = False
                elif etype == perv_type:
                    if conjunc_flag:
                        if conjunction == u',':
                            word = word[:-1] + conjunction + u' ' + entity + u' '
                        else:
                            word = word + conjunction + u' ' + entity + u' '
                        conjunction = u''
                        conjunc_flag = False
                    else:
                        if entity in ['%'] or word == u'$ ':
                            word = word[:-1] + entity + u' '
                        else:
                            word = word + entity + u' '
        if word != u'':
            result_sentence.append((word[:-1],perv_type))
        result_sentences.append(result_sentence)      
    return result_sentences
def parse_NER(ner_sentences):
    pattern_number = re.compile(r'([0-9]+|\%|\$)')
    year_number = re.compile(r'([0-9]{4}s?)')
    result_sentences = []
    for ner_sentence in ner_sentences:
        result_sentence = []
        for index,(entity,etype) in enumerate(ner_sentence):
            if entity != u'':
                entity.replace(u'\u2013',u'-')
                entity.replace(u'\u2014',u'-')
                entity.replace(u'\u2212',u'-')
                entity.replace(u'\u2044',u'%')
                if etype == u'O':
                    if year_number.search(entity):
                        result_sentence.append((entity,u'YEAR'))
                    elif pattern_number.search(entity) or entity in time_word:
                        result_sentence.append((entity,u'NUMBER'))
                    elif u'-' in entity:
                        word_seperate = entity.split(u'-')
                        for word in word_seperate:
                            if word in time_word:
                                result_sentence.append((entity,u'NUMBER'))
                                break
                    elif entity in location_word:
                        result_sentence.append((entity,u'LOCATION'))
                    elif index == 0 and entity.lower() not in stopwords:
                        result_sentence.append((entity,u'ORGANIZATION'))
                    elif index != 0 and entity[0].isupper():
                        result_sentence.append((entity,u'ORGANIZATION'))
                    else:
                        result_sentence.append((entity,etype))
                elif entity in ['(',')']:
                    result_sentence.append((entity,u'O'))
                else:
                    result_sentence.append((entity,etype))
        result_sentences.append(result_sentence)       
    return result_sentences
def extract_NER(parse_ner_sentence,mode):
    result = []
    if mode == 0:
        #PERSON
        for entity,etype in parse_ner_sentence:
            if etype == u'PERSON':
                result.append(entity)
    elif mode == 1:
        #NUMBER
        for entity,etype in parse_ner_sentence:
            if etype == u'NUMBER':
                result.append(entity)
    elif mode == 2:
        #LOCATION
        for entity,etype in parse_ner_sentence:
            if etype == u'LOCATION':
                result.append(entity)
    elif mode == 3:
        #ORGANIZATION
        for entity,etype in parse_ner_sentence:
            if etype == u'ORGANIZATION':
                result.append(entity)
    elif mode == 4:
        #YEAR
        for entity,etype in parse_ner_sentence:
            if etype == u'YEAR':
                result.append(entity)
    return result
def parse_token(token_sentence):
    result = []
    for index,word in enumerate(token_sentence):
        if index != 0 and index != (len(token_sentence)-1) and word == u'.':
            last_word = result[-1]
            last_word = last_word + u'.'
            result = result[:-1]
            result.append(last_word)
        else:
            result.append(word)
    return result
'''
    Part V. Tagging sentence by POS Tagger
    There are 2 def in this part.
    1. transfer_pos_sentence()
                input:  POS tag of the sentence
                output: Edited POS tag of the sentence
    2. get_continuous_chunks_sentence()
                input:  the sentence, the type of question (0: WHAT)
                output: the POS tag, and the result by loading customized chunk grammar 
'''
def transfer_pos_sentence(pos):
    new_pos = []   
    for (word,wtype) in pos:
        if word.lower() == 'and' or word.lower() == 'or':
            new_pos.append((word,'POSICC'))
        elif word.lower() == 'with':
            new_pos.append((word,'WITH'))
        elif word.lower() == 'a' or word.lower() == 'an':
            new_pos.append((word,'A'))
        elif word == '"':
            new_pos.append((word,'"'))
        elif word == 'around':
            new_pos.append((word,'AROUND'))
        else:
            new_pos.append((word,wtype))
    return new_pos
def get_continuous_chunks_sentence(text,texttype):
    t = copy.deepcopy(text)
    pos = pos_tag(nltk.word_tokenize(t))
    if texttype==0:
        #WHAT
        pos = transfer_pos_sentence(pos)
        grammar = r"""
                    J:
                        {<JJ.*><VBN>}
                        {<JJ.*><POSICC><JJ.*>}   
                        {<JJ.*>+}
                        {<NN.*><POS>}
                    N:
                        {<CD>+<NN.*>}
                        {<A>?<NN.*>?<J>?<NN.*>+}
                        <\">{<A>?<J>?<NN.*>+}<\">
                    COMBON:
                        {(<N><,>)*<N><,>?<POSICC><N>}
                    NWC:
                        {<N><WITH><COMBON>}
                    """
    cp = nltk.RegexpParser(grammar) 
    result = []
    poss = copy.deepcopy(pos)
    tree = cp.parse(pos)
    flag = 0
    for i in range (len(tree)):   
        if type(tree[i]) != tuple:
            subtree = tree[i]
            if texttype==0 and subtree.label() != 'S':
                phrase = u''
                for word,pos in subtree.leaves():
                    if word == ',':
                        phrase = phrase + word
                    else:
                        phrase = phrase + u' '
                        phrase = phrase + word
                result.append((subtree.label(),phrase[1:]))
            elif subtree.label() != 'S':
                phrase = u' '.join([word for word,pos in subtree.leaves()])
                result.append((subtree.label(),phrase))      
    return poss,result
'''
    Part VI. Answer Ranking rules
    There are 4 def in this part.
    1. get_open_class_words()
                input:  token of the question query
                output: removed stopword and punctuation of the list
    2. rank_rule_1()
                input:  entity and the question query
                output: score of rule 1
    3. rank_rule_3()
                input:  answer sentence, token of the answer snetence, entity and the token of question query 
                output: score of rule 3
    4. screen_out_answer_WHAT()
                input:  the list of alternative answers
                output: the list of alternative answers after filtering
'''  
def rank_rule_1(entity,query):
    #lower scores for content words also appear in the query
    count = 0
    length = len(entity)
    for word in entity:
        word = lemmatize(word.lower())
        if word not in stopwords:
            if word in query:
                count += 1
    if length == 0:
        score = 0
    else:
        score = 1 - float(count)/length
    return score
def get_open_class_words(query):
    result = []
    for index in range(len(query)):
        if query[index] not in stopwords:
            if query[index] not in string.punctuation:
                result.append(query[index])
    return result
def rank_rule_3(answer_sentence,sentence,entity,query):
    #higher scores for closer distance between an entity and the headword
    #step 1: using a filter to extract "useful" open-class words
    results = get_open_class_words(query)
    sent = sentence
    original_sent = answer_sentence
    entity_loc = []
    query_loc = []
    for word in entity:
        if word in original_sent:
            entity_loc.append(original_sent.index(word))
    for q in results:
        if q in sent:
            query_loc.append(sent.index(q))
    min_dist = len(original_sent)
    if query_loc != []:
        for i in query_loc:
            for j in entity_loc:
                dist = abs(i - j)
                if dist < min_dist:
                    min_dist = dist
                    
    return 1 - float(min_dist)/len(original_sent)
def screen_out_answer_WHAT(result):
    answer_list = []
    for wtype, word in result:
        if wtype != 'J':
            answer_list.append(word)
    return answer_list

'''
    Part VII. Output
    There are 1 def in this part.
    1. output_result()
                input:  filename
                output: putting result in this file
'''
def output_result(filename):
    predictions_file = open(filename, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["id","answer"])
    for i in range(len(answers)):
        open_file_object.writerow([ids[i], answers[i].encode("utf-8")])
    predictions_file.close()

In [2]:
'''
    This cell is for initializing dataset and tools.
'''
#Initialize datasets and tools
train,test,dev = input_data()
model,jar = input_NER()
st = StanfordNERTagger(model,jar)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

#Define dataset for customized function
punctuations = [',','\'\'','?','\'','.','%','(',')',';','``']
#Customized feature words in sentence
time_word = [
    'one','two','three','four','five','six','seven','eight','nine',
    'January','February','March','April','May','June','July','August','September','October','November','December',
    'million','billion',
    'minutes','hours','years','times',
    'mm','miles','inches','foot','feet',
    'late','early','around','over',
    'persons','seasons','square',
    'spring','summer','fall','autumn','winter'
]
location_word = [
    'southwest','southeast','northwest','northeast'
]
conjunction_word = ['and','of']
#Trained Headword
year_headword = {}
organization_headword = {}
person_headword = {}
number_headword = {}
location_headword = {}
#Threshold to avoid overfitting (the doc frequency of the headword)
threshold = 1
#Initialize data for BM25 processing
question_list, total_sentence_bow, total_question_bow, sent_lengthes, avg_lengthes = get_Docfrequency_SentenceBOW(test)
#Initialize the out put file
filename = "result_enhance.csv"

In [3]:
'''
    This cell is for BM25 processing.
'''
#Set the range of k1, k2 and b
k1_list = [0.78]
k2_list = [0]
b_list = [0.5]
#Store the predicting result by BM25 in result_sentences
result_sentences = []
result_sentences_weight = []
test_length = len(test)
for k1 in k1_list:
    for k2 in k2_list:
        for b in b_list:
            for i in range(0,test_length):
                #the amount of extract sentences
                amount = 5
                correct_id,correct_id_weight = BM25_n(i,k1,k2,b,amount)
                result_sentences.append(correct_id)
                result_sentences_weight.append(correct_id_weight)

In [4]:
'''
    This cell is for Extracting Headword from train dataset
''' 
print 'Start Extracting Headwords from train dataset'
total = len(train)
count = 0
for article in train:
    print count,'/',total,' trains Start...'
    count += 1
    qas = article['qa']
    sentences = article['sentences']
    token_sentences = copy.deepcopy(sentences)
    for i in range(len(sentences)):
        token_sentences[i] = nltk.word_tokenize(token_sentences[i])
    ner_sentences = st.tag_sents(token_sentences)
    parse_ner_sentences = parse_NER(ner_sentences)
    #Get the NER of the answer in the corresponding sentence
    for qa in qas:
        rank = {}
        ner_sentence = parse_ner_sentences[qa['answer_sentence']]
        for word in nltk.word_tokenize(qa['answer']):
            for w,t in ner_sentence:
                if t != u'O' and w == word:
                    rank[t] = rank.get(t,0) + 1
        result = sorted(rank.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
        if result != []:        
            kind = result[0][0]
            text = qa['question']
            #According to the NER of answer, save headwords into different dictionary key: headword; value: doc frequency
            if kind == u'NUMBER':
                for word in getHeadWord(text):
                    number_headword[word] = number_headword.get(word,0) + 1
            elif kind == u'YEAR':
                for word in getHeadWord(text):
                    year_headword[word] = year_headword.get(word,0) + 1
            elif kind == u'ORGANIZATION':
                for word in getHeadWord(text):
                    organization_headword[word] = organization_headword.get(word,0) + 1
            elif kind == u'PERSON':
                for word in getHeadWord(text):
                    person_headword[word] = person_headword.get(word,0) + 1
            elif kind == u'LOCATION':
                for word in getHeadWord(text):
                    location_headword[word] = location_headword.get(word,0) + 1

#If two dictionary have the same headword, which one has the higher doc frequency will keep the result, while the other will delete this record.
for (w,n) in year_headword.items():
    if w in organization_headword:
        if n >= organization_headword.get(w):
            del organization_headword[w]
        else:
            del year_headword[w]
for (w,n) in year_headword.items():
    if w in person_headword:
        if n >= person_headword.get(w):
            del person_headword[w]
        else:
            del year_headword[w]
for (w,n) in year_headword.items():
    if w in number_headword:
        if n >= number_headword.get(w):
            del number_headword[w]
        else:
            del year_headword[w]
for (w,n) in year_headword.items():
    if w in location_headword:
        if n >= location_headword.get(w):
            del location_headword[w]
        else:
            del year_headword[w]
            
for (w,n) in organization_headword.items():
    if w in person_headword:
        if n >= person_headword.get(w):
            del person_headword[w]
        else:
            del organization_headword[w]
for (w,n) in organization_headword.items():
    if w in number_headword:
        if n >= number_headword.get(w):
            del number_headword[w]
        else:
            del organization_headword[w]
for (w,n) in organization_headword.items():
    if w in location_headword:
        if n >= location_headword.get(w):
            del location_headword[w]
        else:
            del organization_headword[w]
            
for (w,n) in person_headword.items():
    if w in number_headword:
        if n >= number_headword.get(w):
            del number_headword[w]
        else:
            del person_headword[w]
for (w,n) in person_headword.items():
    if w in location_headword:
        if n >= location_headword.get(w):
            del location_headword[w]
        else:
            del person_headword[w]

for (w,n) in number_headword.items():
    if w in location_headword:
        if n >= location_headword.get(w):
            del location_headword[w]
        else:
            del number_headword[w]
            
#Filtering the result to avoid overfitting
for (w,n) in year_headword.items():
    if n < threshold:
        del year_headword[w]
for (w,n) in organization_headword.items():
    if n < threshold:
        del organization_headword[w]
for (w,n) in person_headword.items():
    if n < threshold:
        del person_headword[w]
for (w,n) in number_headword.items():
    if n < threshold:
        del number_headword[w]
for (w,n) in location_headword.items():
    if n < threshold:
        del location_headword[w]

location_list = location_headword.keys()
number_list = number_headword.keys()
organization_list = organization_headword.keys()
name_list = person_headword.keys()
year_list = year_headword.keys()

location_list = location_list + ['country','county','district','city']
number_list = number_list + ['version','size','msa','far','much','ration','time','many','population','large','percent','average','day','decade','big','long']
name_list = name_list + ['name','center','president','denominations','denomination','film','broadcaster','pitcher','commentator']
year_list = year_list + ['years','year','era']
                

Start Extracting Headwords from train dataset
0 / 360  trains Start...
1 / 360  trains Start...
2 / 360  trains Start...
3 / 360  trains Start...
4 / 360  trains Start...
5 / 360  trains Start...
6 / 360  trains Start...
7 / 360  trains Start...
8 / 360  trains Start...
9 / 360  trains Start...
10 / 360  trains Start...
11 / 360  trains Start...
12 / 360  trains Start...
13 / 360  trains Start...
14 / 360  trains Start...
15 / 360  trains Start...
16 / 360  trains Start...
17 / 360  trains Start...
18 / 360  trains Start...
19 / 360  trains Start...
20 / 360  trains Start...
21 / 360  trains Start...
22 / 360  trains Start...
23 / 360  trains Start...
24 / 360  trains Start...
25 / 360  trains Start...
26 / 360  trains Start...
27 / 360  trains Start...
28 / 360  trains Start...
29 / 360  trains Start...
30 / 360  trains Start...
31 / 360  trains Start...
32 / 360  trains Start...
33 / 360  trains Start...
34 / 360  trains Start...
35 / 360  trains Start...
36 / 360  trains Start...
37

In [5]:
'''
    This cell is for 
        I.   Question Classification
        II.  Named Entity Recognition processing
        III. Answer Ranking
        IV.  Output.
''' 
ids = []
answers = []
count = 1
for index in range(len(test)):
    article = test[index]
    qas = article['qa']
    sentences = article['sentences']
    token_sentences = copy.deepcopy(sentences)
    #Tag the sentence by NER processing
    for i in range(len(sentences)):
        token_sentences[i] = parse_token(nltk.word_tokenize(token_sentences[i]))
    ner_sentences = st.tag_sents(token_sentences)
    parse_ner_sentences = parse_NER(ner_sentences)
    #Predict the answer of each question
    for i in range(len(qas)):
        qa = qas[i]
        anss = []
        scs = []
        id = qa['id']
        #the amount of extract sentences
        amount = 4
        for ii in range(amount):
            if result_sentences_weight[index][i][ii] != 0:
                weight = float(result_sentences_weight[index][i][ii])/sum(result_sentences_weight[index][i])
            else:
                weight = 0
            #Extract the sentence of answer
            answer_sentence = sentences[result_sentences[index][i][ii]]
            #Extract the id of sentence of answer
            answer_sentence_id = result_sentences[index][i][ii]
            #Extract question query
            text_question = qa['question']
            #Get the grammar(POS) result of question 
            result = get_continuous_chunks(text_question)

            if result != []:
                wtype,word = result[0]
                if wtype == 'NUMBER':
                    answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],1)
                    if answer_list == []:
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],3)
                elif wtype == 'WHO':
                    answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],0)
                    if answer_list == []:
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],2)
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],3)
                elif wtype == 'WHERE':
                    answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],2)
                    if answer_list == []:
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],0)
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],3)
                elif wtype == 'WHEN':
                    answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],4)
                else:
                    #what, or other types
                    answer_list = extract_NER(parse_ner_sentences[answer_sentence_id],3)
                    if answer_list == []:
                        answer_sentence = sentences[answer_sentence_id]
                        wpos,wresult = get_continuous_chunks_sentence(answer_sentence,0)
                        answer_list = screen_out_answer_WHAT(wresult)
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],2)
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],0)
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],1)
                        answer_list += extract_NER(parse_ner_sentences[answer_sentence_id],4)

            if answer_list != []:           
                #For question that have answer, ranking the answer by rule 1&3
                query = copy.deepcopy(text_question)
                query = nltk.word_tokenize(query)
                #token the answer sentence and copy it for further usage
                answer_sentence = nltk.word_tokenize(answer_sentence)
                sentence = copy.deepcopy(answer_sentence)
                for query_index in range(len(query)):
                    query[query_index] = lemmatize(query[query_index].lower())
                for sent_index in range(len(sentence)):
                    sentence[sent_index] = lemmatize(sentence[sent_index].lower())
                scores_1 = []
                scores_3 = []
                scores = []           
                for entity in answer_list:
                    entity = nltk.word_tokenize(entity) 
                    score1 = rank_rule_1(entity,query)
                    scores_1.append(score1)
                    #answer_sentence is the original version and sentence is preprocessed
                    score3 = rank_rule_3(answer_sentence,sentence,entity,query)
                    scores_3.append(score3)
                    w1 = 0.2
                    w3 = 1 - w1
                    if score1 == 0:
                        score3 = 0
                    total = w1 * score1 + w3 * score3
                    scores.append(total)
                answer = answer_list[scores.index(max(scores))]
                sc = max(scores)*weight

            else:
                #For question that doesn't have answer, return the total sentence
                answer = answer_sentence
                sc = 0
            anss.append(answer)
            scs.append(sc)
        ans = anss[scs.index(max(scs))]       
        ids.append(id)
        answers.append(ans)
    print 'article ',count,' finished.'
    count += 1
'''
    IV. Output
'''
print 'Output result...'
filename = "result_enhance.csv"
for i in range(len(answers)):
    answers[i] = answers[i].replace(',','-COMMA-')
    answers[i] = answers[i].replace('"','')
output_result(filename)
print 'Results are stored in ', filename

article  1  finished.
article  2  finished.
article  3  finished.
article  4  finished.
article  5  finished.
article  6  finished.
article  7  finished.
article  8  finished.
article  9  finished.
article  10  finished.
article  11  finished.
article  12  finished.
article  13  finished.
article  14  finished.
article  15  finished.
article  16  finished.
article  17  finished.
article  18  finished.
article  19  finished.
article  20  finished.
article  21  finished.
article  22  finished.
article  23  finished.
article  24  finished.
article  25  finished.
article  26  finished.
article  27  finished.
article  28  finished.
article  29  finished.
article  30  finished.
article  31  finished.
article  32  finished.
article  33  finished.
article  34  finished.
article  35  finished.
article  36  finished.
article  37  finished.
article  38  finished.
article  39  finished.
article  40  finished.
article  41  finished.
article  42  finished.
Output result...
Results are stored in  res