In [None]:
# PartI: Question Processing
# - Answer Type Detection
# -- build a supervised ML classifier to predict the answer type for each question in the test set
# -- use questions in train dataset as the training questions
# -- use ner(answer) as the labelled class *
# -- use POS tag of each word in the question as the features 
# -- // alternatives: tokens in the question(BOW) ; NER of each word in question ; synset ID of each word in question
# - Query Formulation
import json
import re
import nltk
import string
from time import ctime
from math import log
from nltk import ngrams
from collections import defaultdict, Counter
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger
from sklearn.feature_extraction import DictVectorizer
from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation 
from sklearn.metrics import accuracy_score, classification_report

#printing start time of the script
#print("Start Time:",ctime())
stanford_ner_tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
stanford_pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

# build classifier
### get class lable
with open('/Users/zhanglufan/QA_train.json') as data_file:
    data = json.load(data_file)
training_data = []
training_answers = []
training_questions = []
training_answers_ner = [] # (answer,NER type)

# getting list of english punctuation marks to clean out sentences
PunctuationExclude = set(string.punctuation)
PunctuationExclude.remove(',')
PunctuationExclude.remove('-')
PunctuationExclude.remove('.')
PunctuationExclude.remove('\'')
PunctuationExclude.remove('%')

for ques_article in data:
    for each in ques_article['qa']:
        training_answers.append(each['answer']) # punctuations are useful in determining correct answers-keep punctuations
        training_questions.append(each['question'])
        training_data.append((each['question'],each['answer']))
#print training_answers[:30]
#print training_questions[:30]
#print training_data[:30]
#print training_answers_ner[:30]
def find_number(answer):
    num = re.match("\d+",answer)
    if num:
        return True
        #training_answers_ner[answer] = 'NUMBER'
def find_organization(ner_list):
    for each in ner_list:
        for e in each:
            if e[1] == 'ORGANIZATION':
                return True
                break   
    
def find_person(ner_list):
    #if 'PERSON' in ner_list.values():
        #return True
    for each in ner_list:
        for e in each:
            if e[1] == 'PERSON':
                return True
                break           
#find_person('William Herschel')
#print '======'
def find_location(ner_list):
    #ner_list = set(ner_list)
    #if 'LOCATION' in ner_list:
        #return True
    for each in ner_list:
        for e in each:
            if e[1] == 'LOCATION':
                return True
                break        

def get_label_array(training_answers):
    for answer in training_answers:
        ner_list = stanford_ner_tagger.tag_sents([answer.split()])
        #print ner_list
        #ner_list = dict(ner_list)
        if find_number(answer):
            training_answers_ner.append((answer,'NUMBER'))
        elif find_organization(ner_list):
            training_answers_ner.append((answer,'ORGANIZATION'))
        elif find_location(ner_list):
            training_answers_ner.append((answer,'LOCATION'))
        elif find_person(ner_list):
            training_answers_ner.append((answer,'PERSON'))
        else:
            training_answers_ner.append((answer,'OTHER'))
    return training_answers_ner

def get_label(answer):
    ner_list = stanford_ner_tagger.tag_sents([answer.split()])
    if find_number(answer):
        return 'NUMBER'
            #training_answers_ner.append((answer,'NUMBER'))
    elif find_organization(ner_list):
        return 'ORGANIZATION'
            #training_answers_ner.append((answer,'ORGANIZATION'))
    elif find_location(ner_list):
        return 'LOCATION'
            #training_answers_ner.append((answer,'LOCATION'))
    elif find_person(ner_list):
        return 'PERSON'
            #training_answers_ner.append((answer,'PERSON'))
    else:
        return 'OTHER'
            #training_answers_ner.append((answer,'OTHER'))
    #return training_answers_ner

def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

cfdist = ConditionalFreqDist() #for condition is NN,NNP
def get_all_words():
    #all_words = []
    for question in training_questions:
        for word in word_tokenize(question):
            condition = len(word)
            cfdist[condition][word] += 1  
    return cfdist

def get_all_words2():
    fdist = FreqDist()
    for question in training_questions:
        for word in word_tokenize(question):
            fdist[word] += 1
    return fdist
        

#print word_features
def feature_extractor(question):
    #print question
    features = {}
    doc_words = set(question.split())
    a = get_all_words2()    
    word_features = list(a)[:200] 
    for word in word_features:
        #features['contains({})'.format(word)] = (word in doc_words)
        features[word] = a[word]
    return features
#print feature_extractor('red')        

def prepare_feature_data(feature_extractor):
    feature_matrix = []
    classifications = []
    for q,a in training_data[:500]:
        feature_dict = feature_extractor(q)   
        feature_matrix.append(feature_dict)
        classifications.append(get_label(a))
     
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset,classifications

dataset,classifications = prepare_feature_data(feature_extractor)
#print dataset._shape
clf = RandomForestClassifier()
predictions = cross_validation.cross_val_predict(clf, dataset,classifications, cv=10)
def check_results(predictions, classifications):
    print "accuracy"
    print accuracy_score(classifications,predictions)
    print classification_report(classifications,predictions)
    
#check_results(predictions, classifications)
#featuresets = [(document_features(get_document(q)), get_label(a)) for (q,a) in training_data]
#print featuresets
#train_set, test_set = featuresets[100:], featuresets[:100]
#classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
# PartII: Passage Retrieval
from collections import defaultdict, Counter
from math import log

with open('/Users/zhanglufan/QA_dev.json') as data_file2:
    data2 = json.load(data_file2)
dev_data = data2[:10] 
dev_answers = []
dev_questions = []
dev_sentences = {}
dev_q_a = []

def get_ques_ans(i):
    for q in dev_data[i]['qa']:
        dev_q_a.append((q['question'],q['answer']))
    return dev_q_a
get_ques_ans(0)

def get_questions(i):
    for q in dev_data[i]['qa']:
        dev_questions.append(q['question'])
    return dev_questions    
get_questions(0)

def get_doc(i):
    index = 0
    #sent_dict = {}
    for sent in dev_data[i]['sentences']:
        dev_sentences[index] = sent
        #sent_dict[index] = sent
        #dev_sentences.append(sent_dict)
        index += 1
    return dev_sentences
get_doc(0)

#print dev_questions
#print dev_sentences

# getting the list of english stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.remove('the') ## After the error analysis of the results I realised that many answers have these words i.e. The President
stopwords.remove('of') ## So will not exclude these

stopwordsAll = set(nltk.corpus.stopwords.words('english'))
#stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
stemmer = nltk.stem.PorterStemmer() 

def extract_term_freqs(doc):
    tfs = Counter()
    for token in nltk.word_tokenize(doc):
        if token not in stopwords:
            tfs[stemmer.stem(token.lower())] += 1
    return tfs
#print extract_term_freqs(data2[0]['qa'][0]['question'])
def compute_doc_freqs(doc_term_freqs):
    dfs = Counter()
    for tfs in doc_term_freqs.values():
        for term in tfs.keys():
            dfs[term] += 1
    return dfs # how many doc contains this term

doc_term_freqs = {}
for docid,sent in dev_sentences.items():
    #print q
    term_freqs = extract_term_freqs(sent)
    doc_term_freqs[docid] = term_freqs
M = len(doc_term_freqs)
doc_freqs = compute_doc_freqs(doc_term_freqs)
#print doc_term_freqs

# build inverted index
vsm_inverted_index = defaultdict(list)
for docid, term_freqs in doc_term_freqs.items():
    N = sum(term_freqs.values())
    length = 0
    
    # find tf*idf values and accumulate sum of squares 
    tfidf_values = []
    for term, count in term_freqs.items():
        tfidf = float(count) / N * log(M / float(doc_freqs[term]))
        tfidf_values.append((term, tfidf))
        length += tfidf ** 2

    # normalise documents by length and insert into index
    length = length ** 0.5
    for term, tfidf in tfidf_values:
        # note the inversion of the indexing, to be term -> (doc_id, score)
        vsm_inverted_index[term].append([docid, tfidf / length])
        
# ensure posting lists are in sorted order (less important here cf above)
for term, docids in vsm_inverted_index.items():
    docids.sort()
    
def query_vsm(query, index, k=1):
    accumulator = Counter()
    for term in query:
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator.most_common(k)

#results = query_vsm([stemmer.stem(term.lower()) for term in 'What technology is used by night-vision devices?'.split()], vsm_inverted_index)
#print results


def get_relevant_sentences(question,query):
    results = query([stemmer.stem(term.lower()) for term in question.split()], vsm_inverted_index)
    return results
#print get_relevant_sentences('What technology is used by night-vision devices?',query_vsm)
#dev_sentences[1]

In [None]:
# PartIII: Answer Extraction
# process the relevant sentence 
# to get corresponding answer type entity
from nltk import ngrams
#get relevant sentences
for question in dev_questions:
    sent_candidates = get_relevant_sentences(question,query_vsm)
    #print sent_candidates

def get_possible_answer(question,sent):
    answer_ner_list = []
    sentence = dev_sentences[sent[0][0]]
    answer_ner_list = stanford_ner_tagger.tag_sents([sentence.split()])
    predict_answer_type = get_label(question)
    return answer_ner_list, predict_answer_type
q = "What notable warming effect does the presence of infrared absorbers contribute to?"
an = get_relevant_sentences(q,query_vsm)
#print an

def retrieve_answer(sent_candidates,answer_type):
    all_possible_answers = []
    for each in sent_candidates:
        sent = dev_sentences[each[0]]
        answer_ner_list = stanford_ner_tagger.tag_sents([sent.split()])
        answer_pos_list = stanford_pos_tagger.tag_sents([sent.split()])
        

sent = dev_sentences[93]
answer_ner_list = stanford_ner_tagger.tag_sents([sent.split()])
answer_pos_list = stanford_pos_tagger.tag_sents([sent.split()])
#print answer_ner_list, answer_pos_list

# basic assumption that answer term should not be explicitly in the question
def check_question(question,pos_list):
    for each in nltk.word_tokenize(question):
        #print dict(pos_list[0]).keys()
        if each in dict(pos_list[0]).keys():
            return False
            break
        else:
            #print each
            return True
#pos = stanford_pos_tagger.tag_sents(['new applicatons are used money'.split()])
#print check_question('What reflectance is measured?',pos)    
def get_rid_q(question,sent):
    s_list = set(nltk.word_tokenize(sent))
    for each in nltk.word_tokenize(question):
        if each in s_list:
            s_list.remove(each)
    #new_sent = list(s_list)
    return s_list#new_sent
   

def get_first_nn(q,pos_list):
    for each in pos_list[0]:
        if each[0] not in set(nltk.word_tokenize(q)):
            if 'NN' == each[1] or 'NNP' == each[1]:
                return each[0]
                break

def check_nn(ngram_pos_list):
    counter = 0
    for each in ngram_pos_list[0]:
        if 'NN' == each[1] or 'NNP' == each[1]:
            counter += 1
    return counter
    
def get_ngram_nn(n,q,sent):
    ng = ngrams(sent.split(), n)
    nn_dict = dict()
    #returned_ans = []
    for grams in ng:
        ngram_pos_list = stanford_pos_tagger.tag_sents([grams])
        #print ngram_pos_list
        c_nn = check_nn(ngram_pos_list)
        #print c_nn
        nn_dict[c_nn] = ngram_pos_list[0]       
    result = [value for (key, value) in sorted(nn_dict.items(), reverse=True)]#sorted(nn_dict, key=nn_dict.__getitem__, reverse=True)[0]
    final = list(zip(*result[0]))[0]
    return final
        
        
print get_ngram_nn(3,'What reflectance is measured?','The reflectance of light is measured')         
#print get_first_nn('Along with industrial and medical, in what applications is infrared radiation used?',pos)
#print get_label('What technology is used by night-vision devices?')
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#note: by locating index of the same term occurring in both q&a, we can further explore number of words we return
def find_match(q,s):
    ques = set([stemmer.stem(w.lower()) for w in q])
    s = [stemmer.stem(w.lower()) for w in s]
    s_index = []
    for e in s:
        if e in ques:
            s_index.append(s.index(e))
    return s_index
def find_nn(s,s_index):
    s = s[s_index[-1]+1:]
    #print s
    s_pos = stanford_pos_tagger.tag_sents([s])
    #print s_pos
    for each in s_pos[0]:
        if 'NN' == each[1] or 'NNP' == each[1]:
            index = s.index(each[0])
            #print index
            return s[:index+1]
            break
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
def answer_type_filter(ans_type, ner_list):
    for each in ner_list[0]:
        if ans_type == each[1]:
            return each[0]
            #print each[0]
            break
            
def get_answer(question):
    questionText = ''.join(w for w in question if w not in PunctuationExclude) ######
    #print questionText
    sent_candidate = get_relevant_sentences(questionText,query_vsm)
    #print sent_candidate
    sent = dev_sentences[sent_candidate[0][0]]
    sent2 = ''.join(w for w in sent if w not in PunctuationExclude)
    sent2 = sent2.replace(",", " ,")
    sent2 = sent2.replace(".", " .")
    #print sent
    #sent = get_rid_q(question,sent) 
    #print sent
    answer_ner_list = stanford_ner_tagger.tag_sents([sent2.split()])
    #print answer_ner_list
    answer_pos_list = stanford_pos_tagger.tag_sents([sent2.split()])
    predict_answer_type = get_label(questionText)
    #print predict_answer_type
    if predict_answer_type == 'OTHER':
        #print '-----'
        return get_first_nn(questionText,answer_pos_list)
    else:
        print '--------'
        return answer_type_filter(predict_answer_type, answer_ner_list)
            
#print get_answer('What reflectance is measured from a semiconductor wafer\'s surface to determine the index of refraction?')     

def get_answer2(question):
    questionText = ''.join(w for w in question if w not in PunctuationExclude) ######
    questionText = questionText.replace(",", " ,")
    questionText = questionText.replace(".", " .")
    questionText = questionText.replace("?", " ?")
    #print questionText
    sent_candidate = get_relevant_sentences(questionText,query_vsm)
    #print sent_candidate
    sent = dev_sentences[sent_candidate[0][0]]
    sent2 = ''.join(w for w in sent if w not in PunctuationExclude)
    sent2 = sent2.replace(",", " ,")
    sent2 = sent2.replace(".", " .")
    answer_ner_list = stanford_ner_tagger.tag_sents([sent2.split()])
    #print answer_ner_list
    answer_pos_list = stanford_pos_tagger.tag_sents([sent2.split()])
    predict_answer_type = get_label(questionText)
    #print predict_answer_type
    #result = get_ngram_nn(3,questionText,sent2)
    return sent2


def get_answer3(question):
    questionText = ''.join(w for w in question if w not in PunctuationExclude) ######
    questionText = questionText.replace(",", " ,")
    questionText = questionText.replace(".", " .")
    questionText = questionText.replace("?", " ?")
    sent_candidate = get_relevant_sentences(questionText,query_vsm)
    sent = dev_sentences[sent_candidate[0][0]]
    sent3 = ''.join(w for w in sent if w not in PunctuationExclude)
    sent3 = sent3.replace(",", " ,")
    sent3 = sent3.replace(".", " .")
    sent3 = sent3.split()
    answer_pos_list = stanford_pos_tagger.tag_sents([sent3])
    i_list = find_match(questionText.split(),sent3)
    result = find_nn(sent3,i_list)
    if result is None:
        print '--------'
        result = get_first_nn(questionText,answer_pos_list)
        print result
    #answer = ''.join(w for w in result)
    return result

In [None]:
# evaluation
# get answer for dev_dataset

print len(dev_q_a) 

    
def check_accuracy(guess,answer):
    if guess == answer:
        return True
    else:
        return False
    
def get_evaluation_result(q_a_file):
    counter = 0
    ground_truth = q_a_file
    for each in ground_truth:
        guess = get_answer3(each[0])
        #guess = get_ngram_nn(3,each[0],guess_sent)
        print each[0], each[1]
        print guess
        #if check_accuracy(guess,each[1]):
            #counter += 1
    #return counter/len(q_a_file)

print get_evaluation_result(dev_q_a)
#sent3 = 'Night-vision devices using active near-infrared illumination allow people or animals to be observed without the observer being detected .'.split()
#q3 = 'What technology is used by night-vision devices ?'.split()
#i_list = find_match(q3,sent3)
#result = find_nn(sent3,i_list)
#print result
#answer_pos_list2 = stanford_pos_tagger.tag_sents([sent3.split()])  
#print answer_pos_list2 
#for each in sent3:
    #print sent3.index(each)
    #print stemmer.stem(each.lower())
    #print answer_pos_list2.index(each)