In [4]:
import json
import re
import nltk
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine as cos_distance
from sklearn.decomposition import TruncatedSVD
with open('QA_train.json', 'r') as f:
    data = json.load(f)
#for i in range(0,len(data)):
sample = []
punctuation = [',','.','(',')',':','``','\'\'',';','&']
stopwords = nltk.corpus.stopwords.words('english')

def seperate_sentence(sentence):
    #print '----------------------------'
    #print sentence
    combined_word = ''
    result = re.split('[,() \'"]+',sentence)
    segment = []
    for index,word in enumerate(result):
        if word != '':
            if index == len(result)-1:
                segment.append(word[:-1])
            else:
                segment.append(word)
    #print segment
    return segment
    
        

for i in range(0,1):
    qa_list = data[i].get('qa')
    sentences_list = data[i].get('sentences')
    for sentence in sentences_list:
        seperate_sentence(sentence)
    '''for j in range(0,len(qa_list)):
        qa = qa_list[j]
        sentence_id = qa.get('answer_sentence')
        #print "#################################"
        #print "Q:",qa.get('question')
        #print "A:",qa.get('answer')
        #print "S:",sentences_list[sentence_id]
        sentence = sentences_list[sentence_id]
        print seperate_sentence(sentence)'''
        
        #sample.append((qa.get('question'),qa.get('answer'),sentences_list[sentence_id]))

In [33]:
import os
import csv
import json
import nltk
import copy
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine as cos_distance
from sklearn.decomposition import TruncatedSVD
from nltk.tag import StanfordNERTagger

def create_process_tools():
    stopwords = nltk.corpus.stopwords.words('english')
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    v = DictVectorizer(sparse=False)
    transformer = TfidfTransformer(smooth_idf=False,norm=None)
    svd = TruncatedSVD(n_components=260)
    return stopwords,lemmatizer,v,transformer,svd

def input_data():

    base_path = os.path.join('')
    train_file = base_path + 'QA_train.json'
    train_data = json.load(open(train_file))
    test_file = base_path + 'QA_test.json'
    test_data = json.load(open(test_file))
    dev_file = base_path + 'QA_dev.json'
    dev_data = json.load(open(dev_file))

    return train_data,test_data,dev_data

def output_result(filename):

    predictions_file = open(filename, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["id","answer"])
    # open_file_object.writerows(zip(ID, output))
    predictions_file.close()

def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def transform_doc(doc):
    return svd.fit_transform(transformer.fit_transform(v.fit_transform(doc)))

def transform_query(query_text):
    query_text = nltk.word_tokenize(query_text)
    query_text = [lemmatize(word.lower()) for word in query_text]
    for word in query_text:
        if word in stopwords or word in punctuation:
            query_text.remove(word)
    return svd.transform(transformer.transform(v.transform([get_BOW(query_text)])))[0]

def process_sentence(sentence):
    #token, lower, remove stopwords, get bag of words
    sentence = nltk.word_tokenize(sentence)
    sentence = [lemmatize(word.lower()) for word in sentence]
    for word in sentence:
        if word in stopwords or word in punctuation:
            sentence.remove(word)
    sentence = get_BOW(sentence)
    return sentence

def divide_data(data):
    #achieve sentences, questions, answers and the corresponding indexs.
    queries = []
    indexs = []
    answers = []
    qas = data['qa']
    sentences = data['sentences']
    for qa in qas:
        query = qa['question']
        queries.append(query)
        index = qa['answer_sentence']
        indexs.append(index)
        answer = qa['answer']
        answers.append(answer)
    for i in range(len(sentences)):
        sentences[i] = process_sentence(sentences[i])
    sentences = transform_doc(sentences)
    return sentences,indexs,queries,answers

def get_best_doc_num(query,sentences):
    #get the most likely documents according to the query
    min = 1
    index = 0
    for i in range(sentences.shape[0]):
        dist = cos_distance(query,sentences[i])
        if dist < min:
            min = dist
            index = i
    return index

def evaluate_article(predicts,indexs):
    #output the accuracy of predicted documents
    count = 0
    for i in range(len(predicts)):
        if predicts[i] != indexs[i]:
            count += 1
    result = 1 - float(count)/len(predicts)
    return result

def show_wrong_result(original_article,predicts,indexs):
    sentences_list = original_article.get('sentences')
    for i in range(len(predicts)):
        if predicts[i] != indexs[i]:
            print "############################"
            print "Question: ",original_article.get('qa')[i].get('question')
            print "Guess: ",sentences_list[predicts[i]]
            print "Right: ",sentences_list[indexs[i]]

def sentence_retrieval(article):
    #achieve the original sentences which are retrieved
    original_article = copy.deepcopy(article)
    sentences,indexs,queries,answers = divide_data(article)
    original_query = copy.deepcopy(queries)

    predicts = []
    for i in range(len(queries)):
        queries[i] = transform_query(queries[i])
        predict = get_best_doc_num(queries[i],sentences)
        predicts.append(predict)
    result = evaluate_article(predicts,indexs)
    #print '(TF-IDF) retrieval accuracy is: ',
    #show_wrong_result(original_article,predicts,indexs)

    sentences_retrieval = []
    for num in predicts:
        sent = original_article['sentences'][num]
        sentences_retrieval.append(sent)
    #return sentences_retrieval,original_query
    return result

def input_NER():
    stanford_dir = os.path.join('stanford-ner-2016-10-31')
    jarfile = os.path.join(stanford_dir,'stanford-ner.jar')
    modelfile = os.path.join(stanford_dir,'classifiers\english.muc.7class.distsim.crf.ser.gz')
    return modelfile,jarfile

'''input data'''
train,test,dev = input_data()
'''input data'''

punctuation = [',','.','(',')',':','``','\'\'',';','&']

'''get processing tools'''
stopwords,lemmatizer,v,transformer,svd = create_process_tools()
'''get processing tools'''

'''retrieve sentences'''
model,jar = input_NER()
st = StanfordNERTagger(model,jar)

In [34]:
result = 0.0
for articals in train:
    result += sentence_retrieval(articals)
print "Average accurancy is ", result/len(train)

ValueError: n_components must be < n_features; got 260 >= 229

In [22]:
import math
def BM25(articals,k1,k2,b): 
    sentences,queries,sentence_id = parse_data(articals)      
    IDF_dict,avgdl,sentences_token = create_IDF_dict(sentences)
    total_queries = len(queries)
    count = 0
    
    for k11 in k1:
        for k22 in k2:
            for bb in b:
                for index in range(total_queries):
                    query = queries[index]
                    answer_id = sentence_id[index]
                    guess_id = find_max_score_sentence(query,sentences_token,IDF_dict,k1,k2,b,avgdl)
                    if answer_id == guess_id:
                        count += 1   
                accurancy = float(count)/total_queries
                
    #print "Accurancy of BM25 with k1:",k1," k2:",k2," b:",b
    #print "  is ", accurancy
    return accurancy

def find_max_score_sentence(query,sentences_token,IDF_dict,k1,k2,b,avgdl):
    query_text = nltk.word_tokenize(query)
    query_text = [lemmatize(word.lower()) for word in query_text]
    for word in query_text:
        if word in stopwords or word in punctuation:
            query_text.remove(word)
    query_dict = get_BOW(query_text)
    max_score = 0
    guess_sentence = 0
    for index in range(len(sentences_token)):
        
        score = 0
        sentence = sentences_token[index]         
        sentence_dict = get_BOW(sentence)
        for word in query_dict:
            N = len(sentences_token)
            n_qi = IDF_dict.get(word,0)
            fi = sentence_dict.get(word,0)
            qfi = query_dict.get(word,0)
            dl = len(sentence)
            K = k1*(1-b+b*(float(dl)/avgdl))
            
            W = math.log((N-n_qi+0.5)/(n_qi+0.5))
            R = (fi*(k1+1))/(fi+K)*qfi*(k2+1)/(qfi+k2)
            score += W*R
        if score > max_score:
            max_score = score
            guess_sentence = index
        #print index,score
    return guess_sentence
      
def parse_data(articals):
    sentences = articals.get("sentences")
    qa = articals.get("qa")
    queries = []
    sentence_id = []
    for index in range(len(qa)):
        queries.append(qa[index].get("question"))
        sentence_id.append(qa[index].get("answer_sentence"))
    return sentences,queries,sentence_id

def create_IDF_dict(sentences):
    IDF_dict = {}
    total_length = 0
    
    sentences_token = []
    for sentence in sentences:
        sentence = nltk.word_tokenize(sentence)
        sentence = [lemmatize(word.lower()) for word in sentence]
        for word in sentence:
            if word in stopwords or word in punctuation:
                sentence.remove(word)
        total_length += len(sentence)
        for word in set(sentence):
            IDF_dict[word] = IDF_dict.get(word,0) + 1
        sentences_token.append(sentence)
    return IDF_dict, float(total_length)/len(sentences),sentences_token

In [38]:
with open('QA_train.json', 'r') as f:
    data = json.load(f)
print len(data)
test_length = len(data)
accurancy = 0.0

k1 = 1.2
k2 = 100
for b in [0.1,0.125,0.15,0.175]:
    accurancy = 0.0
    for i in range(0,test_length):
        accurancy += BM25(data[i],k1,k2,b)
    average_accurancy = accurancy/test_length
    
    
    
#(k1: 1.2  k2: 100  b: 0.1 )   average_accurancy:  0.66457691951
#(k1: 1.2  k2: 100  b: 0.125 ) average_accurancy:  0.664785305326
#(k1: 1.2  k2: 100  b: 0.2 )   average_accurancy:  0.664918 1.2,100,0.2
#(k1: 1.2  k2: 100  b: 0.3 )   average_accurancy:  0.664284 1.2,100,0.3
#(k1: 1.2  k2: 100  b: 0.4 )   average_accurancy:  0.658218 1.2,100,0.4


360
(k1: 1.2  k2: 100  b: 0.1 ) average_accurancy:  0.66457691951
(k1: 1.2  k2: 100  b: 0.125 ) average_accurancy:  0.664785305326


KeyboardInterrupt: 

In [34]:
for i in range(0,1):
    qa_list = data[i].get('qa')
    sentences_list = data[i].get('sentences')
    for sentence in sentences_list[11:20]:
        print st.tag(seperate_sentence(sentence))

[(u'Phonautograms', u'O'), (u'of', u'O'), (u'singing', u'O'), (u'and', u'O'), (u'speech', u'O'), (u'made', u'O'), (u'by', u'O'), (u'Scott', u'PERSON'), (u'in', u'O'), (u'1860', u'DATE'), (u'were', u'O'), (u'played', u'O'), (u'back', u'O'), (u'as', u'O'), (u'sound', u'O'), (u'for', u'O'), (u'the', u'O'), (u'first', u'O'), (u'time', u'O'), (u'in', u'O'), (u'2008', u'DATE')]
[(u'Along', u'O'), (u'with', u'O'), (u'a', u'O'), (u'tuning', u'O'), (u'fork', u'O'), (u'tone', u'O'), (u'and', u'O'), (u'unintelligible', u'O'), (u'snippets', u'O'), (u'recorded', u'O'), (u'as', u'O'), (u'early', u'O'), (u'as', u'O'), (u'1857', u'DATE'), (u'these', u'O'), (u'are', u'O'), (u'the', u'O'), (u'earliest', u'O'), (u'known', u'O'), (u'recordings', u'O'), (u'of', u'O'), (u'sound', u'O')]
[(u'In', u'O'), (u'1877', u'DATE'), (u'Thomas', u'ORGANIZATION'), (u'Edison', u'ORGANIZATION'), (u'invented', u'O'), (u'the', u'O'), (u'phonograph', u'O')]
[(u'Unlike', u'O'), (u'the', u'O'), (u'phonautograph', u'O'), (u'it'