In [1]:
import pickle
import re
import random
import numpy as np
from gensim.models import word2vec
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def readFile(fname):
    f = open(fname, "rb")
    result = pickle.load(f)
    f.close()
    
    return result

In [3]:
def trecPreprocessing(fname):
    with open(fname, 'r') as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    content = [w.replace('\t', ' ') for w in content]

    trec = []

    tmp_trec = []
    tmp_dic = {"p": [], "n": []}

    for index in range(len(content)):
        if ("<QApairs id=" in content[index] and index != 1):
            # If questions have no positive or no negative ans, discard it.
            if ((len(tmp_dic["p"]) != 0) and (len(tmp_dic["n"]) != 0)):
                tmp_trec.append(tmp_dic)
                trec.append(tmp_trec)

            tmp_trec = []
            tmp_dic = {"p": [], "n": []}

        elif (content[index] == "<question>"):
            tmp_trec.append(content[index + 1].lower())
        elif (content[index] == "<positive>"):
            tmp_dic["p"].append(content[index + 1].lower())
        elif (content[index] == "<negative>"):
            tmp_dic["n"].append(content[index + 1].lower())
            
    # Last round
    if ((len(tmp_dic["p"]) != 0) and (len(tmp_dic["n"]) != 0)):
        tmp_trec.append(tmp_dic)
        trec.append(tmp_trec)
    
    return trec

In [4]:
def tokenize(text):
    pattern = re.compile('[a-zA-Z]+')
    return (match.group(0) for match in pattern.finditer(text))

def makeValidationFeatuers(trec_val, vec_model, word_index):
    
    q_docs = []
    a_docs = []
    a_label = []
    
    for e in trec_val:
        tmp_q_docs = []
        all_a_docs = []
        all_a_label = []
        
        q = list(tokenize(e[0]))
        
        p = e[1]['p']
        #p = p[random.randrange(len(p))]
        #p = list(tokenize(p))
        
        n = e[1]['n']
        #n = n[random.randrange(len(n))]
        #n = list(tokenize(n))
        
        for token in q:
            if (token in vec_model.wv.vocab):
                
                tmp_q_docs.append(word_index[token])
                
        for sentence in p:
            tmp_a_docs = []
            all_a_label.append("P")
            current_sentence = list(tokenize(sentence))
            for token in current_sentence:
                if (token in vec_model.wv.vocab):
                    tmp_a_docs.append(word_index[token])
            all_a_docs.append(tmp_a_docs)    
            
        for sentence in n:
            tmp_a_docs = []
            all_a_label.append("N")
            current_sentence = list(tokenize(sentence))
            for token in current_sentence:
                if (token in vec_model.wv.vocab):
                    tmp_a_docs.append(word_index[token])
            all_a_docs.append(tmp_a_docs)
            
        q_docs.append(tmp_q_docs)
        a_docs.append(all_a_docs)
        a_label.append(all_a_label)
        
    return q_docs, a_docs, a_label

In [5]:
def pad(q_docs, a_docs):
    
    a_pad = []
    max_length = 40
    
    q_pad = pad_sequences(q_docs, maxlen=max_length, padding='post')
    for single_doc in a_docs:
        tmp_a_pad = pad_sequences(single_doc, maxlen=max_length, padding='post')
        a_pad.append(tmp_a_pad)
    return q_pad, a_pad

In [6]:
trec_validation = trecPreprocessing("dev-less-than-40.manual-edit.xml")

In [7]:
word_index = readFile("word_index_dic.pkl")
vec_model = word2vec.Word2Vec.load("word2vec.model")

In [8]:
#q_docs, p_docs, n_docs = makeValidationFeatuers(trec_validation, vec_model, word_index)
q_docs, a_docs, a_label = makeValidationFeatuers(trec_validation, vec_model, word_index)
#q_pad, p_pad, n_pad = pad(q_docs, p_docs, n_docs)
q_pad, a_pad = pad(q_docs, a_docs)

In [9]:
model = load_model("weights_best_2.hdf5") 

In [10]:
#model.predict([q_pad, p_pad, n_pad])
for i in range(len(q_pad)):
    all_pos_cos_sim = []
    for single_a in a_pad[i]:
        pos_cos_sim = model.predict([np.reshape(q_pad[0], (1, 40)), np.reshape(single_a, (1, 40)), np.reshape(single_a, (1, 40))])[1][0][0]
        all_pos_cos_sim.append(pos_cos_sim)
    all_pos_cos_sim, a_label[i] = zip(*sorted(zip(all_pos_cos_sim, a_label[i]), reverse=True))

In [36]:
def calMAPAndMRR(a_label):
    MAP = []
    MRR = []
    
    for i in range(len(a_label)):
        positive_count = a_label[i].count("P")
        positive_index = [index for index, value in enumerate(a_label[i]) if value == "P"]
        # MAP
        tmp_map = 0
        for j in range(len(positive_index)):
            tmp_map = tmp_map + ((j + 1) / (positive_index[j] + 1))
        tmp_map = tmp_map / positive_count
        MAP.append(tmp_map)
        
        #MRR 
        # First occurrences of positive
        positive_first_count = a_label[i].index("P") + 1
        tmp_mrr = 1 / positive_first_count
        #print("tmp_mrr", tmp_mrr)
        MRR.append(tmp_mrr)
        
    return MAP, MRR    

In [37]:
MAP, MRR = calMAPAndMRR(a_label)

In [40]:
# MAP of the whole data
sum(MAP) / len(MAP)

0.42267853150720824

In [41]:
# MRR of the whole data
sum(MRR) / len(MRR)

0.44990903449703146