In [34]:
import numpy as np
import math
import pandas as pd
import string
import json
import random
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.similarities import Similarity
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim.models import LdaModel
from stop_words import get_stop_words
from pandas import Series, DataFrame
from sklearn.utils import shuffle
from sklearn import linear_model
from sklearn.externals import joblib
from pandas import DataFrame

In [2]:
data = pd.read_csv('../data/FiQA_train_question_doc_final.tsv', sep='\t')
load_questions = pd.read_csv('../data/FiQA_train_question_final.tsv', sep='\t')
load_docs = pd.read_csv('../data/FiQA_train_doc_final.tsv', sep='\t')
load_docs = load_docs.dropna()

questions = load_questions['question']
docs = load_docs['doc']

In [3]:
for item in data['docid']:
    if item not in load_docs['docid'].values:
        data = data.drop(axis=0, index=data[data.docid == item].index)
                
# data = data[data.docid == data['docid'].isin(load_docs['docid'])]
# print(len(data))

In [4]:
def splitData(input_data):
    #create positive and negative data
    negative_qid = data['qid'].sample(10000)
    negative_docid = []
    for item in negative_qid:
        negative_docid.append(data[data.qid != item]['docid'].sample(1).values[0]) 
    
    qid = np.hstack((data['qid'], negative_qid))
    docid  = np.hstack((data['docid'], negative_docid))
    
    postive_y = np.ones(len(data['qid']))
    negative_y = np.zeros(len(negative_qid))
    y = np.hstack((postive_y, negative_y))
    new_data = {'qid': qid, 'docid': docid, 'y': y}
    new_data = DataFrame(data = new_data)
    new_data = shuffle(new_data)               
    
    train_data = new_data[0:15000]
    test_data = new_data[15000:] 
    
    return [train_data, test_data]


In [5]:
def splitWord(documents):
    #split the sentence into word and remove the stop word
    texts = []
    stoplist=set('for a of the and to in at after with do i was am an Do its so need on if be were are is who we fca'.split())  
    for document in documents:
        document = document.translate(str.maketrans('','',string.punctuation))
        tmp = []
        for word in document.lower().split():
            if word not in stoplist:
                tmp.append(word)
        texts.append(tmp)
    return texts

In [6]:
def create_corpus(load_questions, load_docs):
    questions = load_questions['question']
    docs = load_docs['doc']
    
    texts = np.hstack((questions, docs))
    texts = splitWord(texts)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus_len = len(dictionary)
    tfidf = models.TfidfModel(corpus)
    print(corpus[0])
    
    question_vec = [tfidf[text] for text in corpus[0: len(questions)]]
    load_questions['tfidf_vector'] = question_vec
    doc_vec = [tfidf[text] for text in corpus[len(questions):]]
    load_docs['tfidf_vector'] = doc_vec
    return [load_questions, load_docs]

In [7]:
def data_process(load_questions, load_docs, data):
    cos_sim_list = []
    euclidean_dist_list = []
    kl_list = []
    word_count_list = []
    for qid, docid in zip(data['qid'], data['docid']):
        q_vec = dict(list(load_questions[load_questions.qid == qid]['tfidf_vector'])[0])
#         print(docid)
#         print(load_docs[load_docs.docid == docid])
        doc_vec = dict(list(load_docs[load_docs.docid == docid]['tfidf_vector'])[0])
        cos_sim = 0
        euclidean_dist = 0
        kl = 0
        word_count = 0
        for word in q_vec:
            if word in doc_vec:
                vec1 = q_vec[word]
                vec2 = doc_vec[word]
                cos_sim = vec1*vec2
                euclidean_dist += pow((vec1 - vec2),2)
                kl += vec1 * np.log(vec1 / vec2)
                word_count += 1
            else: 
                euclidean_dist += pow(q_vec[word], 2)
                    
        euclidean_dist = math.sqrt(euclidean_dist)
        cos_sim /= (np.linalg.norm(list(q_vec.values())) * np.linalg.norm(list(doc_vec.values())))
        cos_sim_list.append(cos_sim)
        euclidean_dist_list.append(euclidean_dist)
        kl_list.append(kl)
        word_count_list.append(word_count)
    data['cos_sim_list'] = cos_sim_list
    data['euclidean_dist_list'] = euclidean_dist_list
    data['kl_list'] = kl_list
    data['word_count_list'] = word_count_list
    return data

In [8]:
train_data, test_data = splitData(data)
load_questions, load_docs = create_corpus(load_questions, load_docs)

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1)]


In [9]:
train_data_process = data_process(load_questions, load_docs, train_data)
test_data_process = data_process(load_questions, load_docs, test_data)

In [10]:
train_data_process = train_data_process.dropna()
test_data_process = test_data_process.dropna()
train_x = train_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values
train_y = train_data_process['y'].values
test_x = test_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values
test_y = test_data_process['y'].values

In [12]:
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(train_x, train_y)
# joblib.dump(logreg, "../model/logic_regression_model.m")

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [13]:
predict_y = logreg.predict(test_x)
predict_proba_y = logreg.predict_proba(test_x)
acc = logreg.score(test_x,test_y)
print(acc)

0.8103876739562624


In [14]:
print(np.argmax(predict_proba_y, 1))
print(test_y)

[1 0 1 ... 0 0 0]
[1. 0. 1. ... 0. 1. 0.]


In [None]:
test_qid = data['qid'].sample(1).values[0]
all_docid = load_docs['docid'].values
all_qid = [test_qid]*len(all_docid)
all_data = DataFrame(data = {'qid': all_qid, 'docid': all_docid})
all_data_process = data_process(load_questions, load_docs, all_data)

In [None]:
all_data_process = all_data_process.dropna()
all_test_x = all_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values

In [None]:
all_test_y = logreg.predict_proba(all_test_x)[:, 1]
all_data_process['y'] = all_test_y
new_all_data_process = all_data_process.sort_values('y', ascending=False)
expect_docid = data[data.qid == all_data['qid'][0]]['docid']
new_all_data_process =  new_all_data_process.reset_index(drop=True)

for docid in expect_docid:
    print(docid)
    print(new_all_data_process[new_all_data_process.docid == docid].index)

# result = np.where(all_test_x==np.max(all_test_x))
# new_dataframe = DataFrame(data = {'y':all_test_y, 'docid': all_docid})

In [50]:
print(data['qid'].values[0])
print(data['docid'].values[0])

0
18850


In [64]:
def create_test_sample(data, test_len, qs_len):
    result = []
    for i in range(0, test_len):
        tmp_qs = np.array([data['qid'].values[i]]*qs_len)
        tmp_answers = np.hstack((data['docid'].values[i],  random.sample(list(data['docid'].values), qs_len - 1)))
        tmp_data = DataFrame(data = {'qid': tmp_qs, 'docid': tmp_answers})
        data_process = data_process(load_questions, load_docs, tmp_data)
        data_process = data_process.dropna()
        x = all_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values
        result.append(_x)
    return result

def test_accuracy(test_sample, prediction_model, recall_len):
    accuracy = 0;
    for item in test_sample:
        result = prediction_model.predict_proba(item)[:, 1]
        if np.argmax(result) < recall_len:
            accuracy += 1
    accuracy /= len(test_sample) 
    print(accuracy)
    return accuracy

In [63]:
recall = 1
print('recall:', recall)
print('answer_length: 10')
test_sample_01 = create_test_sample(test_data, 100, 10)
accuracy = test_accuracy(test_sample_01, logreg, recall)
print('answer_length: 20')
test_sample_02 = create_test_sample(test_data, 100, 20)
accuracy = test_accuracy(test_sample_02, logreg, recall)
print('answer_length: 30')
test_sample_03 = create_test_sample(test_data, 100, 30)
accuracy = test_accuracy(test_sample_03, logreg, recall)
print('answer_length: 40')
test_sample_04 = create_test_sample(test_data, 100, 40)
accuracy = test_accuracy(test_sample_04, logreg, recall)
print('answer_length: 50')
test_sample_05 = create_test_sample(test_data, 100, 50)
accuracy = test_accuracy(test_sample_05, logreg, recall)
print('answer_length: 100')
test_sample_06 = create_test_sample(test_data, 100, 100)
accuracy = test_accuracy(test_sample_06, logreg, recall)

recall: 1
answer_length: 10
0.49
answer_length: 20
0.42
answer_length: 30
0.39
answer_length: 40
0.4
answer_length: 50
0.35
answer_length: 100
0.3


In [65]:
recall = 2
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, logreg, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, logreg, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, logreg, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, logreg, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, logreg, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, logreg, recall)

recall: 2
answer_length: 10
0.55
answer_length: 20
0.44
answer_length: 30
0.41
answer_length: 40
0.41
answer_length: 50
0.35
answer_length: 100
0.31


In [68]:
recall = 3
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, logreg, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, logreg, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, logreg, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, logreg, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, logreg, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, logreg, recall)

recall: 3
answer_length: 10
0.58
answer_length: 20
0.47
answer_length: 30
0.42
answer_length: 40
0.43
answer_length: 50
0.35
answer_length: 100
0.31


In [69]:
recall = 4
print('recall:', recall)
print('answer_length: 10')
accuracy = test_accuracy(test_sample_01, logreg, recall)
print('answer_length: 20')
accuracy = test_accuracy(test_sample_02, logreg, recall)
print('answer_length: 30')
accuracy = test_accuracy(test_sample_03, logreg, recall)
print('answer_length: 40')
accuracy = test_accuracy(test_sample_04, logreg, recall)
print('answer_length: 50')
accuracy = test_accuracy(test_sample_05, logreg, recall)
print('answer_length: 100')
accuracy = test_accuracy(test_sample_06, logreg, recall)

recall: 4
answer_length: 10
0.65
answer_length: 20
0.51
answer_length: 30
0.42
answer_length: 40
0.45
answer_length: 50
0.36
answer_length: 100
0.32
