In [1]:
import numpy as np
import math
import pandas as pd
import string
import json
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.similarities import Similarity
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim.models import LdaModel
from stop_words import get_stop_words
from pandas import Series, DataFrame
from sklearn.utils import shuffle
from sklearn import linear_model
from sklearn.externals import joblib

In [2]:
data = pd.read_csv('../data/FiQA_train_question_doc_final.tsv', sep='\t')
load_questions = pd.read_csv('../data/FiQA_train_question_final.tsv', sep='\t')
load_docs = pd.read_csv('../data/FiQA_train_doc_final.tsv', sep='\t')
load_docs = load_docs.dropna()

questions = load_questions['question']
docs = load_docs['doc']

In [3]:
for item in data['docid']:
    if item not in load_docs['docid'].values:
        data = data.drop(axis=0, index=data[data.docid == item].index)
                
# data = data[data.docid == data['docid'].isin(load_docs['docid'])]
# print(len(data))

In [4]:
def splitData(input_data):
    #create positive and negative data
    negative_qid = data['qid'].sample(10000)
    negative_docid = []
    for item in negative_qid:
        negative_docid.append(data[data.qid != item]['docid'].sample(1).values[0]) 
    
    qid = np.hstack((data['qid'], negative_qid))
    docid  = np.hstack((data['docid'], negative_docid))
    
    postive_y = np.ones(len(data['qid']))
    negative_y = np.zeros(len(negative_qid))
    y = np.hstack((postive_y, negative_y))
    new_data = {'qid': qid, 'docid': docid, 'y': y}
    new_data = DataFrame(data = new_data)
    new_data = shuffle(new_data)               
    
    train_data = new_data[0:15000]
    test_data = new_data[15000:] 
    
    return [train_data, test_data]


In [5]:
def splitWord(documents):
    #split the sentence into word and remove the stop word
    texts = []
    stoplist=set('for a of the and to in at after with do i was am an Do its so need on if be were are is who we fca'.split())  
    for document in documents:
        document = document.translate(str.maketrans('','',string.punctuation))
        tmp = []
        for word in document.lower().split():
            if word not in stoplist:
                tmp.append(word)
        texts.append(tmp)
    return texts

In [6]:
def create_corpus(load_questions, load_docs):
    questions = load_questions['question']
    docs = load_docs['doc']
    
    texts = np.hstack((questions, docs))
    texts = splitWord(texts)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus_len = len(dictionary)
    tfidf = models.TfidfModel(corpus)
    print(corpus[0])
    
    question_vec = [tfidf[text] for text in corpus[0: len(questions)]]
    load_questions['tfidf_vector'] = question_vec
    doc_vec = [tfidf[text] for text in corpus[len(questions):]]
    load_docs['tfidf_vector'] = doc_vec
    return [load_questions, load_docs]

In [8]:
def data_process(load_questions, load_docs, data):
    cos_sim_list = []
    euclidean_dist_list = []
    kl_list = []
    word_count_list = []
    for qid, docid in zip(data['qid'], data['docid']):
        q_vec = dict(list(load_questions[load_questions.qid == qid]['tfidf_vector'])[0])
#         print(docid)
#         print(load_docs[load_docs.docid == docid])
        doc_vec = dict(list(load_docs[load_docs.docid == docid]['tfidf_vector'])[0])
        cos_sim = 0
        euclidean_dist = 0
        kl = 0
        word_count = 0
        for word in q_vec:
            if word in doc_vec:
                vec1 = q_vec[word]
                vec2 = doc_vec[word]
                cos_sim = vec1*vec2
                euclidean_dist += pow((vec1 - vec2),2)
                kl += vec1 * np.log(vec1 / vec2)
                word_count += 1
            else: 
                euclidean_dist += pow(q_vec[word], 2)
                    
        euclidean_dist = math.sqrt(euclidean_dist)
        cos_sim /= (np.linalg.norm(list(q_vec.values())) * np.linalg.norm(list(doc_vec.values())))
        cos_sim_list.append(cos_sim)
        euclidean_dist_list.append(euclidean_dist)
        kl_list.append(kl)
        word_count_list.append(word_count)
    data['cos_sim_list'] = cos_sim_list
    data['euclidean_dist_list'] = euclidean_dist_list
    data['kl_list'] = kl_list
    data['word_count_list'] = word_count_list
    return data

In [9]:
train_data, test_data = splitData(data)
load_questions, load_docs = create_corpus(load_questions, load_docs)

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1)]


In [10]:
train_data_process = data_process(load_questions, load_docs, train_data)
test_data_process = data_process(load_questions, load_docs, test_data)

In [17]:
train_data_process = train_data_process.dropna()
test_data_process = test_data_process.dropna()
train_x = train_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values
train_y = train_data_process['y'].values
test_x = test_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values
test_y = test_data_process['y'].values

In [18]:
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(train_x, train_y)
joblib.dump(logreg, "../model/logic_regression_model.m")

['../model/logic_regression_model.m']

In [19]:
predict_y = logreg.predict(test_x)
predict_proba_y = logreg.predict_proba(test_x)
acc = logreg.score(test_x,test_y)
print(acc)

0.8103048376408217


In [16]:
print(np.argmax(predict_proba_y, 1))
print(test_y)

[1 1 1 ... 0 1 0]
[1. 1. 1. ... 0. 1. 1.]


In [27]:
test_qid = data['qid'].sample(1).values[0]
all_docid = load_docs['docid'].values
all_qid = [test_qid]*len(all_docid)
all_data = DataFrame(data = {'qid': all_qid, 'docid': all_docid})
all_data_process = data_process(load_questions, load_docs, all_data)

In [14]:
all_data_process = all_data_process.dropna()
all_test_x = all_data_process.loc[:, ['cos_sim_list', 'word_count_list']].values

In [30]:
all_test_y = logreg.predict_proba(all_test_x)[:, 1]
all_data_process['y'] = all_test_y
new_all_data_process = all_data_process.sort_values('y', ascending=False)
expect_docid = data[data.qid == all_data['qid'][0]]['docid']
new_all_data_process =  new_all_data_process.reset_index(drop=True)

for docid in expect_docid:
    print(docid)
    print(new_all_data_process[new_all_data_process.docid == docid].index)

# result = np.where(all_test_x==np.max(all_test_x))
# new_dataframe = DataFrame(data = {'y':all_test_y, 'docid': all_docid})

457667
Int64Index([16116], dtype='int64')
402174
Int64Index([20575], dtype='int64')
7774
Int64Index([29692], dtype='int64')
284526
Int64Index([5558], dtype='int64')
400859
Int64Index([20607], dtype='int64')
268035
Int64Index([52761], dtype='int64')
591130
Int64Index([28160], dtype='int64')
473644
Int64Index([18497], dtype='int64')
76782
Int64Index([48540], dtype='int64')
71986
Int64Index([46885], dtype='int64')
447597
Int64Index([9832], dtype='int64')
492735
Int64Index([30928], dtype='int64')
144894
Int64Index([41888], dtype='int64')
261378
Int64Index([54345], dtype='int64')
309853
Int64Index([55743], dtype='int64')
259463
Int64Index([54813], dtype='int64')
590375
Int64Index([28120], dtype='int64')
