In [None]:
import tensorflow as tf
import gensim
import numpy as np
import pandas as pd

In [None]:
print("read word2vec pretrained by using gensim:")
filename_word2vec = "../data/GoogleNews-vectors-negative300_unigrams_alphabetic.bin"
model = gensim.models.KeyedVectors.load_word2vec_format(filename_word2vec, binary=True)

print("gensim to numpy embedding matrix:")
vector_dim=300
embedding_matrix = np.zeros((len(model.vocab), vector_dim))
for i in range(len(model.vocab)):
    embedding_vector = model[model.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print embedding_matrix.shape

print("numpy embedding matrix to embedding tensors:")
saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)

print("normalize embeddings:")
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
normalized_embeddings = embedding / norm

read word2vec pretrained by using gensim:
gensim to numpy embedding matrix:
(869549, 300)
numpy embedding matrix to embedding tensors:
normalize embeddings:


In [3]:
print "find words with no effect:"
for k, v in model.vocab.iteritems():
    if v.count <= 3:
        print k, v.count

find words with no effect:
VISIONPAD 2
tricorne 3
RAFFAELE 1


In [4]:
# size of network:
n_qw, n_d, n_f, n_fw = 5, 20, 3, 7
doc_words_shape = [n_d, n_f, n_fw]
print "# of query words, # of documents, # of fields, # of field words:", n_qw, n_d, n_f, n_fw

# of query words, # of documents, # of fields, # of field words: 5 20 3 7


In [5]:
print("define query embedding tensors:")
query_words_tf = tf.Variable(tf.zeros(n_qw, dtype=tf.int32))
query_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, query_words_tf)
print query_words_embeddings.shape

define query embedding tensors:
(5, 300)


In [6]:
print("fielded document words to embedding tensors:")
doc_words_tf = tf.Variable(tf.zeros(doc_words_shape, dtype=tf.int32))
doc_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, doc_words_tf)
# number of tuplets: 10 -> 10 negative document and 10 positive document so in total 20 documents
old_shape = doc_words_embeddings.shape
print "old_shape:", old_shape
new_shape = [np.prod(old_shape[:3]), old_shape[3]]
doc_words_embeddings = tf.reshape(doc_words_embeddings, new_shape)
print "new_shape:", doc_words_embeddings.shape

fielded document words to embedding tensors:
old_shape: (20, 3, 7, 300)
new_shape: (420, 300)


In [7]:
print "Get Similarity of Words in the Query and all the Documents:"
similarity = tf.matmul(query_words_embeddings, doc_words_embeddings, transpose_b=True)
print similarity.shape
similarity = tf.reshape(similarity, [n_qw, n_d, n_f, n_fw])
print "shape of initial similarity tensor", similarity.shape

Get Similarity of Words in the Query and all the Documents:
(5, 420)
shape of initial similarity tensor (5, 20, 3, 7)


In [8]:
print "Split Similarity Tensor to Docs and Fields:"
# each split corresponds to a field
similarity_splits = map(tf.squeeze, tf.split(similarity, n_f, 2))
print "number of splits (number of documents):", len(similarity_splits)
print "shape of fields for all documents:", map(lambda x: x.shape, similarity_splits)

Split Similarity Tensor to Docs and Fields:
number of splits (number of documents): 3
shape of fields for all documents: [TensorShape([Dimension(5), Dimension(20), Dimension(7)]), TensorShape([Dimension(5), Dimension(20), Dimension(7)]), TensorShape([Dimension(5), Dimension(20), Dimension(7)])]


In [9]:
print "First Fully Connected Layers:"
print similarity_splits[0].shape

att = [tf.Variable(tf.zeros(n_d, dtype=tf.float64)) for _ in range(n_f)]
similarity_fc1 = map(lambda x: tf.transpose(x, perm=(1, 0, 2)), similarity_splits)

for i in range(len(similarity_fc1)):
    att_exp = tf.expand_dims(tf.expand_dims(att[i], 1), 1)
    similarity_fc1[i] = tf.multiply(att_exp, similarity_fc1[i])

print "shape of fields for all documents:", map(lambda x: x.shape, similarity_fc1)
similarity_fc1 = map(lambda x: tf.reshape(x, (n_d, n_qw*n_fw)), similarity_fc1)
print "shape of fields for all documents:", map(lambda x: x.shape, similarity_fc1)
similarity_fc1 = map(lambda x: tf.layers.dense(x, 1), similarity_fc1)
print "shape of fields for all documents:", map(lambda x: x.shape, similarity_fc1)
similarity_fc1 = map(lambda x: tf.tanh(x), similarity_fc1)
print "shape of fields for all documents:", map(lambda x: x.shape, similarity_fc1)

First Fully Connected Layers:
(5, 20, 7)
shape of fields for all documents: [TensorShape([Dimension(20), Dimension(5), Dimension(7)]), TensorShape([Dimension(20), Dimension(5), Dimension(7)]), TensorShape([Dimension(20), Dimension(5), Dimension(7)])]
shape of fields for all documents: [TensorShape([Dimension(20), Dimension(35)]), TensorShape([Dimension(20), Dimension(35)]), TensorShape([Dimension(20), Dimension(35)])]
shape of fields for all documents: [TensorShape([Dimension(20), Dimension(1)]), TensorShape([Dimension(20), Dimension(1)]), TensorShape([Dimension(20), Dimension(1)])]
shape of fields for all documents: [TensorShape([Dimension(20), Dimension(1)]), TensorShape([Dimension(20), Dimension(1)]), TensorShape([Dimension(20), Dimension(1)])]


In [10]:
print "2nd Fully Connected Layers:"
similarity_fc2 = tf.concat(similarity_fc1, axis=1)
print "shape of tensor at the input of fc2:", similarity_fc2.shape
similarity_fc2 = tf.layers.dense(similarity_fc2, 1, kernel_initializer=tf.ones_initializer())
similarity_fc2 = tf.tanh(similarity_fc2)
print "shape of tensor at the output of fc2:", similarity_fc2.shape
similarity_fc2 = tf.squeeze(similarity_fc2)
print "shape of tensor at the output of fc2:", similarity_fc2.shape

First Fully Connected Layers:
shape of tensor at the input of fc2: (20, 3)
shape of tensor at the output of fc2: (20, 1)
shape of tensor at the output of fc2: (20,)


In [17]:
# read and preprocess files:

print("read the query:")
queries_pd = pd.read_csv("../configs/queries/gov.tsv", names=["qid", "qtext"], delimiter='\t')
print("read the qrels")
qrels_pd = pd.read_csv("../configs/qrels/gov.txt", names=["qid", "nl", "docid", "rel"], delimiter=' ')
print qrels_pd.head(10)
print("read documents:")
txt_len = n_fw
docs_pd = pd.read_csv("../../mfnn-data/govs_mod.csv", names=["docid", "title", "body", "meta"])
docs_pd = docs_pd.replace(np.nan, '', regex=True) 

# query numbers:
qnumbers = queries_pd.qid.values.tolist()

# initialization of dictionaries with keys as query numbers:
query = dict()
query_words_ids = dict()
qrels = dict()
doc_words_ids_d = dict()
docs_d = dict()
docs_pd_d = dict()
docs_att = dict()

for qn in qnumbers:
    print qn, " ",
    
    # extract an specific query
    queries_pd_ = queries_pd[queries_pd.qid==qn]
    query[qn] = queries_pd_["qtext"].values[0].split(' ')
    query[qn] = [x for x in query[qn] if x in model.vocab]
    if n_qw >= len(query[qn]):
        # Padding:
        query[qn] = query[qn] + ["RAFFAELE"] * (n_qw - len(query[qn]))
    else:
        # cutting:
        query[qn] = query[qn][:n_qw]
    query_words_ids[qn] = [model.vocab[x].index for x in query[qn]]

    # extracted qrels for an specific query
    qrels_pd_ = qrels_pd[qrels_pd.qid==qn]
    qrels[qn] = dict(qrels_pd_[["docid", "rel"]].values.tolist())

    # remove documents not in qrels:
    docs_pd_ = docs_pd[docs_pd['docid'].isin(qrels_pd_.docid.values)]
    
    # remove words not in word2vec vocabulary
    docs_pd_.meta = docs_pd_.meta.apply(lambda x: filter(lambda y: (y in model.vocab), x.split()))
    docs_pd_.body = docs_pd_.body.apply(lambda x: filter(lambda y: (y in model.vocab), x.split()))
    docs_pd_.title = docs_pd_.title.apply(lambda x: filter(lambda y: (y in model.vocab), x.split()))

    # attention weights
    att_fun = lambda x: (1.0+txt_len)/(1+len(x))
    docs_att[qn] = [docs_pd_.meta.apply(att_fun).values.tolist()]
    docs_att[qn] += [docs_pd_.body.apply(att_fun).values.tolist()]
    docs_att[qn] += [docs_pd_.title.apply(att_fun).values.tolist()]
        
    # padding
    docs_pd_.meta = docs_pd_.meta.apply(lambda x: x[:txt_len] if len(x)>txt_len else x + ['tricorne']*(txt_len-len(x)))
    docs_pd_.body = docs_pd_.body.apply(lambda x: x[:txt_len] if len(x)>txt_len else x + ['tricorne']*(txt_len-len(x)))
    docs_pd_.title = docs_pd_.title.apply(lambda x: x[:txt_len] if len(x)>txt_len else x + ['tricorne']*(txt_len-len(x)))

    # document fields dictionary
    docs_pd_ = docs_pd_.reset_index(drop=True)
    docs_ids = list(docs_pd_.docid)
    docs = (docs_pd_[["title", "body", "meta"]]).values.tolist()
    docs_d[qn] = dict(zip(docs_ids, docs))

    # merge docs and qrels:
    docs_pd_d[qn] = pd.merge(docs_pd_, qrels_pd_, how='left', on=['docid'])    
    
    # find documents word IDs
    doc_words_ids_d_ = dict()
    for k, doc in docs_d[qn].iteritems():
        doc_words_ids_d_[k] = np.array([[model.vocab[x].index for x in f] for f in doc])
    doc_words_ids_d[qn] = doc_words_ids_d_
    
print

read the query:
read the qrels
   qid  nl           docid  rel
0  551   0  G14-77-3709129    0
1  551   0  G08-22-1623396    0
2  551   0  G00-62-3810067    0
3  551   0  G03-16-2715908    0
4  551   0  G03-04-3678813    0
5  551   0  G00-82-0922843    0
6  551   0  G14-20-2024537    0
7  551   0  G02-47-3151627    0
8  551   0  G18-28-0513470    0
9  551   0  G22-94-3703003    1
read documents:
551   552   553   554   555   556   557   558   559   560   561   562   563   564   565   566   567   568   569   570   571   572   573   574   575   576   577   578   579   580   581   582   583   584   585   586   587   588   589   590   591   592   593   594   595   596   597   598   599   600  


In [12]:
print "generate tuplets for the hinge loss:"
seq_ids = dict()
seq_names = dict()
seq_rels = dict()
for qn in qnumbers:
    dpos = filter(lambda x: qrels[qn][x] == 1, qrels[qn].keys())
    dneg = filter(lambda x: qrels[qn][x] == 0, qrels[qn].keys())
    pairs = zip(dpos, dneg)
    
    seq_ids_ = []
    seq_names_ = []
    seq_rels_ = []
    for k, v in pairs:
        if k in doc_words_ids_d[qn] and v in doc_words_ids_d[qn]:
            seq_ids_ += [doc_words_ids_d[qn][k]]
            seq_names_ += [k]
            seq_rels_ += [docs_pd_d[qn][docs_pd_d[qn].docid == k].rel.values[0]]
    for k, v in pairs:
        if k in doc_words_ids_d[qn] and v in doc_words_ids_d[qn]:
            seq_ids_ += [doc_words_ids_d[qn][v]]
            seq_names_ += [v]
            seq_rels_ += [docs_pd_d[qn][docs_pd_d[qn].docid == v].rel.values[0]]
    
    # extract from middle to keep the symetrical property of ids 
    n_d_ = len(seq_ids_)
    if n_d_ >= n_d:
        seq_ids[qn] = seq_ids_[(n_d_/2-n_d/2):(n_d_/2+n_d/2)]
        seq_names[qn] = seq_names_[(n_d_/2-n_d/2):(n_d_/2+n_d/2)]
        seq_rels[qn] = seq_rels_[(n_d_/2-n_d/2):(n_d_/2+n_d/2)]
    
qnumbers_ = qnumbers[:]
np.random.shuffle(qnumbers_)
train_qnumbers = qnumbers_[len(qnumbers_)/10:]
test_qnumbers = qnumbers_[:len(qnumbers_)/10]
print train_qnumbers, test_qnumbers

# training data 9/10
training_seq_ids = {k: v for k, v in seq_ids.iteritems() if k in train_qnumbers}
training_seq_names = {k: v for k, v in seq_names.iteritems() if k in train_qnumbers}
training_seq_rels = {k: v for k, v in seq_rels.iteritems() if k in train_qnumbers}

# testing data 1/10
testing_seq_ids = {k: v for k, v in seq_ids.iteritems() if k in test_qnumbers}
testing_seq_names = {k: v for k, v in seq_names.iteritems() if k in test_qnumbers}
testing_seq_rels = {k: v for k, v in seq_rels.iteritems() if k in test_qnumbers}


generate tuplets for the hinge loss:
[568, 587, 592, 579, 599, 576, 563, 559, 566, 562, 583, 596, 554, 569, 570, 557, 567, 590, 586, 558, 560, 595, 584, 555, 564, 588, 578, 571, 574, 565, 575, 593, 597, 556, 594, 553, 585, 561, 589, 572, 581, 552, 573, 600, 577] [598, 551, 580, 591, 582]


In [13]:
# hinge loss
margin=0.5
y_pred = similarity_fc2
y_pos, y_neg = map(lambda x: tf.cast(x, dtype=tf.float32), tf.split(y_pred, 2))
loss = tf.maximum(0., margin + y_neg - y_pos)
hinge_loss = tf.reduce_mean(loss)

In [14]:
# Gradient Descent Optimizer:
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(hinge_loss)

In [15]:
# test function:
def test():
    for qn in testing_seq_ids.keys():
        _, loss, sim = sess.run([train_step, hinge_loss, similarity_fc2], feed_dict={doc_words_tf:testing_seq_ids[qn], 
                                                                             query_words_tf:query_words_ids[qn]})
        ranks = (-np.squeeze(sim)).argsort()
        print "-"*100
        print "query:", ' '.join(query[qn])
        print "-"*10
        print "rank\tscore\tdocument"
        print testing_seq_names[qn]
        for i, r in enumerate(ranks):
            docname = testing_seq_names[qn][r]
            if docs_pd_d[qn][docs_pd_d[qn].docid == docname].rel.values[0] == 1:
                docs_pd_d_ = docs_pd_d[qn][docs_pd_d[qn].docid == docname][["title", "body", "meta"]].values[0]
                print '%d\t%.3f\t%s' % (i, sim[r], ', '.join([' '.join(x) for x in docs_pd_d_]))
        print "-"*10
        print "loss:", loss
        print "-"*100

In [16]:
# initialize tensors
init = tf.global_variables_initializer()

# create a tensorflow session
with tf.Session() as sess:
    sess.run(init)
    
    # test before training
    test()
    
    # training
    losses = []
    for epoch in range(5):
        for qn in trainingg_seq_ids.keys():
            _, loss, sim = sess.run([train_step, hinge_loss, similarity_fc2], feed_dict={doc_words_tf:training_seq_ids[qn], 
                                                                             query_words_tf:query_words_ids[qn]})
            losses += [loss]
            print epoch, qn, np.mean(losses)
    
    # test after training
    test()


----------------------------------------------------------------------------------------------------
query: education Indian reservations RAFFAELE RAFFAELE
----------
rank	score	document
['G04-81-0432546', 'G17-03-3343615', 'G01-89-1605282', 'G19-70-2900269', 'G11-44-0571147', 'G19-71-0508626', 'G38-14-3429707', 'G01-29-0109956', 'G28-18-0755298', 'G02-27-0967960', 'G20-65-3639560', 'G00-44-1599468', 'G10-85-0735685', 'G13-20-3368630', 'G03-95-1936748', 'G43-88-4060816', 'G00-68-2377713', 'G00-84-2692556', 'G05-37-0002575', 'G44-77-3997443']
0	0.000	code talk tricorne tricorne tricorne tricorne tricorne, american indian rehabilitation research training center affiliated, adobe win tricorne tricorne tricorne tricorne tricorne
9	0.000	remarks prepared for delivery by the honorable, department the interior tricorne tricorne tricorne tricorne, microsoft frontpage office the secretary remarks honorable
10	0.000	american educations economics tricorne tricorne tricorne tricorne, guide usda pr

0 552 0.48669353
0 554 0.48729116
0 557 0.48864904
0 558 0.48620066
0 559 0.48770705
0 560 0.48381272
0 561 0.48039463
0 563 0.47760636
0 564 0.47458902
0 565 0.46859735
0 566 0.46701148
0 567 0.46083054
0 568 0.45314118
0 569 0.43582016
0 570 0.42305544
0 572 0.40510926
0 574 0.38876984
0 575 0.37313548
0 576 0.35758448
0 578 0.34482127
0 583 0.32840124
0 584 0.3134739
0 586 0.30065796
0 589 0.28966454
0 593 0.27807796
0 594 0.26738265
0 595 0.25747958
0 596 0.249743
0 597 0.24113117
0 599 0.23309347
0 600 0.22557433
1 552 0.21852511
1 554 0.21190314
1 557 0.2056707
1 558 0.19979438
1 559 0.19424455
1 560 0.18899469
1 561 0.18402115
1 563 0.17930266
1 564 0.1748201
1 565 0.17055619
1 566 0.1667822
1 567 0.16290355
1 568 0.15920119
1 569 0.155974
1 570 0.15258327
1 572 0.14939284
1 574 0.14628051
1 575 0.14329518
1 576 0.14042929
1 578 0.13833922
1 583 0.13567886
1 584 0.13311887
1 586 0.13065371
1 589 0.12849462
1 593 0.12620007
1 594 0.12398603
1 595 0.12184834
1 596 0.1200276
1 597 

----------------------------------------------------------------------------------------------------
query: intellectual property RAFFAELE RAFFAELE RAFFAELE
----------
rank	score	document
['G37-06-3325628', 'G16-46-0232450', 'G01-65-0142653', 'G04-87-2877981', 'G44-68-4097332', 'G02-55-3134570', 'G26-03-2591182', 'G25-88-4165498', 'G15-05-2150481', 'G22-94-3703003', 'G03-73-1026843', 'G14-59-0807875', 'G02-09-3570681', 'G15-83-3965981', 'G31-19-3607797', 'G00-68-2200781', 'G15-66-3795795', 'G26-00-1785848', 'G00-43-1754673', 'G03-88-2255698']
0	0.916	computer crime intellectual property section tricorne tricorne, computer crime intellectual property section department justice, melvin murray computer crime intellectual property section
1	0.802	research library legal information tricorne tricorne tricorne, information by subject legal information intellectual property, legal information legal intellectual property disability patent
2	0.751	prosecuting intellectual property crimes tricorn