In [1]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

## Data Loadings

### word 2 vec repre

In [2]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

In [3]:
w2i_map = {}
for i, key in enumerate(w2v_map.keys()):
    w2i_map[key] = i

In [4]:
pkl.dump(w2i_map, open('data/word_idx_map.pkl','wb'))

### map Q idx to context

In [5]:
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
counter = 0
for _, val in w2v_map.items():
    w2v_matrix[counter] = val
    counter += 1

In [6]:
pkl.dump(w2i_map, open('data/w2v_matrix.pkl','wb'))

In [7]:
def w2v(w):
    return w2v_matrix[w2i_map[w]]

In [8]:
def sen2w(sen):
    processed = []
    sen = re.sub(r'[!#\'(),/:?\{}]', ' ', sen).strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        # ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [9]:
# fixed context len = 125
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            len_title = len(context[0])
            if len_title >= 125:
                context_repre[int(qid)] = {'t':sen2w(context[0])[:125], 'b': None}
            else:
                context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])[:125 - len_title]}

### len of context ??

In [10]:
all_lens = []
for k, v in context_repre.items():
    t, b = v['t'], v['b']
    if not v['b']:
        b = []
    all_lens += [len(t)+len(b)]

In [11]:
all(np.array(all_lens) < 125)

True

In [12]:
train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])

In [13]:
train_df.head()

Unnamed: 0,Q,Q+,Q-
0,262144,211039,227387 413633 113297 356390 256881 145638 2962...
1,491522,65911,155119 402211 310669 383107 131731 299465 1633...
2,240299,168608 390642,368007 70009 48077 376760 438005 228888 142340...
3,196614,205184,334471 163710 376791 441664 159963 406360 4300...
4,360457,321532,151863 501857 217578 470017 125838 31836 42066...


In [14]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

In [15]:
train_idx_set = build_set_pair_with_idx(train_df)

In [46]:
train_idx_set[240299]

{'neg': array([368007,  70009,  48077, 376760, 438005, 228888, 142340, 220049,
        195789,  25591, 503498,  35125, 282665, 350677,  67132, 492121,
        521770, 482854, 314882, 460162, 474768,  47095, 441111, 430424,
         55776, 371296, 471245, 523727, 472488,  79961,  76016, 245183,
        212299, 417533, 328855, 405600, 342727, 437437, 392462, 351849,
        161396, 497477, 183319, 393544, 293781, 205739, 450857, 362082,
        297814, 228122, 145335, 223978, 235373, 429337, 421932, 385761,
        214356, 158411, 498088, 416208, 518985, 163666, 282313, 306557,
        202189, 207846, 128929,  64066, 110792, 516967, 288842, 101977,
        128678, 402999, 199440, 281229, 447477, 210418,  47234, 224765,
        168359, 286331,  34844, 369064, 420539, 349599, 472814, 335544,
        450513, 312684, 505425,  81283, 174562, 456927, 328250, 165416,
         46207, 269400, 468467, 215983]), 'pos': array([168608, 390642])}

## Process Batch

In [16]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    v = np.zeros( (len(title) + len(body), 200) )
    counter = 0
    
    for t in title:
        v[counter] = w2v(t)
        counter += 1

    for b in body:
        v[counter] = w2v(b)
        counter += 1
    
    return v

In [17]:
def process_batch(qids, idx_set):
    
    total_pos_len = 0
    for qid in qids:
        total_pos_len += len(idx_set[qid]['pos'])
    
    # per batch element x: vstack, [query_Q x 1; pos_Q x 1; neg_Q x 20]
    # per batch element y: vstack, [query_Q=-1; pos_Q=1; neg_Q=0]
    batch_x = np.zeros(( total_pos_len * 22, 124, 200 ))
    seq_len = np.zeros(total_pos_len * 22)
    
    counter = 0
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        # usually one sample
        for qid_pos in q_pos:
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            # query Q
            if not q_body:
                q_seq_len = len(q_title)
            else:
                q_seq_len = len(q_title) + len(q_body)
            seq_len[counter] = q_seq_len
            batch_x[counter, :q_seq_len] = contxt2vec(q_title, q_body)
            counter += 1
            # pos Q
            if not body:
                pos_q_seq_len = len(title)
            else:
                pos_q_seq_len = len(title) + len(body)
            seq_len[counter] = pos_q_seq_len
            batch_x[counter, :pos_q_seq_len] = contxt2vec(title, body)
            counter += 1
        
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            # neg Q
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                if not body:
                    neg_q_seq_len = len(title)
                else:
                    neg_q_seq_len = len(title) + len(body)
                seq_len[counter] = neg_q_seq_len
                batch_x[counter, : neg_q_seq_len] = contxt2vec(title, body)
                counter += 1
    
    return batch_x, seq_len

In [None]:
dev = read_annotations('data/dev.txt')
dev_data = {}
for item in dev:
    qid = int(item[0])
    dev_data[qid] = {}
    dev_data[qid]['q'] = list(map(int, item[1]))
    dev_data[qid]['label'] = item[2]

# create eval batch 
def process_eval_batch(qid, data):
    qid_dict = data[qid]
    qs = qid_dict['q']
    batch_x = np.zeros(( len(qs)+1, 124, 200 ))
    seq_len = np.zeros(len(qs)+1)
    counter = 0
    for qid_ in [qid] + qs:
        title, body = context_repre[qid_]['t'], context_repre[qid_]['b']
        if not body:
            q_seq_len = len(title)
        else:
            q_seq_len = len(title) + len(body)
        seq_len[counter] = q_seq_len
        batch_x[counter, : q_seq_len] = contxt2vec(title, body)
        counter += 1
    return batch_x, seq_len

def evaluate(embeddings): # (n x 240)
    qs = embeddings[0]
    qs_ = embeddings[1:]
    cos_scores = cos_sim(qs.expand(len(embeddings)-1, 240), qs_)
    return cos_scores

## Evaluation

In [18]:
def read_annotations(path, K_neg=20, prune_pos_cnt=10):
    lst = [ ]
    with open(path) as fin:
        for line in fin:
            parts = line.split("\t")
            pid, pos, neg = parts[:3]
            pos = pos.split()
            neg = neg.split()
            if len(pos) == 0 or (len(pos) > prune_pos_cnt and prune_pos_cnt != -1): continue
            if K_neg != -1:
                np.random.shuffle(neg)
                neg = neg[:K_neg]
            s = set()
            qids = [ ]
            qlabels = [ ]
            for q in neg:
                if q not in s:
                    qids.append(q)
                    qlabels.append(0 if q not in pos else 1)
                    s.add(q)
            for q in pos:
                if q not in s:
                    qids.append(q)
                    qlabels.append(1)
                    s.add(q)
            lst.append((pid, qids, qlabels))

    return lst

In [None]:
def precision(at, labels):
    res = []
    for item in labels:
        tmp = item[:at]
        res.append(np.sum(tmp) / at if len(tmp) != 0 else 0.0)
    return sum(res)/len(res) if len(res) != 0 else 0.0

def MAP(labels):
    scores = []
    missing_MAP = 0
    for item in labels:
        temp = []
        count = 0.0
        for i,val in enumerate(item):
            
            if val == 1:
                count += 1.0
                temp.append(count/(i+1))
            if len(temp) > 0:
                scores.append(sum(temp) / len(temp))
            else:
                missing_MAP += 1
    return sum(scores)/len(scores) if len(scores) > 0 else 0.0
    
def MRR(labels):
    scores = []
    for item in labels:
        for i,val in enumerate(item):
            if val == 1:
                scores.append(1.0/(i+1))
                break
    return sum(scores)/len(scores) if len(scores) > 0 else 0.0
    

In [19]:
class Evaluation():
    
    def __init__(self, data):
        self.data = data

    def Precision(self, precision_at):
        scores = []
        for item in self.data:
            temp = item[:precision_at]
            if any(val==1 for val in item):
                scores.append(sum([1 if val==1 else 0 for val in temp])*1.0 / len(temp) if len(temp) > 0 else 0.0)
        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

    def MAP(self):
        scores = []
        missing_MAP = 0
        for item in self.data:
            temp = []
            count = 0.0
            for i,val in enumerate(item):
                if val == 1:
                    count += 1.0
                    temp.append(count/(i+1))
                if len(temp) > 0:
                    scores.append(sum(temp) / len(temp))
                else:
                    missing_MAP += 1
        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

    def MRR(self):
        scores = []
        for item in self.data:
            for i,val in enumerate(item):
                if val == 1:
                    scores.append(1.0/(i+1))
                    break

        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

## Models

In [195]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type):
        
        super(EmbeddingLayer, self).__init__()
        
        self.hidden_size = hidden_size
        
        if layer_type == 'lstm':
            self.embedding_layer = nn.LSTM(input_size, hidden_size)#, batch_first=True)
        elif layer_type == 'cnn':
            pass
        
        self.tanh = nn.Tanh()

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(batch_size, 1, self.hidden_size)), Variable(torch.zeros(batch_size, 1, self.hidden_size)))

    def forward(self, context, seq_len):
        lstm_out, self.hidden = self.embedding_layer(context, (self.tanh(self.hidden[0]), self.tanh(self.hidden[1])))
        mask = build_mask(seq_len)
        embeddings = torch.sum(lstm_out * Variable(torch.FloatTensor(mask)), dim=1) \
            / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
        return embeddings

In [21]:
def cos_sim(qv, qv_):
    return torch.sum(qv * qv_, dim=1) / (torch.sqrt(torch.sum(qv ** 2, dim=1)) * torch.sqrt(torch.sum(qv_ ** 2, dim=1)))

In [188]:
cos_sim(torch.FloatTensor([[1,1,1,1],[2,3,4,5]]), torch.FloatTensor([[2,1,1,1],[3,4,5,6]]))


 0.9449
 0.9978
[torch.FloatTensor of size 2]

In [94]:
def criterion(embeddings):
    
    # a batch of embeddings
    num_block = embeddings.size()[0] // 22
    loss = 0
    for i in range(num_block):
        block_embeddings = embeddings[ i * 22: (i + 1) * 22 ]
        qs = block_embeddings[0]
        qs_ = block_embeddings[1:22]
        cos_scores = cos_sim(qs.expand(21, 240), qs_)
        pos_score = cos_scores[0]
        neg_score = torch.max(cos_scores[1:])
        
        diff = neg_score - pos_score + 1 # margin=1
        if diff.data[0] > 0:
            loss += diff
            
    return loss / num_block # , cos_scores

In [23]:
def build_mask(seq_len):
    mask = []
    for i, s in enumerate(seq_len):
        s_mask = np.zeros((124, 1))
        s_mask[:int(s)] = np.ones((int(s), 1))
        mask += [s_mask]
    return mask

In [196]:
# mdl = EmbeddingLayer(200, 240, 'lstm')
def train(layer_type, mdl, batch_size=64, num_epoch=100, eval=True):
    
    if layer_type == 'lstm':
        embedding_layer = mdl
    elif layer_type == 'cnn':
        pass
        
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.005)
    
    qids = list(train_idx_set.keys())
    num_batch = len(qids) // batch_size
    
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[(batch_idx - 1) * batch_size: batch_idx * batch_size]
#             print(batch_x_qids)
            start = time.time()
            print ('processing batch {}'.format(batch_idx))
            padded_batch_x, seq_len = process_batch(batch_x_qids, train_idx_set)
#             print(padded_batch_x.shape, len(seq_len))
            
            
            print ('processing batch costs:', time.time() - start)
            embedding_layer.hidden = embedding_layer.init_hidden(padded_batch_x.shape[0])
            optimizer.zero_grad()
            
            start = time.time()
            qs = Variable(torch.FloatTensor(padded_batch_x))
            embeddings = embedding_layer(qs, seq_len)
            print ('embedding costs:', time.time() - start)
            start = time.time()
            
#             print(embeddings.size())
            
            
            
            print ('accumulating loss costs:', time.time() - start)
            loss = criterion(embeddings)
            
            print ('-------------------------------')
            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, batch_idx, num_batch, loss.data[0]))
            print ('-------------------------------')
            start = time.time()
            print('gradient befor back:', loss.grad)
            loss.backward()
            print('gradient after back:', loss.grad)
            print ('backprop costs:', time.time() - start)
            optimizer.step()
            tmp = embedding_layer
            if eval:
                labels = []
                for qid_ in dev_data.keys():
                    dev_batch, dev_len = process_eval_batch(qid_, dev_data)
                    embedding_layer.hidden = embedding_layer.init_hidden(dev_batch.shape[0])
                    qs_ = Variable(torch.FloatTensor(dev_batch))
                    embeddings = embedding_layer(qs_, dev_len)
                    cos_scores = evaluate(embeddings)
                    labels.append(np.array(dev_data[qid_]['label'])[np.argsort(cos_scores.data.numpy())][::-1])
                print ('Dev Performance P@5', precision(5, labels))
                print ('Dev Performance P@1', precision(1, labels))
                print ('Dev Performance MAP', MAP(labels))
                print ('Dev Performance MRR', MRR(labels))
                
            

In [179]:

    
#     def MAP(self):
#         scores = []
#         missing_MAP = 0
#         for item in self.data:
#             temp = []
#             count = 0.0
#             for i,val in enumerate(item):
#                 if val == 1:
#                     count += 1.0
#                     temp.append(count/(i+1))
#                 if len(temp) > 0:
#                     scores.append(sum(temp) / len(temp))
#                 else:
#                     missing_MAP += 1
#         return sum(scores)/len(scores) if len(scores) > 0 else 0.0

#     def MRR(self):
#         scores = []
#         for item in self.data:
#             for i,val in enumerate(item):
#                 if val == 1:
#                     scores.append(1.0/(i+1))
#                     break

#         return sum(scores)/len(scores) if len(scores) > 0 else 0.0
# scores = []
# for qid_ in dev_data.keys():
#     dev_batch, dev_len = process_eval_batch(qid_, dev_data)
#     mdl.hidden = mdl.init_hidden(dev_batch.shape[0])
#     qs_ = Variable(torch.FloatTensor(dev_batch))
#     embeddings = mdl(qs_, dev_len)
#     cos_scores = evaluate(embeddings)
#     scores.append(cos_scores)

# precision(5, scores, dev_data)

In [184]:
labels = []
qids = dev_data.keys()
for qid_ in qids:
    dev_batch, dev_len = process_eval_batch(qid_, dev_data)
    mdl.hidden = mdl.init_hidden(dev_batch.shape[0])
    qs_ = Variable(torch.FloatTensor(dev_batch))
    embeddings = mdl(qs_, dev_len)
    cos_scores = evaluate(embeddings)
    labels.append(np.array(dev_data[qid_]['label'])[np.argsort(cos_scores.data.numpy())][::-1])
print ('Dev Performance P@5', precision(5, labels))
print ('Dev Performance P@1', precision(1, labels))
print ('Dev Performance MAP', MAP(labels))
print ('Dev Performance MRR', MRR(labels))

KeyboardInterrupt: 

In [172]:
labels[2].shape

(20,)

In [None]:
def Precision(self, precision_at):
    scores = []
    for item in self.data:
        temp = item[:precision_at]
        if any(val==1 for val in item):
            scores.append(sum([1 if val==1 else 0 for val in temp])*1.0 / len(temp) if len(temp) > 0 else 0.0)
    return sum(scores)/len(scores) if len(scores) > 0 else 0.0

In [194]:
mdl = EmbeddingLayer(200, 240, 'lstm')
train('lstm', mdl, batch_size=25, num_epoch=1)

processing batch 1
processing batch costs: 0.3981361389160156
embedding costs: 11.348710060119629
accumulating loss costs: 1.1920928955078125e-06
-------------------------------
epoch:1/1, batch:1/10, loss:1.0000015497207642
-------------------------------
gradient befor back: None
gradient after back: None
backprop costs: 6.78510594367981
Dev Performance P@5 0.289440993789
Dev Performance P@1 0.304347826087
Dev Performance MAP 0.5013439828151541
Dev Performance MRR 0.5040220409319308
processing batch 2
processing batch costs: 0.47588086128234863
embedding costs: 10.744858980178833
accumulating loss costs: 9.5367431640625e-07
-------------------------------
epoch:1/1, batch:2/10, loss:0.9999990463256836
-------------------------------
gradient befor back: None
gradient after back: None
backprop costs: 6.25277304649353
Dev Performance P@5 0.285714285714
Dev Performance P@1 0.285714285714
Dev Performance MAP 0.49561478558654337
Dev Performance MRR 0.49428217826098725
processing batch 3
p

In [56]:
read_annotations('data/dev.txt')[0]

('501754',
 ['399513',
  '456861',
  '144028',
  '269913',
  '491992',
  '23596',
  '428574',
  '396630',
  '50146',
  '313023',
  '419839',
  '416899',
  '438505',
  '459246',
  '23731',
  '386606',
  '218016',
  '17630',
  '462024',
  '400442'],
 [1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [99]:
tmp = 0
for i in dev_data.keys():
    tmp += 1
print(tmp)

161


In [70]:
dev = read_annotations('data/dev.txt')
dev_data = {}
for item in dev:
    qid = int(item[0])
    dev_data[qid] = {}
    dev_data[qid]['q'] = list(map(int, item[1]))
    dev_data[qid]['label'] = item[2]

In [71]:
dev_data[501754]

{'label': [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
 'q': [428574,
  400442,
  218016,
  491992,
  419839,
  17630,
  144028,
  399513,
  416899,
  23731,
  459246,
  386606,
  23596,
  438505,
  462024,
  50146,
  396630,
  313023,
  269913,
  456861]}

In [146]:
dev = read_annotations('data/dev.txt')
dev_data = {}
for item in dev:
    qid = int(item[0])
    dev_data[qid] = {}
    dev_data[qid]['q'] = list(map(int, item[1]))
    dev_data[qid]['label'] = item[2]

# create eval batch 
def process_eval_batch(qid, data):
    qid_dict = data[qid]
    qs = qid_dict['q']
    batch_x = np.zeros(( len(qs)+1, 124, 200 ))
    seq_len = np.zeros(len(qs)+1)
    counter = 0
    for qid_ in [qid] + qs:
        title, body = context_repre[qid_]['t'], context_repre[qid_]['b']
        if not body:
            q_seq_len = len(title)
        else:
            q_seq_len = len(title) + len(body)
        seq_len[counter] = q_seq_len
        batch_x[counter, : q_seq_len] = contxt2vec(title, body)
        counter += 1
    return batch_x, seq_len

def evaluate(embeddings): # (n x 240)
    qs = embeddings[0]
    qs_ = embeddings[1:]
    cos_scores = cos_sim(qs.expand(len(embeddings)-1, 240), qs_)
    return cos_scores

In [105]:
dev_batch = process_eval_batch(501754, dev_data)
dev_batch[0].shape, dev_batch[1].shape

((21, 124, 200), (21,))