In [579]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

Using TensorFlow backend.
  return f(*args, **kwds)


## Data Loadings

### word 2 vec repre

In [2]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

In [3]:
w2i_map = {}
for i, key in enumerate(w2v_map.keys()):
    w2i_map[key] = i

In [4]:
pkl.dump(w2i_map, open('data/word_idx_map.pkl','wb'))

### map Q idx to context

In [5]:
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
counter = 0
for _, val in w2v_map.items():
    w2v_matrix[counter] = val
    counter += 1

In [6]:
pkl.dump(w2i_map, open('data/w2v_matrix.pkl','wb'))

In [7]:
def w2v(w):
    return w2v_matrix[w2i_map[w]]

In [8]:
def sen2w(sen):
    processed = []
    sen = re.sub(r'[!#\'(),/:?\{}]', ' ', sen).strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        # ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [9]:
# fixed context len = 125
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            len_title = len(context[0])
            if len_title >= 125:
                context_repre[int(qid)] = {'t':sen2w(context[0])[:125], 'b': None}
            else:
                context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])[:125 - len_title]}

### len of context ??

In [631]:
all_lens = []
for k, v in context_repre.items():
    t, b = v['t'], v['b']
    if not v['b']:
        b = []
    all_lens += [len(t)+len(b)]

In [633]:
all(np.array(all_lens) < 125)

True

In [10]:
train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])

In [11]:
train_df.head()

Unnamed: 0,Q,Q+,Q-
0,262144,211039,227387 413633 113297 356390 256881 145638 2962...
1,491522,65911,155119 402211 310669 383107 131731 299465 1633...
2,240299,168608 390642,368007 70009 48077 376760 438005 228888 142340...
3,196614,205184,334471 163710 376791 441664 159963 406360 4300...
4,360457,321532,151863 501857 217578 470017 125838 31836 42066...


In [12]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

In [13]:
train_idx_set = build_set_pair_with_idx(train_df)

## Process Batch

In [612]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
#     v = np.zeros( (len(title) + len(body), 200) )
#     counter = 0
    title_v = np.zeros( (len(title), 200) )
    
    for i, t in enumerate(title):
        title_v[i] = w2v(t)
    
    if len(body) > 0:
        body_v = np.zeros( (len(body), 200) )
        for i, b in enumerate(body):
            body_v[i] = w2v(b)
    
        return title_v, body_v
    
    return title_v, None
    
#     for t in title:
#         v[counter] = w2v(t)
#         counter += 1

#     for b in body:
#         v[counter] = w2v(b)
#         counter += 1
    
#     return v

In [617]:
def process_contxt_batch(qids, idx_set):
    
    batch_title, batch_body = [], []
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    counter = 0
    
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        for qid_pos in q_pos:

            # query Q
            title_len += [len(q_title)]
            batch_title += [ q_title ]
            max_title_len = max(max_title_len, len(q_title))
            if not q_body:
                body_len += [0]
                batch_body += [ [] ]
            else:
                batch_body += [ q_body ]
                body_len += [len(q_body)]
                max_body_len = max(max_body_len, len(q_body))
                
            # pos Q
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            title_len += [len(title)]
            batch_title += [ title ]
            max_title_len = max(max_title_len, len(title))
            if not body:
                body_len += [0]
                batch_body += [ [] ]
            else:
                batch_body += [ body ]
                body_len += [len(body)]
                max_body_len = max(max_body_len, len(body))
        
            # neg Q
            
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                title_len += [len(title)]
                batch_title += [ title ]
                max_title_len = max(max_title_len, len(title))
                if not body:
                    body_len += [0]
                    batch_body += [ [] ]
                else:
                    batch_body += [ body ]
                    body_len += [len(body)]
                    max_body_len = max(max_body_len, len(body))
    
    # (max_seq_len, batch_size, feature_len)
    padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
    padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
    
    for i, (title, body) in enumerate(zip(batch_title, batch_body)):
        title_repre, body_repre = contxt2vec(title, body)
        padded_batch_title[:title_len[i], i] = title_repre
        if len(body) > 0:
            padded_batch_body[:body_len[i], i] = body_repre
    
    return padded_batch_title, padded_batch_body, np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1)

In [15]:
def process_batch(qids, idx_set):
    
    total_pos_len = 0
    for qid in qids:
        total_pos_len += len(idx_set[qid]['pos'])
    
    # per batch element x: vstack, [query_Q x 1; pos_Q x 1; neg_Q x 20]
    # per batch element y: vstack, [query_Q=-1; pos_Q=1; neg_Q=0]
    batch_x = np.zeros(( total_pos_len * 22, 124, 200 ))
    seq_len = np.zeros(total_pos_len * 22)
    
    counter = 0
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        # usually one sample
        for qid_pos in q_pos:
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            # query Q
            if not q_body:
                q_seq_len = len(q_title)
            else:
                q_seq_len = len(q_title) + len(q_body)
            seq_len[counter] = q_seq_len
            batch_x[counter, :q_seq_len] = contxt2vec(q_title, q_body)
            counter += 1
            # pos Q
            if not body:
                pos_q_seq_len = len(title)
            else:
                pos_q_seq_len = len(title) + len(body)
            seq_len[counter] = pos_q_seq_len
            batch_x[counter, :pos_q_seq_len] = contxt2vec(title, body)
            counter += 1
        
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            # neg Q
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                if not body:
                    neg_q_seq_len = len(title)
                else:
                    neg_q_seq_len = len(title) + len(body)
                seq_len[counter] = neg_q_seq_len
                batch_x[counter, : neg_q_seq_len] = contxt2vec(title, body)
                counter += 1
    
    return batch_x, seq_len

## Evaluation

In [40]:
def read_annotations(path, K_neg=20, prune_pos_cnt=10):
    lst = [ ]
    with open(path) as fin:
        for line in fin:
            parts = line.split("\t")
            pid, pos, neg = parts[:3]
            pos = pos.split()
            neg = neg.split()
            if len(pos) == 0 or (len(pos) > prune_pos_cnt and prune_pos_cnt != -1): continue
            if K_neg != -1:
                np.random.shuffle(neg)
                neg = neg[:K_neg]
            s = set()
            qids = [ ]
            qlabels = [ ]
            for q in neg:
                if q not in s:
                    qids.append(q)
                    qlabels.append(0 if q not in pos else 1)
                    s.add(q)
            for q in pos:
                if q not in s:
                    qids.append(q)
                    qlabels.append(1)
                    s.add(q)
            lst.append((pid, qids, qlabels))

    return lst

In [34]:
class Evaluation():
    
    def __init__(self, data):
        self.data = data

    def Precision(self, precision_at):
        scores = []
        for item in self.data:
            temp = item[:precision_at]
            if any(val==1 for val in item):
                scores.append(sum([1 if val==1 else 0 for val in temp])*1.0 / len(temp) if len(temp) > 0 else 0.0)
        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

    def MAP(self):
        scores = []
        missing_MAP = 0
        for item in self.data:
            temp = []
            count = 0.0
            for i,val in enumerate(item):
                if val == 1:
                    count += 1.0
                    temp.append(count/(i+1))
                if len(temp) > 0:
                    scores.append(sum(temp) / len(temp))
                else:
                    missing_MAP += 1
        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

    def MRR(self):
        scores = []
        for item in self.data:
            for i,val in enumerate(item):
                if val == 1:
                    scores.append(1.0/(i+1))
                    break

        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

## Models

In [625]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type, kernel_size=None):
        
        super(EmbeddingLayer, self).__init__()
        
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        
        if layer_type == 'lstm':
            self.layer_type = 'lstm'
            self.embedding_layer = nn.LSTM(input_size, hidden_size)
            self.tanh = nn.Tanh()
        elif layer_type == 'cnn':
            self.layer_type = 'cnn'
            self.embedding_layer = nn.Sequential(
                        nn.Conv1d(in_channels = 200,
                                  out_channels = self.hidden_size,
                                  kernel_size = kernel_size),
                        nn.Tanh())

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(batch_size, 1, self.hidden_size)), \
                Variable(torch.zeros(batch_size, 1, self.hidden_size)))

    def forward(self, context, seq_len):
            
        if self.layer_type == 'lstm':
            
            
            lstm_out, self.hidden = self.embedding_layer(context, (self.tanh(self.hidden[0]), \
                                                                   self.tanh(self.hidden[1])))
            mask = build_mask(seq_len, 124)
            embeddings = torch.sum(lstm_out * Variable(torch.FloatTensor(mask)), dim=1) \
                / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
        
        elif self.layer_type == 'cnn':
            
            cnn_out = self.embedding_layer(context.view(-1, context.size()[2], context.size()[1]))
            mask = build_mask(seq_len - self.kernel_size + 1, 124 - self.kernel_size + 1)
            embeddings = torch.sum(cnn_out.view(-1, 124 - self.kernel_size + 1, self.hidden_size) \
                                   * Variable(torch.FloatTensor(mask)), dim=1) \
                / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
        
        return embeddings

In [626]:
def cos_sim(qv, qv_):
    return torch.sum(qv * qv_, dim=1) / (torch.sqrt(torch.sum(qv ** 2, dim=1)) * torch.sqrt(torch.sum(qv_ ** 2, dim=1)))

In [627]:
def criterion(embeddings):
    
    # a batch of embeddings
    num_block = embeddings.size()[0] // 22
    loss = 0
    for i in range(num_block):
        block_embeddings = embeddings[ i * 22: (i + 1) * 22 ]
        qs = block_embeddings[0]
        qs_ = block_embeddings[1:22]
        cos_scores = cos_sim(qs.expand(21, 240), qs_)
        pos_score = cos_scores[0]
        neg_score = torch.max(cos_scores[1:])
        diff = neg_score - pos_score + 1 # margin=1
        if diff.data[0] > 0:
            loss += diff
    return loss / num_block

In [628]:
def build_mask(seq_len, max_len):
    mask = []
    for i, s in enumerate(seq_len):
        s_mask = np.zeros((max_len, 1))
        s_mask[:int(s)] = np.ones((int(s), 1))
        mask += [s_mask]
    return mask

In [629]:
def train(layer_type, batch_size=25, num_epoch=100):
    
    if layer_type == 'lstm':
        embedding_layer = EmbeddingLayer(200, 240, 'lstm')
    elif layer_type == 'cnn':
        embedding_layer = EmbeddingLayer(200, 240, 'cnn', kernel_size=3)
        
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.005)
    
    qids = list(train_idx_set.keys())[:640]
    num_batch = len(qids) // batch_size
    
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[ ( batch_idx - 1 ) * batch_size: batch_idx * batch_size]
            start = time.time()
            print ('processing batch {}'.format(batch_idx))
            padded_batch_x, seq_len = process_batch(batch_x_qids, train_idx_set)
            print ('processing batch costs:', time.time() - start)
            if layer_type == 'lstm':
                embedding_layer.hidden = embedding_layer.init_hidden(padded_batch_x.shape[0])
            optimizer.zero_grad()
            
            start = time.time()
            qs = Variable(torch.FloatTensor(padded_batch_x))
            embeddings = embedding_layer(qs, seq_len)
            print ('embedding costs:', time.time() - start)

            loss = criterion(embeddings)
            print ('-------------------------------')
            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, batch_idx, num_batch, loss.data[0]))
            print ('-------------------------------')
            start = time.time()
            loss.backward()
            print ('backprop costs:', time.time() - start)
            optimizer.step()

In [630]:
train('lstm', num_epoch=5)

processing batch 1


ValueError: could not broadcast input array from shape (2) into shape (73,200)

## Test CNN

In [74]:
qids = list(train_idx_set.keys())
batch_x_qids = qids[:10]
batch_x, seq_len = process_batch(batch_x_qids, train_idx_set)

In [519]:
batch_x[0][:int(seq_len[0])]

array([[ 0.101999, -0.104434, -0.012801, ...,  0.034353, -0.013605,
        -0.037034],
       [ 0.00388 , -0.07965 , -0.044619, ..., -0.021587,  0.023161,
        -0.135637],
       [-0.026436,  0.013091, -0.037213, ..., -0.059916,  0.027431,
         0.020814],
       ..., 
       [-0.026436,  0.013091, -0.037213, ..., -0.059916,  0.027431,
         0.020814],
       [ 0.00388 , -0.07965 , -0.044619, ..., -0.021587,  0.023161,
        -0.135637],
       [-0.026436,  0.013091, -0.037213, ..., -0.059916,  0.027431,
         0.020814]])

In [280]:
qs = Variable(torch.FloatTensor(batch_x))

In [465]:
embedding_layer = EmbeddingLayer(200, 240, 'lstm')
embedding_layer.hidden = embedding_layer.init_hidden(batch_x.shape[0])

In [467]:
pooled = embedding_layer(qs, seq_len)

In [576]:
mask = build_mask(seq_len, 124)

In [361]:
cnn = nn.Sequential(
            nn.Conv1d(in_channels = 200,
                      out_channels = 240,
                      kernel_size = 3),
            nn.Tanh())

In [362]:
cnn_embedded = cnn(qs.view(-1, 200, 124)).view(-1, 122, 240)

In [363]:
mask = build_mask(seq_len - 3 + 1, 122)

In [364]:
torch.sum(cnn_embedded * Variable(torch.FloatTensor(mask)), dim=1) \
                / Variable(torch.FloatTensor(np.sum(mask, axis=1)))

Variable containing:
-1.2164e-03 -2.9757e-03 -4.3598e-03  ...   5.5581e-03  2.8719e-03  8.5967e-03
-9.0396e-04 -4.5738e-03  1.3033e-03  ...  -5.9795e-03 -1.0130e-03  5.0258e-03
-4.3827e-03  4.5129e-03 -9.6274e-03  ...  -5.7902e-03  5.3599e-03  2.4672e-03
                ...                   ⋱                   ...                
-3.2514e-03 -2.0969e-04 -4.0796e-03  ...   3.6529e-03 -3.3661e-05  8.5487e-03
 1.2756e-03  1.2373e-04  5.6213e-04  ...  -9.9583e-04 -2.3388e-04  9.8959e-03
-6.0902e-03 -2.7618e-03 -8.5951e-03  ...  -6.0784e-03 -2.5088e-03 -1.8907e-03
[torch.FloatTensor of size 242x240]

In [414]:
Variable(torch.FloatTensor(np.sum(mask, axis=1)))

Variable containing:
   71
   63
   95
   82
   89
   79
   46
   79
   79
   82
   88
   74
   75
   44
   90
   54
   94
   97
   76
   44
   94
   74
   68
   40
   79
   98
   74
   68
   97
  102
   78
   70
   51
   94
   91
   95
   92
   92
   58
   80
   67
   72
   64
   66
   93
   80
   47
   55
   72
   90
   13
   85
   82
   88
   99
   66
   61
   35
   72
   86
   79
   77
   47
   83
   38
   56
   93
   37
   23
   56
   84
   91
   99
   27
  101
   13
   79
   79
   47
   40
   85
   83
   47
   78
   85
   59
   64
   87
   60
   58
   81
   72
  102
   89
   93
   52
   91
   56
   31
   92
   68
   72
   81
   61
   31
   92
   74
   82
   86
   63
   79
   76
   94
   94
   63
   96
   83
   92
   92
   58
   35
  107
   86
   60
   88
   45
   70
   83
   42
   55
   68
   23
   70
   53
   60
   83
   99
   59
   75
   75
   71
   61
   90
   98
   47
   94
   32
   66
   60
   59
   54
   49
   99
   94
   33
   40
   72
   76
   83
   94
   90
  102
  102
 

In [420]:
a = Variable(torch.FloatTensor(242, 20))

In [421]:
a

Variable containing:
-2.0489e-28  2.8026e-45  1.5785e-36  ...   1.4013e-45  8.2920e-38  1.4013e-45
 1.5785e-36  1.4013e-45 -2.9426e-35  ...   1.4013e-45  1.5785e-36  1.4013e-45
-2.9432e-35  1.4013e-45  8.2920e-38  ...   1.4013e-45 -2.9429e-35  1.4013e-45
                ...                   ⋱                   ...                
-2.9876e-35  1.4013e-45  8.2920e-38  ...   1.4013e-45 -2.9877e-35  1.4013e-45
 8.2920e-38  1.4013e-45  1.5785e-36  ...   1.4013e-45  8.2920e-38  1.4013e-45
 1.5785e-36  1.4013e-45 -2.9878e-35  ...   1.4013e-45  1.5785e-36  1.4013e-45
[torch.FloatTensor of size 242x20]

In [423]:
a / Variable(torch.FloatTensor(np.sum(mask, axis=1)))

Variable containing:
-2.8857e-30  0.0000e+00  2.2232e-38  ...   0.0000e+00  1.1679e-39  0.0000e+00
 2.5055e-38  0.0000e+00 -4.6708e-37  ...   0.0000e+00  2.5055e-38  0.0000e+00
-3.0981e-37  0.0000e+00  8.7284e-40  ...   0.0000e+00 -3.0978e-37  0.0000e+00
                ...                   ⋱                   ...                
-3.3569e-37  0.0000e+00  9.3169e-40  ...   0.0000e+00 -3.3570e-37  0.0000e+00
 9.0131e-40  0.0000e+00  1.7157e-38  ...   0.0000e+00  9.0131e-40  0.0000e+00
 4.9327e-38  0.0000e+00 -9.3368e-37  ...   0.0000e+00  4.9327e-38  0.0000e+00
[torch.FloatTensor of size 242x20]

In [471]:
torch.max(cos_sim(pooled[0].expand(21,240), pooled[1:22])[1:]) - cos_sim(pooled[0].expand(21,240), pooled[1:22])[0]

Variable containing:
1.00000e-05 *
 -6.2883
[torch.FloatTensor of size 1]

In [566]:
pooled.req

AttributeError: 'Variable' object has no attribute 'req'

In [586]:
torch.sum(out, dim=0)

Variable containing:
 3.6748e+00  3.1938e+00  6.7675e+00  ...   1.0466e+00 -2.3588e+00 -3.7777e+00
 4.8605e+00  3.8674e+00  9.5440e+00  ...   1.3729e+00 -3.6389e+00 -5.2017e+00
 6.8265e+00  4.0363e+00  1.0813e+01  ...   2.2500e+00 -3.9408e+00 -6.3748e+00
                ...                   ⋱                   ...                
 1.2466e+01  3.2535e+00  1.1807e+01  ...   3.0170e+00 -4.1371e+00 -6.4224e+00
 1.2466e+01  3.2535e+00  1.1807e+01  ...   3.0170e+00 -4.1371e+00 -6.4224e+00
 1.2466e+01  3.2535e+00  1.1807e+01  ...   3.0170e+00 -4.1371e+00 -6.4224e+00
[torch.FloatTensor of size 124x240]

In [604]:
torch.sum(Variable(torch.FloatTensor([[[1,2],[3,4]],[[5,6],[7,8]]])) * Variable(torch.FloatTensor([[[1],[0]],[[0],[1]]])), dim=1) / Variable(torch.FloatTensor([[2],[3]]))

Variable containing:
 0.5000  1.0000
 2.3333  2.6667
[torch.FloatTensor of size 2x2]