In [2]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

In [None]:
# global
HIDDEN_DIM = 120

In [3]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

w2i_map = {}
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
for i, (key, val) in enumerate(w2v_map.items()):
    w2i_map[key] = i
    w2v_matrix[i] = val

def w2v(w):
    return w2v_matrix[w2i_map[w]]

def sen2w(sen):
    processed = []
    sen = sen.strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        #ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [4]:
# fixed context len = 125
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])}

In [5]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])
train_idx_set = build_set_pair_with_idx(train_df)

In [6]:
train_df.head()

Unnamed: 0,Q,Q+,Q-
0,262144,211039,227387 413633 113297 356390 256881 145638 2962...
1,491522,65911,155119 402211 310669 383107 131731 299465 1633...
2,240299,168608 390642,368007 70009 48077 376760 438005 228888 142340...
3,196614,205184,334471 163710 376791 441664 159963 406360 4300...
4,360457,321532,151863 501857 217578 470017 125838 31836 42066...


In [8]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    title_v = np.zeros( (len(title), 200) )
    
    for i, t in enumerate(title):
        title_v[i] = w2v(t)
    
    if len(body) > 0:
        body_v = np.zeros( (len(body), 200) )
        for i, b in enumerate(body):
            body_v[i] = w2v(b)
    
        return title_v, body_v
    
    return title_v, None

In [595]:
def process_contxt_batch(qids, idx_set, batch_first=False):
    
    batch_title, batch_body = [], []
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']
        
        if len(q_pos) > 20:
            q_pos = q_pos[:20]

        for qid_pos in q_pos:
            # query Q
            title_len += [len(q_title)]
            batch_title += [ q_title ]
            max_title_len = max(max_title_len, len(q_title))
            if not q_body:
                body_len += [len(q_title)]
                batch_body += [ q_title ]
            else:
                batch_body += [ q_body ]
                body_len += [len(q_body)]
                max_body_len = max(max_body_len, len(q_body))
                
            # pos Q
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            title_len += [len(title)]
            batch_title += [ title ]
            max_title_len = max(max_title_len, len(title))
            if not body:
                body_len += [len(title)]
                batch_body += [ title ]
            else:
                batch_body += [ body ]
                body_len += [len(body)]
                max_body_len = max(max_body_len, len(body))
            # neg Q
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                title_len += [len(title)]
                batch_title += [ title ]
                max_title_len = max(max_title_len, len(title))
                if not body:
                    body_len += [len(title)]
                    batch_body += [ title ]
                else:
                    batch_body += [ body ]
                    body_len += [len(body)]
                    max_body_len = max(max_body_len, len(body))
    
    if batch_first:
        # for CNN
        padded_batch_title = np.zeros(( len(batch_title), max_title_len, 200)) 
        padded_batch_body = np.zeros(( len(batch_body),  max_body_len, 200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[i, :title_len[i]] = title_repre
            padded_batch_body[i, :body_len[i]] = body_repre
    else:
        # for LSTM
        # (max_seq_len, batch_size, feature_len)
        padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
        padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[:title_len[i], i] = title_repre
            padded_batch_body[:body_len[i], i] = body_repre

    return padded_batch_title, padded_batch_body, \
                np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1)

# Eval

In [616]:
def read_annotations(path, K_neg=20, prune_pos_cnt=20):
    lst = [ ]
    with open(path) as fin:
        for line in fin:
            parts = line.split("\t")
            pid, pos, neg = parts[:3]
            pos = pos.split()
            neg = neg.split()
            if len(pos) == 0 or (len(pos) > prune_pos_cnt and prune_pos_cnt != -1): continue
            if K_neg != -1:
                np.random.shuffle(neg)
                neg = neg[:K_neg]
            s = set()
            qids = [ ]
            qlabels = [ ]
            for q in neg:
                if q not in s:
                    qids.append(q)
                    qlabels.append(0 if q not in pos else 1)
                    s.add(q)
            for q in pos:
                if q not in s:
                    qids.append(q)
                    qlabels.append(1)
                    s.add(q)
            lst.append((pid, qids, qlabels))

    return lst

def cos_sim(qv, qv_):
    return torch.sum(qv * qv_, dim=1) / (torch.sqrt(torch.sum(qv ** 2, dim=1)) * torch.sqrt(torch.sum(qv_ ** 2, dim=1)))
    
# create eval batch 
def process_eval_batch(qid, data, batch_first=False):
    qid_dict = data[qid]
    qs = qid_dict['q']
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    batch_title, batch_body = [], []
    for qid_ in [qid] + qs:
        title, body = context_repre[qid_]['t'], context_repre[qid_]['b']
        title_len += [len(title)]
        batch_title += [ title ]
        max_title_len = max(max_title_len, len(title))
        if not body:
            body_len += [len(title)]
            batch_body += [ title ]
        else:
            batch_body += [ body ]
            body_len += [len(body)]
            max_body_len = max(max_body_len, len(body))
            
    if batch_first:
        padded_batch_title = np.zeros(( len(batch_title), max_title_len, 200)) 
        padded_batch_body = np.zeros(( len(batch_body),  max_body_len, 200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[i, :title_len[i]] = title_repre
            padded_batch_body[i, :body_len[i]] = body_repre
    else:
        padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
        padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[:title_len[i], i] = title_repre
            padded_batch_body[:body_len[i], i] = body_repre
    
    return padded_batch_title, padded_batch_body, \
           np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1) 
    
def evaluate(embeddings): # (n x 240)
    qs = embeddings[0]
    qs_ = embeddings[1:]
    cos_scores = cos_sim(qs.expand(len(embeddings)-1, qs.size(0)), qs_)
    return cos_scores

def precision(at, labels):
    res = []
    for item in labels:
        tmp = item[:at]
        if any(val==1 for val in item):
            res.append(np.sum(tmp) / len(tmp) if len(tmp) != 0 else 0.0)
    return sum(res)/len(res) if len(res) != 0 else 0.0

def MAP(labels):
    scores = []
    missing_MAP = 0
    for item in labels:
        temp = []
        count = 0.0
        for i,val in enumerate(item):
            
            if val == 1:
                count += 1.0
                temp.append(count/(i+1))
            if len(temp) > 0:
                scores.append(sum(temp) / len(temp))
            else:
                missing_MAP += 1
    return sum(scores)/len(scores) if len(scores) > 0 else 0.0
    
def MRR(labels):
    scores = []
    for item in labels:
        for i,val in enumerate(item):
            if val == 1:
                scores.append(1.0/(i+1))
                break
    return sum(scores)/len(scores) if len(scores) > 0 else 0.0

In [617]:
# DEV SET
dev = read_annotations('data/dev.txt')
dev_data = {}
for item in dev:
    qid = int(item[0])
    dev_data[qid] = {}
    dev_data[qid]['q'] = list(map(int, item[1]))
    dev_data[qid]['label'] = item[2]

# TEST SET
test = read_annotations('data/test.txt')
test_data = {}
for item in test:
    qid = int(item[0])
    test_data[qid] = {}
    test_data[qid]['q'] = list(map(int, item[1]))
    test_data[qid]['label'] = item[2]

In [618]:
def do_eval(embedding_layer, eval_name, batch_first=False):
    
    if eval_name == 'Dev':
        eval_data = dev_data
        eval_map = {}
        for qid_ in dev_data.keys():
            eval_map[qid_] = process_eval_batch(qid_, dev_data, batch_first=batch_first)
            
    elif eval_name == 'Test':
        eval_data = test_data
        eval_map = {}
        for qid_ in test_data.keys():
            eval_map[qid_] = process_eval_batch(qid_, test_data, batch_first=batch_first)
    
    labels = []
    
    for qid_ in eval_map.keys():
        
        eval_title_batch, eval_body_batch, eval_title_len, eval_body_len = eval_map[qid_] # process_eval_batch(qid_, eval_data)
        embedding_layer.title_hidden = embedding_layer.init_hidden(eval_title_batch.shape[1])
        embedding_layer.body_hidden = embedding_layer.init_hidden(eval_body_batch.shape[1])
        eval_title_qs = Variable(torch.FloatTensor(eval_title_batch))
        eval_body_qs = Variable(torch.FloatTensor(eval_body_batch))
        embeddings = embedding_layer(eval_title_qs, eval_body_qs, eval_title_len, eval_body_len)
        cos_scores = evaluate(embeddings)
        labels.append(np.array(eval_data[qid_]['label'])[np.argsort(cos_scores.data.numpy())][::-1])
    
    print (eval_name + ' Performance MAP', MAP(labels))
    print (eval_name + ' Performance MRR', MRR(labels))
    print (eval_name + ' Performance P@1', precision(1, labels))
    print (eval_name + ' Performance P@5', precision(5, labels))

# Train Utility

In [619]:
def build_mask3d(seq_len, max_len):
    mask = np.zeros((max_len, len(seq_len), 1))
    for i, s in enumerate(seq_len):
        # only one word
        if int(s) == -1:
            mask[0, i] = 1
        # only two word
        elif int(s) == 0:
            mask[:2, i] = np.ones((2, 1))
        else: 
            mask[:int(s), i] = np.ones((int(s), 1))
    return mask

def multi_margin_loss(hidden, margin=0.50):
    
    def loss_func(embeddings):
        # a batch of embeddings
        blocked_embeddings = embeddings.view(-1, 22, hidden)
        q_vecs = blocked_embeddings[:,0,:]
        
        pos_vecs = blocked_embeddings[:,1,:]
        neg_vecs = blocked_embeddings[:,2:,:]

        pos_scores = torch.sum(q_vecs * pos_vecs, dim=1) / (torch.sqrt(torch.sum(q_vecs ** 2, dim=1)) \
                                                   * torch.sqrt(torch.sum(pos_vecs ** 2, dim=1)))
        neg_scores = torch.sum(torch.unsqueeze(q_vecs, dim=1) * neg_vecs, dim=2) \
        / (torch.unsqueeze(torch.sqrt(torch.sum(q_vecs ** 2, dim=1)),dim=1) * torch.sqrt(torch.sum( neg_vecs ** 2, dim=2)))
        neg_scores = torch.max(neg_scores, dim=1)[0]
        
        diff = neg_scores - pos_scores + margin
        loss = torch.mean((diff > 0).float() * diff)
        return loss

    return loss_func

# Model

In [627]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type, num_layer=1, kernel_size=3):
        
        super(EmbeddingLayer, self).__init__()
        
        self.num_layer = num_layer
        
        self.layer_type = layer_type
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        
        self.tanh = nn.Tanh()
        
        if self.layer_type == 'lstm':
            
            self.embedding_layer = nn.LSTM(self.input_size, hidden_size, bidirectional=True)
        
        elif self.layer_type == 'cnn':

            self.embedding_layer = nn.Sequential(
                        nn.Conv1d(in_channels = self.input_size,
                                  out_channels = self.hidden_size,
                                  kernel_size = kernel_size),
                        self.tanh)

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size)), \
                Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size)))

    def forward(self, title, body, title_len, body_len):
        
            
        if self.layer_type == 'lstm':
            
            title_mask = Variable(torch.FloatTensor(build_mask3d(title_len, np.max(seq_len))))
            body_mask = Variable(torch.FloatTensor(build_mask3d(body_len, np.max(body_len))))
            
            
            title_out, self.title_hidden = self.embedding_layer(title, (self.tanh(self.title_hidden[0]), \
                                                                   self.tanh(self.title_hidden[1])))
            body_out, self.body_hidden = self.embedding_layer(body, (self.tanh(self.body_hidden[0]), \
                                                                   self.tanh(self.body_hidden[1])))
        
        if self.layer_type == 'cnn':
            # batch first input
            title_mask = Variable(torch.FloatTensor(build_mask3d(title_len - self.kernel_size + 1,\
                                                                 np.max(title_len) - self.kernel_size + 1)))
            body_mask = Variable(torch.FloatTensor(build_mask3d(body_len - self.kernel_size + 1, \
                                                                np.max(body_len) - self.kernel_size + 1)))
            
            title = torch.transpose(title, 1, 2)
            body = torch.transpose(body, 1, 2)
#             title = title.view(-1, title.size(2), title.size(1))
#             body = body.view(-1, body.size(2), body.size(1))
#             print ('after transpose:', title.size())
            title_out =  self.embedding_layer(title)
            body_out =  self.embedding_layer(body)
#             print ('after embedding:', title_out.size())
            title_out = torch.transpose(title_out, 1, 2)
            body_out = torch.transpose(body_out, 1, 2)
#             print ('after transpose:', title_out.size())
            title_out = torch.transpose(title_out, 0, 1)
            body_out = torch.transpose(body_out, 0, 1)
#             print ('after transpose:', title_out.size())
#             title_out = title_out.view(title_out.size(1), -1, title_out.size(2))
#             body_out = body_out.view(body_out.size(1), -1, body_out.size(2))
#             print ('after view embedding :', title_out.size())
        
        #print ('mask size:', title_mask.size())
        title_embeddings = torch.sum(title_out * title_mask, dim=0) / torch.sum(title_mask, dim=0)
        body_embeddings = torch.sum(body_out * body_mask, dim=0) / torch.sum(body_mask, dim=0)
        embeddings = ( title_embeddings + body_embeddings ) / 2
        
        return embeddings#title_embeddings, body_embeddings

In [592]:
def save_model(mdl, path):
    # saving model params
    torch.save(mdl.state_dict(), path)

def restore_model(mdl_skeleton, path):
    # restoring params to the mdl skeleton
    mdl_skeleton.load_state_dict(torch.load(path))
    return mdl

# Train

In [625]:
def train(layer_type, embedding_layer, batch_size=25, num_epoch=100, id_set=train_idx_set, eval=True):
    
    if layer_type == 'lstm':
        
        embedding_layer = EmbeddingLayer(200, 240, 'lstm')
        criterion = multi_margin_loss(hidden=embedding_layer.hidden_size * 2)
    
    elif layer_type == 'cnn':
        
        embedding_layer = EmbeddingLayer(200, 667, 'cnn')
        criterion = multi_margin_loss(hidden=embedding_layer.hidden_size)
    
    
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.001)
    
    qids = list(id_set.keys())
    num_batch = len(qids) // batch_size
    
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[ ( batch_idx - 1 ) * batch_size: batch_idx * batch_size ]
            
            if layer_type == 'lstm':
                batch_title, batch_body, title_len, body_len = process_contxt_batch(batch_x_qids, \
                                                                                train_idx_set)
                embedding_layer.title_hidden = embedding_layer.init_hidden(batch_title.shape[1])
                embedding_layer.body_hidden = embedding_layer.init_hidden(batch_body.shape[1])
            else:
                batch_title, batch_body, title_len, body_len = process_contxt_batch(batch_x_qids, \
                                                                                train_idx_set, batch_first=True)
            
            title_qs = Variable(torch.FloatTensor(batch_title))
            body_qs = Variable(torch.FloatTensor(batch_body))
            
            embeddings = embedding_layer(title_qs, body_qs, title_len, body_len)

            loss = criterion(embeddings)

            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, batch_idx, num_batch, loss.data[0]))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
            if eval and batch_idx % 5 == 0: # lstm for now
                print ('evaluating ....')
                if layer_type == 'lstm':
                    do_eval(embedding_layer, 'Dev')
                    print ('------------------')
                    do_eval(embedding_layer, 'Test')
                elif layer_type == 'cnn':
                    do_eval(embedding_layer, 'Dev', batch_first=True)
                    print ('------------------')
                    do_eval(embedding_layer, 'Test', batch_first=True)

In [None]:
model = EmbeddingLayer(200, 667, 'cnn') # loss margin = 0.5
train('cnn', model, batch_size=25, num_epoch=10)

In [147]:
save_model(model, 'models/lstm_bi_epoch=4.5_margin=.5_hidden=120')

In [161]:
model_margin_p3 = EmbeddingLayer(200, HIDDEN_DIM, 'lstm')
train('lstm', model, batch_size=25, num_epoch=10)

epoch:1/10, batch:1/508, loss:0.06204039976000786
epoch:1/10, batch:2/508, loss:0.06698644161224365
epoch:1/10, batch:3/508, loss:0.08204396069049835
epoch:1/10, batch:4/508, loss:0.07271836698055267
epoch:1/10, batch:5/508, loss:0.11348847299814224
epoch:1/10, batch:6/508, loss:0.12347452342510223
epoch:1/10, batch:7/508, loss:0.07170307636260986
epoch:1/10, batch:8/508, loss:0.05405590310692787
epoch:1/10, batch:9/508, loss:0.13060428202152252
epoch:1/10, batch:10/508, loss:0.07260049134492874
epoch:1/10, batch:11/508, loss:0.08073266595602036
epoch:1/10, batch:12/508, loss:0.14527538418769836
epoch:1/10, batch:13/508, loss:0.05466698110103607
epoch:1/10, batch:14/508, loss:0.08360808342695236
epoch:1/10, batch:15/508, loss:0.0836087018251419
epoch:1/10, batch:16/508, loss:0.09609977155923843
epoch:1/10, batch:17/508, loss:0.10398983210325241
epoch:1/10, batch:18/508, loss:0.1288212686777115
epoch:1/10, batch:19/508, loss:0.03824116662144661
epoch:1/10, batch:20/508, loss:0.034473661

epoch:1/10, batch:129/508, loss:0.07693225145339966
epoch:1/10, batch:130/508, loss:0.13850216567516327
epoch:1/10, batch:131/508, loss:0.1792794018983841
epoch:1/10, batch:132/508, loss:0.09958145767450333
epoch:1/10, batch:133/508, loss:0.08008226007223129
epoch:1/10, batch:134/508, loss:0.1331702023744583
epoch:1/10, batch:135/508, loss:0.07687525451183319
epoch:1/10, batch:136/508, loss:0.08159011602401733
epoch:1/10, batch:137/508, loss:0.08214867860078812
epoch:1/10, batch:138/508, loss:0.08653660118579865
epoch:1/10, batch:139/508, loss:0.08022867143154144
epoch:1/10, batch:140/508, loss:0.0887611135840416
epoch:1/10, batch:141/508, loss:0.12374649196863174
epoch:1/10, batch:142/508, loss:0.08957083523273468
epoch:1/10, batch:143/508, loss:0.07674036175012589
epoch:1/10, batch:144/508, loss:0.07492111623287201
epoch:1/10, batch:145/508, loss:0.06743984669446945
epoch:1/10, batch:146/508, loss:0.09372374415397644
epoch:1/10, batch:147/508, loss:0.07385239005088806
epoch:1/10, bat

epoch:1/10, batch:255/508, loss:0.06415783613920212
epoch:1/10, batch:256/508, loss:0.11833930760622025
epoch:1/10, batch:257/508, loss:0.1142667680978775
epoch:1/10, batch:258/508, loss:0.12293071299791336
epoch:1/10, batch:259/508, loss:0.13567987084388733
epoch:1/10, batch:260/508, loss:0.13079805672168732
epoch:1/10, batch:261/508, loss:0.0925537720322609
epoch:1/10, batch:262/508, loss:0.04757543280720711
epoch:1/10, batch:263/508, loss:0.07988748699426651
epoch:1/10, batch:264/508, loss:0.14379945397377014
epoch:1/10, batch:265/508, loss:0.055405762046575546
epoch:1/10, batch:266/508, loss:0.11227753013372421
epoch:1/10, batch:267/508, loss:0.11142270267009735
epoch:1/10, batch:268/508, loss:0.11688664555549622
epoch:1/10, batch:269/508, loss:0.07529149204492569
epoch:1/10, batch:270/508, loss:0.09170898795127869
epoch:1/10, batch:271/508, loss:0.11453507095575333
epoch:1/10, batch:272/508, loss:0.07868916541337967
epoch:1/10, batch:273/508, loss:0.07169362157583237
epoch:1/10, b

epoch:1/10, batch:381/508, loss:0.05859388783574104
epoch:1/10, batch:382/508, loss:0.1103350892663002
epoch:1/10, batch:383/508, loss:0.08043211698532104
epoch:1/10, batch:384/508, loss:0.06245983764529228
epoch:1/10, batch:385/508, loss:0.11905223876237869
epoch:1/10, batch:386/508, loss:0.07976298034191132
epoch:1/10, batch:387/508, loss:0.0702722817659378
epoch:1/10, batch:388/508, loss:0.09807973355054855
epoch:1/10, batch:389/508, loss:0.13630197942256927
epoch:1/10, batch:390/508, loss:0.06581863760948181
epoch:1/10, batch:391/508, loss:0.06663747876882553
epoch:1/10, batch:392/508, loss:0.08755485713481903
epoch:1/10, batch:393/508, loss:0.11267269402742386
epoch:1/10, batch:394/508, loss:0.10319242626428604
epoch:1/10, batch:395/508, loss:0.17395727336406708
epoch:1/10, batch:396/508, loss:0.1303078532218933
epoch:1/10, batch:397/508, loss:0.07981802523136139
epoch:1/10, batch:398/508, loss:0.07730397582054138
epoch:1/10, batch:399/508, loss:0.14149557054042816
epoch:1/10, bat

epoch:1/10, batch:507/508, loss:0.08553814888000488
epoch:1/10, batch:508/508, loss:0.06603405624628067
epoch:2/10, batch:1/508, loss:0.0494852289557457
epoch:2/10, batch:2/508, loss:0.09168995916843414
epoch:2/10, batch:3/508, loss:0.1366259753704071
epoch:2/10, batch:4/508, loss:0.06433810293674469
epoch:2/10, batch:5/508, loss:0.11617773026227951
epoch:2/10, batch:6/508, loss:0.10645077377557755
epoch:2/10, batch:7/508, loss:0.06478830426931381
epoch:2/10, batch:8/508, loss:0.057500630617141724
epoch:2/10, batch:9/508, loss:0.09177706390619278
epoch:2/10, batch:10/508, loss:0.05528249964118004
epoch:2/10, batch:11/508, loss:0.08676209300756454
epoch:2/10, batch:12/508, loss:0.13217806816101074
epoch:2/10, batch:13/508, loss:0.039482150226831436
epoch:2/10, batch:14/508, loss:0.08643946796655655
epoch:2/10, batch:15/508, loss:0.06677962094545364
epoch:2/10, batch:16/508, loss:0.06086498126387596
epoch:2/10, batch:17/508, loss:0.09507714956998825
epoch:2/10, batch:18/508, loss:0.11380

KeyboardInterrupt: 

# Debugging

In [210]:
qids = list(train_idx_set.keys())[:25]
t, b, tl, bl = process_contxt_batch(qids, train_idx_set, batch_first=True)

In [574]:
embedding_layer = EmbeddingLayer(200, 667, 'cnn')
criterion = multi_margin_loss(hidden=embedding_layer.hidden_size)
# embedding_layer.title_hidden = embedding_layer.init_hidden(t.shape[1])
# embedding_layer.body_hidden = embedding_layer.init_hidden(b.shape[1])

In [556]:
title_qs = Variable(torch.FloatTensor(t))
body_qs = Variable(torch.FloatTensor(b))
title_qs.size()

torch.Size([902, 30, 200])

In [575]:
emb = embedding_layer(title_qs, body_qs, tl, bl)

after transpose: torch.Size([902, 200, 30])
after embedding: torch.Size([902, 667, 28])
after transpose: torch.Size([902, 28, 667])
after transpose: torch.Size([28, 902, 667])


In [559]:
# tmask = Variable(torch.FloatTensor(build_mask3d(tl - 3 + 1, np.max(tl) - 3 + 1)))
# bmask = Variable(torch.FloatTensor(build_mask3d(bl - 3 + 1, np.max(bl) - 3 + 1)))

In [None]:
# # te 
# # tmask = Variable(torch.FloatTensor(build_mask3d(tl - 3 + 1, np.max(tl) - 3 + 1)))
# # bmask = Variable(torch.FloatTensor(build_mask3d(bl - 3 + 1, np.max(bl) - 3 + 1)))
# temb = (torch.sum(torch.transpose(te, 0,1) * tmask, dim=0) / torch.sum(tmask, dim=0))
# bemb = (torch.sum(torch.transpose(be, 0,1) * bmask, dim=0) / torch.sum(bmask, dim=0))
# emb = (temb + bemb) / 2
# # torch.sum(te * tmask, dim=1)[17*22: 18*22]
# # te[17*22:18*22] * tmask[17*22:18*22]
# # tmask[17*22:18*22], 
# # tl[17*22:18*22]
# criterion(emb)

In [175]:
target = Variable(torch.LongTensor([1,1] + [0]*20))

In [176]:
loss = torch.nn.MultiMarginLoss()

In [179]:
loss(embeddings[:22], target)

Variable containing:
 1.0260
[torch.FloatTensor of size 1]

In [196]:
blocked_embeddings = embeddings.view(-1, 22, 240)

In [198]:
q_vecs = blocked_embeddings[:,0,:]
pos_vecs = blocked_embeddings[:,1,:]
neg_vecs = blocked_embeddings[:,2:,:]

In [199]:
pos_scores = torch.sum(q_vecs * pos_vecs, dim=1) / (torch.sqrt(torch.sum(q_vecs ** 2, dim=1)) \
                                    * torch.sqrt(torch.sum(pos_vecs ** 2, dim=1)))
neg_scores = torch.sum(torch.unsqueeze(q_vecs, dim=1) * neg_vecs, dim=2) \
/ (torch.unsqueeze(torch.sqrt(torch.sum(q_vecs ** 2, dim=1)), dim=1) * torch.sqrt(torch.sum( neg_vecs ** 2, dim=2)))

In [203]:
pos_scores[0], neg_scores[0]

(Variable containing:
  0.9956
 [torch.FloatTensor of size 1], Variable containing:
  0.9919
  0.9907
  0.9932
  0.9865
  0.9866
  0.9890
  0.9923
  0.9835
  0.9876
  0.9871
  0.9884
  0.9846
  0.9868
  0.9855
  0.9879
  0.9884
  0.9886
  0.9864
  0.9892
  0.9893
 [torch.FloatTensor of size 20])

In [223]:
q_vecs = blocked_embeddings[:,0,:]

In [224]:
pn_vecs = blocked_embeddings[:,1:,:]

In [225]:
scores = torch.sum(torch.unsqueeze(q_vecs, dim=1) * pn_vecs, dim=2) \
/ (torch.unsqueeze(torch.sqrt(torch.sum(q_vecs ** 2, dim=1)), dim=1) * torch.sqrt(torch.sum( pn_vecs ** 2, dim=2)))

In [229]:
scores

Variable containing:

Columns 0 to 9 
 0.9956  0.9919  0.9907  0.9932  0.9865  0.9866  0.9890  0.9923  0.9835  0.9876
 0.9903  0.9830  0.9841  0.9789  0.9809  0.9809  0.9718  0.9705  0.9728  0.9721
 0.9930  0.9897  0.9855  0.9897  0.9890  0.9820  0.9798  0.9854  0.9760  0.9834
 0.9867  0.9819  0.9823  0.9876  0.9819  0.9851  0.9798  0.9759  0.9894  0.9874
 0.9920  0.9864  0.9870  0.9819  0.9849  0.9806  0.9822  0.9809  0.9870  0.9651
 0.9929  0.9910  0.9900  0.9828  0.9876  0.9922  0.9914  0.9906  0.9916  0.9895
 0.9964  0.9934  0.9812  0.9916  0.9866  0.9881  0.9790  0.9898  0.9876  0.9864
 0.9941  0.9806  0.9755  0.9790  0.9821  0.9760  0.9773  0.9871  0.9782  0.9830
 0.9853  0.9871  0.9801  0.9824  0.9804  0.9677  0.9870  0.9842  0.9805  0.9844
 0.9833  0.9861  0.9755  0.9871  0.9849  0.9815  0.9762  0.9896  0.9809  0.9874
 0.9978  0.9925  0.9840  0.9818  0.9911  0.9924  0.9908  0.9857  0.9918  0.9911
 0.9929  0.9882  0.9847  0.9853  0.9857  0.9858  0.9930  0.9908  0.9923  0.9881
 0

In [230]:
criterion = torch.nn.MultiMarginLoss()

In [234]:
target = Variable(torch.zeros(scores.size(0)).type(torch.LongTensor)) 
criterion(scores, target)

Variable containing:
 0.9469
[torch.FloatTensor of size 1]