In [1]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

In [2]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

w2i_map = {}
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
for i, (key, val) in enumerate(w2v_map.items()):
    w2i_map[key] = i
    w2v_matrix[i] = val

def w2v(w):
    return w2v_matrix[w2i_map[w]]

def sen2w(sen):
    processed = []
    sen = sen.strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        #ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [3]:
# fixed context len = 125
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])}

In [4]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])
train_idx_set = build_set_pair_with_idx(train_df)

In [5]:
train_df.head()

Unnamed: 0,Q,Q+,Q-
0,262144,211039,227387 413633 113297 356390 256881 145638 2962...
1,491522,65911,155119 402211 310669 383107 131731 299465 1633...
2,240299,168608 390642,368007 70009 48077 376760 438005 228888 142340...
3,196614,205184,334471 163710 376791 441664 159963 406360 4300...
4,360457,321532,151863 501857 217578 470017 125838 31836 42066...


In [6]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    title_v = np.zeros( (len(title), 200) )
    
    for i, t in enumerate(title):
        title_v[i] = w2v(t)
    
    if len(body) > 0:
        body_v = np.zeros( (len(body), 200) )
        for i, b in enumerate(body):
            body_v[i] = w2v(b)
    
        return title_v, body_v
    
    return title_v, None

In [190]:
def process_contxt_batch(qids, idx_set):
    
    batch_title, batch_body = [], []
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    counter = 0
    y = []
    
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        for qid_pos in q_pos:

            # query Q
            title_len += [len(q_title)]
            batch_title += [ q_title ]
            max_title_len = max(max_title_len, len(q_title))
            if not q_body:
                body_len += [len(q_title)]
                batch_body += [ q_title ]
            else:
                batch_body += [ q_body ]
                body_len += [len(q_body)]
                max_body_len = max(max_body_len, len(q_body))
            y += [1]
            # pos Q
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            title_len += [len(title)]
            batch_title += [ title ]
            max_title_len = max(max_title_len, len(title))
            if not body:
                body_len += [len(title)]
                batch_body += [ title ]
            else:
                batch_body += [ body ]
                body_len += [len(body)]
                max_body_len = max(max_body_len, len(body))
            y += [1]
            # neg Q
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                title_len += [len(title)]
                batch_title += [ title ]
                max_title_len = max(max_title_len, len(title))
                if not body:
                    body_len += [len(title)]
                    batch_body += [ title ]
                else:
                    batch_body += [ body ]
                    body_len += [len(body)]
                    max_body_len = max(max_body_len, len(body))
                y += [0]
    # (max_seq_len, batch_size, feature_len)
    padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
    padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
    
    for i, (title, body) in enumerate(zip(batch_title, batch_body)):
        title_repre, body_repre = contxt2vec(title, body)
        padded_batch_title[:title_len[i], i] = title_repre
        padded_batch_body[:body_len[i], i] = body_repre
    #np.array(y).reshape(-1,1)
    return padded_batch_title, padded_batch_body, y,\
                np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1)

# Train Utility

In [191]:
def build_mask(seq_len):
    mask = []
    for i, s in enumerate(seq_len):
        s_mask = np.zeros((np.max(seq_len), 1))
        s_mask[:int(s)] = np.ones((int(s), 1))
        mask += [s_mask]
    return mask

def build_mask3d(seq_len):
    mask = np.zeros((np.max(seq_len), len(seq_len), 1))
    for i, s in enumerate(seq_len):
        mask[:int(s), i] = np.ones((int(s), 1))
    return mask

# def cos_sim(qv, qv_):
#     return torch.sum(qv * qv_, dim=1) / (torch.sqrt(torch.sum(qv ** 2, dim=1)) * torch.sqrt(torch.sum(qv_ ** 2, dim=1)))

def criterion(embeddings):
    
    # a batch of embeddings
#     num_block = embeddings.size()[0] // 22
    blocked_embeddings = embeddings.view(-1, 22, 240)
    q_vecs = blocked_embeddings[:,0,:]
    pos_vecs = blocked_embeddings[:,1,:]
    neg_vecs = blocked_embeddings[:,2:,:]
    
    pos_scores = torch.sum(q_vecs * pos_vecs, dim=1) / (torch.sqrt(torch.sum(q_vecs ** 2, dim=1)) \
                                               * torch.sqrt(torch.sum(pos_vecs ** 2, dim=1)))

    neg_scores = torch.sum(torch.unsqueeze(q_vecs, dim=1) * neg_vecs, dim=2) \
    / (torch.unsqueeze(torch.sqrt(torch.sum(q_vecs ** 2, dim=1)),dim=1) * torch.sqrt(torch.sum( neg_vecs ** 2, dim=2)))
    neg_scores = torch.max(neg_scores, dim=1)[0]
    
    print (torch.mean(neg_scores - pos_scores))
    
    diff = neg_scores - pos_scores + 1.
    loss = torch.mean((diff > 0).float() * diff)
#     for i in range(num_block):
#         block_embeddings = embeddings[ i * 22: (i + 1) * 22 ]
#         qs = block_embeddings[0]
#         qs_ = block_embeddings[1: 22]
#         cos_scores = cos_sim(qs.expand(21, 240), qs_)
#         pos_score = cos_scores[0]
#         neg_score = torch.max(cos_scores[1:])[0]
#         diff = neg_score - pos_score + 1 # margin=1
#         if diff.data[0] > 0:
#             loss += diff
            
    return loss #loss / num_block

# Model

In [192]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type, kernel_size=None):
        
        super(EmbeddingLayer, self).__init__()
        
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        
        if layer_type == 'lstm':
            
            self.layer_type = 'lstm'
            #self.title_embedding_layer = nn.LSTM(input_size, hidden_size)
            #self.body_embedding_layer = nn.LSTM(input_size, hidden_size)
            self.embedding_layer = nn.LSTM(input_size, hidden_size)
            self.tanh = nn.Tanh()
        
        elif layer_type == 'cnn':
            self.layer_type = 'cnn'
            self.embedding_layer = nn.Sequential(
                        nn.Conv1d(in_channels = 200,
                                  out_channels = self.hidden_size,
                                  kernel_size = kernel_size),
                        nn.Tanh())

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(1, batch_size, self.hidden_size)), \
                Variable(torch.zeros(1, batch_size, self.hidden_size)))

    def forward(self, title, body, title_len, body_len):
            
        if self.layer_type == 'lstm':
            
            
            title_lstm_out, self.title_hidden = self.embedding_layer(title, (self.tanh(self.title_hidden[0]), \
                                                                   self.tanh(self.title_hidden[1])))
            
            body_lstm_out, self.body_hidden = self.embedding_layer(body, (self.tanh(self.body_hidden[0]), \
                                                                   self.tanh(self.body_hidden[1])))
            
            
            title_mask = Variable(torch.FloatTensor(build_mask3d(title_len)))
            title_embeddings = torch.sum(title_lstm_out * title_mask, dim=0) / torch.sum(title_mask, dim=0)
            
            body_mask = Variable(torch.FloatTensor(build_mask3d(body_len)))
            body_embeddings = torch.sum(body_lstm_out * body_mask, dim=0) / torch.sum(body_mask, dim=0)
            
            embeddings = ( title_embeddings + body_embeddings ) / 2
        
            return embeddings

# Train

In [None]:
def train(layer_type, batch_size=25, num_epoch=100, id_set=train_idx_set):
    
    if layer_type == 'lstm':
        embedding_layer = EmbeddingLayer(200, 240, 'lstm')
    elif layer_type == 'cnn':
        embedding_layer = EmbeddingLayer(200, 240, 'cnn', kernel_size=3)
        
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.005)
    criterion = torch.nn.MultiMarginLoss()
    
    qids = list(id_set.keys())
    num_batch = len(qids) // batch_size
    
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[ ( batch_idx - 1 ) * batch_size: batch_idx * batch_size ]
            batch_title, batch_body, y, title_len, body_len = process_contxt_batch(batch_x_qids, train_idx_set)
            
            if layer_type == 'lstm':
                embedding_layer.title_hidden = embedding_layer.init_hidden(batch_title.shape[1])
                embedding_layer.body_hidden = embedding_layer.init_hidden(batch_body.shape[1])
            
            title_qs = Variable(torch.FloatTensor(batch_title))
            body_qs = Variable(torch.FloatTensor(batch_body))
            
            embeddings = embedding_layer(title_qs, body_qs, title_len, body_len)
            target = Variable(torch.LongTensor(y))
            
            optimizer.zero_grad()
            loss = criterion(embeddings, target)

            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, batch_idx, num_batch, loss.data[0]))
            
            loss.backward()
            optimizer.step()

In [195]:
train('lstm', num_epoch=10)

-------------------------------
epoch:1/10, batch:1/198, loss:0.9868218898773193
-------------------------------
-------------------------------
epoch:1/10, batch:2/198, loss:0.8651137351989746
-------------------------------
-------------------------------
epoch:1/10, batch:3/198, loss:0.635169267654419
-------------------------------
-------------------------------
epoch:1/10, batch:4/198, loss:0.44764813780784607
-------------------------------


KeyboardInterrupt: 

In [154]:
torch.nn.MultiMarginLoss()

MultiMarginLoss (
)

In [155]:
qids = list(train_idx_set.keys())[:25]
t, b, tl, bl = process_contxt_batch(batch_x_qids, train_idx_set)

In [157]:
embedding_layer = EmbeddingLayer(200, 240, 'lstm')
embedding_layer.title_hidden = embedding_layer.init_hidden(t.shape[1])
embedding_layer.body_hidden = embedding_layer.init_hidden(b.shape[1])

In [158]:
title_qs = Variable(torch.FloatTensor(t))
body_qs = Variable(torch.FloatTensor(b))
embeddings = embedding_layer(title_qs, body_qs, tl, bl)

In [175]:
target = Variable(torch.LongTensor([1,1] + [0]*20))

In [176]:
loss = torch.nn.MultiMarginLoss()

In [179]:
loss(embeddings[:22], target)

Variable containing:
 1.0260
[torch.FloatTensor of size 1]

In [178]:
embeddings

Variable containing:
-3.1115e-02 -7.0960e-03  2.7456e-02  ...  -3.1113e-02 -1.8998e-03 -6.6966e-03
-3.2506e-02 -8.2851e-03  3.1229e-02  ...  -2.8928e-02  1.6568e-04 -7.4809e-03
-3.2401e-02 -1.1434e-02  3.2729e-02  ...  -2.8260e-02  1.9398e-03 -8.8022e-03
                ...                   ⋱                   ...                
-3.4694e-02 -5.4238e-03  2.4188e-02  ...  -2.7358e-02  3.9551e-03 -1.2894e-02
-3.6714e-02 -7.3036e-03  3.6655e-02  ...  -3.0296e-02 -5.7206e-03 -1.5181e-03
-3.5976e-02 -7.6397e-03  3.0960e-02  ...  -2.5265e-02 -7.2806e-03 -7.7941e-03
[torch.FloatTensor of size 902x240]