In [1]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

In [105]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

w2i_map = {}
for i, key in enumerate(w2v_map.keys()):
    w2i_map[key] = i

w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
counter = 0
for _, val in w2v_map.items():
    w2v_matrix[counter] = val
    counter += 1

def w2v(w):
    return w2v_matrix[w2i_map[w]]

def sen2w(sen):
    processed = []
    sen = sen.strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        #ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [106]:
# fixed context len = 125
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])}

In [15]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])
train_idx_set = build_set_pair_with_idx(train_df)

In [16]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    title_v = np.zeros( (len(title), 200) )
    
    for i, t in enumerate(title):
        title_v[i] = w2v(t)
    
    if len(body) > 0:
        body_v = np.zeros( (len(body), 200) )
        for i, b in enumerate(body):
            body_v[i] = w2v(b)
    
        return title_v, body_v
    
    return title_v, None

In [113]:
def process_contxt_batch(qids, idx_set):
    
    batch_title, batch_body = [], []
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    counter = 0
    
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        for qid_pos in q_pos:

            # query Q
            title_len += [len(q_title)]
            batch_title += [ q_title ]
            max_title_len = max(max_title_len, len(q_title))
            if not q_body:
                body_len += [len(q_title)]
                batch_body += [ q_title ]
            else:
                batch_body += [ q_body ]
                body_len += [len(q_body)]
                max_body_len = max(max_body_len, len(q_body))
                
            # pos Q
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            title_len += [len(title)]
            batch_title += [ title ]
            max_title_len = max(max_title_len, len(title))
            if not body:
                body_len += [len(title)]
                batch_body += [ title ]
            else:
                batch_body += [ body ]
                body_len += [len(body)]
                max_body_len = max(max_body_len, len(body))
        
            # neg Q
            
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                title_len += [len(title)]
                batch_title += [ title ]
                max_title_len = max(max_title_len, len(title))
                if not body:
                    body_len += [len(title)]
                    batch_body += [ title ]
                else:
                    batch_body += [ body ]
                    body_len += [len(body)]
                    max_body_len = max(max_body_len, len(body))
    
    # (max_seq_len, batch_size, feature_len)
    padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
    padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
    
    for i, (title, body) in enumerate(zip(batch_title, batch_body)):
        title_repre, body_repre = contxt2vec(title, body)
        padded_batch_title[:title_len[i], i] = title_repre
        padded_batch_body[:body_len[i], i] = body_repre
    
    return padded_batch_title, padded_batch_body, np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1)

# Train Utility

In [203]:
def build_mask(seq_len):
    mask = []
    for i, s in enumerate(seq_len):
        s_mask = np.zeros((np.max(seq_len), 1))
        s_mask[:int(s)] = np.ones((int(s), 1))
        mask += [s_mask]
    return mask

def cos_sim(qv, qv_):
    return torch.sum(qv * qv_, dim=1) / (torch.sqrt(torch.sum(qv ** 2, dim=1)) * torch.sqrt(torch.sum(qv_ ** 2, dim=1)))

def criterion(embeddings):
    
    # a batch of embeddings
    num_block = embeddings.size()[0] // 22
    loss = 0
    for i in range(num_block):
        block_embeddings = embeddings[ i * 22: (i + 1) * 22 ]
        qs = block_embeddings[0]
        qs_ = block_embeddings[1: 22]
        cos_scores = cos_sim(qs.expand(21, 240), qs_)
        pos_score = cos_scores[0]
        neg_score = torch.max(cos_scores[1:])
        diff = neg_score - pos_score + 1 # margin=1
        if diff.data[0] > 0:
            loss += diff
            
    return loss / num_block

# Model

In [187]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type, kernel_size=None):
        
        super(EmbeddingLayer, self).__init__()
        
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        
        if layer_type == 'lstm':
            self.layer_type = 'lstm'
            self.embedding_layer = nn.LSTM(input_size, hidden_size)
            self.tanh = nn.Tanh()
        elif layer_type == 'cnn':
            self.layer_type = 'cnn'
            self.embedding_layer = nn.Sequential(
                        nn.Conv1d(in_channels = 200,
                                  out_channels = self.hidden_size,
                                  kernel_size = kernel_size),
                        nn.Tanh())

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(1, batch_size, self.hidden_size)), \
                Variable(torch.zeros(1, batch_size, self.hidden_size)))

    def forward(self, context, seq_len):
            
        if self.layer_type == 'lstm':
            
            
            lstm_out, self.hidden = self.embedding_layer(context, (self.tanh(self.hidden[0]), \
                                                                   self.tanh(self.hidden[1])))
            
            mask = build_mask(seq_len)
            mask = Variable(torch.FloatTensor(mask)).view(lstm_out.size()[0], -1, 1)
            embeddings = torch.sum(lstm_out * mask, dim=0) / ( torch.sum(mask, dim=0) + 1e-8)

#             mask = build_mask(seq_len)
#             embeddings = torch.sum(lstm_out.view(-1, lstm_out.size()[0], self.hidden_size) \
#                                    * Variable(torch.FloatTensor(mask)), dim=1) \
#                 / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
        
#         elif self.layer_type == 'cnn':
            
#             cnn_out = self.embedding_layer(context.view(-1, context.size()[2], context.size()[1]))
#             mask = build_mask(seq_len - self.kernel_size + 1, 124 - self.kernel_size + 1)
#             embeddings = torch.sum(cnn_out.view(-1, 124 - self.kernel_size + 1, self.hidden_size) \
#                                    * Variable(torch.FloatTensor(mask)), dim=1) \
#                 / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
        
            return embeddings

# Train

In [189]:
def train(layer_type, batch_size=25, num_epoch=100):
    
    if layer_type == 'lstm':
        embedding_layer = EmbeddingLayer(200, 240, 'lstm')
    elif layer_type == 'cnn':
        embedding_layer = EmbeddingLayer(200, 240, 'cnn', kernel_size=3)
        
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.005)
    
    qids = list(train_idx_set.keys())
    num_batch = len(qids) // batch_size
    
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[ ( batch_idx - 1 ) * batch_size: batch_idx * batch_size]
            start = time.time()
            print ('processing batch {}'.format(batch_idx))
            batch_title, batch_body, title_len, body_len = process_contxt_batch(batch_x_qids, train_idx_set)
            print ('processing batch costs:', time.time() - start)
            
            if layer_type == 'lstm':
                embedding_layer.hidden = embedding_layer.init_hidden(batch_title.shape[1])
            
            start = time.time()
            
            title_qs = Variable(torch.FloatTensor(batch_title))
            body_qs = Variable(torch.FloatTensor(batch_body))
            
            title_embeddings = embedding_layer(title_qs, title_len)
            body_embeddings = embedding_layer(body_qs, body_len)
            
            contxt_embeddings = ( title_embeddings + body_embeddings ) / 2
            print ('embedding costs:', time.time() - start)
            
            optimizer.zero_grad()
            loss = criterion(contxt_embeddings)

            print ('-------------------------------')
            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, batch_idx, num_batch, loss.data[0]))
            print ('-------------------------------')
            start = time.time()
            loss.backward()
            print ('backprop costs:', time.time() - start)
            optimizer.step()

In [190]:
train('lstm')

processing batch 1
processing batch costs: 0.49100685119628906
embedding costs: 10.933600902557373
-------------------------------
epoch:1/100, batch:1/508, loss:1.000967025756836
-------------------------------
backprop costs: 8.534204244613647
processing batch 2
processing batch costs: 1.0649118423461914
embedding costs: 7.769730091094971
-------------------------------
epoch:1/100, batch:2/508, loss:1.000191330909729
-------------------------------
backprop costs: 5.0320963859558105
processing batch 3
processing batch costs: 0.8480250835418701
embedding costs: 13.439337968826294
-------------------------------
epoch:1/100, batch:3/508, loss:1.000032901763916
-------------------------------
backprop costs: 9.811716079711914
processing batch 4
processing batch costs: 1.7102789878845215
embedding costs: 8.348220825195312
-------------------------------
epoch:1/100, batch:4/508, loss:1.0000029802322388
-------------------------------
backprop costs: 5.67284893989563
processing batch 5
p

KeyboardInterrupt: 

In [36]:
qids = list(train_idx_set.keys())[0:25]

In [168]:
batch_title, batch_body, title_len, body_len = process_contxt_batch(qids, train_idx_set)

In [169]:
title_qs = Variable(torch.FloatTensor(batch_title))
body_qs = Variable(torch.FloatTensor(batch_body))

In [178]:
embedding_layer = EmbeddingLayer(200, 240, 'lstm')
embedding_layer.hidden = embedding_layer.init_hidden(batch_title.shape[1])

title_embeddings = embedding_layer(title_qs, title_len)
body_embeddings = embedding_layer(body_qs, body_len)

In [179]:
c = (title_embeddings + body_embeddings) / 2

In [180]:
c[-22:]

Variable containing:
1.00000e-02 *
-0.0380  1.2984  0.2760  ...  -2.6460  2.4250 -0.6759
 0.2273  1.2945  0.4926  ...  -2.7629  2.4335 -0.6174
 0.0910  1.7504  0.5326  ...  -2.3675  2.3776 -0.8221
          ...             ⋱             ...          
 0.1120  1.7249  0.7047  ...  -2.5319  2.4124 -0.5000
 0.4159  1.1858  0.5571  ...  -2.8192  2.3473 -0.1939
-0.2448  1.8618  0.8149  ...  -2.5857  2.4927 -0.8622
[torch.FloatTensor of size 22x240]

In [98]:
train_idx_set

[262144,
 491522,
 240299,
 196614,
 360457,
 425996,
 163842,
 393230,
 393231,
 491536,
 327698,
 121116,
 397172,
 294938,
 32795,
 32798,
 33,
 34,
 229411,
 37,
 294951,
 496988,
 196650,
 294955,
 73107]

In [132]:
train_idx_set[73107]

{'neg': array([ 66935, 426300,  70518, 232599, 523074, 210772, 229721, 421797,
        124315,  69105, 248750,  70052, 206788, 248606, 294643,   2340,
         57833, 237004, 245933, 256280, 346767,   5084,  16776,  55312,
        100586,  53443, 136515, 132076, 340324, 157574, 378966,  94040,
        342049, 286386, 256085, 406437, 426281, 367553, 416229, 276123,
        397066, 116051, 231825, 306014, 253084,  29443, 110394,   6424,
        407490,  53489, 489072, 205811,  21303, 377515, 208037,  66606,
        206277, 246931, 335508, 507425, 393560, 429556, 352150, 331768,
         67155,   6307, 192036, 396483, 315431, 376003, 448796,  87677,
        391839, 160160, 278122, 360817, 510617,  73323, 331227,  70037,
        417418, 367338,  92541, 520528, 237451, 230574, 248240, 322035,
        445612, 124040, 516399, 150639, 453936, 405564, 452362, 326452,
         82445, 167025, 104373, 486989]), 'pos': array([82395])}

In [112]:
' '.join(context_repre[73107]['b'])

'after upgrading to 11.10 i ca n t change the launcher icon size to smaller . i tried to change it using ccsm like described in how can i configure unity but the changes take no effects restarted tried sudo - still big icons . in the previous version of ubuntu this solution worked . how can i change the icon size btw how can i know which unity 2d vd 3d i running and switch between . edit looks like i using unity 2d'

In [135]:
' '.join(context_repre[407490]['b'])

'i using ubuntu one my laptop for over a half of year now and i upgraded it to 13.10 a month ago . it did n t have any major problems before . but today after few hours of work it stopped responding completly and i had to restart it . i logged on and about minute later it freezed again . now it happens every time laptop boots up . keyboard and mouse does n t work either .'

In [116]:
t, b = contxt2vec(context_repre[405564]['t'], context_repre[405564]['b'])

In [117]:
t.shape, b.shape

((10, 200), (98, 200))

In [197]:
criterion(c[-132:])

Variable containing:
 1.0008
[torch.FloatTensor of size 1]

In [145]:
w2i_map['doesnot']

20951

In [158]:
batch_body[:,-22],batch_body[:,-19]

(array([[ 0.066099, -0.072827,  0.07531 , ..., -0.043089,  0.053407,
          0.004687],
        [ 0.066099, -0.072827,  0.07531 , ..., -0.043089,  0.053407,
          0.004687],
        [ 0.029121, -0.017085, -0.048705, ...,  0.000808,  0.03283 ,
         -0.099112],
        ..., 
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ]]),
 array([[ 0.091238, -0.121687, -0.018059, ...,  0.16513 , -0.005217,
          0.059096],
        [ 0.091238, -0.121687, -0.018059, ...,  0.16513 , -0.005217,
          0.059096],
        [ 0.06426 , -0.095106,  0.043716, ..., -0.013996, -0.030572,
         -0.026798],
        ..., 
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
        [ 0.      ,  0.  

In [201]:
loss = 0
loss += torch.max(cos_sim(c[-22].expand(21,240), c[-21:]))

In [202]:
loss

Variable containing:
 0.9984
[torch.FloatTensor of size 1]