In [2]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

## Data Loadings

### word 2 vec repre

In [3]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

In [4]:
w2i_map = {}
for i, key in enumerate(w2v_map.keys()):
    w2i_map[key] = i

In [5]:
pkl.dump(w2i_map, open('data/word_idx_map.pkl','wb'))

### map Q idx to context

In [6]:
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
counter = 0
for _, val in w2v_map.items():
    w2v_matrix[counter] = val
    counter += 1

In [7]:
pkl.dump(w2i_map, open('data/w2v_matrix.pkl','wb'))

In [8]:
def w2v(w):
    return w2v_matrix[w2i_map[w]]

In [11]:
def sen2w(sen):
    processed = []
    sen = re.sub(r'[!#\'(),/:?\{}]', ' ', sen).strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        # ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [12]:
# fixed context len = 125
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            len_title = len(context[0])
            if len_title >= 125:
                context_repre[int(qid)] = {'t':sen2w(context[0])[:125], 'b': None}
            else:
                context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])[:125 - len_title]}

### len of context ??

In [631]:
all_lens = []
for k, v in context_repre.items():
    t, b = v['t'], v['b']
    if not v['b']:
        b = []
    all_lens += [len(t)+len(b)]

In [633]:
all(np.array(all_lens) < 125)

True

In [13]:
train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])

In [14]:
train_df.head()

Unnamed: 0,Q,Q+,Q-
0,262144,211039,227387 413633 113297 356390 256881 145638 2962...
1,491522,65911,155119 402211 310669 383107 131731 299465 1633...
2,240299,168608 390642,368007 70009 48077 376760 438005 228888 142340...
3,196614,205184,334471 163710 376791 441664 159963 406360 4300...
4,360457,321532,151863 501857 217578 470017 125838 31836 42066...


In [15]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

In [16]:
train_idx_set = build_set_pair_with_idx(train_df)

## Process Batch

In [17]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    v = np.zeros( (len(title) + len(body), 200) )
    counter = 0
    
    for t in title:
        v[counter] = w2v(t)
        counter += 1

    for b in body:
        v[counter] = w2v(b)
        counter += 1
    
    return v

In [18]:
def process_batch(qids, idx_set):
    
    total_pos_len = 0
    for qid in qids:
        total_pos_len += len(idx_set[qid]['pos'])
    
    # per batch element x: vstack, [query_Q x 1; pos_Q x 1; neg_Q x 20]
    # per batch element y: vstack, [query_Q=-1; pos_Q=1; neg_Q=0]
    batch_x = np.zeros(( total_pos_len * 22, 124, 200 ))
    seq_len = np.zeros(total_pos_len * 22)
    
    counter = 0
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        # usually one sample
        for qid_pos in q_pos:
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            # query Q
            if not q_body:
                q_seq_len = len(q_title)
            else:
                q_seq_len = len(q_title) + len(q_body)
            seq_len[counter] = q_seq_len
            batch_x[counter, :q_seq_len] = contxt2vec(q_title, q_body)
            counter += 1
            # pos Q
            if not body:
                pos_q_seq_len = len(title)
            else:
                pos_q_seq_len = len(title) + len(body)
            seq_len[counter] = pos_q_seq_len
            batch_x[counter, :pos_q_seq_len] = contxt2vec(title, body)
            counter += 1
        
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            # neg Q
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                if not body:
                    neg_q_seq_len = len(title)
                else:
                    neg_q_seq_len = len(title) + len(body)
                seq_len[counter] = neg_q_seq_len
                batch_x[counter, : neg_q_seq_len] = contxt2vec(title, body)
                counter += 1
    
    return batch_x, seq_len

## Models

In [19]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type):
        
        super(EmbeddingLayer, self).__init__()
        
        self.hidden_size = hidden_size
        
        if layer_type == 'lstm':
            self.embedding_layer = nn.LSTM(input_size, hidden_size)
        elif layer_type == 'cnn':
            pass
        
        self.tanh = nn.Tanh()
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (Variable(torch.zeros(1, 1, self.hidden_size)), Variable(torch.zeros(1, 1, self.hidden_size)))

    def forward(self, context, seq_len):
        lstm_out, self.hidden = self.embedding_layer(context, (self.tanh(self.hidden[0]), self.tanh(self.hidden[1])))
        mask = build_mask(seq_len)
        embeddings = torch.sum(lstm_out * Variable(torch.FloatTensor(mask)), dim=1) \
            / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
        return embeddings

In [20]:
def cos_sim(qv, qv_):
    return torch.sum(qv * qv_, dim=1) / (torch.sqrt(torch.sum(qv ** 2, dim=1)) * torch.sqrt(torch.sum(qv_ ** 2, dim=1)))

In [21]:
def criterion(embeddings):
    
#     embeddings = torch.sum(embeddings * Variable(torch.FloatTensor(mask)), dim=1) \
#             / Variable(torch.FloatTensor(np.sum(mask, axis=1)))
    # a batch of embeddings
    num_block = embeddings.size()[0] // 22
    loss = 0
    for i in range(num_block):
        block_embeddings = embeddings[ i * 22: (i + 1) * 22 ]
        qs = block_embeddings[0]
        qs_ = block_embeddings[1:22]
        pos_score = cos_sim(qs.expand(21, 240), qs_)[0]
        neg_score = torch.max(cos_sim(qs.expand(21, 240), qs_)[1:])
        diff = neg_score - pos_score + 1 # margin=1
        if diff.data[0] > 0:
            loss += diff
            
    return torch.mean(loss)

In [22]:
def build_mask(seq_len):
    mask = []
    for i, s in enumerate(seq_len):
        s_mask = np.zeros((124, 1))
        s_mask[:int(s)] = np.ones((int(s), 1))
        mask += [s_mask]
    return mask

In [23]:
def train(layer_type, batch_size=10, num_epoch=100):
    
    if layer_type == 'lstm':
        embedding_layer = EmbeddingLayer(200, 240, 'lstm')
    elif layer_type == 'cnn':
        pass
        
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.005)
    
    qids = list(train_idx_set.keys())
    # num_batch = len(qids) // batch_size
    num_batch = 10
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[batch_idx * batch_size: (batch_idx + 1) * batch_size]
            start = time.time()
            print ('processing batch {}'.format(batch_idx))
            padded_batch_x, seq_len = process_batch(batch_x_qids, train_idx_set)
            mask = build_mask(seq_len)
            print ('processing batch costs:', time.time() - start)
            start = time.time()
        
            qs = Variable(torch.FloatTensor(padded_batch_x))
            embeddings = embedding_layer(qs, mask)
            print ('embedding costs:', time.time() - start)
            optimizer.zero_grad()
            start = time.time()
            
            print ('accumulating loss costs:', time.time() - start)
            loss = criterion(embeddings, mask)
        
            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, batch_idx, num_batch, loss.data[0]))
            start = time.time()
            loss.backward()
            print ('backprop costs:', time.time() - start)
            optimizer.step()

In [24]:
train('lstm', num_epoch=10)

processing batch 1
processing batch costs: 0.12594103813171387


TypeError: only length-1 arrays can be converted to Python scalars