In [4]:
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time

## Data Loadings

### word 2 vec repre

In [5]:
w2v_map = {}
with open('data/vectors_pruned.200.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))

In [6]:
w2i_map = {}
for i, key in enumerate(w2v_map.keys()):
    w2i_map[key] = i

In [101]:
pkl.dump(w2i_map, open('data/word_idx_map.pkl','wb'))

### map Q idx to context

In [7]:
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
counter = 0
for _, val in w2v_map.items():
    w2v_matrix[counter] = val
    counter += 1

In [102]:
pkl.dump(w2i_map, open('data/w2v_matrix.pkl','wb'))

In [8]:
def w2v(w):
    return w2v_matrix[w2i_map[w]]

In [9]:
def sen2w(sen):
    processed = []
    sen = re.sub(r'[!#\'(),/:?\{}]', ' ', sen).strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        # ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [10]:
context_repre = {}
with open('data/text_tokenized.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        context = line.strip().split('\t')
        qid = context.pop(0)
        if len(context) == 1:
            context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
        else:
            context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])}

### pair building --- (q, q+), (q, q-)

In [11]:
train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])

In [12]:
train_df.head()

Unnamed: 0,Q,Q+,Q-
0,262144,211039,227387 413633 113297 356390 256881 145638 2962...
1,491522,65911,155119 402211 310669 383107 131731 299465 1633...
2,240299,168608 390642,368007 70009 48077 376760 438005 228888 142340...
3,196614,205184,334471 163710 376791 441664 159963 406360 4300...
4,360457,321532,151863 501857 217578 470017 125838 31836 42066...


In [13]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

In [14]:
train_idx_set = build_set_pair_with_idx(train_df)

## Process Batch

In [15]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    v = np.zeros( (len(title) + len(body), 200) )
    counter = 0
    
    for t in title:
        v[counter] = w2v(t)
        counter += 1

    for b in body:
        v[counter] = w2v(b)
        counter += 1
    
    return v

In [16]:
def process_batch(qids, idx_set):
    
    # per batch element x: vstack, [query_Q x 1; pos_Q x 1; neg_Q x 20]
    # per batch element y: vstack, [query_Q=-1; pos_Q=1; neg_Q=0]
    batch_x = []
    batch_y = []
    
    max_len = 0
    
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']

        # usually one sample
        for qid_pos in q_pos:
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            max_len = max(max_len, len(title) + len(body))
            # query Q
            batch_x += [ (q_title, q_body) ]
            batch_y += [-1]
            # pos Q
            batch_x += [ (title, body) ]
            batch_y += [1]
        
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            # neg Q
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                max_len = max(max_len, len(title) + len(body))
                batch_x += [ (title, body) ]
                batch_y += [0]

    seq_len = np.zeros((len(batch_x), ), dtype=np.int32)
    batch_y = np.array(batch_y)
    
    # padding
    padded_batch_x = np.zeros(( len(batch_x), max_len, 200 ))
    for i, (title, body) in enumerate(batch_x):
        qv = contxt2vec(title, body)
        seq_len[i] = int(qv.shape[0])
        padded_batch_x[i,:qv.shape[0]] = qv
    
    return padded_batch_x, batch_y, seq_len

In [17]:
qids = list(train_idx_set.keys())[:10]

In [18]:
batch_x, batch_y, seq_len = process_batch(qids, train_idx_set)

In [19]:
batch_y.shape

(242,)

## Models

In [151]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type):
        
        super(EmbeddingLayer, self).__init__()
        
        self.hidden_size = hidden_size
        
        if layer_type == 'lstm':
            self.embedding_layer = nn.LSTM(input_size, hidden_size)
        elif layer_type == 'cnn':
            pass
        
        self.tanh = nn.Tanh()
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (Variable(torch.zeros(1, 1, self.hidden_size)), Variable(torch.zeros(1, 1, self.hidden_size)))

    def forward(self, context):
        lstm_out, self.hidden = self.embedding_layer(context, (self.tanh(self.hidden[0]), self.tanh(self.hidden[1])))
        return lstm_out

In [23]:
def cos_sim(qv, qv_):
    return torch.dot(qv, qv_) / ( torch.sqrt(torch.sum(qv ** 2)) * torch.sqrt(torch.sum(qv_ ** 2) ) )

In [148]:
def criterion(pos_scores, neg_scores):
    diff = neg_scores - pos_scores + 1 # margin=1
    loss = torch.mean((diff > 0).float() * diff)
    return loss

In [143]:
embedding_layer = EmbeddingLayer(200, 240, 'lstm')

In [144]:
qs = Variable(torch.FloatTensor(batch_x))

In [159]:
def train():
    
    embedding_layer = EmbeddingLayer(200, 240, 'lstm')
    optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.005)
    
    start = time.time()
    qs = Variable(torch.FloatTensor(batch_x))
    embeddings = embedding_layer(qs)
    print ('embedding costs:', time.time() - start)
    optimizer.zero_grad()
    loss = 0
    start = time.time()
    
    pos_scores = torch.zeros(len(batch_y) // 22, 1)
    neg_scores = torch.zeros(len(batch_y) // 22, 1)
    
    for i, y in enumerate(batch_y):

        if y == -1:
            qv = torch.mean(embeddings[i][:seq_len[i]], dim=0)
            sub_neg_scores = torch.Tensor(20, 1)
            neg_counter = 0
        elif y == 1:
            pos_qv_ = torch.mean(embeddings[i][:seq_len[i]], dim=0)
            pos_scores[ i // 22 ] = cos_sim(qv, pos_qv_).data[0]
        else:
            neg_qv_ = torch.mean(embeddings[i][:seq_len[i]], dim=0)
            sub_neg_scores[neg_counter] = cos_sim(qv, neg_qv_).data[0]
            neg_counter += 1
        if (i + 1) % 22 == 0 and i > 0:
            neg_scores[ i // 22 ] = torch.max(sub_neg_scores)
    print (pos_scores, neg_scores)
    print ('accumulating loss costs:', time.time() - start)
    loss = criterion(Variable(pos_scores, requires_grad=True), Variable(neg_scores, requires_grad=True))
    print ('loss:', loss)
    start = time.time()
    loss.backward()
    print ('backprop costs:', time.time() - start)
    optimizer.step()

In [160]:
train()

embedding costs: 1.6369600296020508

 0.9935
 0.9980
 0.9990
 0.9982
 0.9984
 0.9992
 0.9980
 0.9973
 0.9947
 0.9982
 0.9995
[torch.FloatTensor of size 11x1]
 
 0.9822
 0.9973
 0.9986
 0.9988
 0.9982
 0.9986
 0.9981
 0.9979
 0.9955
 0.9975
 0.9981
[torch.FloatTensor of size 11x1]

accumulating loss costs: 0.036016225814819336
loss: Variable containing:
 0.9988
[torch.FloatTensor of size 1]

backprop costs: 0.0003552436828613281
