In [43]:
% matplotlib inline
import pandas as pd
import pickle as pkl
import string
import numpy as np; np.random.seed(7)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import re
import time
import numpy as np

In [6]:
w2v_map = {}
with open('data/glove.6B.200d.txt', 'r') as src:
    src = src.read().strip().split('\n')
    for line in src:
        wv = line.strip().split(' ')
        word = wv.pop(0)
        w2v_map[word] = np.array(list(map(float, wv)))
        
w2i_map = {}
w2v_matrix = np.zeros(( len((w2v_map.keys())), 200 ))
for i, (key, val) in enumerate(w2v_map.items()):
    w2i_map[key] = i
    w2v_matrix[i] = val

def w2v(w):
    return w2v_matrix[w2i_map[w]]
    
def sen2w(sen):
    processed = []
    sen = sen.strip().split()
    if len(sen) > 100:
        sen = sen[:100]
    for w in sen:
        #ignore date
        if re.match(r'\d{1,}-\d{1,}-\d{1,}', w):
            continue
        if re.match(r'\d{1,}:\d{1,}', w):
            continue
        
        if w in w2i_map:
            processed += [w]
        else:
            separated = re.findall(r"[^\W\d_]+|\d+|[=`%$\^\-@;\[&_*>\].<~|+\d+]", w)
            if len(set(separated)) == 1:
                continue
            if separated.count('*') > 3 or separated.count('=') > 3:
                continue
            for separate_w in separated:
                if separate_w in w2i_map:
                    processed += [separate_w]
    return processed

In [9]:
def build_context_repre(path):
    context_repre = {}
    with open('data/' + path, 'r') as src:
        src = src.read().strip().split('\n')
        for line in src:
            context = line.strip().split('\t')
            qid = context.pop(0)
            if len(context) == 1:
                context_repre[int(qid)] = {'t': sen2w(context[0]), 'b': None}
            else:
                context_repre[int(qid)] = {'t':sen2w(context[0]), 'b': sen2w(context[1])}
    return context_repre

In [14]:
def build_set_pair_with_idx(df):
    idx_set = {}
    for idx, row in df.iterrows():
        idx_set[row['Q']] = {'pos': np.array(list(map(int, row['Q+'].split(' ')))), \
                             'neg': np.array(list(map(int, row['Q-'].split(' '))))}
    return idx_set

In [31]:
def read_android_set(pos_path, neg_path):
    
    idx_set = {}
    
    pos_file = open('data/' + pos_path, 'r')
    pos_src = pos_file.read().strip().split('\n')
    
    neg_file = open('data/' + neg_path, 'r')
    neg_src = neg_file.read().strip().split('\n')
    
    for pos in pos_src:
        pos = list(map(int, pos.split(' ')))
        if pos[0] in idx_set:
            idx_set[pos[0]]['pos'] += [pos[1]]
        else:
            idx_set[pos[0]] = {}
            idx_set[pos[0]]['pos'] = [pos[1]]
            idx_set[pos[0]]['neg'] = []
         
    for neg in neg_src:
        neg = list(map(int, neg.split(' ')))
        idx_set[neg[0]]['neg'] += [neg[1]]

    
    pos_file.close()
    neg_file.close()
    
    return idx_set

In [36]:
def contxt2vec(title, body=None):
    
    if body == None:
        body = []
    
    title_v = np.zeros( (len(title), 200) )
    
    for i, t in enumerate(title):
        title_v[i] = w2v(t)
    
    if len(body) > 0:
        body_v = np.zeros( (len(body), 200) )
        for i, b in enumerate(body):
            body_v[i] = w2v(b)
    
        return title_v, body_v
    
    return title_v, None

In [69]:
# create random batch
def sample_contxt_batch(context_repre, sample_size=128, batch_first=False):
    
    sampled_qids = np.random.choice(list(context_repre.keys()), sample_size)
    
    batch_title, batch_body = [], []
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    
    for qid in sampled_qids:
        
        title, body = context_repre[qid]['t'], context_repre[qid]['b']
        
        title_len += [len(title)]
        batch_title += [ title ]
        max_title_len = max(max_title_len, len(title))
        
        if not body:
            body_len += [len(title)]
            batch_body += [ title ]
        else:
            batch_body += [ body ]
            body_len += [len(body)]
            max_body_len = max(max_body_len, len(body))
        
    if batch_first:
        # for CNN
        padded_batch_title = np.zeros(( len(batch_title), max_title_len, 200)) 
        padded_batch_body = np.zeros(( len(batch_body),  max_body_len, 200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[i, :title_len[i]] = title_repre
            padded_batch_body[i, :body_len[i]] = body_repre
    else:
        # for LSTM
        # (max_seq_len, batch_size, feature_len)
        padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
        padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[:title_len[i], i] = title_repre
            padded_batch_body[:body_len[i], i] = body_repre

    return padded_batch_title, padded_batch_body, \
                np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1)

In [93]:
# create batch with order
def process_contxt_batch(qids, idx_set, context_repre, batch_first=False):
    
    batch_title, batch_body = [], []
    max_title_len, max_body_len = 0, 0
    title_len, body_len = [], []
    
    for qid in qids:
        
        q_title, q_body = context_repre[qid]['t'], context_repre[qid]['b']
        q_pos = idx_set[qid]['pos']
        
        if len(q_pos) > 20:
            q_pos = q_pos[:20]

        for qid_pos in q_pos:
            # query Q
            title_len += [len(q_title)]
            batch_title += [ q_title ]
            max_title_len = max(max_title_len, len(q_title))
            if not q_body:
                body_len += [len(q_title)]
                batch_body += [ q_title ]
            else:
                batch_body += [ q_body ]
                body_len += [len(q_body)]
                max_body_len = max(max_body_len, len(q_body))
                
            # pos Q
            title, body = context_repre[qid_pos]['t'], context_repre[qid_pos]['b']
            title_len += [len(title)]
            batch_title += [ title ]
            max_title_len = max(max_title_len, len(title))
            if not body:
                body_len += [len(title)]
                batch_body += [ title ]
            else:
                batch_body += [ body ]
                body_len += [len(body)]
                max_body_len = max(max_body_len, len(body))
            # neg Q
            q_neg = idx_set[qid]['neg']
            q_neg_sample_indices = np.random.choice(range(100), size=20)
            q_random_neg = q_neg[q_neg_sample_indices]
            
            for qid_neg in q_random_neg:
                title, body = context_repre[qid_neg]['t'], context_repre[qid_neg]['b']
                title_len += [len(title)]
                batch_title += [ title ]
                max_title_len = max(max_title_len, len(title))
                if not body:
                    body_len += [len(title)]
                    batch_body += [ title ]
                else:
                    batch_body += [ body ]
                    body_len += [len(body)]
                    max_body_len = max(max_body_len, len(body))
    
    if batch_first:
        # for CNN
        padded_batch_title = np.zeros(( len(batch_title), max_title_len, 200)) 
        padded_batch_body = np.zeros(( len(batch_body),  max_body_len, 200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[i, :title_len[i]] = title_repre
            padded_batch_body[i, :body_len[i]] = body_repre
    else:
        # for LSTM
        # (max_seq_len, batch_size, feature_len)
        padded_batch_title = np.zeros(( max_title_len, len(batch_title), 200)) 
        padded_batch_body = np.zeros(( max_body_len, len(batch_body),  200))
        for i, (title, body) in enumerate(zip(batch_title, batch_body)):
            title_repre, body_repre = contxt2vec(title, body)
            padded_batch_title[:title_len[i], i] = title_repre
            padded_batch_body[:body_len[i], i] = body_repre

    return padded_batch_title, padded_batch_body, \
                np.array(title_len).reshape(-1,1), np.array(body_len).reshape(-1,1)

# Model

In [132]:
LAMDA = 1e-3

In [133]:
class GradReverse(torch.autograd.Function):
    def forward(self, x):
        return x.view_as(x)

    def backward(self, grad_output):
        return (grad_output * -LAMDA) # need tune

def grad_reverse(x):
    return GradReverse()(x)

class DomainClassifer(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_classes):
        
        super(DomainClassifer, self).__init__()
        
        self.hidden_size = hidden_size
        self.domain_classifier = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes),
            nn.LogSoftmax()
        )

    def forward(self, embedding):
        embedding = grad_reverse(embedding)
        return self.domain_classifier(embedding)

In [102]:
class EmbeddingLayer(nn.Module):
    
    def __init__(self, input_size, hidden_size, layer_type, num_layer=1, kernel_size=3):
        
        super(EmbeddingLayer, self).__init__()
        
        self.num_layer = num_layer
        
        self.layer_type = layer_type
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        
        self.tanh = nn.Tanh()
        
        if self.layer_type == 'lstm':
            
            self.embedding_layer = nn.LSTM(self.input_size, hidden_size, bidirectional=True)
        
        elif self.layer_type == 'cnn':

            self.embedding_layer = nn.Sequential(
                        nn.Conv1d(in_channels = self.input_size,
                                  out_channels = self.hidden_size,
                                  kernel_size = self.kernel_size),
                        self.tanh)

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size)), \
                Variable(torch.zeros(self.num_layer*2, batch_size, self.hidden_size)))

    def forward(self, title, body, title_len, body_len):
        
            
        if self.layer_type == 'lstm':
            
            title_mask = Variable(torch.FloatTensor(build_mask3d(title_len, np.max(title_len))))
            body_mask = Variable(torch.FloatTensor(build_mask3d(body_len, np.max(body_len))))
            
            
            title_out, self.title_hidden = self.embedding_layer(title, (self.tanh(self.title_hidden[0]), \
                                                                   self.tanh(self.title_hidden[1])))
            body_out, self.body_hidden = self.embedding_layer(body, (self.tanh(self.body_hidden[0]), \
                                                                   self.tanh(self.body_hidden[1])))
        
        if self.layer_type == 'cnn':
            # batch first input
            title_mask = Variable(torch.FloatTensor(build_mask3d(title_len - self.kernel_size + 1,\
                                                                 np.max(title_len) - self.kernel_size + 1)))
            body_mask = Variable(torch.FloatTensor(build_mask3d(body_len - self.kernel_size + 1, \
                                                                np.max(body_len) - self.kernel_size + 1)))
            
            title = torch.transpose(title, 1, 2)
            body = torch.transpose(body, 1, 2)

            title_out =  self.embedding_layer(title)
            body_out =  self.embedding_layer(body)

            title_out = torch.transpose(title_out, 1, 2)
            body_out = torch.transpose(body_out, 1, 2)
        
            title_out = torch.transpose(title_out, 0, 1)
            body_out = torch.transpose(body_out, 0, 1)


        title_embeddings = torch.sum(title_out * title_mask, dim=0) / torch.sum(title_mask, dim=0)
        body_embeddings = torch.sum(body_out * body_mask, dim=0) / torch.sum(body_mask, dim=0)
        embeddings = ( title_embeddings + body_embeddings ) / 2
        
        return embeddings

In [103]:
def build_mask3d(seq_len, max_len):
    mask = np.zeros((max_len, len(seq_len), 1))
    for i, s in enumerate(seq_len):
        # only one word
        if int(s) == -1:
            mask[0, i] = 1
        # only two word
        elif int(s) == 0:
            mask[:2, i] = np.ones((2, 1))
        else: 
            mask[:int(s), i] = np.ones((int(s), 1))
    return mask

def multi_margin_loss(hidden, margin=0.50):
    
    def loss_func(embeddings):
        # a batch of embeddings
        blocked_embeddings = embeddings.view(-1, 22, hidden)
        q_vecs = blocked_embeddings[:,0,:]
        
        pos_vecs = blocked_embeddings[:,1,:]
        neg_vecs = blocked_embeddings[:,2:,:]

        pos_scores = torch.sum(q_vecs * pos_vecs, dim=1) / (torch.sqrt(torch.sum(q_vecs ** 2, dim=1)) \
                                                   * torch.sqrt(torch.sum(pos_vecs ** 2, dim=1)))
        neg_scores = torch.sum(torch.unsqueeze(q_vecs, dim=1) * neg_vecs, dim=2) \
        / (torch.unsqueeze(torch.sqrt(torch.sum(q_vecs ** 2, dim=1)),dim=1) * torch.sqrt(torch.sum( neg_vecs ** 2, dim=2)))
        neg_scores = torch.max(neg_scores, dim=1)[0]
        
        diff = neg_scores - pos_scores + margin
        loss = torch.mean((diff > 0).float() * diff)
        return loss

    return loss_func

In [135]:
def train( 
    embedding_layer, domain_classifier, 
    emb_batch_size=25, dc_batch_size=25,
    num_epoch=100, lamda=1e-3,
    id_set=None,train_from=None,sample_from=None,
    eval=True
    ):
    
    if embedding_layer.layer_type == 'lstm':
        
        margin_criterion = multi_margin_loss(hidden=embedding_layer.hidden_size * 2)
    
    elif embedding_layer.layer_type == 'cnn':
        
        margin_criterion = multi_margin_loss(hidden=embedding_layer.hidden_size)
        
    domain_criterion = torch.nn.NLLLoss()
    
    emb_optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=0.001)
    domain_optimizer = torch.optim.Adam(domain_classifier.parameters(), lr=0.001)
    
    qids = list(id_set.keys())
    num_batch = len(qids) // emb_batch_size
    
    for epoch in range(1, num_epoch + 1):
        
        for batch_idx in range(1, num_batch + 1):
            
            batch_x_qids = qids[ ( batch_idx - 1 ) * emb_batch_size: batch_idx * emb_batch_size ]
            
            ## Minimize margin loss
            if embedding_layer.layer_type == 'lstm':
                batch_title, batch_body, title_len, body_len = process_contxt_batch(batch_x_qids, \
                                                                                id_set, train_from)
                embedding_layer.title_hidden = embedding_layer.init_hidden(batch_title.shape[1])
                embedding_layer.body_hidden = embedding_layer.init_hidden(batch_body.shape[1])
            else:
                batch_title, batch_body, title_len, body_len = process_contxt_batch(batch_x_qids, \
                                                                                id_set, train_from, batch_first=True)
            
            src_title_qs = Variable(torch.FloatTensor(batch_title))
            src_body_qs = Variable(torch.FloatTensor(batch_body))
            src_embeddings = embedding_layer(src_title_qs, src_body_qs, title_len, body_len) # class label = ubuntu
            
            margin_loss = margin_criterion(src_embeddings)
            
            ## Domain classification
            if embedding_layer.layer_type == 'lstm':
                batch_title, batch_body, title_len, body_len = sample_contxt_batch(sample_from, \
                                                                                   sample_size=dc_batch_size)
                embedding_layer.title_hidden = embedding_layer.init_hidden(batch_title.shape[1])
                embedding_layer.body_hidden = embedding_layer.init_hidden(batch_body.shape[1])
            else:
                batch_title, batch_body, title_len, body_len = sample_contxt_batch(sample_from, \
                                                                                   batch_first=True)
            
            # sample title, body from android dataset
            target_title_qs = Variable(torch.FloatTensor(batch_title))
            target_body_qs = Variable(torch.FloatTensor(batch_body))
            # class label = android
            target_embeddings = embedding_layer(target_title_qs, target_body_qs, title_len, body_len) 
            embedding_X = torch.cat((src_embeddings[:dc_batch_size], target_embeddings), 0)
            
            src_label = torch.zeros(dc_batch_size).type(torch.LongTensor)
            target_label = torch.ones(dc_batch_size).type(torch.LongTensor)
            embedding_Y = torch.cat((src_label, target_label), 0)
            
            # prepare for shuffle
            train_loader = DataLoader(TensorDataset(embedding_X.data, embedding_Y), \
                                      batch_size=embedding_Y.size(0), shuffle=True)
            
            for x, y in train_loader:
                domain_loss = domain_criterion(domain_classifier(Variable(x)), Variable(y))
            
            
            loss = margin_loss - lamda * domain_loss
            print ('epoch:{}/{}, batch:{}/{}, loss:{}'.format(epoch, num_epoch, \
                                                              batch_idx, num_batch, loss.data[0]))
            
            emb_optimizer.zero_grad()
            domain_optimizer.zero_grad()
            loss.backward()
            emb_optimizer.step()
            domain_optimizer.step()

In [107]:
ubuntu_train_df = pd.read_csv('data/train_random.txt', header=None, delimiter='\t', names=['Q','Q+','Q-'])
ubuntu_train_idx_set = build_set_pair_with_idx(train_df)

In [34]:
android_dev_idx_set = read_android_set('android/dev.pos.txt', 'android/dev.neg.txt')
android_test_idx_set = read_android_set('android/test.pos.txt', 'android/test.neg.txt')

In [10]:
ubuntu_context_repre = build_context_repre('text_tokenized.txt')

In [11]:
android_context_repre =  build_context_repre('android/corpus.tsv')

In [None]:
embedding_layer = EmbeddingLayer(200, 120, 'lstm')
domain_classifier = DomainClassifer(240, hidden_size=128, num_classes=2)
train( 
    embedding_layer, 
    domain_classifier, 
    lamda = LAMDA,
    id_set=ubuntu_train_idx_set,
    train_from=ubuntu_context_repre,
    sample_from=android_context_repre
)

epoch:1/100, batch:1/508, loss:0.5144846439361572
epoch:1/100, batch:2/508, loss:0.5101043581962585
epoch:1/100, batch:3/508, loss:0.5098358392715454
epoch:1/100, batch:4/508, loss:0.5008068680763245
epoch:1/100, batch:5/508, loss:0.5049743056297302
epoch:1/100, batch:6/508, loss:0.5006970167160034
epoch:1/100, batch:7/508, loss:0.4971328377723694
epoch:1/100, batch:8/508, loss:0.4975408613681793
epoch:1/100, batch:9/508, loss:0.49646589159965515
epoch:1/100, batch:10/508, loss:0.4962424337863922
epoch:1/100, batch:11/508, loss:0.5003198981285095
epoch:1/100, batch:12/508, loss:0.5003091096878052
epoch:1/100, batch:13/508, loss:0.4962831437587738
epoch:1/100, batch:14/508, loss:0.4965842366218567
epoch:1/100, batch:15/508, loss:0.4998915493488312
epoch:1/100, batch:16/508, loss:0.49837401509284973
epoch:1/100, batch:17/508, loss:0.4994613230228424
epoch:1/100, batch:18/508, loss:0.49879372119903564
epoch:1/100, batch:19/508, loss:0.49664756655693054
epoch:1/100, batch:20/508, loss:0.49