# Propose Master Triplet Loss

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
METHOD = 'propose_master_triplet_loss'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = 'propose_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'propose_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=322339), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39545), HTML(value='')))




#### Loading bug ids in memory

In [10]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


361006

#### Dicionário de títulos e descrições

In [11]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 2min 17s, sys: 4.39 s, total: 2min 21s
Wall time: 2min 17s


#### Hashing bugs by buckets

In [12]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=321536), HTML(value='')))




#### Prepare the train and test

In [13]:
%%time

experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids
CPU times: user 2min 37s, sys: 29.7 ms, total: 2min 37s
Wall time: 2min 37s


#### Recovery bug ids from train

In [14]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

#### Display a random bug

In [15]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

{'bug_severity': '0\n',
 'bug_status': '2\n',
 'component': '527\n',
 'creation_ts': '2009-03-24 04:03:00 -0400',
 'delta_ts': '2009-05-08 10:06:15 -0400',
 'description': 'type in any method code return this this string no highlight as error but this error correct code next return this',
 'description_word': array([  61,   10,  255,   22,   81,  296,   23,   23,  118,  117, 1855,
          56,   51,   92,   23,   51,  700,   81,  344,  296,   23,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0

### Generating the batch test

In [16]:
"Train ", len(baseline.dup_sets_train)

('Train ', 34882)

In [17]:
 # data - path
# batch_size - 128
# n_neg - 1

import random

def get_neg_bug(invalid_bugs, bug_ids, issues_by_buckets):
    neg_bug = random.choice(list(issues_by_buckets.keys()))
    try:
        while neg_bug in invalid_bugs or neg_bug not in issues_by_buckets:
            neg_bug = random.choice(bug_ids)
    except:
        invalid_bugs = [invalid_bugs]
        while neg_bug in invalid_bugs or neg_bug not in issues_by_buckets:
            neg_bug = random.choice(bug_ids)
    return neg_bug

def batch_iterator(baseline, data, dup_sets, bug_train_ids, batch_size, n_neg, issues_by_buckets):
    # global train_data
    # global self.dup_sets
    # global self.bug_ids
    # global self.bug_set

    random.shuffle(data)

    batch_input, batch_pos, batch_neg, master_batch_input, master_batch_neg = {'title' : [], 'desc' : [], 'info' : []}, \
                                            {'title' : [], 'desc' : [], 'info' : []}, \
                                                {'title' : [], 'desc' : [], 'info' : []},\
                                                    {'title' : [], 'desc' : [], 'info' : []}, \
                                                        {'title' : [], 'desc' : [], 'info' : []}

    n_train = len(data)

    batch_triplets = []

    for offset in range(batch_size):
        neg_bug = baseline.get_neg_bug(dup_sets[data[offset][0]], bug_train_ids)
        anchor, pos, neg = data[offset][0], data[offset][1], neg_bug
        bug_anchor = baseline.bug_set[anchor]
        bug_pos = baseline.bug_set[pos]
        bug_neg = baseline.bug_set[neg]
        # master anchor and neg
        master_anchor = baseline.bug_set[issues_by_buckets[anchor]]
        master_neg = baseline.bug_set[issues_by_buckets[neg]]
        
        baseline.read_batch_bugs(batch_input, bug_anchor)
        baseline.read_batch_bugs(batch_pos, bug_pos)
        baseline.read_batch_bugs(batch_neg, bug_neg)
        # master anchor and neg
        baseline.read_batch_bugs(master_batch_input, master_anchor)
        baseline.read_batch_bugs(master_batch_neg, master_neg)
        # triplet bug and master
        batch_triplets.append([data[offset][0], data[offset][1], neg_bug, master_anchor, master_neg])

    batch_input['title'] = np.array(batch_input['title'])
    batch_input['desc'] = np.array(batch_input['desc'])
    batch_input['info'] = np.array(batch_input['info'])
    batch_pos['title'] = np.array(batch_pos['title'])
    batch_pos['desc'] = np.array(batch_pos['desc'])
    batch_pos['info'] = np.array(batch_pos['info'])
    batch_neg['title'] = np.array(batch_neg['title'])
    batch_neg['desc'] = np.array(batch_neg['desc'])
    batch_neg['info'] = np.array(batch_neg['info'])
    
    # master
    master_batch_input['title'] = np.array(master_batch_input['title'])
    master_batch_input['desc'] = np.array(master_batch_input['desc'])
    master_batch_input['info'] = np.array(master_batch_input['info'])
    
    master_batch_neg['title'] = np.array(master_batch_neg['title'])
    master_batch_neg['desc'] = np.array(master_batch_neg['desc'])
    master_batch_neg['info'] = np.array(master_batch_neg['info'])

    n_half = len(batch_triplets) // 2
    if n_half > 0:
        pos = np.full((1, n_half), 1)
        neg = np.full((1, n_half), 0)
        sim = np.concatenate([pos, neg], -1)[0]
    else:
        sim = np.array([np.random.choice([1, 0])])

    input_sample, input_pos, input_neg, master_input_sample, master_neg = {}, {}, {}, {}, {}

    input_sample = { 'title' : batch_input['title'], 'description' : batch_input['desc'], 'info' : batch_input['info'] }
    input_pos = { 'title' : batch_pos['title'], 'description' : batch_pos['desc'], 'info': batch_pos['info'] }
    input_neg = { 'title' : batch_neg['title'], 'description' : batch_neg['desc'], 'info': batch_neg['info'] }
    # master 
    master_input_sample = { 'title' : master_batch_input['title'], 'description' : master_batch_input['desc'], 
                           'info' : master_batch_input['info'] }
    master_neg = { 'title' : master_batch_neg['title'], 'description' : master_batch_neg['desc'], 
                           'info' : master_batch_neg['info'] }
    return batch_triplets, input_sample, input_pos, input_neg, master_input_sample, master_neg, sim #sim

### Train ids

In [18]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [19]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, \
                            valid_master_sample, valid_master_neg, valid_sim = batch_iterator(baseline, baseline.train_data, 
                                                                                          baseline.dup_sets_train,
                                                                                          bug_train_ids,
                                                                                          batch_size_test, 1, issues_by_buckets)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

CPU times: user 50.4 ms, sys: 773 µs, total: 51.2 ms
Wall time: 50.8 ms


In [20]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((128, 100), (128, 500), (128, 1682), (128,))

### Validar entrada

In [21]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

***Title***: the add implemented methods hover should be resizeable or should have at least a scrollbar
***Title***: no scroll bars in completion box help panel
***Description***: the add implemented methods dialog should be resizeable or should have at least a scrollbar otherwise it would be enough to know e g i have to implement methods because currently i can not see the full signature of the methods
***Description***: build id i steps to reproduce open completion box in the java editor by typing for example system more information there are no scroll bars in the yellow help popup beside the completion box when the javadoc description is larger than the window i am using ubuntu linux with eclipse rc i this used to work with previous versions
***similar = 1
########################
***Title***: action filter providers extension has attribute based on unknown interface
***Title***: can not select java class in log listener extension
***Description***: the class attribute of the action

## Pre-trained embeddings

Loading pretrained word vectors

### Fasttext

In [22]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed_fasttext.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

vocabulary loaded


In [23]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 92499'

In [24]:
import io

def generating_embed(baseline, EMBED_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(EMBED_DIR, 'crawl-300d-2M.vec')
    f = open(embed_path, 'rb')
    f = io.open(embed_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, f.readline().split())

    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed_fasttext.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading FastText")
    for line in loop:
        tokens = line.rstrip().split(' ')
        embed = list(map(float, tokens[1:]))
        word = tokens[0]
        embeddings_index[word] = np.asarray(embed, dtype='float32')
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in FastText 42B 300d.' % len(embeddings_index))

    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    i = 0
    for word, embed in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
        else:
            embedding_matrix[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [25]:
%%time

generating_embed(baseline, EMBED_DIR=EMBED_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

vocabulary loaded


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1999995 word vectors in FastText 42B 300d.


HBox(children=(IntProgress(value=0, max=92499), HTML(value='')))


CPU times: user 2min 1s, sys: 3.5 s, total: 2min 5s
Wall time: 2min 2s


## Experiment

## Propose

https://github.com/tqtg/DuplicateBugFinder

In [26]:
from keras.initializers import RandomUniform, RandomNormal, Ones

### Embedding layer

In [27]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable, name):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer_{}'.format(name),
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### CNN Dilated

In [28]:
from keras.constraints import max_norm
from keras.layers import MaxPooling1D
import math

def DC_CNN_Block(nb_filter, filter_length, dilation, l2_layer_reg):
    def block(block_input):        
        residual =    block_input
        
        number_of_layers = K.int_shape(block_input)[2]
        
        layer_out =   Conv1D(filters=nb_filter, kernel_size=filter_length, 
                      dilation_rate=dilation, 
                      activation='linear', padding='causal', use_bias=False)(block_input) #kernel_regularizer=l2(l2_layer_reg)                    
        
        activation_out = Activation('tanh')(layer_out)
        
        skip_out =    Dense(number_of_layers, activation='linear', use_bias=False)(activation_out) # use_bias=False, kernel_constraint=max_norm(1.)
        
        c1x1_out =    Dense(number_of_layers, activation='linear', use_bias=False)(activation_out)
                      
        block_out =   Add()([residual, c1x1_out])
        
        return block_out, skip_out
    return block

def cnn_dilated_model(units, number_of_layers, embedding_layer, title_layer, max_sequence_length, name):
    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput_CNND_{}'.format(name))
    embedded_sequences = embedding_layer(sequence_input)

    # units = 128
    #number_of_layers = 6
    
    if title_layer != None:
        title_input = title_layer.input
        title_layer = title_layer.output

    # Embedding layer with CNN dilated
    #la, lb = DC_CNN_Block(units,2,1,0.01)(embedded_sequences)
    la = embedded_sequences
    if title_layer != None:
        la_title = title_layer
    attention_layes, attention_title_layes = [], []
    filters_size = [3, 4, 5]
    number_of_filters = len(filters_size)
    for index in range(1, number_of_layers + 1):
        # Desc
        la, lb = DC_CNN_Block(units, 5, int(math.pow(2, index)), 0.01)(la)
        # Title
        if title_layer != None:
            la_title, lb_title = DC_CNN_Block(units, 3, int(math.pow(2, index)), 0.01)(la_title)
            lb = Add()([lb_title, lb])
        #la = Dropout(.90)(la)
        #lb = Dropout(.90)(lb)
        attention_layes.append(lb)
        
        if title_layer != None:
            attention_title_layes.append(lb_title)

    attention_layer = Add()(attention_layes)
    if title_layer != None:
        attention_title_layes = Add()(attention_title_layes)
        attention_layer =   Add()([attention_layer, attention_title_layes])
    
    #layer = Add()([attention_layer, l9])
    
    layer =   Activation('tanh')(attention_layer)

    #layer =  Conv1D(1, 1, activation='linear', use_bias=False)(layer)
    
    layer = GlobalAveragePooling1D()(layer)
    #layer = Flatten()(layer)
    #layer = Dropout(0.50)(layer)
    layer = Dense(300, activation='tanh')(layer)
    #layer = GRU(150, activation='tanh', return_sequences=False)(layer)

    if title_layer != None:
        inputs = [sequence_input, title_input]
    else:
        inputs = [sequence_input]
    
    cnn_dilated_feature_model = Model(inputs=inputs, 
                                      outputs=[layer], name = 'FeatureCNNDilatedGenerationModel_{}'.format(name)) # inputs=visible
    return cnn_dilated_feature_model

### CNN with filter 3,4,5

In [29]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D, TimeDistributed

def cnn_model(embedding_layer, title_input, title_layer, max_sequence_length):
    
    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput_CNN')
    #sequence_input = Input(shape=(None,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    
    if title_layer != None:
        conv = Conv1D(filters=32, kernel_size=5)(l_merge)
        #title_layer = Permute((2, 1))(title_layer)
        #conv = Permute((2, 1))(conv)
        #layer = Dot(axes=1)([conv, title_layer])
        #title_layer = TimeDistributed(Dense(1))(title_layer)
        title_layer = Flatten()(title_layer)
        layer = GlobalAveragePooling1D()(conv)
        layer = Concatenate()([layer, title_layer])
    else:
        layer = GlobalAveragePooling1D()(l_merge)
    #layer = GlobalAveragePooling1D()(layer)
    #layer = Flatten()(l_merge)
    layer = Dense(300, activation='tanh')(layer)
    #layer = GRU(100, activation='tanh', return_sequences=False)(l_merge)
    #layer = LeakyReLU()(layer)
    
    if title_layer != None:
        inputs = [sequence_input, title_input]
    else:
        inputs = [sequence_input]

    cnn_feature_model = Model(inputs=inputs, outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [30]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D, Permute, Dot

def bilstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
#     lstm_layer = Bidirectional(LSTM(number_lstm_units, return_sequences=True), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
#                                merge_mode='ave')

    left_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    right_layer = LSTM(number_lstm_units, return_sequences=True, go_backwards=True)(left_layer)
    
    lstm_layer = Add()([left_layer, right_layer])
    
    lstm_layer = TimeDistributed(Dense(1))(lstm_layer)
    layer = Flatten()(lstm_layer)
    #layer = GlobalAveragePooling1D()(layer)
    layer = Dense(300, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model, lstm_layer

### MLP

In [31]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    #layer = GRU(100, activation='tanh')(layer)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [32]:
from keras import backend as K
import tensorflow as tf

'''
    Some loss ideas
    hinge loss Kullback-Leibler
    https://stackoverflow.com/questions/53581298/custom-combined-hinge-kb-divergence-loss-function-in-siamese-net-fails-to-genera
'''

def normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = normalize(x, axis=-1)
    y, y_norm = normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return K.mean(distance, axis=-1, keepdims=False)

def euclidean_distance(vects):
    x, y = vects
    distance = K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
    # Normalize https://stats.stackexchange.com/questions/53068/euclidean-distance-score-and-similarity
    distance = K.constant(1) / (K.constant(1) + distance)
    return K.mean(distance, keepdims=False)
    #return K.mean(distance, axis=-1, keepdims=False)

# https://jdhao.github.io/2017/03/13/some_loss_and_explanations/
def triplet_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.mean(K.maximum(0.0, pos - neg + margin))

def custom_margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.mean(K.maximum(0.0, margin - pos + neg), keepdims=False)

# https://www.kaggle.com/c/quora-question-pairs/discussion/33631
# https://www.researchgate.net/figure/Illustration-of-triplet-loss-contrastive-loss-for-negative-samples-and-binomial_fig2_322060548
def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    pos = y_pred[0]
    neg = y_pred[1]
    margin = 1
    return K.mean(pos * K.square(neg) +
                  (1 - pos) * K.square(K.maximum(margin - neg, 0)))

def pos_distance(y_true, y_pred):
    return y_pred[0]

def neg_distance(y_true, y_pred):
    return y_pred[1]

def stack_tensors(vects):
    return K.stack(vects, axis=-1)

#### Propose

In [33]:
from keras.initializers import TruncatedNormal
from keras.regularizers import l2
from keras.layers import Average, Dot, Maximum, Permute, Reshape

def residual_bug():
    def block(block_input):
        shape_size_cols = K.int_shape(block_input)[1]
        shape_size_rows = 1
        #block_input = Dense(shape_size_cols)(block_input)
        residual =  block_input
#         residual = Activation('tanh')(residual)
        #residual = BatchNormalization()(residual)
        
#         layer_out = Reshape((shape_size_cols, shape_size_rows))(block_input)
#         layer_out = GRU(100, activation='tanh', return_sequences=True)(layer_out)
#         #layer_out = GRU(100, activation='relu', return_sequences=True)(layer_out)
#         #layer_out = Reshape((shape_size_cols, ))(layer_out)
#         layer_out = GlobalAveragePooling1D()(layer_out)
#         #layer_out = BatchNormalization()(layer_out)
#         layer_out = Dense(50, activation='tanh')(layer_out)
#         #layer_out = BatchNormalization()(layer_out)
#         layer_out = Dense(shape_size_cols, activation='tanh', use_bias=True)(layer_out)
#         skip_out = Dense(shape_size_cols, activation='tanh', use_bias=True)(layer_out)
        #layer_out = Activation('relu')(layer_out)
        #layer_out = BatchNormalization()(layer_out)
        
        #layer_out = Dense(shape_size_cols // 2, activation='tanh')(block_input)
        layer_out = Dense(shape_size_cols)(block_input)
        skip_out = Dense(shape_size_cols)(block_input)
        
        block_out = Add()([residual, layer_out])
        #block_out = Activation('relu')(block_out)
        return block_out, skip_out
    return block

In [34]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum, Subtract, Average, AveragePooling1D
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model(bug_t)
    bug_d_feat = desc_feature_model(bug_d)
    #bug_d_feat = desc_feature_model([bug_d, bug_t])
    bug_i_feat = categorical_feature_model(bug_i)
    
#     bug_t_feat = GlobalAveragePooling1D()(bug_t_feat)
    #bug_t_feat = GRU(100, return_sequences=False, activation='tanh')(bug_t_feat)
    #bug_t_feat = Flatten()(bug_t_feat)
    #bug_t_feat = Dense(100, activation='tanh')(bug_t_feat)
    
    #bug_d_feat = GRU(100, return_sequences=False, activation='tanh')(bug_d_feat)
    #bug_d_feat = Flatten()(bug_d_feat)
    #bug_d_feat = Dense(100, activation='tanh')(bug_t_feat)
    
#     encoded_t_1a, encoded_t_1b  = residual_bug()(bug_t_feat)
#     encoded_d_1a, encoded_d_1b  = residual_bug()(bug_d_feat)
#     bug_t_feat = encoded_t_1a
#     bug_d_feat = encoded_d_1a
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
#     for _ in range(2):
#         bug_feature_output = Dense(150, activation='tanh')(bug_feature_output)
    
    #bug_feature_output, bug_feature_output_1b = residual_bug()(bug_feature_output)
    #bug_feature_output_1a = Dropout(.5)(bug_feature_output_1a)
    #bug_feature_output, bug_feature_output_2b = residual_bug()(bug_feature_output)
    #bug_feature_output = Add()([bug_feature_output, bug_feature_output_1b, bug_feature_output_2b])
    
    #bug_feature_output = Add()([bug_feature_output_1b, bug_feature_output_2b])
    #bug_feature_output = BatchNormalization()(bug_feature_output)
    #bug_feature_output = Activation('relu')(bug_feature_output)
#     bug_feature_output = Dropout(.75)(bug_feature_output)
#     shape_size = K.int_shape(bug_feature_output)[1]
#     bug_feature_output = Dense(shape_size, activation='linear', use_bias=False)(bug_feature_output)
#     bug_feature_output = Dropout(.33)(bug_feature_output)
#     bug_feature_output = Dense(100)(bug_feature_output)
    
    #bug_feature_output  = residual_bug()(bug_feature_output)
    #bug_feature_output = BatchNormalization()(bug_feature_output)
    #     encoded_2a, encoded_2b  = residual_bug()(encoded_1a)
    
    #     bug_feature_output = Add()([encoded_1b, encoded_2b])
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    # bug_feature_output = Dense(300, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [35]:
from keras.layers import Average

def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, 
                             master_anchor, master_negative, master_positive, decay_lr=1):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input, 
                                 master_anchor.input, master_positive.input, master_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    master_anchor = master_anchor.output
    master_negative = master_negative.output
    master_positive = master_positive.output
    
    # Distance bugs
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])
    
    # Distance masters anchor
    master_anchor_positive_d = Lambda(cosine_distance, name='pos_master_cosine_distance', output_shape=[1])([encoded_anchor, master_positive])
    master_anchor_negative_d = Lambda(cosine_distance, name='neg_master_cosine_distance', output_shape=[1])([encoded_anchor, master_negative])
    
    # Distance master positive
    master_pos_positive_d = Lambda(cosine_distance, name='pos_master_pos_cosine_distance', output_shape=[1])([encoded_positive, master_positive])
    master_pos_negative_d = Lambda(cosine_distance, name='neg_master_pos_cosine_distance', output_shape=[1])([encoded_positive, master_negative])
    
    # Distance master negative
    master_neg_positive_d = Lambda(cosine_distance, name='pos_master_neg_cosine_distance', output_shape=[1])([encoded_negative, master_negative])
    master_neg_negative_d = Lambda(cosine_distance, name='neg_master_neg_cosine_distance', output_shape=[1])([encoded_negative, master_positive])
    

    # Loss function only works with a single output
    output_bug = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-bug',
        output_shape=(2, 1)
    )([positive_d, negative_d])
    
    output_master = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master-anchor',
        output_shape=(2, 1)
    )([master_anchor_positive_d, master_anchor_negative_d])
    
    output_master_pos = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master-pos',
        output_shape=(2, 1)
    )([master_pos_positive_d, master_pos_negative_d])
    
    output_master_neg = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master-neg',
        output_shape=(2, 1)
    )([master_neg_positive_d, master_neg_negative_d])
    
    #output = Average()([output_bug, output_master, output_master_pos, output_master_neg])
    
    output_avg_master = Average()([output_master, output_master_pos, output_master_neg])
    output = Average()([output_bug, output_avg_master])
    #loss = MarginLoss()(output)

    similarity_model = Model(inputs = inputs, outputs = [output], name = 'Similarity_Model')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    optimizer = Adam(lr=1e-3 * decay_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=custom_margin_loss, 
                                 metrics=[pos_distance, neg_distance])

    return similarity_model

In [36]:
#%%time
import keras

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False, name='desc')
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False, name='title')

# Feature models
'''
    cnn_dilated_model
    arcii_model
    cnn_model
    lstm_model
    bilstm_model
'''
title_feature_model, title_layer = bilstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
desc_feature_model = cnn_model(desc_embedding_layer, None, None, MAX_SEQUENCE_LENGTH_D)
#title_feature_model = cnn_dilated_model(50, 6, title_embedding_layer, None, MAX_SEQUENCE_LENGTH_T, 'title')
#desc_feature_model = cnn_dilated_model(128, 6, desc_embedding_layer, title_feature_model, MAX_SEQUENCE_LENGTH_D, 'desc')
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')
encoded_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')
# Master model
master_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_in')
master_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_pos')
master_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_neg')

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, 
                                            master_anchor, master_negative, master_positive, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, train_master_input, train_master_neg, \
            train_sim = batch_iterator(baseline, baseline.train_data, baseline.dup_sets_train, bug_train_ids, 
                                       batch_size, 1, issues_by_buckets)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'],
                   train_input_pos['title'], train_input_pos['description'], train_input_pos['info'], 
                   train_input_neg['title'], train_input_neg['description'], train_input_neg['info'],
                  train_master_input['title'], train_master_input['description'], train_master_input['info'],
                  train_master_input['title'], train_master_input['description'], train_master_input['info'],
                   train_master_neg['title'], train_master_neg['description'], train_master_neg['info']]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, _, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, 
                                                               bug_train_ids)
        print("Epoch: {} Loss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}, recall@25: {:.2f}".format(epoch+1,
                                                                                                         h[0],
                                                                                                         h[1], h[2], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}".format(epoch+1,
                                                                                                         h[0],
                                                                                                         h[1],
                                                                                                         h[2]))
    loss = h[0]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}s, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
info_pos (InputLayer)           (None, 1682)         0                                            
__________________________________________________________________________________________________
title_pos 

Epoch: 1 Loss: 0.91, pos_cosine: 0.90, neg_cosine: 0.82
Epoch: 2 Loss: 0.93, pos_cosine: 0.89, neg_cosine: 0.82
Epoch: 3 Loss: 0.95, pos_cosine: 0.89, neg_cosine: 0.84
Epoch: 4 Loss: 0.96, pos_cosine: 0.90, neg_cosine: 0.86
Epoch: 5 Loss: 0.92, pos_cosine: 0.90, neg_cosine: 0.82
Epoch: 6 Loss: 0.94, pos_cosine: 0.90, neg_cosine: 0.84
Epoch: 7 Loss: 0.91, pos_cosine: 0.90, neg_cosine: 0.81
Epoch: 8 Loss: 0.93, pos_cosine: 0.90, neg_cosine: 0.83
Epoch: 9 Loss: 0.90, pos_cosine: 0.91, neg_cosine: 0.81
Epoch: 10 Loss: 0.91, pos_cosine: 0.92, neg_cosine: 0.83
Epoch: 11 Loss: 0.91, pos_cosine: 0.91, neg_cosine: 0.82
Epoch: 12 Loss: 0.94, pos_cosine: 0.90, neg_cosine: 0.84
Epoch: 13 Loss: 0.87, pos_cosine: 0.90, neg_cosine: 0.77
Epoch: 14 Loss: 0.89, pos_cosine: 0.89, neg_cosine: 0.78
Epoch: 15 Loss: 0.92, pos_cosine: 0.91, neg_cosine: 0.83
Epoch: 16 Loss: 0.91, pos_cosine: 0.91, neg_cosine: 0.82
Epoch: 17 Loss: 0.96, pos_cosine: 0.90, neg_cosine: 0.86
Epoch: 18 Loss: 0.93, pos_cosine: 0.92, 

In [37]:
recall

0.2

In [38]:
_[:20]

['327681:324658|382274:0.9399709813296795,361301:0.9398491233587265,397407:0.938922606408596,364809:0.937847413122654,358912:0.9373695477843285,398885:0.9371131956577301,323220:0.9368166476488113,358938:0.9290433377027512,329254:0.928930789232254,296664:0.9220841601490974,421647:0.9196240156888962,351068:0.9189987406134605,401574:0.9182261228561401,363830:0.9159024655818939,321346:0.9151357784867287,340903:0.914868175983429,393566:0.9140876084566116,378868:0.9105135351419449,359609:0.908665731549263,394186:0.9084839820861816,407704:0.9084819778800011,421713:0.907542034983635,401029:0.9047767594456673,366850:0.9045613408088684,323040:0.9042341634631157,355440:0.9035104140639305,366294:0.9029841423034668,339356:0.9015799090266228,382193:0.9014610052108765',
 '324658:327681|361734:0.8739428222179413,358287:0.8705032467842102,370853:0.8679444342851639,360484:0.8657918274402618,378700:0.863791361451149,360929:0.8617623448371887,361575:0.859798014163971,355014:0.8596422523260117,357184:0.858

In [39]:
# recall, exported_rank, debug = experiment.evaluate_validation_test(experiment, retrieval, verbose, 
#                                                         encoded_anchor, issues_by_buckets, evaluate_validation_test)
# test_vectorized, queries_test_vectorized, annoy, X_test, distance_test, indices_test = debug
# "recall@25 last epoch:", recall

### Retrieval evaluation

In [40]:
print("Total of queries:", len(retrieval.test))

Total of queries: 4641


#### Getting the model trained

In [41]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'propose_feature_100epochs_64batch(eclipse)'

In [42]:
model = experiment.get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))



In [43]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          504900      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [44]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, bug_train_ids)

In [45]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/eclipse/exported_rank_propose_master_triplet_loss.txt'

In [46]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [47]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.14,
 '2 - recall_at_10': 0.16,
 '3 - recall_at_15': 0.17,
 '4 - recall_at_20': 0.19,
 '5 - recall_at_25': 0.2}