# Propose Master Triplet Loss

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 50 # 40
MAX_SEQUENCE_LENGTH_D = 50 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
METHOD = 'propose_master_triplet_loss'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = 'propose_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'propose_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))




#### Loading bug ids in memory

In [10]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


361006

#### Dicionário de títulos e descrições

In [11]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 40.8 s, sys: 4.17 s, total: 45 s
Wall time: 42.8 s


#### Hashing bugs by buckets

In [12]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




#### Prepare the train and test

In [None]:
%%time

experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data


#### Recovery bug ids from train

In [None]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

#### Display a random bug

In [None]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

### Generating the batch test

In [None]:
"Train ", len(baseline.dup_sets_train)

In [None]:
 # data - path
# batch_size - 128
# n_neg - 1

import random

def get_neg_bug(invalid_bugs, bug_ids, issues_by_buckets):
    neg_bug = random.choice(list(issues_by_buckets.keys()))
    try:
        while neg_bug in invalid_bugs or neg_bug not in issues_by_buckets:
            neg_bug = random.choice(bug_ids)
    except:
        invalid_bugs = [invalid_bugs]
        while neg_bug in invalid_bugs or neg_bug not in issues_by_buckets:
            neg_bug = random.choice(bug_ids)
    return neg_bug

def batch_iterator(baseline, data, dup_sets, bug_train_ids, batch_size, n_neg, issues_by_buckets):
    # global train_data
    # global self.dup_sets
    # global self.bug_ids
    # global self.bug_set

    random.shuffle(data)

    batch_input, batch_pos, batch_neg, master_batch_input, master_batch_neg = {'title' : [], 'desc' : [], 'info' : []}, \
                                            {'title' : [], 'desc' : [], 'info' : []}, \
                                                {'title' : [], 'desc' : [], 'info' : []},\
                                                    {'title' : [], 'desc' : [], 'info' : []}, \
                                                        {'title' : [], 'desc' : [], 'info' : []}

    n_train = len(data)

    batch_triplets = []

    for offset in range(batch_size):
        neg_bug = baseline.get_neg_bug(dup_sets[data[offset][0]], bug_train_ids)
        anchor, pos, neg = data[offset][0], data[offset][1], neg_bug
        bug_anchor = baseline.bug_set[anchor]
        bug_pos = baseline.bug_set[pos]
        bug_neg = baseline.bug_set[neg]
        # master anchor and neg
        master_anchor = baseline.bug_set[issues_by_buckets[anchor]]
        master_neg = baseline.bug_set[issues_by_buckets[neg]]
        
        baseline.read_batch_bugs(batch_input, bug_anchor)
        baseline.read_batch_bugs(batch_pos, bug_pos)
        baseline.read_batch_bugs(batch_neg, bug_neg)
        # master anchor and neg
        baseline.read_batch_bugs(master_batch_input, master_anchor)
        baseline.read_batch_bugs(master_batch_neg, master_neg)
        # triplet bug and master
        batch_triplets.append([data[offset][0], data[offset][1], neg_bug, master_anchor, master_neg])

    batch_input['title'] = np.array(batch_input['title'])
    batch_input['desc'] = np.array(batch_input['desc'])
    batch_input['info'] = np.array(batch_input['info'])
    batch_pos['title'] = np.array(batch_pos['title'])
    batch_pos['desc'] = np.array(batch_pos['desc'])
    batch_pos['info'] = np.array(batch_pos['info'])
    batch_neg['title'] = np.array(batch_neg['title'])
    batch_neg['desc'] = np.array(batch_neg['desc'])
    batch_neg['info'] = np.array(batch_neg['info'])
    
    # master
    master_batch_input['title'] = np.array(master_batch_input['title'])
    master_batch_input['desc'] = np.array(master_batch_input['desc'])
    master_batch_input['info'] = np.array(master_batch_input['info'])
    
    master_batch_neg['title'] = np.array(master_batch_neg['title'])
    master_batch_neg['desc'] = np.array(master_batch_neg['desc'])
    master_batch_neg['info'] = np.array(master_batch_neg['info'])

    n_half = len(batch_triplets) // 2
    if n_half > 0:
        pos = np.full((1, n_half), 1)
        neg = np.full((1, n_half), 0)
        sim = np.concatenate([pos, neg], -1)[0]
    else:
        sim = np.array([np.random.choice([1, 0])])

    input_sample, input_pos, input_neg, master_input_sample, master_neg = {}, {}, {}, {}, {}

    input_sample = { 'title' : batch_input['title'], 'description' : batch_input['desc'], 'info' : batch_input['info'] }
    input_pos = { 'title' : batch_pos['title'], 'description' : batch_pos['desc'], 'info': batch_pos['info'] }
    input_neg = { 'title' : batch_neg['title'], 'description' : batch_neg['desc'], 'info': batch_neg['info'] }
    # master 
    master_input_sample = { 'title' : master_batch_input['title'], 'description' : master_batch_input['desc'], 
                           'info' : master_batch_input['info'] }
    master_neg = { 'title' : master_batch_neg['title'], 'description' : master_batch_neg['desc'], 
                           'info' : master_batch_neg['info'] }
    return batch_triplets, input_sample, input_pos, input_neg, master_input_sample, master_neg, sim #sim

### Train ids

In [None]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [None]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, \
                            valid_master_sample, valid_master_neg, valid_sim = batch_iterator(baseline, baseline.train_data, 
                                                                                          baseline.dup_sets_train,
                                                                                          bug_train_ids,
                                                                                          batch_size_test, 1, issues_by_buckets)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

In [None]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

### Validar entrada

In [None]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

## Pre-trained embeddings

Loading pretrained word vectors

### Fasttext

In [None]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed_fasttext.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

In [None]:
"Total vocabulary: {}".format(len(vocab))

In [None]:
import io

def generating_embed(baseline, EMBED_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(EMBED_DIR, 'crawl-300d-2M.vec')
    f = open(embed_path, 'rb')
    f = io.open(embed_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, f.readline().split())

    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed_fasttext.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading FastText")
    for line in loop:
        tokens = line.rstrip().split(' ')
        embed = list(map(float, tokens[1:]))
        word = tokens[0]
        embeddings_index[word] = np.asarray(embed, dtype='float32')
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in FastText 42B 300d.' % len(embeddings_index))

    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    i = 0
    for word, embed in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
        else:
            embedding_matrix[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [25]:
%%time

generating_embed(baseline, EMBED_DIR=EMBED_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS


Total 1999995 word vectors in FastText 42B 300d.


HBox(children=(IntProgress(value=0, max=173904), HTML(value='')))


CPU times: user 2min 9s, sys: 4.38 s, total: 2min 14s
Wall time: 2min 10s


## Experiment

## Propose

https://github.com/tqtg/DuplicateBugFinder

In [26]:
from keras.initializers import RandomUniform, RandomNormal, Ones

### Embedding layer

In [27]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable, name):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer_{}'.format(name),
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### CNN Dilated

In [28]:
from keras.constraints import max_norm
import math

def DC_CNN_Block(nb_filter, filter_length, dilation, l2_layer_reg):
    def block(block_input):        
        residual =    block_input
        
        layer_out =   Conv1D(filters=nb_filter, kernel_size=filter_length, 
                      dilation_rate=dilation, 
                      activation='linear', padding='causal', use_bias=False)(block_input) #kernel_regularizer=l2(l2_layer_reg)                    
        
        activation_out = Activation('tanh')(layer_out)
        
        skip_out =    Conv1D(1,1, activation='linear', use_bias=False)(activation_out) # use_bias=False, kernel_constraint=max_norm(1.)
        
        c1x1_out =    Conv1D(1,1, activation='linear', use_bias=False)(activation_out)
                      
        block_out =   Add()([residual, c1x1_out])
        
        return block_out, skip_out
    return block

def cnn_dilated_model(embedding_layer, title_layer, max_sequence_length):
    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput_CNND')
    embedded_sequences = embedding_layer(sequence_input)

    units = 128
    number_of_layers = 6
    
    title_input = title_layer.input
    title_layer = title_layer.output

    # Embedding layer with CNN dilated
    #la, lb = DC_CNN_Block(units,2,1,0.01)(embedded_sequences)
    la = embedded_sequences
    la_title = title_layer
    attention_layes, attention_title_layes = [], []
    filters_size = [3, 4, 5]
    number_of_filters = len(filters_size)
    for index in range(1, number_of_layers + 1):
        # Desc
        la, lb = DC_CNN_Block(units, 5, int(math.pow(2, index)), 0.01)(la)
        # Title 
        la_title, lb_title = DC_CNN_Block(units, 3, int(math.pow(2, index)), 0.01)(la_title)
        lb = Add()([lb_title, lb])
        #la = Dropout(.90)(la)
        #lb = Dropout(.90)(lb)
        attention_layes.append(lb)
        attention_title_layes.append(lb_title)

    attention_layer = Add()(attention_layes)
    attention_title_layes = Add()(attention_title_layes)
    attention_layer =   Add()([attention_layer, attention_title_layes])
    
    #layer = Add()([attention_layer, l9])
    
    layer =   Activation('tanh')(attention_layer)

    #layer =  Conv1D(1,1, activation='linear', use_bias=False)(layer)
    
    #layer = Flatten()(layer)
    layer = GlobalAveragePooling1D()(layer)
    #layer = Dropout(0.50)(layer)
    #layer = Dense(300, activation='tanh')(layer)
    #layer = GRU(150, activation='tanh', return_sequences=False)(layer)

    cnn_dilated_feature_model = Model(inputs=[sequence_input, title_input], outputs=[layer], name = 'FeatureCNNDilatedGenerationModel') # inputs=visible
    return cnn_dilated_feature_model

### CNN with filter 3,4,5

In [29]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput_CNN')
    #sequence_input = Input(shape=(None,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 32

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size, kernel_initializer='random_uniform',
                bias_initializer='zeros')(embedded_sequences)
        l_pool = AveragePooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=n_filters * 3, kernel_size=5)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    #layer = Dense(300, activation='tanh')(layer)
    #layer = GRU(100, activation='tanh', return_sequences=False)(l_merge)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### LSTM

In [30]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 100
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput_LSTM')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    lstm_layer = LSTM(number_lstm_units, return_sequences=True, kernel_initializer='random_uniform',
                bias_initializer='zeros')(embedded_sequences)
    
    #lstm_layer = lstm_layer(embedded_sequences)
    #lstm_layer = GlobalAveragePooling1D()(lstm_layer)
    #lstm_layer = Dense(300, activation='tanh')(lstm_layer)
    #lstm_layer = GRU(100, activation='tanh', return_sequences=False)(lstm_layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[lstm_layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### Bi-LSTM

In [31]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D, Permute, Dot

def bilstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput_bilstm')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
    lstm_layer = Bidirectional(GRU(number_lstm_units, activation='tanh', 
                                   return_sequences=True), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
                               merge_mode='ave')

#     lstm_layer = LSTM(number_lstm_units, return_sequences=True, kernel_initializer='random_uniform',
#                 bias_initializer='zeros')(embedded_sequences)
    
    # Attention layer to title
    #title_input = title_layer.input
    #title_layer = title_layer.output
    #shape_lstm = K.int_shape(lstm_layer)
    #lstm_layer = Permute((2, 1), input_shape=shape_lstm)(lstm_layer)
    #shape_lstm = K.int_shape(title_layer)
    #title_layer = Permute((2, 1), input_shape=shape_lstm)(title_layer)
    #layer = Dot(axes=1)([lstm_layer, title_layer])
    
#     layer = LSTM(number_lstm_units, return_sequences=False, kernel_initializer='random_uniform',
#                 bias_initializer='zeros')(layer)

    layer = lstm_layer(embedded_sequences)
    #layer = GlobalAveragePooling1D()(layer)
    #layer = Dense(300, activation='tanh')(layer)
    #layer = GRU(100, activation='tanh', return_sequences=False)(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureBiLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [32]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    #layer = GRU(100, activation='tanh')(layer)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [33]:
from keras import backend as K
import tensorflow as tf

'''
    Some loss ideas
    hinge loss Kullback-Leibler
    https://stackoverflow.com/questions/53581298/custom-combined-hinge-kb-divergence-loss-function-in-siamese-net-fails-to-genera
'''

def normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = normalize(x, axis=-1)
    y, y_norm = normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return K.mean(distance, axis=-1, keepdims=False)

def euclidean_distance(vects):
    x, y = vects
    distance = K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
    # Normalize https://stats.stackexchange.com/questions/53068/euclidean-distance-score-and-similarity
    distance = K.constant(1) / (K.constant(1) + distance)
    return K.mean(distance, keepdims=False)
    #return K.mean(distance, axis=-1, keepdims=False)

# https://jdhao.github.io/2017/03/13/some_loss_and_explanations/
def triplet_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.mean(K.maximum(0.0, pos - neg + margin))

def custom_margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.mean(K.maximum(0.0, margin - pos + neg), keepdims=False)

# https://www.kaggle.com/c/quora-question-pairs/discussion/33631
# https://www.researchgate.net/figure/Illustration-of-triplet-loss-contrastive-loss-for-negative-samples-and-binomial_fig2_322060548
def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    pos = y_pred[0]
    neg = y_pred[1]
    margin = 1
    return K.mean(pos * K.square(neg) +
                  (1 - pos) * K.square(K.maximum(margin - neg, 0)))

def pos_distance(y_true, y_pred):
    return y_pred[0]

def neg_distance(y_true, y_pred):
    return y_pred[1]

def stack_tensors(vects):
    return K.stack(vects, axis=-1)

#### Propose

In [34]:
from keras.initializers import TruncatedNormal
from keras.regularizers import l2
from keras.layers import Average, Dot, Maximum, Permute

def residual_bug():
    def block(block_input):
        shape_size_cols = K.int_shape(block_input)[1]
        shape_size_rows = 1
        
        residual =  block_input
        residual = Activation('relu')(residual)
        #residual = BatchNormalization()(residual)
        
        layer_out = Reshape((shape_size_cols, shape_size_rows))(block_input)
        layer_out = GRU(100, activation='relu', return_sequences=True)(layer_out)
        #layer_out = GRU(100, activation='relu', return_sequences=True)(layer_out)
        #layer_out = Reshape((shape_size_cols, ))(layer_out)
        layer_out = GlobalAveragePooling1D()(layer_out)
        #layer_out = BatchNormalization()(layer_out)
        layer_out = Dense(50, activation='relu')(layer_out)
        #layer_out = BatchNormalization()(layer_out)
        layer_out = Dense(shape_size_cols, activation='relu', use_bias=True, kernel_initializer='random_uniform')(layer_out)
        skip_out = Dense(shape_size_cols, activation='relu', use_bias=True, kernel_initializer='random_uniform')(layer_out)
        #layer_out = Activation('relu')(layer_out)
        #layer_out = BatchNormalization()(layer_out)
        
        block_out = Add()([residual, layer_out])
        #block_out = Activation('relu')(block_out)
        return block_out, skip_out
    return block

In [35]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum, Subtract, Average
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model(bug_t)
    #bug_d_feat = desc_feature_model(bug_d)
    bug_d_feat = desc_feature_model([bug_d, bug_t])
    bug_i_feat = categorical_feature_model(bug_i)
    
    bug_t_feat = GlobalAveragePooling1D()(bug_t_feat)
    
#     encoded_t_1a, encoded_t_1b  = residual_bug()(bug_t_feat)
#     encoded_d_1a, encoded_d_1b  = residual_bug()(bug_d_feat)
#     bug_t_feat = encoded_t_1a
#     bug_d_feat = encoded_d_1a
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
    #bug_feature_output, bug_feature_output_1b = residual_bug()(bug_feature_output)
    #bug_feature_output_1a = Dropout(.5)(bug_feature_output_1a)
    #bug_feature_output, bug_feature_output_2b = residual_bug()(bug_feature_output_1a)
    
    #bug_feature_output = Add()([bug_feature_output_1b, bug_feature_output_2b])
    #bug_feature_output = BatchNormalization()(bug_feature_output)
    #bug_feature_output = Activation('relu')(bug_feature_output)
#     bug_feature_output = Dropout(.75)(bug_feature_output)
#     shape_size = K.int_shape(bug_feature_output)[1]
#     bug_feature_output = Dense(shape_size, activation='linear', use_bias=False)(bug_feature_output)
#     bug_feature_output = Dropout(.33)(bug_feature_output)
#     bug_feature_output = Dense(100)(bug_feature_output)
    
    #bug_feature_output  = residual_bug()(bug_feature_output)
    #bug_feature_output = BatchNormalization()(bug_feature_output)
    #     encoded_2a, encoded_2b  = residual_bug()(encoded_1a)
    
    #     bug_feature_output = Add()([encoded_1b, encoded_2b])
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    # bug_feature_output = Dense(300, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [36]:
from keras.layers import Average

def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, 
                             master_anchor, master_negative, decay_lr=1):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input, 
                                 master_anchor.input, master_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    master_anchor = master_anchor.output
    master_negative = master_negative.output
    master_positive = Add()([encoded_anchor, encoded_positive])
    
    # Distance bugs
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])
    
    # Distance masters
    master_positive_d = Lambda(cosine_distance, name='pos_master_cosine_distance', output_shape=[1])([master_anchor, master_positive])
    master_negative_d = Lambda(cosine_distance, name='neg_master_cosine_distance', output_shape=[1])([master_anchor, master_negative])

    # Loss function only works with a single output
    output_bug = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-bug',
        output_shape=(2, 1)
    )([positive_d, negative_d])
    
    output_master = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master',
        output_shape=(2, 1)
    )([master_positive_d, master_negative_d])
    
    output = Average()([output_bug, output_master])
    
    #loss = MarginLoss()(output)

    similarity_model = Model(inputs = inputs, outputs = [output], name = 'Similarity_Model')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    optimizer = Adam(lr=1e-3 * decay_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=custom_margin_loss, 
                                 metrics=[pos_distance, neg_distance])

    return similarity_model

In [37]:
#%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False, name='desc')
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False, name='title')

# Feature models
'''
    cnn_dilated_model
    arcii_model
    cnn_model
    lstm_model
    bilstm_model
'''
title_feature_model = bilstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
desc_feature_model = cnn_dilated_model(desc_embedding_layer, title_feature_model, MAX_SEQUENCE_LENGTH_D)
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')
encoded_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')
# Master model
master_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_in')
master_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_neg')

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, 
                                            master_anchor, master_negative, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, train_master_input, train_master_neg, \
            train_sim = batch_iterator(baseline, baseline.train_data, baseline.dup_sets_train, bug_train_ids, batch_size, 1, issues_by_buckets)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'],
                   train_input_pos['title'], train_input_pos['description'], train_input_pos['info'], 
                   train_input_neg['title'], train_input_neg['description'], train_input_neg['info'],
                  train_master_input['title'], train_master_input['description'], train_master_input['info'],
                  train_master_neg['title'], train_master_neg['description'], train_master_neg['info']]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, _, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, bug_train_ids)
        print("Epoch: {} Loss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}, recall@25: {:.2f}".format(epoch+1,
                                                                                                         h[0],
                                                                                                         h[1], h[2], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}".format(epoch+1,
                                                                                                         h[0],
                                                                                                         h[1],
                                                                                                         h[2]))
    loss = h[0]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_in (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
title_pos (InputLayer)          (None, 50)           0                                            
__________________________________________________________________________________________________
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
FeatureBiLstmGenerationModel (M (None, 50, 50)       52276500    title_in[0][0]                   
                                                                 title_pos[0][0]                  
          

Epoch: 1 Loss: 0.90, pos_cosine: 0.95, neg_cosine: 0.84
Epoch: 2 Loss: 0.87, pos_cosine: 0.93, neg_cosine: 0.80
Epoch: 3 Loss: 0.84, pos_cosine: 0.90, neg_cosine: 0.74
Epoch: 4 Loss: 0.80, pos_cosine: 0.89, neg_cosine: 0.69
Epoch: 5 Loss: 0.78, pos_cosine: 0.87, neg_cosine: 0.65
Epoch: 6 Loss: 0.77, pos_cosine: 0.88, neg_cosine: 0.64
Epoch: 7 Loss: 0.75, pos_cosine: 0.86, neg_cosine: 0.61
Epoch: 8 Loss: 0.72, pos_cosine: 0.87, neg_cosine: 0.59
Epoch: 9 Loss: 0.71, pos_cosine: 0.85, neg_cosine: 0.56
Epoch: 10 Loss: 0.71, pos_cosine: 0.85, neg_cosine: 0.55
Epoch: 11 Loss: 0.67, pos_cosine: 0.86, neg_cosine: 0.53
Epoch: 12 Loss: 0.69, pos_cosine: 0.87, neg_cosine: 0.56
Epoch: 13 Loss: 0.66, pos_cosine: 0.87, neg_cosine: 0.54
Epoch: 14 Loss: 0.68, pos_cosine: 0.86, neg_cosine: 0.54
Epoch: 15 Loss: 0.64, pos_cosine: 0.86, neg_cosine: 0.51
Epoch: 16 Loss: 0.67, pos_cosine: 0.85, neg_cosine: 0.52
Epoch: 17 Loss: 0.67, pos_cosine: 0.85, neg_cosine: 0.52
Epoch: 18 Loss: 0.69, pos_cosine: 0.84, 

In [38]:
_[:20]

['327681:324658|324658:0.27195924520492554,411777:0.22257781028747559,396773:0.2167147397994995,415435:0.21363812685012817,366816:0.2121729850769043,366854:0.20301556587219238,326856:0.2024645209312439,326553:0.20181787014007568,386639:0.2013646960258484,412693:0.2002018690109253,422917:0.19991284608840942,416382:0.19975131750106812,396454:0.19882571697235107,335074:0.19796502590179443,350622:0.19767892360687256,390614:0.19766765832901,382251:0.1975499987602234,340509:0.19745266437530518,349802:0.19684088230133057,419469:0.19666951894760132,301623:0.19603610038757324,323714:0.1957383155822754,423262:0.19561082124710083,330866:0.1954137086868286,407957:0.1933695673942566,338114:0.19254660606384277,325671:0.19210469722747803,374285:0.19185179471969604,360277:0.1918412446975708',
 '324658:327681|327681:0.27195924520492554,322824:0.22683972120285034,319609:0.20662236213684082,328507:0.19532734155654907,386069:0.19233673810958862,326293:0.18514788150787354,409278:0.18476182222366333,411777:

In [39]:
# recall, exported_rank, debug = experiment.evaluate_validation_test(experiment, retrieval, verbose, 
#                                                         encoded_anchor, issues_by_buckets, evaluate_validation_test)
# test_vectorized, queries_test_vectorized, annoy, X_test, distance_test, indices_test = debug
# "recall@25 last epoch:", recall

### Retrieval evaluation

In [40]:
print("Total of queries:", len(retrieval.test))

Total of queries: 4641


#### Getting the model trained

In [41]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'propose_feature_100epochs_64batch(eclipse)'

In [42]:
model = experiment.get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))



In [43]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_in (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
FeatureBiLstmGenerationModel (M (None, 50, 50)       52276500    title_in[0][0]                   
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
FeatureMlp

In [44]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, bug_train_ids)

In [45]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/eclipse/exported_rank_propose_master_triplet_loss.txt'

In [46]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [47]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.32,
 '2 - recall_at_10': 0.4,
 '3 - recall_at_15': 0.45,
 '4 - recall_at_20': 0.49,
 '5 - recall_at_25': 0.51}