# Bug triage with Deep Learning

https://github.com/AdrianUng/keras-triplet-loss-mnist/blob/master/Triplet_loss_KERAS_semi_hard_from_TF.ipynb

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 20 # 100
MAX_SEQUENCE_LENGTH_D = 20 # 500
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000
'''
    Configuration
'''
epochs = 1000
freeze_train = .1 # 10% with freeze weights
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'openoffice'
METHOD = 'baseline_{}'.format(epochs)
PREPROCESSING = 'bert'
TOKEN = 'bert'
# Dataset paths
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [9]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [10]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D,
                   token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [11]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

#### Loading bug ids in memory

In [12]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


98070

#### Dicionário de títulos e descrições

In [13]:
%%time

experiment.load_bugs(TOKEN)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 9.77 s, sys: 986 ms, total: 10.8 s
Wall time: 10.5 s


#### Hashing bugs by buckets

In [14]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




#### Prepare the train and test

In [15]:
# path_train='train_chronological', path_test='test_chronological'
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

In [16]:
baseline.train_data[:10]

[[6036, 35039],
 [26311, 15091],
 [5142, 34523],
 [8323, 9594],
 [76676, 88775],
 [59725, 71546],
 [121756, 116964],
 [32936, 30911],
 [15257, 16297],
 [99719, 106395]]

#### Recovery bug ids from train

In [17]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Export the corpus train

In [18]:
if EXTRACT_CORPUS:
    corpus = []
    export_file = open(os.path.join(DIR, 'corpus_train.txt'), 'w')
    for bug_id in tqdm(baseline.bug_set):
        bug = baseline.bug_set[bug_id]
        title = bug['title']
        desc = bug['description']
        export_file.write("{}\n{}\n".format(title, desc))
    export_file.close()

# Generating tiple of batches

In [19]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

{'bug_severity': '2\n',
 'bug_status': '1\n',
 'component': '72\n',
 'creation_ts': '2002-09-27 06:23:00 +0000',
 'delta_ts': '2003-09-08 16:53:51 +0000',
 'description': '[CLS] open ##off ##ice 1 . 0 . 1 ( downloaded tar ##ball ) , red ##hat 7 . 3 intel works ##tation install , nothing special modified . i successfully did a network install as root , then successfully did works ##tation install ##s as both root and a regular user . root is able to run so ##ffi ##ce successfully ; the normal user is not . when the normal user runs so ##ffi ##ce the logo appears , disk grind ##s for a while , logo disappears , border of a window is drawn but never filled in , then the window disappears and the message " ab ##orted " is printed on the terminal window . there are a few twists to this story : the regular user is initially able to run the sw ##rite ##r program successfully immediately after the works ##tation install ; however , after he logs out of x and logs in again he is no longer able 

### Generating the batch test

In [20]:
"Train ", len(baseline.dup_sets_train)

('Train ', 14508)

In [21]:
import random

# data - path
# batch_size - 128
# n_neg - 1
def batch_iterator(self, retrieval, model, data, dup_sets, bug_ids, 
                   batch_size, n_neg, issues_by_buckets, TRIPLET_HARD=False, FLOATING_PADDING=False):
    # global train_data
    # global self.dup_sets
    # global self.bug_ids
    # global self.bug_set

    random.shuffle(data)

    batch_features = {'title' : [], 'desc' : [], 'info' : []}

    n_train = len(data)

    batch_triplets, batch_bugs_anchor, batch_bugs_pos, batch_bugs_neg, batch_bugs = [], [], [], [], []

    all_bugs = list(issues_by_buckets.keys())
    buckets = retrieval.buckets

    for offset in range(batch_size):
        anchor, pos = data[offset][0], data[offset][1]
        batch_bugs_anchor.append(anchor)
        batch_bugs_pos.append(pos)
        batch_bugs.append(anchor)
        batch_bugs.append(pos)
        #batch_bugs += dup_sets[anchor]

    for anchor, pos in zip(batch_bugs_anchor, batch_bugs_pos):
        while True:
            neg = self.get_neg_bug(anchor, buckets[issues_by_buckets[anchor]], issues_by_buckets, all_bugs)
            bug_anchor = self.bug_set[anchor]
            bug_pos = self.bug_set[pos]
            if neg not in self.bug_set:
                continue
            batch_bugs.append(neg)
            batch_bugs_neg.append(neg)
            bug_neg = self.bug_set[neg]
            break
        
        # triplet bug and master
        batch_triplets.append([anchor, pos, neg])
    
    random.shuffle(batch_bugs)
    
    for bug_id in batch_bugs:
        bug = self.bug_set[bug_id]
        self.read_batch_bugs(batch_features, bug)

    batch_features['title'] = np.array(batch_features['title'])
    batch_features['desc'] = np.array(batch_features['desc'])
    batch_features['info'] = np.array(batch_features['info'])
    
    sim = np.asarray([issues_by_buckets[bug_id] for bug_id in batch_bugs])

    input_sample = {}

    input_sample = { 'title' : batch_features['title'], 
                        'description' : batch_features['desc'], 
                            'info' : batch_features['info'] }

    return batch_triplets, input_sample, sim #sim

In [22]:
%%time

batch_size = 128
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_sim = batch_iterator(baseline, retrieval, None, 
                                                                                      baseline.train_data, 
                                                                                      baseline.dup_sets_train,
                                                                                      bug_train_ids,
                                                                                      batch_size_test, 1,
                                                                                      issues_by_buckets)

validation_sample = [valid_input_sample['title'], 
             valid_input_sample['description'],
            valid_input_sample['info'], valid_sim]

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

CPU times: user 695 ms, sys: 0 ns, total: 695 ms
Wall time: 694 ms


In [23]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((384, 20), (384, 20), (384, 738), (384,))

### Validar entrada

In [24]:
# %%time 

#baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

In [25]:
"Test ", len(baseline.test_data)

('Test ', 8265)

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [26]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

vocabulary loaded


In [27]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 20031'

In [28]:
def generating_embed(baseline, GLOVE_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(GLOVE_DIR, 'glove.42B.300d.txt')
    f2 = open(embed_path, 'rb')
    num_lines = sum(1 for line in f2)
    f2.close()
    f = open(embed_path, 'rb')
    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (num_lines + vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading Glove")
    i = 0
    for line in loop:
        tokens = line.split()
        word = tokens[0]
        embeddings_index[word] = np.asarray(tokens[1:], dtype='float32')
        i+=1
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))
    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    for word, embed in vocab.items():
        if word not in embeddings_index:
            embeddings_index[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [29]:
%%time

generating_embed(baseline, GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

vocabulary loaded


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.


HBox(children=(IntProgress(value=0, max=20031), HTML(value='')))


CPU times: user 1min 33s, sys: 4.74 s, total: 1min 37s
Wall time: 1min 35s


## Experiment

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [30]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### CNN with filter 3,4,5

In [31]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    #sequence_input = Input(shape=(None,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    layer = Dense(300, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [32]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D, TimeDistributed

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 75
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    left_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    right_layer = LSTM(number_lstm_units, return_sequences=True, go_backwards=True)(left_layer)
    
    lstm_layer = Concatenate()([left_layer, right_layer])
    
    #lstm_layer = TimeDistributed(Dense(50))(lstm_layer)
    #layer = Flatten()(lstm_layer)
    layer = GlobalAveragePooling1D()(lstm_layer)
    layer = Dense(300, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [33]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    for units in [64, 32]:
        layer = Dense(units, activation='tanh', kernel_initializer='random_uniform')(info_input)
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [34]:
def pairwise_distance(feature, squared=False):
    """Computes the pairwise distance matrix with numerical stability.

    output[i, j] = || feature[i, :] - feature[j, :] ||_2

    Args:
      feature: 2-D Tensor of size [number of data, feature dimension].
      squared: Boolean, whether or not to square the pairwise distances.

    Returns:
      pairwise_distances: 2-D Tensor of size [number of data, number of data].
    """
    pairwise_distances_squared = math_ops.add(
        math_ops.reduce_sum(math_ops.square(feature), axis=[1], keepdims=True),
        math_ops.reduce_sum(
            math_ops.square(array_ops.transpose(feature)),
            axis=[0],
            keepdims=True)) - 2.0 * math_ops.matmul(feature,
                                                    array_ops.transpose(feature))

    # Deal with numerical inaccuracies. Set small negatives to zero.
    pairwise_distances_squared = math_ops.maximum(pairwise_distances_squared, 0.0)
    # Get the mask where the zero distances are at.
    error_mask = math_ops.less_equal(pairwise_distances_squared, 0.0)

    # Optionally take the sqrt.
    if squared:
        pairwise_distances = pairwise_distances_squared
    else:
        pairwise_distances = math_ops.sqrt(
            pairwise_distances_squared + math_ops.to_float(error_mask) * 1e-16)

    # Undo conditionally adding 1e-16.
    pairwise_distances = math_ops.multiply(
        pairwise_distances, math_ops.to_float(math_ops.logical_not(error_mask)))

    num_data = array_ops.shape(feature)[0]
    # Explicitly set diagonals to zero.
    mask_offdiagonals = array_ops.ones_like(pairwise_distances) - array_ops.diag(
        array_ops.ones([num_data]))
    pairwise_distances = math_ops.multiply(pairwise_distances, mask_offdiagonals)
    return pairwise_distances

def masked_maximum(data, mask, dim=1):
    """Computes the axis wise maximum over chosen elements.

    Args:
      data: 2-D float `Tensor` of size [n, m].
      mask: 2-D Boolean `Tensor` of size [n, m].
      dim: The dimension over which to compute the maximum.

    Returns:
      masked_maximums: N-D `Tensor`.
        The maximized dimension is of size 1 after the operation.
    """
    axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
    masked_maximums = math_ops.reduce_max(
        math_ops.multiply(data - axis_minimums, mask), dim,
        keepdims=True) + axis_minimums
    return masked_maximums

def masked_minimum(data, mask, dim=1):
    """Computes the axis wise minimum over chosen elements.

    Args:
      data: 2-D float `Tensor` of size [n, m].
      mask: 2-D Boolean `Tensor` of size [n, m].
      dim: The dimension over which to compute the minimum.

    Returns:
      masked_minimums: N-D `Tensor`.
        The minimized dimension is of size 1 after the operation.
    """
    axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
    masked_minimums = math_ops.reduce_min(
        math_ops.multiply(data - axis_maximums, mask), dim,
        keepdims=True) + axis_maximums
    return masked_minimums

In [35]:
## required for semi-hard triplet loss:
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import dtypes
import tensorflow as tf

def triplet_loss_adapted_from_tf(y_true, y_pred):
    del y_true
    margin = 1.
    labels = y_pred[:, :1]
 
    labels = tf.cast(labels, dtype='int32')

    embeddings = y_pred[:, 1:]

    ### Code from Tensorflow function [tf.contrib.losses.metric_learning.triplet_semihard_loss] starts here:
    
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    # lshape=array_ops.shape(labels)
    # assert lshape.shape == 1
    # labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = pairwise_distance(embeddings, squared=True)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    # global batch_size  
    batch_size = array_ops.size(labels) # was 'array_ops.size(labels)'

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
        math_ops.greater(
            pdist_matrix_tile, array_ops.reshape(
                array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
        math_ops.greater(
            math_ops.reduce_sum(
                math_ops.cast(mask, dtype=dtypes.float32), 1, keepdims=True),
            0.0), [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(
        mask_final, negatives_outside, negatives_inside)

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(
        adjacency, dtype=dtypes.float32) - array_ops.diag(
        array_ops.ones([batch_size]))

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    semi_hard_triplet_loss_distance = math_ops.truediv(
        math_ops.reduce_sum(
            math_ops.maximum(
                math_ops.multiply(loss_mat, mask_positives), 0.0)),
        num_positives,
        name='triplet_semihard_loss')
    
    ### Code from Tensorflow function semi-hard triplet loss ENDS here.
    return semi_hard_triplet_loss_distance

In [36]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model(bug_t)
    bug_d_feat = desc_feature_model(bug_d)
    bug_i_feat = categorical_feature_model(bug_i)
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    # bug_feature_output = Dense(300, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [37]:
def max_margin_objective(encoded_anchor, decay_lr=1):
    
    input_labels = Input(shape=(1,), name='input_label')    # input layer for labels
    inputs = np.concatenate([encoded_anchor.input, [input_labels]], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    
    output = concatenate([input_labels, encoded_anchor])  # concatenating the labels + embeddings
    
    similarity_model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    # optimizer = Adam(lr=1e-3 * decay_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer='adam', loss=triplet_loss_adapted_from_tf) 
    # metrics=[pos_distance, neg_distance, custom_margin_loss]

    return similarity_model

In [38]:
# Domain to use
limit_train = int(epochs * freeze_train) # 10% de 1000 , 100 epocas
METHOD = 'baseline_{}'.format(limit_train)
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

In [39]:
import _pickle as pickle
def save_loss(result):
    with open(os.path.join(DIR,'{}_log.pkl'.format(METHOD)), 'wb') as f:
        pickle.dump(result, f)
    print("=> result saved!")

In [40]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

# Feature models
'''
    cnn_model
    lstm_model
    mlp_model
'''
desc_feature_model = cnn_model(desc_embedding_layer, MAX_SEQUENCE_LENGTH_D)
title_feature_model = lstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')

similarity_model = max_margin_objective(encoded_anchor, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()

'''
    Experiment
'''
result = { 'train' : [], 'test' : [] }
print("Total of ", limit_train)
for epoch in range(limit_train):
    batch_triplet_train, \
        train_input_sample, train_sim = batch_iterator(baseline, retrieval, encoded_anchor, baseline.train_data, 
                                                       baseline.dup_sets_train, bug_train_ids, 
                                                           batch_size, 1, issues_by_buckets, TRIPLET_HARD=False)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'], train_sim]
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    h_validation = similarity_model.test_on_batch(x=validation_sample, y=valid_sim)
    
     # save results
    result['train'].append(h)
    result['test'].append(h_validation)
    
    if( (epoch+1) % 10 == 0 or (epoch+1 == limit_train) ):
        save_loss(result)
    
    if (epoch+1 == limit_train): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, bug_train_ids)
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, recall@25: {:.2f}".format(epoch+1, h, h_validation, recall))
    else:
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}".format(epoch+1, h, h_validation))
    loss = h
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

#experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
#experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))














Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 738)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)        

Epoch: 58 Loss: 0.06, Loss_test: 0.03
Epoch: 59 Loss: 0.09, Loss_test: 0.03
=> result saved!
Epoch: 60 Loss: 0.07, Loss_test: 0.03
Epoch: 61 Loss: 0.04, Loss_test: 0.03
Epoch: 62 Loss: 0.07, Loss_test: 0.03
Epoch: 63 Loss: 0.07, Loss_test: 0.03
Epoch: 64 Loss: 0.06, Loss_test: 0.03
Epoch: 65 Loss: 0.04, Loss_test: 0.03
Epoch: 66 Loss: 0.04, Loss_test: 0.03
Epoch: 67 Loss: 0.07, Loss_test: 0.03
Epoch: 68 Loss: 0.05, Loss_test: 0.02
Epoch: 69 Loss: 0.06, Loss_test: 0.02
=> result saved!
Epoch: 70 Loss: 0.04, Loss_test: 0.02
Epoch: 71 Loss: 0.06, Loss_test: 0.02
Epoch: 72 Loss: 0.07, Loss_test: 0.02
Epoch: 73 Loss: 0.04, Loss_test: 0.02
Epoch: 74 Loss: 0.06, Loss_test: 0.02
Epoch: 75 Loss: 0.05, Loss_test: 0.02
Epoch: 76 Loss: 0.03, Loss_test: 0.02
Epoch: 77 Loss: 0.05, Loss_test: 0.02
Epoch: 78 Loss: 0.03, Loss_test: 0.02
Epoch: 79 Loss: 0.05, Loss_test: 0.02
=> result saved!
Epoch: 80 Loss: 0.03, Loss_test: 0.02
Epoch: 81 Loss: 0.03, Loss_test: 0.02
Epoch: 82 Loss: 0.04, Loss_test: 0.02

In [41]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/openoffice/bert/exported_rank_baseline_100.txt'

In [42]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [None]:
experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(limit_train)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(limit_train)), verbose=1)

In [44]:
len(result['train']), len(result['test'])

(100, 100)

In [45]:
model = similarity_model.get_layer('concatenate_3')
output = model.output
inputs = similarity_model.inputs
model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

# setup the optimization process 
model.compile(optimizer='adam', loss=triplet_loss_adapted_from_tf)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 738)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          221700      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [46]:
# Domain to use
METHOD = 'baseline_{}'.format(epochs)
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

In [47]:
end_train = epochs - limit_train
for epoch in range(limit_train, end_train):
    batch_triplet_train, \
        train_input_sample, train_sim = batch_iterator(baseline, retrieval, model, baseline.train_data, 
                                                       baseline.dup_sets_train, bug_train_ids, 
                                                           batch_size, 1, issues_by_buckets, TRIPLET_HARD=False)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'], train_sim]
    

    h = model.train_on_batch(x=train_batch, y=train_sim)
    h_validation = model.test_on_batch(x=validation_sample, y=valid_sim)
    
    # save results
    result['train'].append(h)
    result['test'].append(h_validation)
    
    if( (epoch+1) % 10 == 0 or (epoch+1 == end_train )):
        save_loss(result)
    
    print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}".format(epoch+1, h, h_validation))

Epoch: 101 Loss: 0.05, Loss_test: 0.02, recall@25: 0.38
Epoch: 102 Loss: 0.03, Loss_test: 0.02, recall@25: 0.38
Epoch: 103 Loss: 0.02, Loss_test: 0.02, recall@25: 0.38
Epoch: 104 Loss: 0.05, Loss_test: 0.02, recall@25: 0.38
Epoch: 105 Loss: 0.04, Loss_test: 0.02, recall@25: 0.38
Epoch: 106 Loss: 0.07, Loss_test: 0.02, recall@25: 0.38
Epoch: 107 Loss: 0.08, Loss_test: 0.02, recall@25: 0.38
Epoch: 108 Loss: 0.03, Loss_test: 0.03, recall@25: 0.38
Epoch: 109 Loss: 0.03, Loss_test: 0.04, recall@25: 0.38
=> result saved!
Epoch: 110 Loss: 0.03, Loss_test: 0.04, recall@25: 0.38
Epoch: 111 Loss: 0.05, Loss_test: 0.04, recall@25: 0.38
Epoch: 112 Loss: 0.11, Loss_test: 0.03, recall@25: 0.38
Epoch: 113 Loss: 0.05, Loss_test: 0.02, recall@25: 0.38
Epoch: 114 Loss: 0.05, Loss_test: 0.02, recall@25: 0.38
Epoch: 115 Loss: 0.04, Loss_test: 0.02, recall@25: 0.38
Epoch: 116 Loss: 0.04, Loss_test: 0.03, recall@25: 0.38
Epoch: 117 Loss: 0.04, Loss_test: 0.02, recall@25: 0.38
Epoch: 118 Loss: 0.02, Loss_tes

Epoch: 244 Loss: 0.01, Loss_test: 0.01, recall@25: 0.38
Epoch: 245 Loss: 0.02, Loss_test: 0.01, recall@25: 0.38
Epoch: 246 Loss: 0.01, Loss_test: 0.01, recall@25: 0.38
Epoch: 247 Loss: 0.01, Loss_test: 0.01, recall@25: 0.38
Epoch: 248 Loss: 0.01, Loss_test: 0.01, recall@25: 0.38
Epoch: 249 Loss: 0.02, Loss_test: 0.01, recall@25: 0.38
=> result saved!
Epoch: 250 Loss: 0.01, Loss_test: 0.01, recall@25: 0.38
Epoch: 251 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 252 Loss: 0.02, Loss_test: 0.00, recall@25: 0.38
Epoch: 253 Loss: 0.02, Loss_test: 0.00, recall@25: 0.38
Epoch: 254 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 255 Loss: 0.02, Loss_test: 0.00, recall@25: 0.38
Epoch: 256 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 257 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 258 Loss: 0.02, Loss_test: 0.00, recall@25: 0.38
Epoch: 259 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 260 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 261 Los

Epoch: 387 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 388 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 389 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 390 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 391 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 392 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 393 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 394 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 395 Loss: 0.01, Loss_test: 0.01, recall@25: 0.38
Epoch: 396 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 397 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 398 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 399 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 400 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 402 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 403 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 404 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 405 Los

Epoch: 531 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 532 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 533 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 534 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 535 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 536 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 537 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 538 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 539 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 540 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 541 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 542 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 543 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 544 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 545 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 546 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 547 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 548 Loss: 0.01, Loss_tes

Epoch: 674 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 675 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 676 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 677 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 678 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 679 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 680 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 681 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
Epoch: 682 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 683 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 684 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 685 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 686 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 687 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 688 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 689 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 690 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 691 Los

Epoch: 817 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 818 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 819 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 820 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 821 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 822 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 823 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 824 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 825 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 826 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 827 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 828 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 829 Loss: 0.01, Loss_test: 0.00, recall@25: 0.38
=> result saved!
Epoch: 830 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 831 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 832 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 833 Loss: 0.00, Loss_test: 0.00, recall@25: 0.38
Epoch: 834 Los

In [48]:
len(result['train']), len(result['test'])

(900, 900)

In [49]:
encoded = model.get_layer('merge_features_in')
output = encoded.output
inputs = similarity_model.inputs[:-1]
encoded_anchor = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

In [50]:
SAVE_PATH.replace('@number_of_epochs@', str(epochs))

'bert_preprocessing_baseline_1000_feature1000epochs_64batch(openoffice)'

In [51]:
experiment.save_model(model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
"Model saved"

Saved model 'modelos/model_bert_preprocessing_baseline_1000_feature_1000epochs_64batch(openoffice).h5' to disk


'Model saved'

In [52]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 1, encoded_anchor, issues_by_buckets, bug_train_ids)
print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, recall@25: {:.2f}".format(epoch+1, h, h_validation, recall))

HBox(children=(IntProgress(value=0, max=8265), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11757), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12837), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11757), HTML(value='')))


Epoch: 900 Loss: 0.00, Loss_test: 0.00, recall@25: 0.58


In [53]:
recall

0.58

In [54]:
exported_rank[:20]

['98306:88871,50853,33630,90791|98409:0.4277384877204895,78806:0.3756054639816284,69738:0.36447882652282715,29075:0.3629556894302368,56191:0.34965288639068604,37146:0.3444681167602539,85440:0.34345531463623047,96073:0.34310072660446167,35249:0.33767491579055786,62787:0.3366132974624634,64092:0.331354558467865,37566:0.33022648096084595,108206:0.32732093334198,44022:0.3253530263900757,95954:0.3242754340171814,94451:0.3240339159965515,54736:0.3229805827140808,100934:0.32271599769592285,95851:0.3226770758628845,43048:0.3218843936920166,69134:0.32052868604660034,65402:0.3199648857116699,55268:0.31826138496398926,79720:0.3167358636856079,32494:0.31628602743148804,101179:0.3155577778816223,69519:0.31495386362075806,53885:0.3121868968009949,94168:0.3115445375442505',
 '32771:32490,33548,32560,33879,32699|26003:0.41570770740509033,32494:0.38182735443115234,6177:0.37766045331954956,20262:0.3675897717475891,17469:0.356759250164032,50601:0.3394836187362671,15634:0.3382298946380615,98930:0.33074897

### Retrieval evaluation

In [55]:
print("Total of queries:", len(retrieval.test))

Total of queries: 8265


#### Getting the model trained

In [56]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'bert_preprocessing_baseline_1000_feature_1000epochs_64batch(openoffice)'

In [57]:
encoded_anchor.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 738)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          221700      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [58]:
len(exported_rank)

11757

In [59]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/openoffice/bert/exported_rank_baseline_1000.txt'

In [60]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [61]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.46,
 '2 - recall_at_10': 0.51,
 '3 - recall_at_15': 0.54,
 '4 - recall_at_20': 0.56,
 '5 - recall_at_25': 0.58}

#### Some ideas to visualizate

- https://towardsdatascience.com/building-a-recommendation-system-using-neural-network-embeddings-1ef92e5c80c9