# Bug triage with Deep Learning

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 20 # 100
MAX_SEQUENCE_LENGTH_D = 20 # 500
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000
'''
    Configuration
'''
epochs = 1000
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'openoffice'
METHOD = 'baseline_{}'.format(epochs)
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = '{}_feature@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=57667), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14567), HTML(value='')))




#### Loading bug ids in memory

In [10]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


72234

#### Dicionário de títulos e descrições

In [11]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=72234), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 5.39 s, sys: 661 ms, total: 6.05 s
Wall time: 6.04 s


#### Hashing bugs by buckets

In [12]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=58572), HTML(value='')))




#### Prepare the train and test

In [13]:
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids


In [14]:
baseline.train_data[:10]

[[59, 27],
 [59, 92],
 [27, 92],
 [64, 43],
 [44, 45],
 [53, 54],
 [84, 63],
 [75, 699],
 [105, 121],
 [186, 199]]

#### Recovery bug ids from train

In [15]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Export the corpus train

In [16]:
if EXTRACT_CORPUS:
    corpus = []
    export_file = open(os.path.join(DIR, 'corpus_train.txt'), 'w')
    for bug_id in tqdm(baseline.bug_set):
        bug = baseline.bug_set[bug_id]
        title = bug['title']
        desc = bug['description']
        export_file.write("{}\n{}\n".format(title, desc))
    export_file.close()

# Generating tiple of batches

In [17]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

{'bug_severity': '4\n',
 'bug_status': '1\n',
 'component': '117\n',
 'creation_ts': '2006-07-28 15:20:00 +0000',
 'delta_ts': '2013-02-24 21:09:32 +0000',
 'description': '[CLS] i have got a book ##mark that en ##cl ##oses a text ##field . trying to access this text ##field using the create ##con ##ten ##ten ##ume ##ration ( ) - method of the book ##mark . anchor does not work though book ##mark . anchor . get ##ava ##ila ##bles ##er ##vic ##ena ##mes ( ) tells that the required " com . sun . star . text . text ##con ##ten ##t " is available . regarding the specified behaviour of get ##ava ##ila ##bles ##er ##vic ##ena ##mes ( ) , the create ##con ##ten ##ten ##ume ##ration - method is not allowed to return an empty content ##en ##ume ##ration ( see http : / / api . open ##off ##ice . org / doc ##s / common / ref / com / sun / star / container / x ##con ##ten ##ten ##ume ##ration ##ac ##ces ##s . html # get ##ava ##ila ##bles ##er ##vic ##ena ##mes ) the following code demonstrates th

### Generating the batch test

In [18]:
"Train ", len(baseline.dup_sets_train)

('Train ', 11043)

In [19]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = experiment.batch_iterator(None, 
                                                                                                      baseline.train_data, 
                                                                                                      baseline.dup_sets_train,
                                                                                                      bug_train_ids,
                                                                                                      batch_size_test, 1,
                                                                                                      issues_by_buckets)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

CPU times: user 31.9 ms, sys: 0 ns, total: 31.9 ms
Wall time: 31.6 ms


In [20]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((128, 20), (128, 20), (128, 729), (128,))

### Validar entrada

In [21]:
# %%time 

#baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

In [22]:
"Test ", len(baseline.test_data)

('Test ', 2086)

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [23]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

vocabulary loaded


In [24]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 18562'

In [25]:
def generating_embed(baseline, GLOVE_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(GLOVE_DIR, 'glove.42B.300d.txt')
    f = open(embed_path, 'rb')
    #num_lines = sum(1 for line in open(embed_path, 'rb'))

    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading Glove")
    for line in loop:
        tokens = line.split()
        word = tokens[0]
        embeddings_index[word] = np.asarray(tokens[1:], dtype='float32')
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))

    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    i = 0
    for word, embed in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
        else:
            embedding_matrix[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [26]:
%%time

generating_embed(baseline, GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

vocabulary loaded


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.


HBox(children=(IntProgress(value=0, max=18562), HTML(value='')))


CPU times: user 1min 22s, sys: 3.65 s, total: 1min 25s
Wall time: 1min 23s


## Experiment

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [27]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### CNN with filter 3,4,5

In [28]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    #sequence_input = Input(shape=(None,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    layer = Dense(300, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [29]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D, TimeDistributed

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 75
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    left_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    right_layer = LSTM(number_lstm_units, return_sequences=True, go_backwards=True)(left_layer)
    
    lstm_layer = Concatenate()([left_layer, right_layer])
    
    #lstm_layer = TimeDistributed(Dense(50))(lstm_layer)
    #layer = Flatten()(lstm_layer)
    layer = GlobalAveragePooling1D()(lstm_layer)
    layer = Dense(300, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [30]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    for units in [64, 32]:
        layer = Dense(units, activation='tanh', kernel_initializer='random_uniform')(info_input)
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [31]:
from keras import backend as K
import tensorflow as tf

def normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = normalize(x, axis=-1)
    y, y_norm = normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return K.mean(distance, axis=-1, keepdims=False)

def custom_margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.sum(K.maximum(0.0, margin - pos + neg))

def pos_distance(y_true, y_pred):
    return y_pred[0]

def neg_distance(y_true, y_pred):
    return y_pred[1]

def stack_tensors(vects):
    return K.stack(vects, axis=-1)

In [32]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model(bug_t)
    bug_d_feat = desc_feature_model(bug_d)
    bug_i_feat = categorical_feature_model(bug_i)
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    # bug_feature_output = Dense(300, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [33]:
def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=1):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    
    # Cosine
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])

    # Loss function only works with a single output
    output = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances',
        output_shape=(2, 1)
    )([positive_d, negative_d])
    
    #loss = MarginLoss()(output)

    similarity_model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    optimizer = Adam(lr=1e-3 * decay_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=custom_margin_loss, metrics=[pos_distance, neg_distance, custom_margin_loss])

    return similarity_model

In [34]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

# Feature models
'''
    cnn_model
    lstm_model
    mlp_model
'''
desc_feature_model = cnn_model(desc_embedding_layer, MAX_SEQUENCE_LENGTH_D)
title_feature_model = lstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')

encoded_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = experiment.batch_iterator(encoded_anchor, baseline.train_data, baseline.dup_sets_train, bug_train_ids, 
                                       batch_size, 1, issues_by_buckets)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'],
                   train_input_pos['title'], train_input_pos['description'], train_input_pos['info'], 
                   train_input_neg['title'], train_input_neg['description'], train_input_neg['info']]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, _, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, bug_train_ids)
        print("Epoch: {} Loss: {:.2f}, MarginLoss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}, recall@25: {:.2f}".format(epoch+1,
                                                                                                         h[0],  h[3],
                                                                                                         h[1], h[2], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, MarginLoss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}".format(epoch+1,
                                                                                                         h[0],  h[3],
                                                                                                         h[1],
                                                                                                         h[2]))
    loss = h[3]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 729)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
info_pos (InputLayer)           (None, 729)          0                                            
__________________________________________________________________________________________________
title_pos 

Epoch: 37 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 38 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 39 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 40 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 41 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 42 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 43 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 44 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 45 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 46 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 47 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 48 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 49 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 50 Loss: 1.00, Mar

Epoch: 146 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 147 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 148 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 149 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 150 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 151 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 152 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 153 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 154 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 155 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 156 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 157 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 158 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 159 L

Epoch: 254 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 255 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 256 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 257 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 258 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 259 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 260 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 261 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 262 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 263 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 264 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 265 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 266 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 267 L

Epoch: 362 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 363 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 364 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 365 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 366 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 367 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 368 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 369 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 370 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 371 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 372 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 373 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 374 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 375 L

Epoch: 470 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 471 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 472 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 473 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 474 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 475 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 476 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 477 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 478 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 479 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 480 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 481 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 482 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 483 L

Epoch: 578 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 579 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 580 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 581 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 582 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 583 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 584 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 585 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 586 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 587 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 588 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 589 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 590 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 591 L

Epoch: 686 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 687 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 688 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 689 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 690 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 691 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 692 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 693 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 694 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 695 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 696 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 697 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 698 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 699 L

Epoch: 794 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 795 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 796 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 797 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 798 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 799 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 800 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 801 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 802 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 803 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 804 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 805 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 806 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 807 L

Epoch: 902 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 903 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 904 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 905 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 906 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 907 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 908 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 909 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 910 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 911 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 912 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 913 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 914 Loss: 1.00, MarginLoss: 1.00, pos_cosine: 1.00, neg_cosine: 1.00
Epoch: 915 L

In [35]:
recall

0.51

In [36]:
_[:20]

['108544:111059,109674,108379,109366|102470:0.963911809027195,14454:0.9630606733262539,109674:0.9616216197609901,111059:0.9608864150941372,111297:0.959563922137022,115100:0.9588290862739086,46957:0.9583318904042244,92117:0.9570300281047821,115569:0.9570199176669121,92348:0.9562196135520935,115421:0.9558651223778725,94421:0.9555420242249966,92690:0.9548383057117462,105671:0.9546678513288498,102495:0.9544434510171413,86960:0.954442922025919,95277:0.9543333686888218,104574:0.9542505517601967,104576:0.9542505517601967,93177:0.9542159736156464,116229:0.9540876485407352,93169:0.9540772624313831,98212:0.9539180845022202,108379:0.9538363441824913,89620:0.9537629596889019,105166:0.9537356644868851,107267:0.953667726367712,92613:0.9535845369100571,94909:0.9535478688776493',
 '109674:108544,111059,108379,109366|92117:0.9754188433289528,115100:0.9750226605683565,92348:0.9749806709587574,14454:0.9747119471430779,102470:0.9746984858065844,115569:0.9740808866918087,92690:0.9740739706903696,94421:0.97

In [37]:
# '''
#     Between 0-10 epochs recall@25 = 0.28
#     Between 0-20 epochs recall@25 = 0.32
#     Between 0-70 epochs recall@25 = ?
#     Between 0-100 epochs recall@25 = ?
# '''
# recall, exported_rank = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, bug_train_ids)

# "recall@25 last epoch:", recall

### Retrieval evaluation

In [38]:
print("Total of queries:", len(retrieval.test))

Total of queries: 2086


#### Getting the model trained

In [39]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'baseline_1000_feature_1000epochs_64batch(openoffice)'

In [40]:
model = encoded_anchor
# model = experiment.get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))

In [41]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 729)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          219000      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [42]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, bug_train_ids)

In [43]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/openoffice/exported_rank_baseline_1000.txt'

In [44]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in _:
        file_out.write(row + "\n")

In [45]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.37,
 '2 - recall_at_10': 0.44,
 '3 - recall_at_15': 0.47,
 '4 - recall_at_20': 0.5,
 '5 - recall_at_25': 0.51}

#### Some ideas to visualizate

- https://towardsdatascience.com/building-a-recommendation-system-using-neural-network-embeddings-1ef92e5c80c9