# Bug triage with Deep Learning

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 20 # 40
MAX_SEQUENCE_LENGTH_D = 100 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'netbeans'
METHOD = 'baseline'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = 'baseline_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'baseline_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [9]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=180483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=36232), HTML(value='')))




#### Loading bug ids in memory

In [10]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


216715

#### Dicionário de títulos e descrições

In [11]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=216715), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 1min 21s, sys: 2.39 s, total: 1min 24s
Wall time: 1min 22s


#### Hashing bugs by buckets

In [12]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=180483), HTML(value='')))




#### Prepare the train and test

In [13]:
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids


In [14]:
baseline.train_data[:10]

[[90024, 1289],
 [1408, 6256],
 [1787, 14975],
 [166804, 2020],
 [2337, 31362],
 [2337, 46020],
 [2337, 15205],
 [2337, 32942],
 [2337, 35023],
 [2337, 57495]]

#### Recovery bug ids from train

In [15]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Export the corpus train

In [16]:
if EXTRACT_CORPUS:
    corpus = []
    export_file = open(os.path.join(DIR, 'corpus_train.txt'), 'w')
    for bug_id in tqdm(baseline.bug_set):
        bug = baseline.bug_set[bug_id]
        title = bug['title']
        desc = bug['description']
        export_file.write("{}\n{}\n".format(title, desc))
    export_file.close()

# Generating tiple of batches

In [17]:
if 2521 in baseline.bug_set:
    print(baseline.bug_set[2521])

{'dup_id': '[]', 'bug_status': '1\n', 'issue_id': 2521, 'priority': '3\n', 'title': 'when opended number edited file save is not available onlu save all when opened more then number file save is available but organization s save non actual file', 'component': '103\n', 'description': 'priority is changed to p normal', 'delta_ts': '2008-12-23 10:55:57 +0000', 'creation_ts': '1999-07-16 03:55:00 +0000', 'resolution': 'FIXED', 'product': '9\n', 'description_word': array([1351,   15,  292,   11,  421, 1348,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,

### Generating the batch test

In [18]:
"Train ", len(baseline.dup_sets_train)

('Train ', 30600)

In [19]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = baseline.batch_iterator(baseline.train_data, 
                                                                                          baseline.dup_sets_train,
                                                                                          bug_train_ids,
                                                                                          batch_size_test, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

CPU times: user 52 ms, sys: 0 ns, total: 52 ms
Wall time: 51.4 ms


In [20]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((128, 100), (128, 500), (128, 544), (128,))

### Validar entrada

In [21]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

***Title***: cat assertion error child organization reciete organization src utils moms java folder organization reciete organization src utils
***Title***: assertion error file name users ryanmauger projects i i local application modules slideshows models slideshows php be d fo organization users ryanmauger projects i i local application module
***Description***: this bug was originally marked as duplicate of bug that is already resolved this bug is still valid so this seems to be another bug but it might be related build net beans ide dev build vm java hot spot tm client vm b organization runtime environment b organization product carlo salinari checking issues from exception reporter person to rename a package stacktrace java lang assertion error child organization reciete person src utils moms java folder organization reciete person src utils at org netbeans modules masterfs filebasedfs children organization add child organization java at org netbeans modules masterfs filebasedfs c

In [22]:
"Test ", len(baseline.test_data)

('Test ', 3162)

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [23]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

vocabulary loaded


In [24]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 102875'

In [25]:
def generating_embed(baseline, GLOVE_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(GLOVE_DIR, 'glove.42B.300d.txt')
    f = open(embed_path, 'rb')
    #num_lines = sum(1 for line in open(embed_path, 'rb'))

    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading Glove")
    for line in loop:
        tokens = line.split()
        word = tokens[0]
        embeddings_index[word] = np.asarray(tokens[1:], dtype='float32')
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))

    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    i = 0
    for word, embed in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
        else:
            embedding_matrix[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [26]:
%%time

generating_embed(baseline, GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

vocabulary loaded


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.


HBox(children=(IntProgress(value=0, max=102875), HTML(value='')))


CPU times: user 1min 28s, sys: 3.62 s, total: 1min 32s
Wall time: 1min 30s


## Experiment

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [27]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### CNN with filter 3,4,5

In [28]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    #sequence_input = Input(shape=(None,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    layer = Dense(300, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [29]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
#     lstm_layer = Bidirectional(LSTM(number_lstm_units, return_sequences=True), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
#                                merge_mode='ave')

    lstm_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    layer = LSTM(number_lstm_units)(lstm_layer)

    #layer = lstm_layer(embedded_sequences)
    #layer = GlobalAveragePooling1D()(layer)
    layer = Dense(300, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [30]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [31]:
from keras import backend as K
import tensorflow as tf

def normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = normalize(x, axis=-1)
    y, y_norm = normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return K.mean(distance, axis=-1, keepdims=False)

def custom_margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.sum(K.maximum(0.0, margin - pos + neg))

def pos_distance(y_true, y_pred):
    return y_pred[0]

def neg_distance(y_true, y_pred):
    return y_pred[1]

def stack_tensors(vects):
    return K.stack(vects, axis=-1)

In [32]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model(bug_t)
    bug_d_feat = desc_feature_model(bug_d)
    bug_i_feat = categorical_feature_model(bug_i)
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    # bug_feature_output = Dense(300, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [33]:
def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=1):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    
    # Cosine
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])

    # Loss function only works with a single output
    output = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances',
        output_shape=(2, 1)
    )([positive_d, negative_d])
    
    #loss = MarginLoss()(output)

    similarity_model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    optimizer = Adam(lr=1e-3 * decay_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=custom_margin_loss, metrics=[pos_distance, neg_distance, custom_margin_loss])

    return similarity_model

In [34]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

# Feature models
'''
    cnn_model
    lstm_model
    mlp_model
'''
desc_feature_model = cnn_model(desc_embedding_layer, MAX_SEQUENCE_LENGTH_D)
title_feature_model = lstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')

encoded_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = baseline.batch_iterator(baseline.train_data, baseline.dup_sets_train, bug_train_ids, batch_size, 1)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'],
                   train_input_pos['title'], train_input_pos['description'], train_input_pos['info'], 
                   train_input_neg['title'], train_input_neg['description'], train_input_neg['info']]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, _, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, bug_train_ids)
        print("Epoch: {} Loss: {:.2f}, MarginLoss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}, recall@25: {:.2f}".format(epoch+1,
                                                                                                         h[0],  h[3],
                                                                                                         h[1], h[2], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, MarginLoss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}".format(epoch+1,
                                                                                                         h[0],  h[3],
                                                                                                         h[1],
                                                                                                         h[2]))
    loss = h[3]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 544)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
info_pos (InputLayer)           (None, 544)          0                                            
__________________________________________________________________________________________________
title_pos 

Epoch: 37 Loss: 0.62, MarginLoss: 0.62, pos_cosine: 0.86, neg_cosine: 0.48
Epoch: 38 Loss: 0.65, MarginLoss: 0.65, pos_cosine: 0.86, neg_cosine: 0.50
Epoch: 39 Loss: 0.68, MarginLoss: 0.68, pos_cosine: 0.87, neg_cosine: 0.55
Epoch: 40 Loss: 0.67, MarginLoss: 0.67, pos_cosine: 0.85, neg_cosine: 0.51
Epoch: 41 Loss: 0.63, MarginLoss: 0.63, pos_cosine: 0.87, neg_cosine: 0.50
Epoch: 42 Loss: 0.67, MarginLoss: 0.67, pos_cosine: 0.84, neg_cosine: 0.50
Epoch: 43 Loss: 0.59, MarginLoss: 0.59, pos_cosine: 0.88, neg_cosine: 0.47
Epoch: 44 Loss: 0.61, MarginLoss: 0.61, pos_cosine: 0.87, neg_cosine: 0.48
Epoch: 45 Loss: 0.65, MarginLoss: 0.65, pos_cosine: 0.86, neg_cosine: 0.52
Epoch: 46 Loss: 0.60, MarginLoss: 0.60, pos_cosine: 0.88, neg_cosine: 0.48
Epoch: 47 Loss: 0.60, MarginLoss: 0.60, pos_cosine: 0.88, neg_cosine: 0.49
Epoch: 48 Loss: 0.65, MarginLoss: 0.65, pos_cosine: 0.87, neg_cosine: 0.51
Epoch: 49 Loss: 0.60, MarginLoss: 0.60, pos_cosine: 0.88, neg_cosine: 0.48
Epoch: 50 Loss: 0.66, Mar

In [35]:
_[:20]

['233472:230872|234329:0.9974548863247037,230092:0.9969204270746559,234917:0.9967435547150671,234918:0.9967435547150671,230090:0.995780489873141,231856:0.9957241797819734,231804:0.9931402602232993,229428:0.9864214025437832,237668:0.5490624308586121,236741:0.5489483177661896,233727:0.5397214591503143,237097:0.5397157371044159,236879:0.5397125780582428,236839:0.5397065579891205,234948:0.5103350281715393,234962:0.5044711232185364,231238:0.5044639408588409,235127:0.5044516324996948,235129:0.5043337345123291,234216:0.5005126595497131,234214:0.5005110502243042,234218:0.4993858337402344,232621:0.49730223417282104,232615:0.4973018765449524,233709:0.49729180335998535,232720:0.49728816747665405,229528:0.4913507103919983,235293:0.4910625219345093,229581:0.49105578660964966',
 '230872:233472|235293:0.9948847237974405,229581:0.993970044888556,236775:0.9050537645816803,235670:0.9050142765045166,234120:0.9050001576542854,229419:0.9049997180700302,237324:0.9049843028187752,237294:0.9049826413393021,23

In [36]:
# '''
#     Between 0-10 epochs recall@25 = 0.28
#     Between 0-20 epochs recall@25 = 0.32
#     Between 0-70 epochs recall@25 = ?
#     Between 0-100 epochs recall@25 = ?
# '''
# recall, exported_rank = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, bug_train_ids)

# "recall@25 last epoch:", recall

### Retrieval evaluation

In [37]:
print("Total of queries:", len(retrieval.test))

Total of queries: 3162


#### Getting the model trained

In [38]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'baseline_feature_100epochs_64batch(netbeans)'

In [39]:
model = experiment.get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))



In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 544)          0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          163500      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [None]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, bug_train_ids)

In [45]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/netbeans/exported_rank_baseline.txt'

In [46]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [47]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.6,
 '2 - recall_at_10': 0.66,
 '3 - recall_at_15': 0.69,
 '4 - recall_at_20': 0.72,
 '5 - recall_at_25': 0.74}

#### Some ideas to visualizate

- https://towardsdatascience.com/building-a-recommendation-system-using-neural-network-embeddings-1ef92e5c80c9