# DWEN

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 20 # 100
MAX_SEQUENCE_LENGTH_D = 20 # 500
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000
'''
    Configuration
'''
epochs = 1000
freeze_train = .1 # 10% with freeze weights
best_loss = float('inf')
best_epoch = 0
verbose = 0
loss = 0

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'netbeans'
METHOD = 'baseline_dwen_{}'.format(epochs)
PREPROCESSING = 'bert'
TOKEN = 'bert'
# Dataset paths
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [9]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [10]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D,
                   token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [11]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

#### Loading bug ids in memory

In [12]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


216715

#### Dicionário de títulos e descrições

In [13]:
%%time

experiment.load_bugs(TOKEN)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=216715), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 37.7 s, sys: 11.4 s, total: 49.1 s
Wall time: 1min 34s


#### Hashing bugs by buckets

In [14]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=216715), HTML(value='')))




#### Prepare the train and test

In [15]:
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

In [16]:
baseline.train_data[:10]

[[23370, 26780],
 [103239, 105067],
 [61954, 73016],
 [204317, 202674],
 [220178, 220147],
 [195089, 195269],
 [221186, 219028],
 [102315, 105397],
 [196611, 193682],
 [50448, 50450]]

In [17]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [18]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

{'bug_severity': '2\n',
 'bug_status': '0\n',
 'component': '0\n',
 'creation_ts': '2009-11-11 05:56:00 +0000',
 'delta_ts': '2009-11-11 06:13:37 +0000',
 'description': "[CLS] net ##be ##ans id ##e 6 . 8 beta ( build 2009 ##10 ##21 ##200 ##1 ) when you search for a string in the project a window with results is shown , but when you double click on the found item , id ##e won ' t bring you to the editor on the found line of the item but just to the beginning of the document [SEP]",
 'description_segment': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


### Generating the batch test

In [19]:
"Train ", len(baseline.dup_sets_train)

('Train ', 34596)

In [20]:
import random
random.choice(list(issues_by_buckets))

171396

In [21]:
110647 in experiment.baseline.bug_set

True

In [22]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = experiment.batch_iterator(None, 
                                                                                          baseline.train_data, 
                                                                                          baseline.dup_sets_train,
                                                                                          bug_train_ids,
                                                                                          batch_size_test, 1,
                                                                                          issues_by_buckets)

pos = np.full((1, batch_size_test), 1)
neg = np.full((1, batch_size_test), 0)
valid_sim = np.concatenate([pos, neg], -1)[0]

valid_title_sample_a = np.concatenate([valid_input_sample['title'], valid_input_sample['title']], 0)
valid_title_sample_b = np.concatenate([valid_input_pos['title'], valid_input_neg['title']], 0)
valid_desc_sample_a = np.concatenate([valid_input_sample['description'], valid_input_sample['description']], 0)
valid_desc_sample_b = np.concatenate([valid_input_pos['description'], valid_input_neg['description']], 0)

validation_sample = [valid_title_sample_a, valid_title_sample_b, valid_desc_sample_a, valid_desc_sample_b]

# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

CPU times: user 760 ms, sys: 2.99 ms, total: 763 ms
Wall time: 763 ms


In [23]:
len(valid_title_sample_a), len(valid_sim)

(256, 256)

In [24]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_sim.shape

((128, 20), (128, 20), (256,))

### Validar entrada

In [25]:
# %%time 

#baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

## Pre-trained embeddings

Loading pretrained word vectors

### Fasttext

In [26]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

vocabulary loaded


In [27]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 19061'

In [28]:
def generating_embed(baseline, GLOVE_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(GLOVE_DIR, 'glove.42B.300d.txt')
    
    f2 = open(embed_path, 'rb')
    num_lines = sum(1 for line in f2)
    f2.close()
    
    f = open(embed_path, 'rb')
    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (num_lines + vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading Glove")
    i = 0
    for line in loop:
        tokens = line.split()
        word = tokens[0]
        embed = np.asarray(tokens[1:], dtype='float32')
        embeddings_index[word] = embed
        embedding_matrix[i] = embed
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))

    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    
    for word, embed in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
        else:
            embedding_matrix[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [29]:
%%time

generating_embed(baseline, GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

vocabulary loaded


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.


HBox(children=(IntProgress(value=0, max=19061), HTML(value='')))


CPU times: user 1min 34s, sys: 3.67 s, total: 1min 37s
Wall time: 1min 36s


## Experiment

## Propose

https://github.com/tqtg/DuplicateBugFinder

In [30]:
from keras.initializers import RandomUniform, RandomNormal, Ones

### Embedding layer

In [31]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable, name):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer_{}'.format(name),
                                  weights=[embeddings],
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### DWEN model

In [32]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum, Subtract, \
    Average, GlobalAveragePooling1D, BatchNormalization, Activation
from keras.optimizers import Adam, Nadam
import keras.backend as K

def dwen_feature(title_feature_model, desc_feature_model, \
                  sequence_length_t, sequence_length_d, name):
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    
    # Embedding feature
    bug_t_feat = title_feature_model(bug_t)
    bug_d_feat = desc_feature_model(bug_d)
    
    bug_t_feat = GlobalAveragePooling1D()(bug_t_feat)
    bug_d_feat = GlobalAveragePooling1D()(bug_d_feat)
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = Average(name = 'merge_features_{}'.format(name))([bug_t_feat, bug_d_feat])
    
    bug_feature_model = Model(inputs=[bug_t, bug_d], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

def dwen_model(bug_feature_output_a, bug_feature_output_b, name):
    
    inputs = np.concatenate([bug_feature_output_a.input, bug_feature_output_b.input], -1).tolist()
    
    bug_feature_output_a = bug_feature_output_a.output
    bug_feature_output_b = bug_feature_output_b.output
    
    # 2D concatenate feature
    bug_feature_output = concatenate([bug_feature_output_a, bug_feature_output_b])
    
    hidden_layers = 2
    
    # Deep Hidden MLPs
    for _ in range(hidden_layers):
        number_of_units = K.int_shape(bug_feature_output)[1]
        bug_feature_output = Dense(number_of_units // 2)(bug_feature_output)
#         bug_feature_output = BatchNormalization()(bug_feature_output)
        bug_feature_output = Activation('relu')(bug_feature_output)
        #bug_feature_output = Dropout(.5)(bug_feature_output)
    
     # Sigmoid
    output = Dense(1, activation='sigmoid')(bug_feature_output)

    similarity_model = Model(inputs=inputs, outputs=[output], name = 'dwen_output')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    optimizer = Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    return similarity_model

In [33]:
import _pickle as pickle
def save_loss(result):
    with open(os.path.join(DIR,'{}_log.pkl'.format(METHOD)), 'wb') as f:
        pickle.dump(result, f)
    print("=> result saved!")

In [34]:
# Domain to use
limit_train = int(epochs * freeze_train) # 10% de 1000 , 100 epocas
METHOD = 'baseline_dwen_{}'.format(limit_train)
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

In [35]:
%%time

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False, name='desc')
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False, name='title')

# Similarity model
bug_feature_output_a = dwen_feature(title_embedding_layer, desc_embedding_layer, 
                                    MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'dwen_a')
bug_feature_output_b = dwen_feature(title_embedding_layer, desc_embedding_layer, 
                                    MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'dwen_b')
similarity_model = dwen_model(bug_feature_output_a, bug_feature_output_b, 'dwen')

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Experiment
'''
result = { 'train' : [], 'test' : [] }
print("Total of ", limit_train)
for epoch in range(limit_train):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = experiment.batch_iterator(None, baseline.train_data, baseline.dup_sets_train, 
                                                  bug_train_ids, batch_size, 1, issues_by_buckets, TRIPLET_HARD=False)
    
    num_batch = train_input_sample['title'].shape[0]
    pos = np.full((1, num_batch), 1)
    neg = np.full((1, num_batch), 0)
    train_sim = np.concatenate([pos, neg], -1)[0]
    
    title_sample_a = np.concatenate([train_input_sample['title'], train_input_sample['title']], 0)
    title_sample_b = np.concatenate([train_input_pos['title'], train_input_neg['title']], 0)
    desc_sample_a = np.concatenate([train_input_sample['description'], train_input_sample['description']], 0)
    desc_sample_b = np.concatenate([train_input_pos['description'], train_input_neg['description']], 0)
    train_batch = [title_sample_a, desc_sample_a, title_sample_b, desc_sample_b]
    
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    h_validation = similarity_model.test_on_batch(x=validation_sample, y=valid_sim)
    
    # save results
    result['train'].append(h)
    result['test'].append(h_validation)
    
    if( (epoch+1) % 10 == 0 or (epoch+1 == limit_train) ):
        save_loss(result)
    
    if (epoch+1 == limit_train): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, verbose, bug_feature_output_a, issues_by_buckets, 
                                                        bug_train_ids, 'dwen')
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, acc: {:.2f}, acc_tets: {:.2f}, recall@25: {:.2f}".format(epoch+1, h[0], h_validation[0],  h[1], h_validation[1], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, acc: {:.2f}, acc_test: {:.2f}".format(epoch+1, h[0], h_validation[0], h[1], h_validation[1]))
    
    loss = h[0]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

#experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
#experiment.save_model(bug_feature_output_a, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))














Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_dwen_a (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
desc_dwen_a (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
title_dwen_b (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
desc_dwen_b (InputLayer)        (None, 20)           0                                            
__

Epoch: 8 Loss: 0.65, Loss_test: 0.68, acc: 0.72, acc_test: 0.52
Epoch: 9 Loss: 0.66, Loss_test: 0.68, acc: 0.70, acc_test: 0.52
=> result saved!
Epoch: 10 Loss: 0.64, Loss_test: 0.68, acc: 0.73, acc_test: 0.53
Epoch: 11 Loss: 0.64, Loss_test: 0.68, acc: 0.66, acc_test: 0.52
Epoch: 12 Loss: 0.64, Loss_test: 0.68, acc: 0.72, acc_test: 0.52
Epoch: 13 Loss: 0.65, Loss_test: 0.68, acc: 0.65, acc_test: 0.53
Epoch: 14 Loss: 0.65, Loss_test: 0.68, acc: 0.66, acc_test: 0.53
Epoch: 15 Loss: 0.64, Loss_test: 0.68, acc: 0.59, acc_test: 0.53
Epoch: 16 Loss: 0.69, Loss_test: 0.68, acc: 0.55, acc_test: 0.54
Epoch: 17 Loss: 0.62, Loss_test: 0.68, acc: 0.67, acc_test: 0.57
Epoch: 18 Loss: 0.60, Loss_test: 0.68, acc: 0.74, acc_test: 0.54
Epoch: 19 Loss: 0.59, Loss_test: 0.67, acc: 0.71, acc_test: 0.56
=> result saved!
Epoch: 20 Loss: 0.60, Loss_test: 0.68, acc: 0.72, acc_test: 0.57
Epoch: 21 Loss: 0.61, Loss_test: 0.68, acc: 0.70, acc_test: 0.57
Epoch: 22 Loss: 0.65, Loss_test: 0.68, acc: 0.60, acc_test

In [36]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/netbeans/bert/exported_rank_baseline_dwen_100.txt'

In [37]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [38]:
experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(limit_train)))
experiment.save_model(bug_feature_output_a, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(limit_train)), verbose=1)

Saved model 'modelos/model_bert_preprocessing_baseline_dwen_100_feature_100epochs_64batch(netbeans).h5' to disk


In [39]:
len(result['train']), len(result['test'])

(100, 100)

In [40]:
model = similarity_model.get_layer('dense_3')
output = model.output
inputs = similarity_model.inputs
model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

# setup the optimization process 
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_dwen_a (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
desc_dwen_a (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
title_dwen_b (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
desc_dwen_b (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_

In [41]:
# Domain to use
METHOD = 'baseline_dwen_{}'.format(epochs)
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

In [42]:
end_train = epochs - limit_train
for epoch in range(limit_train, end_train):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = experiment.batch_iterator(None, baseline.train_data, baseline.dup_sets_train, 
                                                  bug_train_ids, batch_size, 1, issues_by_buckets, TRIPLET_HARD=False)
    
    num_batch = train_input_sample['title'].shape[0]
    pos = np.full((1, num_batch), 1)
    neg = np.full((1, num_batch), 0)
    train_sim = np.concatenate([pos, neg], -1)[0]
    
    title_sample_a = np.concatenate([train_input_sample['title'], train_input_sample['title']], 0)
    title_sample_b = np.concatenate([train_input_pos['title'], train_input_neg['title']], 0)
    desc_sample_a = np.concatenate([train_input_sample['description'], train_input_sample['description']], 0)
    desc_sample_b = np.concatenate([train_input_pos['description'], train_input_neg['description']], 0)
    train_batch = [title_sample_a, desc_sample_a, title_sample_b, desc_sample_b]
    
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    h_validation = similarity_model.test_on_batch(x=validation_sample, y=valid_sim)
    
    # save results
    result['train'].append(h)
    result['test'].append(h_validation)
    
    if( (epoch+1) % 10 == 0 or (epoch+1 == limit_train) ):
        save_loss(result)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, verbose, bug_feature_output_a, issues_by_buckets, bug_train_ids, 'dwen')
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, acc: {:.2f}, acc_tets: {:.2f}, recall@25: {:.2f}".format(epoch+1, h[0], h_validation[0],  h[1], h_validation[1], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, acc: {:.2f}, acc_test: {:.2f}".format(epoch+1, h[0], h_validation[0], h[1], h_validation[1]))

Epoch: 101 Loss: 0.54, Loss_test: 0.73, acc: 0.74, acc_test: 0.58
Epoch: 102 Loss: 0.47, Loss_test: 0.74, acc: 0.79, acc_test: 0.58
Epoch: 103 Loss: 0.55, Loss_test: 0.73, acc: 0.69, acc_test: 0.59
Epoch: 104 Loss: 0.52, Loss_test: 0.73, acc: 0.70, acc_test: 0.59
Epoch: 105 Loss: 0.50, Loss_test: 0.73, acc: 0.73, acc_test: 0.59
Epoch: 106 Loss: 0.61, Loss_test: 0.72, acc: 0.65, acc_test: 0.59
Epoch: 107 Loss: 0.55, Loss_test: 0.71, acc: 0.71, acc_test: 0.57
Epoch: 108 Loss: 0.51, Loss_test: 0.71, acc: 0.70, acc_test: 0.57
Epoch: 109 Loss: 0.48, Loss_test: 0.72, acc: 0.76, acc_test: 0.57
=> result saved!
Epoch: 110 Loss: 0.51, Loss_test: 0.73, acc: 0.68, acc_test: 0.59
Epoch: 111 Loss: 0.48, Loss_test: 0.74, acc: 0.73, acc_test: 0.59
Epoch: 112 Loss: 0.48, Loss_test: 0.73, acc: 0.72, acc_test: 0.57
Epoch: 113 Loss: 0.51, Loss_test: 0.72, acc: 0.70, acc_test: 0.57
Epoch: 114 Loss: 0.48, Loss_test: 0.72, acc: 0.76, acc_test: 0.57
Epoch: 115 Loss: 0.50, Loss_test: 0.72, acc: 0.74, acc_test

Epoch: 223 Loss: 0.39, Loss_test: 0.75, acc: 0.82, acc_test: 0.61
Epoch: 224 Loss: 0.39, Loss_test: 0.73, acc: 0.78, acc_test: 0.60
Epoch: 225 Loss: 0.44, Loss_test: 0.72, acc: 0.74, acc_test: 0.60
Epoch: 226 Loss: 0.50, Loss_test: 0.72, acc: 0.70, acc_test: 0.60
Epoch: 227 Loss: 0.47, Loss_test: 0.72, acc: 0.73, acc_test: 0.61
Epoch: 228 Loss: 0.50, Loss_test: 0.73, acc: 0.78, acc_test: 0.60
Epoch: 229 Loss: 0.47, Loss_test: 0.75, acc: 0.73, acc_test: 0.61
=> result saved!
Epoch: 230 Loss: 0.43, Loss_test: 0.75, acc: 0.81, acc_test: 0.61
Epoch: 231 Loss: 0.44, Loss_test: 0.75, acc: 0.77, acc_test: 0.61
Epoch: 232 Loss: 0.39, Loss_test: 0.75, acc: 0.86, acc_test: 0.60
Epoch: 233 Loss: 0.49, Loss_test: 0.76, acc: 0.77, acc_test: 0.59
Epoch: 234 Loss: 0.49, Loss_test: 0.76, acc: 0.70, acc_test: 0.58
Epoch: 235 Loss: 0.49, Loss_test: 0.77, acc: 0.72, acc_test: 0.59
Epoch: 236 Loss: 0.44, Loss_test: 0.76, acc: 0.78, acc_test: 0.60
Epoch: 237 Loss: 0.50, Loss_test: 0.76, acc: 0.70, acc_test

Epoch: 345 Loss: 0.50, Loss_test: 0.79, acc: 0.74, acc_test: 0.55
Epoch: 346 Loss: 0.43, Loss_test: 0.78, acc: 0.76, acc_test: 0.57
Epoch: 347 Loss: 0.41, Loss_test: 0.77, acc: 0.82, acc_test: 0.58
Epoch: 348 Loss: 0.45, Loss_test: 0.77, acc: 0.80, acc_test: 0.59
Epoch: 349 Loss: 0.46, Loss_test: 0.76, acc: 0.77, acc_test: 0.58
=> result saved!
Epoch: 350 Loss: 0.47, Loss_test: 0.75, acc: 0.76, acc_test: 0.57
Epoch: 351 Loss: 0.43, Loss_test: 0.75, acc: 0.80, acc_test: 0.58
Epoch: 352 Loss: 0.38, Loss_test: 0.75, acc: 0.84, acc_test: 0.59
Epoch: 353 Loss: 0.42, Loss_test: 0.74, acc: 0.78, acc_test: 0.59
Epoch: 354 Loss: 0.39, Loss_test: 0.75, acc: 0.82, acc_test: 0.61
Epoch: 355 Loss: 0.48, Loss_test: 0.76, acc: 0.73, acc_test: 0.61
Epoch: 356 Loss: 0.39, Loss_test: 0.76, acc: 0.83, acc_test: 0.60
Epoch: 357 Loss: 0.44, Loss_test: 0.77, acc: 0.76, acc_test: 0.61
Epoch: 358 Loss: 0.40, Loss_test: 0.78, acc: 0.84, acc_test: 0.61
Epoch: 359 Loss: 0.48, Loss_test: 0.78, acc: 0.73, acc_test

Epoch: 467 Loss: 0.53, Loss_test: 0.81, acc: 0.77, acc_test: 0.61
Epoch: 468 Loss: 0.41, Loss_test: 0.78, acc: 0.81, acc_test: 0.59
Epoch: 469 Loss: 0.49, Loss_test: 0.77, acc: 0.73, acc_test: 0.58
=> result saved!
Epoch: 470 Loss: 0.36, Loss_test: 0.77, acc: 0.83, acc_test: 0.59
Epoch: 471 Loss: 0.42, Loss_test: 0.77, acc: 0.78, acc_test: 0.58
Epoch: 472 Loss: 0.47, Loss_test: 0.77, acc: 0.75, acc_test: 0.56
Epoch: 473 Loss: 0.47, Loss_test: 0.76, acc: 0.73, acc_test: 0.57
Epoch: 474 Loss: 0.49, Loss_test: 0.76, acc: 0.75, acc_test: 0.57
Epoch: 475 Loss: 0.37, Loss_test: 0.76, acc: 0.81, acc_test: 0.56
Epoch: 476 Loss: 0.40, Loss_test: 0.76, acc: 0.79, acc_test: 0.56
Epoch: 477 Loss: 0.41, Loss_test: 0.75, acc: 0.80, acc_test: 0.56
Epoch: 478 Loss: 0.42, Loss_test: 0.74, acc: 0.80, acc_test: 0.56
Epoch: 479 Loss: 0.38, Loss_test: 0.73, acc: 0.83, acc_test: 0.57
=> result saved!
Epoch: 480 Loss: 0.34, Loss_test: 0.73, acc: 0.84, acc_test: 0.59
Epoch: 481 Loss: 0.47, Loss_test: 0.72, ac

Epoch: 589 Loss: 0.38, Loss_test: 0.83, acc: 0.84, acc_test: 0.64
=> result saved!
Epoch: 590 Loss: 0.36, Loss_test: 0.81, acc: 0.87, acc_test: 0.60
Epoch: 591 Loss: 0.41, Loss_test: 0.81, acc: 0.79, acc_test: 0.60
Epoch: 592 Loss: 0.38, Loss_test: 0.81, acc: 0.80, acc_test: 0.61
Epoch: 593 Loss: 0.35, Loss_test: 0.83, acc: 0.80, acc_test: 0.62
Epoch: 594 Loss: 0.34, Loss_test: 0.85, acc: 0.85, acc_test: 0.62
Epoch: 595 Loss: 0.37, Loss_test: 0.84, acc: 0.79, acc_test: 0.62
Epoch: 596 Loss: 0.31, Loss_test: 0.83, acc: 0.88, acc_test: 0.61
Epoch: 597 Loss: 0.36, Loss_test: 0.82, acc: 0.82, acc_test: 0.60
Epoch: 598 Loss: 0.49, Loss_test: 0.82, acc: 0.76, acc_test: 0.59
Epoch: 599 Loss: 0.37, Loss_test: 0.82, acc: 0.80, acc_test: 0.59
=> result saved!
Epoch: 600 Loss: 0.33, Loss_test: 0.82, acc: 0.86, acc_test: 0.59
Epoch: 601 Loss: 0.34, Loss_test: 0.83, acc: 0.83, acc_test: 0.60
Epoch: 602 Loss: 0.39, Loss_test: 0.82, acc: 0.80, acc_test: 0.60
Epoch: 603 Loss: 0.44, Loss_test: 0.81, ac

Epoch: 711 Loss: 0.32, Loss_test: 0.83, acc: 0.84, acc_test: 0.59
Epoch: 712 Loss: 0.38, Loss_test: 0.83, acc: 0.80, acc_test: 0.58
Epoch: 713 Loss: 0.42, Loss_test: 0.85, acc: 0.78, acc_test: 0.58
Epoch: 714 Loss: 0.39, Loss_test: 0.87, acc: 0.80, acc_test: 0.61
Epoch: 715 Loss: 0.39, Loss_test: 0.90, acc: 0.81, acc_test: 0.62
Epoch: 716 Loss: 0.42, Loss_test: 0.92, acc: 0.77, acc_test: 0.61
Epoch: 717 Loss: 0.37, Loss_test: 0.92, acc: 0.81, acc_test: 0.61
Epoch: 718 Loss: 0.35, Loss_test: 0.91, acc: 0.78, acc_test: 0.62
Epoch: 719 Loss: 0.38, Loss_test: 0.88, acc: 0.82, acc_test: 0.61
=> result saved!
Epoch: 720 Loss: 0.37, Loss_test: 0.88, acc: 0.83, acc_test: 0.60
Epoch: 721 Loss: 0.39, Loss_test: 0.87, acc: 0.83, acc_test: 0.62
Epoch: 722 Loss: 0.41, Loss_test: 0.88, acc: 0.80, acc_test: 0.60
Epoch: 723 Loss: 0.37, Loss_test: 0.90, acc: 0.80, acc_test: 0.59
Epoch: 724 Loss: 0.42, Loss_test: 0.89, acc: 0.81, acc_test: 0.60
Epoch: 725 Loss: 0.45, Loss_test: 0.88, acc: 0.81, acc_test

Epoch: 833 Loss: 0.36, Loss_test: 0.83, acc: 0.85, acc_test: 0.61
Epoch: 834 Loss: 0.35, Loss_test: 0.85, acc: 0.84, acc_test: 0.60
Epoch: 835 Loss: 0.32, Loss_test: 0.86, acc: 0.85, acc_test: 0.59
Epoch: 836 Loss: 0.30, Loss_test: 0.88, acc: 0.84, acc_test: 0.59
Epoch: 837 Loss: 0.37, Loss_test: 0.87, acc: 0.81, acc_test: 0.59
Epoch: 838 Loss: 0.27, Loss_test: 0.87, acc: 0.88, acc_test: 0.59
Epoch: 839 Loss: 0.39, Loss_test: 0.87, acc: 0.81, acc_test: 0.59
=> result saved!
Epoch: 840 Loss: 0.46, Loss_test: 0.87, acc: 0.78, acc_test: 0.59
Epoch: 841 Loss: 0.31, Loss_test: 0.87, acc: 0.87, acc_test: 0.59
Epoch: 842 Loss: 0.41, Loss_test: 0.87, acc: 0.76, acc_test: 0.58
Epoch: 843 Loss: 0.29, Loss_test: 0.87, acc: 0.86, acc_test: 0.60
Epoch: 844 Loss: 0.37, Loss_test: 0.86, acc: 0.83, acc_test: 0.61
Epoch: 845 Loss: 0.42, Loss_test: 0.84, acc: 0.77, acc_test: 0.59
Epoch: 846 Loss: 0.37, Loss_test: 0.84, acc: 0.80, acc_test: 0.61
Epoch: 847 Loss: 0.35, Loss_test: 0.87, acc: 0.84, acc_test

In [43]:
len(result['train']), len(result['test'])

(900, 900)

In [44]:
encoded = model.get_layer('merge_features_dwen_a')
output = encoded.output
inputs = similarity_model.inputs[:-2]
bug_feature_output_a = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

In [45]:
inputs

[<tf.Tensor 'title_dwen_a:0' shape=(?, 20) dtype=float32>,
 <tf.Tensor 'desc_dwen_a:0' shape=(?, 20) dtype=float32>]

In [46]:
SAVE_PATH.replace('@number_of_epochs@', str(epochs))

'bert_preprocessing_baseline_dwen_1000_feature1000epochs_64batch(netbeans)'

In [47]:
experiment.save_model(model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(bug_feature_output_a, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
"Model saved"

Saved model 'modelos/model_bert_preprocessing_baseline_dwen_1000_feature_1000epochs_64batch(netbeans).h5' to disk


'Model saved'

In [48]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, verbose, bug_feature_output_a, issues_by_buckets, bug_train_ids, 'dwen')
print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, acc: {:.2f}, acc_tets: {:.2f}, recall@25: {:.2f}".format(epoch+1, h[0], h_validation[0],  h[1], h_validation[1], recall))

Epoch: 900 Loss: 0.35, Loss_test: 0.82, acc: 0.84, acc_tets: 0.61, recall@25: 0.23


In [49]:
exported_rank[:20]

['131079:124863|113328:0.3987081050872803,162864:0.35667872428894043,105649:0.3552509546279907,125999:0.3379114866256714,179131:0.3343706727027893,159111:0.33260560035705566,186208:0.33242470026016235,139742:0.33136165142059326,182083:0.3301457166671753,168140:0.32184046506881714,180299:0.320984423160553,140573:0.30820679664611816,157027:0.30562371015548706,144646:0.29684633016586304,90835:0.29566681385040283,145301:0.29225850105285645,197950:0.2912602424621582,174869:0.2882149815559387,199934:0.28646767139434814,184178:0.28540879487991333,204613:0.2832145690917969,215675:0.28245043754577637,175044:0.2819741368293762,172772:0.28015780448913574,188204:0.2798312306404114,70108:0.2797725200653076,168331:0.2793806791305542,181956:0.27889150381088257,122651:0.27874821424484253',
 '131082:131085|131085:1.0,226664:0.47495633363723755,136948:0.4439436197280884,160038:0.43521732091903687,125532:0.41653430461883545,42231:0.4092305898666382,24343:0.4082835912704468,125435:0.3930480480194092,70217

### Retrieval evaluation

In [50]:
print("Total of queries:", len(retrieval.test))

Total of queries: 17002


#### Getting the model trained

In [51]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'bert_preprocessing_baseline_dwen_1000_feature_1000epochs_64batch(netbeans)'

In [52]:
bug_feature_output_a.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_dwen_a (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
desc_dwen_a (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_layer_title (Embeddin (None, 20, 300)      580966500   title_dwen_a[0][0]               
__________________________________________________________________________________________________
embedding_layer_desc (Embedding (None, 20, 300)      580966500   desc_dwen_a[0][0]                
__________________________________________________________________________________________________
global_ave

In [53]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/netbeans/bert/exported_rank_baseline_dwen_1000.txt'

In [54]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [55]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.19,
 '2 - recall_at_10': 0.21,
 '3 - recall_at_15': 0.22,
 '4 - recall_at_20': 0.23,
 '5 - recall_at_25': 0.23}