# deepCOREL_cosine

In [1]:
# Optimize the use of GPUs
# https://datascience.stackexchange.com/questions/23895/multi-gpu-in-keras
# https://keras.io/getting-started/faq/#how-can-i-run-a-keras-model-on-multiple-gpus
# https://stackoverflow.com/questions/56316451/how-to-use-specific-gpus-in-keras-for-multi-gpu-training

In [107]:
# import tensorflow as tf
import keras
# from __future__ import print_function, division
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras import optimizers

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

import os
from keras_bert import load_vocabulary
import random

from keras.initializers import RandomUniform, RandomNormal, Ones

from keras_bert import load_trained_model_from_checkpoint
from keras_bert import compile_model, get_model
from keras.layers import GlobalAveragePooling1D

## required for semi-hard triplet loss:
import tensorflow as tf
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import dtypes
from sklearn.utils.extmath import softmax

from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum, Dot
from keras.optimizers import Adam, Nadam
import _pickle as pickle

from keras.layers import Layer
from keras import backend as K
    
# %matplotlib inline

In [3]:
%env epochs 1000
%env base eclipse

env: epochs=1000
env: base=eclipse


## Auxiliary methods

## Configurações Globais

In [4]:
MAX_SEQUENCE_LENGTH_T = 20 # 100
MAX_SEQUENCE_LENGTH_D = 20 # 500
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000
'''
    Configuration
'''
epochs = int(os.environ['epochs'])
freeze_train = .1 # 10% with freeze weights
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

### Parse bugs preproprecessed

In [5]:
# Domain to use
DOMAIN = os.environ['base']
METHOD = 'deepCOREL_cosine_{}'.format(epochs)
PREPROCESSING = 'bert'
TOKEN = 'bert'
# Dataset paths
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [6]:
print("*********")
print("{} for {} epochs in {}".format(METHOD, epochs, DOMAIN))
print("*********")

*********
deepCOREL_cosine_1000 for 1000 epochs in eclipse
*********


In [7]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [8]:
token_dict = load_vocabulary(vocab_path)

In [9]:
"Total vocabulary: {}".format(len(token_dict))

'Total vocabulary: 30522'

In [10]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D,
                   token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [11]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

#### Loading bug ids in memory

In [12]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


361006

#### Dicionário de títulos e descrições

In [13]:
%%time

experiment.load_bugs(TOKEN)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 34.3 s, sys: 3.13 s, total: 37.4 s
Wall time: 37 s


#### Hashing bugs by buckets

In [14]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




#### Prepare the train and test

In [15]:
%%time

experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

CPU times: user 2.14 s, sys: 43.6 ms, total: 2.18 s
Wall time: 2.17 s


In [16]:
baseline.train_data[:10]

[[275492, 218812],
 [288296, 264093],
 [273286, 293887],
 [57162, 62059],
 [82146, 67997],
 [56777, 61857],
 [169445, 165179],
 [250521, 273893],
 [247266, 241461],
 [36781, 38338]]

#### Recovery bug ids from train

In [17]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

### Export the corpus train

In [18]:
if EXTRACT_CORPUS:
    corpus = []
    export_file = open(os.path.join(DIR, 'corpus_train.txt'), 'w')
    for bug_id in tqdm(baseline.bug_set):
        bug = baseline.bug_set[bug_id]
        title = bug['title']
        desc = bug['description']
        export_file.write("{}\n{}\n".format(title, desc))
    export_file.close()

In [19]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

{'bug_severity': '4\n',
 'bug_status': '2\n',
 'component': '337\n',
 'creation_ts': '2008-08-20 05:34:00 -0400',
 'delta_ts': '2008-09-16 04:49:45 -0400',
 'description': '[CLS] ep ##f 1 . 5 build 2008 ##0 ##8 ##18 - 1653 steps : 1 . publish a library such as open ##up , include options " publish gloss ##ary " and " publish index " 2 . brows ##e the published website and open gloss ##ary window 3 . click gloss ##ary index link problems : ( 1 ) no response [SEP]',
 'description_segment': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [20]:
"Train ", len(baseline.dup_sets_train)

('Train ', 39339)

### Train ids

In [21]:
# data - path
# batch_size - 128
# n_neg - 1
def batch_iterator(self, retrieval, model, data, dup_sets, bug_ids, 
                   batch_size, n_neg, issues_by_buckets, TRIPLET_HARD=False, FLOATING_PADDING=False):
    # global train_data
    # global self.dup_sets
    # global self.bug_ids
    # global self.bug_set

    random.shuffle(data)

    batch_features = {'title' : [], 'desc' : [], 'info' : []}
    n_train = len(data)

    batch_triplets, batch_bugs_anchor, batch_bugs_pos, batch_bugs_neg, batch_bugs = [], [], [], [], []

    all_bugs = list(issues_by_buckets.keys())
    buckets = retrieval.buckets

    for offset in range(batch_size):
        anchor, pos = data[offset][0], data[offset][1]
        batch_bugs_anchor.append(anchor)
        batch_bugs_pos.append(pos)
        batch_bugs.append(anchor)
        batch_bugs.append(pos)
        #batch_bugs += dup_sets[anchor]

    for anchor, pos in zip(batch_bugs_anchor, batch_bugs_pos):
        while True:
            neg = self.get_neg_bug(anchor, buckets[issues_by_buckets[anchor]], issues_by_buckets, all_bugs)
            bug_anchor = self.bug_set[anchor]
            bug_pos = self.bug_set[pos]
            if neg not in self.bug_set:
                continue
            batch_bugs.append(neg)
            batch_bugs_neg.append(neg)
            bug_neg = self.bug_set[neg]
            break
        
        # triplet bug and master
        batch_triplets.append([anchor, pos, neg])
    
    random.shuffle(batch_bugs)
    title_ids = np.full((len(batch_bugs), MAX_SEQUENCE_LENGTH_T), 0)
    description_ids = np.full((len(batch_bugs), MAX_SEQUENCE_LENGTH_D), 0)
    for i, bug_id in enumerate(batch_bugs):
        bug = self.bug_set[bug_id]
        self.read_batch_bugs(batch_features, bug, index=i, title_ids=title_ids, description_ids=description_ids)

    batch_features['title'] = { 'token' : np.array(batch_features['title']), 'segment' : title_ids }
    batch_features['desc'] = { 'token' : np.array(batch_features['desc']), 'segment' : description_ids }
    batch_features['info'] = np.array(batch_features['info'])
    
    sim = np.asarray([issues_by_buckets[bug_id] for bug_id in batch_bugs])

    input_sample = {}

    input_sample = { 'title' : batch_features['title'], 
                        'description' : batch_features['desc'], 
                            'info' : batch_features['info'] }

    return batch_triplets, input_sample, sim #sim

In [22]:
# %%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_sim = batch_iterator(baseline, retrieval, None, 
                                                                                      baseline.train_data, 
                                                                                      baseline.dup_sets_train,
                                                                                      bug_train_ids,
                                                                                      batch_size_test, 1,
                                                                                      issues_by_buckets)

validation_sample = [valid_input_sample['title']['token'], valid_input_sample['title']['segment'], 
                   valid_input_sample['description']['token'], valid_input_sample['description']['segment'],
                   valid_input_sample['info'], valid_sim]

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title']['token'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description']['token'].shape[1]

In [23]:
valid_input_sample['title']['token'].shape, \
valid_input_sample['description']['token'].shape, \
valid_input_sample['title']['segment'].shape, \
valid_input_sample['description']['segment'].shape, \
valid_input_sample['info'].shape, valid_sim.shape

((384, 20), (384, 20), (384, 20), (384, 20), (384, 1682), (384,))

### Validar entrada

In [24]:
# %%time 

#baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5)

In [25]:
"Test ", len(baseline.test_data)

('Test ', 16995)

## Experiment

## Propose

https://github.com/tqtg/DuplicateBugFinder

### BERT

https://github.com/CyberZHG/keras-bert

In [265]:
def bert_model(MAX_SEQUENCE_LENGTH, name):
    layer_num = 8
    model = load_trained_model_from_checkpoint(
        config_path,
        model_path,
        training=True,
        use_adapter=True,
        seq_len=MAX_SEQUENCE_LENGTH,
        trainable=['Encoder-{}-MultiHeadSelfAttention-Adapter'.format(i + 1) for i in range(12-layer_num, 13)] +
        ['Encoder-{}-FeedForward-Adapter'.format(i + 1) for i in range(12-layer_num, 13)] +
        ['Encoder-{}-MultiHeadSelfAttention-Norm'.format(i + 1) for i in range(12-layer_num, 13)] +
        ['Encoder-{}-FeedForward-Norm'.format(i + 1) for i in range(layer_num)],
    )

    compile_model(model)
    inputs = model.inputs[:2]
    layers = ['Encoder-{}-MultiHeadSelfAttention-Adapter', 'Encoder-{}-FeedForward-Adapter', 
     'Encoder-{}-MultiHeadSelfAttention-Norm', 'Encoder-{}-FeedForward-Norm']
    outputs = []
    for i in range(1, 13):
        outputs += [ model.get_layer(layer.format(layer_num)).output for layer in layers ]
    outputs = Average()(outputs)
    #outputs = model.get_layer('Extract').output
    outputs = GlobalAveragePooling1D()(outputs)
    outputs = Dense(300, activation='tanh')(outputs)
    
    model = Model(inputs, outputs, name='FeatureBERTGenerationModel{}'.format(name))
    
    return model

### MLP

In [269]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    for units in [64, 32]:
        layer = Dense(units, activation='tanh', kernel_initializer='random_uniform')(info_input)
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Loss function

In [None]:
# Helper functions for dealing with AR Loss
def get_armask(shape, labels):
    if labels.dtype != tf.int32:
        raise Exception("Labels must be a LongTensor with dtype=int32!")

    #mask = tf.zeros(shape)
    #arr = tf.range(0, shape[0], 1)
    mask = tf.identity(labels)
    mask *= -1
    # want to maximize similarity to the correct classes, so this is negative.
    #mask = mask + - 1.
    mask_arr = tf.cast(tf.math.equal(tf.reshape(arr, (-1, 1)), labels), "float32")
    mask = mask + (mask_arr * -1.)
    return mask

def arloss(attraction_tensor, repulsion_tensor, lam):
    # combine up everything to accumulate across the entire batch
    loss_attraction = tf.reduce_sum(attraction_tensor)
    loss_repulsion = tf.reduce_sum(repulsion_tensor)
    arloss = (lam * loss_attraction) + ((1. - lam) * loss_repulsion)
    return arloss / tf.cast(tf.shape(attraction_tensor)[0], 'float32')

def CosineARLoss(y_true, y_pred):
    
    inputs = y_pred
    labels = inputs[:, :1]
    labels = tf.cast(labels, dtype='int32')
    embeddings =  tf.cast(inputs[:, 1:], dtype='float32')
    lam=0.5
    
    mask = get_armask(tf.shape(embeddings), labels)

    # make the attractor and repulsor, mask them!
    attraction_tensor = mask * embeddings
    repulsion_tensor = (mask + 1.0) * embeddings

    # now, apply the special cosine-COREL rules, taking the argmax and squaring the repulsion
    repulsion_tensor = tf.reduce_max(repulsion_tensor, axis=1)
    repulsion_tensor = repulsion_tensor ** 2

    return arloss(attraction_tensor, repulsion_tensor, lam)

### Propose

In [266]:
def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    # Title
    bug_t_token = Input(shape = (sequence_length_t, ), name = 'title_token_{}'.format(name))
    bug_t_segment = Input(shape = (sequence_length_t, ), name = 'title_segment_{}'.format(name))
    # Description
    bug_d_token = Input(shape = (sequence_length_d, ), name = 'desc_token_{}'.format(name))
    bug_d_segment = Input(shape = (sequence_length_d, ), name = 'desc_segment_{}'.format(name))
    # Categorical
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model([bug_t_token, bug_t_segment])
    bug_d_feat = desc_feature_model([bug_d_token, bug_d_segment])
    bug_i_feat = categorical_feature_model(bug_i)
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
    bug_feature_model = Model(inputs=[bug_t_token, bug_t_segment, bug_d_token, bug_d_segment, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [255]:
def cosine_distance(vests):
    x, y = vests
    x_norm = tf.norm(x)
    y_norm = tf.norm(y)
    return K.dot(x, y) / ( x_norm * y_norm)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def cosine_similarity(y_true, y_pred):
    return tf.reduce_mean(y_pred[::, 1])

class CosineLayer(Layer):

    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(CosineLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='cosine_weight', 
                                      shape=(self.output_dim, input_shape[-1]),
                                      initializer="random_normal",
                                      trainable=True)
        super(CosineLayer, self).build(input_shape)  # Be sure to call this at the end
        
    def call(self, x):
        return Lambda(cosine_distance, output_shape=cos_dist_output_shape)([x, tf.transpose(self.W)])
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], 1)

def max_margin_objective(encoded_anchor, batch_size, n_classes, decay_lr=1):
    
    input_labels = Input(shape=(1,), name='input_label')    # input layer for labels
    inputs = np.concatenate([encoded_anchor.input, [input_labels]], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_anchor = CosineLayer(1)(encoded_anchor)
    
    output = concatenate([input_labels, encoded_anchor])  # concatenating the labels + embeddings
    
    similarity_model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

    # setup the optimization process 
    similarity_model.compile(optimizer='adam', loss=CosineARLoss, metrics=[cosine_similarity]) 
    # metrics=[pos_distance, neg_distance, custom_margin_loss]

    return similarity_model

In [31]:
def save_loss(result):
    with open(os.path.join(DIR,'{}_log.pkl'.format(METHOD)), 'wb') as f:
        pickle.dump(result, f)
    print("=> result saved!")

In [48]:
# Domain to use
result = { 'train' : [], 'test' : [] }
limit_train = int(epochs * freeze_train) # 10% de 1000 , 100 epocas
METHOD = 'deepCOREL_cosine_{}'.format(limit_train)
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

In [135]:
n_classes = len([bug for bug in retrieval.buckets if len(retrieval.buckets[bug]) >= 2])
print("Batch size ", batch_size)
print("Number of clusters ", n_classes)

Batch size  64
Number of clusters  63936


In [267]:
### %%time

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Feature models
'''
    cnn_model
    lstm_model
    mlp_model
'''
title_feature_model = bert_model(MAX_SEQUENCE_LENGTH_T, 'Title')
desc_feature_model = bert_model(MAX_SEQUENCE_LENGTH_D, 'Description')
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')

similarity_model = max_margin_objective(encoded_anchor, batch_size, n_classes, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()

'''
    Experiment
'''
print("Total of ", limit_train)
for epoch in range(limit_train):
    batch_triplet_train, \
        train_input_sample, train_sim = batch_iterator(baseline, retrieval, encoded_anchor, baseline.train_data, 
                                                       baseline.dup_sets_train, bug_train_ids, 
                                                           batch_size, 1, issues_by_buckets, TRIPLET_HARD=False)
    train_batch = [train_input_sample['title']['token'], train_input_sample['title']['segment'], 
                   train_input_sample['description']['token'], train_input_sample['description']['segment'],
                   train_input_sample['info'], train_sim]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    h_validation = similarity_model.test_on_batch(x=validation_sample, y=valid_sim)
    
    # save results
    result['train'].append(h)
    result['test'].append(h_validation)
    
    if( (epoch+1) % 10 == 0 or (epoch+1 == limit_train) ):
        save_loss(result)
    
    if (epoch+1 == limit_train): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, 
                                                               bug_train_ids, method='bert')
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, cos: {:.2f}, cos_test: {:.2f}, recall@25: {:.2f}".format(epoch+1, h[0], h[1], h_validation[0], h_validation[1], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, cos: {:.2f}, cos_test: {:.2f}".format(epoch+1, h[0], h[1], h_validation[0], h_validation[1]))
    loss = h[0]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

# experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
# experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_token_in (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
title_segment_in (InputLayer)   (None, 20)           0                                            
__________________________________________________________________________________________________
desc_token_in (InputLayer)      (None, 20)           0                                            
__________________________________________________________________________________________________
desc_segme

Epoch: 73 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: 0.00
Epoch: 74 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: 0.00
Epoch: 75 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: 0.00
Epoch: 76 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: -0.00
Epoch: 77 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: -0.00
Epoch: 78 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: -0.00
Epoch: 79 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: -0.00
=> result saved!
Epoch: 80 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: 0.00
Epoch: 81 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: 0.00
Epoch: 82 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: 0.00
Epoch: 83 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: 0.00
Epoch: 84 Loss: 0.00, Loss_test: 0.00, cos: 0.00, cos_test: -0.00
Epoch: 85 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: -0.00
Epoch: 86 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test: -0.00
Epoch: 87 Loss: 0.00, Loss_test: -0.00, cos: 0.00, cos_test

Epoch: 100 Loss: 7.45, Loss_test: 6.23, recall@25: 0.19
Best_epoch=100, Best_loss=0.00, Recall@25=0.19

In [34]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/eclipse/bert/exported_rank_deepCOREL_cosine_100.txt'

In [35]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [36]:
experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(limit_train)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(limit_train)), verbose=1)

Saved model 'modelos/model_bert_preprocessing_deepCOREL_cosine_100_feature_100epochs_64batch(eclipse).h5' to disk


In [37]:
len(result['train']), len(result['test'])

(100, 100)

In [38]:
model = similarity_model.get_layer('concatenate_1')
output = model.output
inputs = similarity_model.inputs
model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

# setup the optimization process 
model.compile(optimizer='adam', loss=CosineARLoss)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_token_in (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
title_segment_in (InputLayer)   (None, 20)           0                                            
__________________________________________________________________________________________________
desc_token_in (InputLayer)      (None, 20)           0                                            
__________________________________________________________________________________________________
desc_segme

In [39]:
# Domain to use
METHOD = 'deepCOREL_cosine_{}'.format(epochs)
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

In [40]:
end_train = epochs - limit_train
for epoch in range(limit_train, end_train):
    batch_triplet_train, \
        train_input_sample, train_sim = batch_iterator(baseline, retrieval, model, baseline.train_data, 
                                                       baseline.dup_sets_train, bug_train_ids, 
                                                           batch_size, 1, issues_by_buckets, TRIPLET_HARD=False)
    train_batch = [train_input_sample['title']['token'], train_input_sample['title']['segment'], 
                   train_input_sample['description']['token'], train_input_sample['description']['segment'],
                   train_input_sample['info'], train_sim]
    

    h = model.train_on_batch(x=train_batch, y=train_sim)
    h_validation = model.test_on_batch(x=validation_sample, y=valid_sim)
    
    # save results
    result['train'].append([h])
    result['test'].append([h_validation])
    
    if( (epoch+1) % 10 == 0 or (epoch+1 == end_train )):
        save_loss(result)
    
    print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}".format(epoch+1, h, h_validation))

Epoch: 101 Loss: 0.00, Loss_test: 0.00
Epoch: 102 Loss: 0.00, Loss_test: 0.00
Epoch: 103 Loss: 0.00, Loss_test: 0.00
Epoch: 104 Loss: 0.00, Loss_test: 0.00
Epoch: 105 Loss: 0.00, Loss_test: 0.00
Epoch: 106 Loss: 0.00, Loss_test: 0.00
Epoch: 107 Loss: 0.00, Loss_test: 0.00
Epoch: 108 Loss: 0.00, Loss_test: 0.00
Epoch: 109 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 110 Loss: 0.00, Loss_test: 0.00
Epoch: 111 Loss: 0.00, Loss_test: 0.00
Epoch: 112 Loss: 0.00, Loss_test: 0.00
Epoch: 113 Loss: 0.00, Loss_test: 0.00
Epoch: 114 Loss: 0.00, Loss_test: 0.00
Epoch: 115 Loss: 0.00, Loss_test: 0.00
Epoch: 116 Loss: 0.00, Loss_test: 0.00
Epoch: 117 Loss: 0.00, Loss_test: 0.00
Epoch: 118 Loss: 0.00, Loss_test: 0.00
Epoch: 119 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 120 Loss: 0.00, Loss_test: 0.00
Epoch: 121 Loss: 0.00, Loss_test: 0.00
Epoch: 122 Loss: 0.00, Loss_test: 0.00
Epoch: 123 Loss: 0.00, Loss_test: 0.00
Epoch: 124 Loss: 0.00, Loss_test: 0.00
Epoch: 125 Loss: 0.00, Loss_te

Epoch: 303 Loss: 0.00, Loss_test: 0.00
Epoch: 304 Loss: 0.00, Loss_test: 0.00
Epoch: 305 Loss: 0.00, Loss_test: 0.00
Epoch: 306 Loss: 0.00, Loss_test: 0.00
Epoch: 307 Loss: 0.00, Loss_test: 0.00
Epoch: 308 Loss: 0.00, Loss_test: 0.00
Epoch: 309 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 310 Loss: 0.00, Loss_test: 0.00
Epoch: 311 Loss: 0.00, Loss_test: 0.00
Epoch: 312 Loss: 0.00, Loss_test: 0.00
Epoch: 313 Loss: 0.00, Loss_test: 0.00
Epoch: 314 Loss: 0.00, Loss_test: 0.00
Epoch: 315 Loss: 0.00, Loss_test: 0.00
Epoch: 316 Loss: 0.00, Loss_test: 0.00
Epoch: 317 Loss: 0.00, Loss_test: 0.00
Epoch: 318 Loss: 0.00, Loss_test: 0.00
Epoch: 319 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 320 Loss: 0.00, Loss_test: 0.00
Epoch: 321 Loss: 0.00, Loss_test: 0.00
Epoch: 322 Loss: 0.00, Loss_test: 0.00
Epoch: 323 Loss: 0.00, Loss_test: 0.00
Epoch: 324 Loss: 0.00, Loss_test: 0.00
Epoch: 325 Loss: 0.00, Loss_test: 0.00
Epoch: 326 Loss: 0.00, Loss_test: 0.00
Epoch: 327 Loss: 0.00, Loss_te

Epoch: 505 Loss: 0.00, Loss_test: 0.00
Epoch: 506 Loss: 0.00, Loss_test: 0.00
Epoch: 507 Loss: 0.00, Loss_test: 0.00
Epoch: 508 Loss: 0.00, Loss_test: 0.00
Epoch: 509 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 510 Loss: 0.00, Loss_test: 0.00
Epoch: 511 Loss: 0.00, Loss_test: 0.00
Epoch: 512 Loss: 0.00, Loss_test: 0.00
Epoch: 513 Loss: 0.00, Loss_test: 0.00
Epoch: 514 Loss: 0.00, Loss_test: 0.00
Epoch: 515 Loss: 0.00, Loss_test: 0.00
Epoch: 516 Loss: 0.00, Loss_test: 0.00
Epoch: 517 Loss: 0.00, Loss_test: 0.00
Epoch: 518 Loss: 0.00, Loss_test: 0.00
Epoch: 519 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 520 Loss: 0.00, Loss_test: 0.00
Epoch: 521 Loss: 0.00, Loss_test: 0.00
Epoch: 522 Loss: 0.00, Loss_test: 0.00
Epoch: 523 Loss: 0.00, Loss_test: 0.00
Epoch: 524 Loss: 0.00, Loss_test: 0.00
Epoch: 525 Loss: 0.00, Loss_test: 0.00
Epoch: 526 Loss: 0.00, Loss_test: 0.00
Epoch: 527 Loss: 0.00, Loss_test: 0.00
Epoch: 528 Loss: 0.00, Loss_test: 0.00
Epoch: 529 Loss: 0.00, Loss_te

Epoch: 707 Loss: 0.00, Loss_test: 0.00
Epoch: 708 Loss: 0.00, Loss_test: 0.00
Epoch: 709 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 710 Loss: 0.00, Loss_test: 0.00
Epoch: 711 Loss: 0.00, Loss_test: 0.00
Epoch: 712 Loss: 0.00, Loss_test: 0.00
Epoch: 713 Loss: 0.00, Loss_test: 0.00
Epoch: 714 Loss: 0.00, Loss_test: 0.00
Epoch: 715 Loss: 0.00, Loss_test: 0.00
Epoch: 716 Loss: 0.00, Loss_test: 0.00
Epoch: 717 Loss: 0.00, Loss_test: 0.00
Epoch: 718 Loss: 0.00, Loss_test: 0.00
Epoch: 719 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 720 Loss: 0.00, Loss_test: 0.00
Epoch: 721 Loss: 0.00, Loss_test: 0.00
Epoch: 722 Loss: 0.00, Loss_test: 0.00
Epoch: 723 Loss: 0.00, Loss_test: 0.00
Epoch: 724 Loss: 0.00, Loss_test: 0.00
Epoch: 725 Loss: 0.00, Loss_test: 0.00
Epoch: 726 Loss: 0.00, Loss_test: 0.00
Epoch: 727 Loss: 0.00, Loss_test: 0.00
Epoch: 728 Loss: 0.00, Loss_test: 0.00
Epoch: 729 Loss: 0.00, Loss_test: 0.00
=> result saved!
Epoch: 730 Loss: 0.00, Loss_test: 0.00
Epoch: 731 Lo

In [41]:
len(result['train']), len(result['test'])

(900, 900)

In [42]:
encoded = model.get_layer('merge_features_in')
output = encoded.output
inputs = similarity_model.inputs[:-1]
encoded_anchor = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

In [43]:
SAVE_PATH.replace('@number_of_epochs@', str(epochs))

'bert_preprocessing_deepCOREL_cosine_1000_feature1000epochs_64batch(eclipse)'

In [44]:
experiment.save_model(model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
"Model saved"

Saved model 'modelos/model_bert_preprocessing_deepCOREL_cosine_1000_feature_1000epochs_64batch(eclipse).h5' to disk


'Model saved'

In [45]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 1, encoded_anchor, issues_by_buckets, 
                                                               bug_train_ids, method='bert')
print("Epoch: {} Loss: {:.2f}, Loss_test: {:.2f}, recall@25: {:.2f}".format(epoch+1, h, h_validation, recall))

HBox(children=(IntProgress(value=0, max=16995), HTML(value='')))




HBox(children=(IntProgress(value=0, max=27321), HTML(value='')))




KeyboardInterrupt: 

In [None]:
recall

In [None]:
exported_rank[:20]

In [None]:
print("Total of queries:", len(retrieval.test))

In [None]:
print(SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))

In [None]:
encoded_anchor.summary()

In [None]:
print(len(exported_rank))

In [None]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
print(EXPORT_RANK_PATH)

In [None]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [None]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
print(report)