# Propose BERT siamese

In [1]:
# import tensorflow as tf
import keras
# from tensorflow.python import keras
import os

Using TensorFlow backend.


In [2]:
# import tensorflow as tf
# import tensorflow.keras.backend as K_tf

# sess = K_tf.get_session()
# uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
# init_op = tf.variables_initializer(
#     [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
# )
# sess.run(init_op)

In [3]:
from __future__ import print_function, division

In [4]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [5]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras import optimizers

In [6]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Auxiliary methods

## Configurações Globais

In [7]:
MAX_SEQUENCE_LENGTH_T = 20 # 20
MAX_SEQUENCE_LENGTH_D = 20 # 80
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [8]:
# Domain to use
DOMAIN = 'openoffice'
METHOD = 'propose_bert'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = '{}_feature@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [9]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [10]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=57667), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14567), HTML(value='')))




#### Loading bug ids in memory

In [11]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


72234

#### Dicionário de títulos e descrições

In [12]:
# !wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !unzip -o uncased_L-12_H-768_A-12.zip

In [13]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [14]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [15]:
"Total vocabulary: {}".format(len(token_dict))

'Total vocabulary: 30522'

In [16]:
import _pickle as pickle
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)

def load_bugs(baseline):
    global tokenizer
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        #try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            title_padding.append(bug['title_word_bert'][:MAX_SEQUENCE_LENGTH_T])
            desc_padding.append(bug['description_word_bert'][:MAX_SEQUENCE_LENGTH_D])
            baseline.bug_set[bug_id] = bug
            #break
        #except:
            removed.append(bug_id)    
    
    for bug_id, bug_title, bug_desc in tqdm(zip(baseline.bug_ids, title_padding, desc_padding)):
            bug = baseline.bug_set[bug_id]
            baseline.sentence_dict[",".join(np.array(bug_title, str))] = bug['title']
            baseline.sentence_dict[",".join(np.array(bug_desc, str))] = bug['description']
            bug['title_word_bert'] = bug_title
            bug['description_word_bert'] = bug_desc
            bug['textual_word_bert'] = np.concatenate([bug_title, bug_desc], -1)

In [17]:
%%time

load_bugs(baseline)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=72234), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 4.17 s, sys: 816 ms, total: 4.98 s
Wall time: 4.78 s


#### Hashing bugs by buckets

In [18]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=58572), HTML(value='')))




#### Prepare the train and test

In [19]:
%%time

experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids
CPU times: user 19.9 s, sys: 323 µs, total: 19.9 s
Wall time: 19.9 s


#### Recovery bug ids from train

In [20]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

#### Display a random bug

In [21]:
idx = np.random.choice(baseline.bug_ids, 1)[0]
baseline.bug_set[idx]

{'bug_severity': '1\n',
 'bug_status': '1\n',
 'component': '37\n',
 'creation_ts': '2012-07-16 02:41:00 +0000',
 'delta_ts': '2012-07-25 09:27:32 +0000',
 'description': 'r1359641 build\n\nWindows:\non install panels the product version is still 3.4 but not 3.4.1.\n\nThe shortcuts on desktop and start menu remain 3.4\n\nDEB:\nversion in the desktop-integration package name is 3.4',
 'description_bert': '[CLS] r ##13 ##59 ##64 ##1 build windows : on install panels the product version is still 3 . 4 but not 3 . 4 . 1 . the short ##cut ##s on desktop and start menu remain 3 . 4 de ##b : version in the desktop - integration package name is 3 . 4 [SEP]',
 'description_word_bert': [101,
  1054,
  17134,
  28154,
  21084,
  2487,
  3857,
  3645,
  1024,
  2006,
  16500,
  9320,
  1996,
  4031,
  2544,
  2003,
  2145,
  1017,
  1012,
  1018],
 'dup_id': '[]',
 'issue_id': 120286,
 'priority': '3\n',
 'product': '8\n',
 'resolution': 'FIXED',
 'textual_word_bert': array([  101,  4031,  2544,  

### Generating the batch test

In [22]:
"Train ", len(baseline.dup_sets_train)

('Train ', 11043)

In [23]:
bug_idx = bug_train_ids[0]
vector = baseline.bug_set[bug_idx]['textual_word_bert']
annoy_train = AnnoyIndex(vector.shape[0])
for bug_id in bug_train_ids:
    annoy_train.add_item(bug_id, baseline.bug_set[bug_id]['textual_word_bert'])
annoy_train.build(10) # 10 trees
"Indexed all train"

'Indexed all train'

In [24]:
 # data - path
# batch_size - 128
# n_neg - 1

import random

def read_batch_bugs(baseline, batch, bug):
        info = np.concatenate((
            baseline.to_one_hot(bug['bug_severity'], baseline.info_dict['bug_severity']),
            baseline.to_one_hot(bug['bug_status'], baseline.info_dict['bug_status']),
            baseline.to_one_hot(bug['component'], baseline.info_dict['component']),
            baseline.to_one_hot(bug['priority'], baseline.info_dict['priority']),
            baseline.to_one_hot(bug['product'], baseline.info_dict['product']),
            baseline.to_one_hot(bug['version'], baseline.info_dict['version']))
        )
        #info.append(info_)
        batch['info'].append(info)
        batch['title'].append(bug['title_word_bert'])
        batch['desc'].append(bug['description_word_bert'])

def get_neg_bug(invalid_bugs, bug_ids, issues_by_buckets):
    neg_bug = random.choice(list(issues_by_buckets.keys()))
    try:
        while neg_bug in invalid_bugs or neg_bug not in issues_by_buckets:
            neg_bug = random.choice(bug_ids)
    except:
        invalid_bugs = [invalid_bugs]
        while neg_bug in invalid_bugs or neg_bug not in issues_by_buckets:
            neg_bug = random.choice(bug_ids)
    return neg_bug

def get_neg_bug_semihard(baseline, model, batch_bugs, anchor, invalid_bugs):
    vector = model.predict([ np.array([baseline.bug_set[anchor]['title_word_bert']]), 
                            np.zeros_like([baseline.bug_set[anchor]['title_word_bert']]), 
                            np.array([baseline.bug_set[anchor]['description_word_bert']]), 
                            np.zeros_like([baseline.bug_set[anchor]['description_word_bert']]), 
                            np.array([retrieval.get_info(baseline.bug_set[anchor])]) ])
    annoy = AnnoyIndex(vector.shape[1])
    embeds = []
    title_data, desc_data, info_data = [], [], []
    batch_bugs_wo_positives = list(set(batch_bugs) - set(invalid_bugs)) 
    for bug_id in batch_bugs_wo_positives:
        bug = baseline.bug_set[bug_id]
        title_data.append(bug['title_word_bert'])
        desc_data.append(bug['description_word_bert'])
        info_data.append(retrieval.get_info(bug))
    embeds = model.predict([ np.array(title_data), np.zeros_like(title_data), np.array(desc_data), np.zeros_like(desc_data), np.array(info_data) ])
    for bug_id, embed in zip(batch_bugs_wo_positives, embeds):
        annoy.add_item(bug_id, embed)
    annoy.build(10) # 10 trees
    rank = annoy.get_nns_by_vector(vector[0], 20, include_distances=False)
    neg_bug = rank[0]
    return neg_bug

def batch_iterator(baseline, model, data, dup_sets, bug_train_ids, batch_size, n_neg, issues_by_buckets):
    # global train_data
    # global self.dup_sets
    # global self.bug_ids
    # global self.bug_set

    random.shuffle(data)

    batch_input, batch_pos, batch_neg, master_batch_input, master_batch_neg = {'title' : [], 'desc' : [], 'info' : []}, \
                                            {'title' : [], 'desc' : [], 'info' : []}, \
                                                {'title' : [], 'desc' : [], 'info' : []},\
                                                    {'title' : [], 'desc' : [], 'info' : []}, \
                                                        {'title' : [], 'desc' : [], 'info' : []}

    n_train = len(data)

    batch_triplets, batch_bugs_anchor, batch_bugs_pos, batch_bugs_neg, batch_bugs = [], [], [], [], []

    for offset in range(batch_size):
        anchor, pos = data[offset][0], data[offset][1]
        batch_bugs_anchor.append(anchor)
        batch_bugs_pos.append(pos)
        batch_bugs += dup_sets[anchor]
    
    for anchor, pos in zip(batch_bugs_anchor, batch_bugs_pos):
        while True:
            if model == None:
                neg = get_neg_bug(anchor, dup_sets[anchor], issues_by_buckets)
            else:
                neg = get_neg_bug_semihard(baseline, model, batch_bugs, anchor, dup_sets[anchor])
            bug_anchor = baseline.bug_set[anchor]
            bug_pos = baseline.bug_set[pos]
            if neg not in baseline.bug_set:
                continue
            bug_neg = baseline.bug_set[neg]
            break
        # master anchor and neg
        master_anchor = baseline.bug_set[issues_by_buckets[anchor]]
        while True:
            if model == None:
                master_neg_id = get_neg_bug(issues_by_buckets[anchor], dup_sets[anchor], issues_by_buckets)
            else:
                master_neg_id = get_neg_bug_semihard(baseline, model, batch_bugs, issues_by_buckets[anchor], dup_sets[anchor])
            
            if master_neg_id not in baseline.bug_set:
                continue
            
            master_neg = baseline.bug_set[master_neg_id]
            break
        
        read_batch_bugs(baseline, batch_input, bug_anchor)
        read_batch_bugs(baseline, batch_pos, bug_pos)
        read_batch_bugs(baseline, batch_neg, bug_neg)
        # master anchor and neg
        read_batch_bugs(baseline, master_batch_input, master_anchor)
        read_batch_bugs(baseline, master_batch_neg, master_neg)
        # triplet bug and master
        batch_triplets.append([anchor, pos, neg, master_anchor, master_neg])
        
        
    title_ids = np.full((len(batch_triplets), MAX_SEQUENCE_LENGTH_T), 0)
    description_ids = np.full((len(batch_triplets), MAX_SEQUENCE_LENGTH_D), 0)

    batch_input['title'] = { 'token' : np.array(batch_input['title']), 'segment' : title_ids }
    batch_input['desc'] = { 'token' : np.array(batch_input['desc']), 'segment' : description_ids }
    batch_input['info'] = np.array(batch_input['info'])
    batch_pos['title'] = { 'token' : np.array(batch_pos['title']), 'segment' : title_ids }
    batch_pos['desc'] = { 'token' : np.array(batch_pos['desc']), 'segment' : description_ids }
    batch_pos['info'] = np.array(batch_pos['info'])
    batch_neg['title'] = { 'token' : np.array(batch_neg['title']), 'segment' : title_ids }
    batch_neg['desc'] = { 'token' : np.array(batch_neg['desc']), 'segment' : description_ids }
    batch_neg['info'] = np.array(batch_neg['info'])
    
    # master
    master_batch_input['title'] = { 'token' : np.array(master_batch_input['title']), 'segment' : title_ids }
    master_batch_input['desc'] ={ 'token' : np.array(master_batch_input['desc']), 'segment' : description_ids }
    master_batch_input['info'] = np.array(master_batch_input['info'])
    
    master_batch_neg['title'] = { 'token' : np.array(master_batch_neg['title']), 'segment' : title_ids }
    master_batch_neg['desc'] = { 'token' : np.array(master_batch_neg['desc']), 'segment' : description_ids }
    master_batch_neg['info'] = np.array(master_batch_neg['info'])

    n_half = len(batch_triplets) // 2
    if n_half > 0:
        pos = np.full((1, n_half), 1)
        neg = np.full((1, n_half), 0)
        sim = np.concatenate([pos, neg], -1)[0]
    else:
        sim = np.array([np.random.choice([1, 0])])

    input_sample, input_pos, input_neg, master_input_sample, master_neg = {}, {}, {}, {}, {}

    input_sample = { 'title' : batch_input['title'], 'description' : batch_input['desc'], 'info' : batch_input['info'] }
    input_pos = { 'title' : batch_pos['title'], 'description' : batch_pos['desc'], 'info': batch_pos['info'] }
    input_neg = { 'title' : batch_neg['title'], 'description' : batch_neg['desc'], 'info': batch_neg['info'] }
    # master 
    master_input_sample = { 'title' : master_batch_input['title'], 'description' : master_batch_input['desc'], 
                           'info' : master_batch_input['info'] }
    master_neg = { 'title' : master_batch_neg['title'], 'description' : master_batch_neg['desc'], 
                           'info' : master_batch_neg['info'] }
    return batch_triplets, input_sample, input_pos, input_neg, master_input_sample, master_neg, sim #sim

### Train ids

In [25]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [26]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, \
                            valid_master_sample, valid_master_neg, valid_sim = batch_iterator(baseline, None, baseline.train_data, 
                                                                                          baseline.dup_sets_train,
                                                                                          bug_train_ids,
                                                                                          batch_size_test, 1, 
                                                                                              issues_by_buckets)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title']['token'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description']['token'].shape[1]

CPU times: user 208 ms, sys: 0 ns, total: 208 ms
Wall time: 208 ms


In [27]:
valid_input_sample['title']['token'].shape, valid_input_sample['description']['token'].shape, \
    valid_input_sample['info'].shape, valid_sim.shape

((128, 20), (128, 20), (128, 729), (128,))

### Validar entrada

In [28]:
# %%time 

# baseline.display_batch(baseline.train_data, baseline.dup_sets_train, bug_train_ids, 5, batch_iterator, issues_by_buckets)

## Experiment

## Propose

https://github.com/tqtg/DuplicateBugFinder

In [29]:
from keras.initializers import RandomUniform, RandomNormal, Ones

### BERT

https://github.com/CyberZHG/keras-bert

In [30]:
from keras_bert import load_trained_model_from_checkpoint
from keras_bert import compile_model, get_model
from keras.layers import GlobalAveragePooling1D

def bert_model(MAX_SEQUENCE_LENGTH, name):
    layer_num = 8
#     model = load_trained_model_from_checkpoint(
#             config_path,
#             model_path,
#             training=True,
#             trainable=True,
#             seq_len=MAX_SEQUENCE_LENGTH,
#     )
    model = load_trained_model_from_checkpoint(
        config_path,
        model_path,
        training=False,
        use_adapter=True,
        seq_len=MAX_SEQUENCE_LENGTH,
        trainable=['Encoder-{}-MultiHeadSelfAttention-Adapter'.format(i + 1) for i in range(12-layer_num, 13)] +
        ['Encoder-{}-FeedForward-Adapter'.format(i + 1) for i in range(12-layer_num, 13)] +
        ['Encoder-{}-MultiHeadSelfAttention-Norm'.format(i + 1) for i in range(12-layer_num, 13)] +
        ['Encoder-{}-FeedForward-Norm'.format(i + 1) for i in range(layer_num)],
    )
#     model = get_model(
#         token_num=len(token_dict),
#         head_num=10,
#         transformer_num=layer_num,
#         embed_dim=100,
#         feed_forward_dim=100,
#         seq_len=MAX_SEQUENCE_LENGTH,
#         pos_num=MAX_SEQUENCE_LENGTH,
#         dropout_rate=0.05,
#     )
    compile_model(model)
    inputs = model.inputs[:2]
    outputs = model.get_layer('Encoder-{}-FeedForward-Norm'.format(layer_num)).output
    #outputs = model.get_layer('Extract').output
    outputs = GlobalAveragePooling1D()(outputs)
#     outputs = Dense(300, activation='tanh')(outputs)
    
    model = Model(inputs, outputs, name='FeatureBERTGenerationModel{}'.format(name))
    
    return model

### MLP

In [31]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    #layer = GRU(100, activation='tanh')(layer)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [32]:
from keras import backend as K

'''
    Some loss ideas
    hinge loss Kullback-Leibler
    https://stackoverflow.com/questions/53581298/custom-combined-hinge-kb-divergence-loss-function-in-siamese-net-fails-to-genera
'''

def normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = normalize(x, axis=-1)
    y, y_norm = normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return K.mean(distance, axis=-1, keepdims=False)

def euclidean_distance(vects):
    x, y = vects
    distance = K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
    # Normalize https://stats.stackexchange.com/questions/53068/euclidean-distance-score-and-similarity
    distance = K.constant(1) / (K.constant(1) + distance)
    return K.mean(distance, keepdims=False)
    #return K.mean(distance, axis=-1, keepdims=False)

# https://jdhao.github.io/2017/03/13/some_loss_and_explanations/
def triplet_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.mean(K.maximum(0.0, pos - neg + margin))

def custom_margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.mean(K.maximum(0.0, margin - pos + neg), keepdims=False)

# https://www.kaggle.com/c/quora-question-pairs/discussion/33631
# https://www.researchgate.net/figure/Illustration-of-triplet-loss-contrastive-loss-for-negative-samples-and-binomial_fig2_322060548
def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    pos = y_pred[0]
    neg = y_pred[1]
    margin = 1
    return K.mean(pos * K.square(neg) +
                  (1 - pos) * K.square(K.maximum(margin - neg, 0)))

def pos_distance(y_true, y_pred):
    return y_pred[0]

def neg_distance(y_true, y_pred):
    return y_pred[1]

def stack_tensors(vects):
    return K.stack(vects, axis=-1)

### Propose

In [33]:
from keras.layers import concatenate, Add, Lambda, Average, Maximum, Subtract, Average, AveragePooling1D, GlobalAveragePooling1D
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
    
    # Title
    bug_t_token = Input(shape = (sequence_length_t, ), name = 'title_token_{}'.format(name))
    bug_t_segment = Input(shape = (sequence_length_t, ), name = 'title_segment_{}'.format(name))
    # Description
    bug_d_token = Input(shape = (sequence_length_d, ), name = 'desc_token_{}'.format(name))
    bug_d_segment = Input(shape = (sequence_length_d, ), name = 'desc_segment_{}'.format(name))
    # Categorical
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model([bug_t_token, bug_t_segment])
    bug_d_feat = desc_feature_model([bug_d_token, bug_d_segment])
    bug_i_feat = categorical_feature_model(bug_i)
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, bug_t_feat, bug_d_feat], name = 'merge_features_{}'.format(name))
    
    bug_feature_model = Model(inputs=[bug_t_token, bug_t_segment, bug_d_token, bug_d_segment, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [34]:
from keras.layers import Average
from keras_radam import RAdam
from keras_bert import AdamWarmup, calc_train_steps

def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, 
                             master_anchor, master_negative, master_positive, 
                         NUMBER_OF_INSTANCES, BATCH_SIZE, EPOCHS, decay_lr=1):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input, 
                                 master_anchor.input, master_positive.input, master_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    master_anchor = master_anchor.output
    master_negative = master_negative.output
    master_positive = master_positive.output
    
    # Distance bugs
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])
    
    # Distance masters anchor
    master_anchor_positive_d = Lambda(cosine_distance, name='pos_master_cosine_distance', output_shape=[1])([encoded_anchor, master_positive])
    master_anchor_negative_d = Lambda(cosine_distance, name='neg_master_cosine_distance', output_shape=[1])([encoded_anchor, master_negative])
    
    # Distance master positive
    master_pos_positive_d = Lambda(cosine_distance, name='pos_master_pos_cosine_distance', output_shape=[1])([encoded_positive, master_positive])
    master_pos_negative_d = Lambda(cosine_distance, name='neg_master_pos_cosine_distance', output_shape=[1])([encoded_positive, master_negative])
    
    # Distance master negative
    master_neg_positive_d = Lambda(cosine_distance, name='pos_master_neg_cosine_distance', output_shape=[1])([encoded_negative, master_negative])
    master_neg_negative_d = Lambda(cosine_distance, name='neg_master_neg_cosine_distance', output_shape=[1])([encoded_negative, master_positive])
    

    # Loss function only works with a single output
    output_bug = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-bug',
        output_shape=(2, 1)
    )([positive_d, negative_d])
    
    output_master = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master-anchor',
        output_shape=(2, 1)
    )([master_anchor_positive_d, master_anchor_negative_d])
    
    output_master_pos = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master-pos',
        output_shape=(2, 1)
    )([master_pos_positive_d, master_pos_negative_d])
    
    output_master_neg = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances-master-neg',
        output_shape=(2, 1)
    )([master_neg_positive_d, master_neg_negative_d])
    
    output = Average()([output_bug, output_master, output_master_pos, output_master_neg])
    
    #output_avg_master = Average()([output_master, output_master_pos, output_master_neg])
    #output = Average()([output_bug, output_avg_master])
    #loss = MarginLoss()(output)

    similarity_model = Model(inputs = inputs, outputs = [output], name = 'Similarity_Model')

    # setup the optimization process 
    similarity_model.compile(optimizer='adam', loss=custom_margin_loss, 
                                 metrics=[pos_distance, neg_distance])

    return similarity_model

In [None]:
%%time
import os

'''
    Configuration
'''
epochs = 1000
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

print("Batch size ", batch_size)

# Inspired on https://'pastebin.com/TaGFdcBA
keras.backend.clear_session()

# Feature models
'''
    cnn_dilated_model
    arcii_model
    cnn_model
    lstm_model
    bilstm_model
'''
# title_feature_model = bilstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
title_feature_model = bert_model(MAX_SEQUENCE_LENGTH_T, 'Title')
desc_feature_model = bert_model(MAX_SEQUENCE_LENGTH_D, 'Description')
#desc_feature_model = cnn_model(desc_embedding_layer, MAX_SEQUENCE_LENGTH_D)
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')
encoded_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')
# Master model
master_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_in')
master_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_pos')
master_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'master_neg')

NUMBER_OF_INSTANCES = len(baseline.dup_sets_train)
BATCH_SIZE = batch_size
EPOCHS = epochs

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, 
                                            master_anchor, master_negative, master_positive,
                                            NUMBER_OF_INSTANCES, BATCH_SIZE, EPOCHS, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, train_master_input, train_master_neg, \
            train_sim = batch_iterator(baseline, encoded_anchor, baseline.train_data, baseline.dup_sets_train, bug_train_ids, 
                                       batch_size, 1, issues_by_buckets)
    
    train_batch = [train_input_sample['title']['token'], train_input_sample['title']['segment'], train_input_sample['description']['token'], train_input_sample['description']['segment'], train_input_sample['info'],
                   train_input_pos['title']['token'], train_input_pos['title']['segment'], train_input_pos['description']['token'], train_input_pos['description']['segment'], train_input_pos['info'], 
                   train_input_neg['title']['token'], train_input_neg['title']['segment'], train_input_neg['description']['token'], train_input_neg['description']['segment'], train_input_neg['info'],
                  train_master_input['title']['token'], train_master_input['title']['segment'], train_master_input['description']['token'], train_master_input['description']['segment'], train_master_input['info'],
                  train_master_input['title']['token'], train_master_input['title']['segment'], train_master_input['description']['token'], train_master_input['description']['segment'], train_master_input['info'],
                   train_master_neg['title']['token'], train_master_neg['title']['segment'], train_master_neg['description']['token'], train_master_neg['description']['segment'], train_master_neg['info']]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, _, debug = experiment.evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets, 
                                                               bug_train_ids, method='bert')
        print("Epoch: {} Loss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}, recall@25: {:.2f}".format(epoch+1,
                                                                                                         h[0],
                                                                                                         h[1], h[2], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}".format(epoch+1,
                                                                                                         h[0],
                                                                                                         h[1],
                                                                                                         h[2]))
    loss = h[0]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

experiment.save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
experiment.save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}s, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

Batch size  64
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 729)          0                                            
__________________________________________________________________________________________________
title_token_in (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
title_segment_in (InputLayer)   (None, 20)           0                                            
__________________________________________________________________________________________________
desc_token_in (InputLayer)      (None, 20)           0                                            
______________________________________________________________________________________________

Epoch: 1 Loss: 1.00, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 2 Loss: 1.00, pos_cosine: 0.89, neg_cosine: 0.90
Epoch: 3 Loss: 1.00, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 4 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 5 Loss: 1.00, pos_cosine: 0.90, neg_cosine: 0.91
Epoch: 7 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 8 Loss: 1.00, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 9 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 10 Loss: 0.99, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 11 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 12 Loss: 1.00, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 13 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 14 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 15 Loss: 1.00, pos_cosine: 0.91, neg_cosine: 0.91
Epoch: 16 Loss: 1.01, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 17 Loss: 0.99, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 18 Loss: 1.00, pos_cosine: 0.90, neg_cosine: 0.90
Epoch: 19 Loss: 1.00, pos_cosine: 0.90,

Epoch: 146 Loss: 0.96, pos_cosine: 0.65, neg_cosine: 0.61
Epoch: 147 Loss: 0.96, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 148 Loss: 0.96, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 149 Loss: 0.95, pos_cosine: 0.67, neg_cosine: 0.62
Epoch: 150 Loss: 0.98, pos_cosine: 0.65, neg_cosine: 0.63
Epoch: 151 Loss: 0.96, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 152 Loss: 0.97, pos_cosine: 0.65, neg_cosine: 0.62
Epoch: 153 Loss: 0.95, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 154 Loss: 0.97, pos_cosine: 0.65, neg_cosine: 0.62
Epoch: 155 Loss: 0.97, pos_cosine: 0.65, neg_cosine: 0.62
Epoch: 156 Loss: 0.96, pos_cosine: 0.65, neg_cosine: 0.61
Epoch: 157 Loss: 0.95, pos_cosine: 0.66, neg_cosine: 0.61
Epoch: 158 Loss: 0.95, pos_cosine: 0.66, neg_cosine: 0.61
Epoch: 159 Loss: 0.97, pos_cosine: 0.65, neg_cosine: 0.62
Epoch: 160 Loss: 0.97, pos_cosine: 0.65, neg_cosine: 0.62
Epoch: 161 Loss: 0.96, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 162 Loss: 0.94, pos_cosine: 0.66, neg_cosine: 0.60
Epoch: 163 Los

Epoch: 288 Loss: 0.96, pos_cosine: 0.67, neg_cosine: 0.63
Epoch: 289 Loss: 0.96, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 290 Loss: 0.96, pos_cosine: 0.66, neg_cosine: 0.62
Epoch: 291 Loss: 0.94, pos_cosine: 0.68, neg_cosine: 0.62
Epoch: 292 Loss: 0.97, pos_cosine: 0.67, neg_cosine: 0.63
Epoch: 293 Loss: 0.95, pos_cosine: 0.69, neg_cosine: 0.64
Epoch: 294 Loss: 0.94, pos_cosine: 0.68, neg_cosine: 0.62
Epoch: 295 Loss: 0.94, pos_cosine: 0.67, neg_cosine: 0.61
Epoch: 296 Loss: 0.97, pos_cosine: 0.66, neg_cosine: 0.63
Epoch: 297 Loss: 0.95, pos_cosine: 0.69, neg_cosine: 0.64
Epoch: 298 Loss: 0.95, pos_cosine: 0.68, neg_cosine: 0.63
Epoch: 299 Loss: 0.93, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 300 Loss: 0.94, pos_cosine: 0.69, neg_cosine: 0.63
Epoch: 301 Loss: 0.97, pos_cosine: 0.68, neg_cosine: 0.64
Epoch: 302 Loss: 0.95, pos_cosine: 0.68, neg_cosine: 0.63
Epoch: 303 Loss: 0.94, pos_cosine: 0.69, neg_cosine: 0.62
Epoch: 304 Loss: 0.96, pos_cosine: 0.67, neg_cosine: 0.62
Epoch: 305 Los

Epoch: 430 Loss: 0.95, pos_cosine: 0.73, neg_cosine: 0.68
Epoch: 431 Loss: 0.95, pos_cosine: 0.69, neg_cosine: 0.64
Epoch: 432 Loss: 0.93, pos_cosine: 0.70, neg_cosine: 0.63
Epoch: 433 Loss: 0.91, pos_cosine: 0.72, neg_cosine: 0.64
Epoch: 434 Loss: 0.94, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 435 Loss: 0.94, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 436 Loss: 0.92, pos_cosine: 0.71, neg_cosine: 0.64
Epoch: 437 Loss: 0.93, pos_cosine: 0.72, neg_cosine: 0.64
Epoch: 438 Loss: 0.93, pos_cosine: 0.69, neg_cosine: 0.63
Epoch: 439 Loss: 0.93, pos_cosine: 0.70, neg_cosine: 0.63
Epoch: 440 Loss: 0.94, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 441 Loss: 0.94, pos_cosine: 0.72, neg_cosine: 0.66
Epoch: 442 Loss: 0.93, pos_cosine: 0.71, neg_cosine: 0.65
Epoch: 443 Loss: 0.93, pos_cosine: 0.72, neg_cosine: 0.64
Epoch: 444 Loss: 0.93, pos_cosine: 0.71, neg_cosine: 0.64
Epoch: 445 Loss: 0.94, pos_cosine: 0.69, neg_cosine: 0.63
Epoch: 446 Loss: 0.93, pos_cosine: 0.71, neg_cosine: 0.65
Epoch: 447 Los

Epoch: 572 Loss: 0.93, pos_cosine: 0.74, neg_cosine: 0.67
Epoch: 573 Loss: 0.93, pos_cosine: 0.71, neg_cosine: 0.64
Epoch: 574 Loss: 0.94, pos_cosine: 0.74, neg_cosine: 0.68
Epoch: 575 Loss: 0.93, pos_cosine: 0.75, neg_cosine: 0.68
Epoch: 576 Loss: 0.93, pos_cosine: 0.73, neg_cosine: 0.66
Epoch: 577 Loss: 0.93, pos_cosine: 0.73, neg_cosine: 0.66
Epoch: 578 Loss: 0.94, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 579 Loss: 0.94, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 580 Loss: 0.93, pos_cosine: 0.73, neg_cosine: 0.66
Epoch: 581 Loss: 0.92, pos_cosine: 0.73, neg_cosine: 0.66
Epoch: 582 Loss: 0.92, pos_cosine: 0.74, neg_cosine: 0.66
Epoch: 583 Loss: 0.93, pos_cosine: 0.72, neg_cosine: 0.64
Epoch: 584 Loss: 0.94, pos_cosine: 0.71, neg_cosine: 0.65
Epoch: 585 Loss: 0.94, pos_cosine: 0.74, neg_cosine: 0.68
Epoch: 586 Loss: 0.92, pos_cosine: 0.75, neg_cosine: 0.66
Epoch: 587 Loss: 0.91, pos_cosine: 0.73, neg_cosine: 0.63
Epoch: 588 Loss: 0.93, pos_cosine: 0.70, neg_cosine: 0.64
Epoch: 589 Los

In [47]:
recall

0.81

In [48]:
_[:20]

['108544:111059,109674,108379,109366|109366:0.15593445301055908,111059:0.12969756126403809,94421:0.10437977313995361,121250:0.08986496925354004,115569:0.08433961868286133,108379:0.07375228404998779,121232:0.07194459438323975,99306:0.06010138988494873,112520:0.05320429801940918,98821:0.046608924865722656,117276:0.039613962173461914,102082:0.03391468524932861,115072:0.03152573108673096,107794:0.023058533668518066,65064:0.017291665077209473,115888:0.014262080192565918,114770:0.014158010482788086,94907:0.01373898983001709,110764:0.011090755462646484,102398:0.010446667671203613,110555:0.007324576377868652,102409:0.006232261657714844,115068:0.0018624067306518555,105993:0.0018515586853027344,115895:0.0014536380767822266,96124:0.001212477684020996,105907:0.0009016990661621094,99982:0.00038933753967285156,97860:0.0',
 '109674:108544,111059,108379,109366|94815:0.45261454582214355,112693:0.27634429931640625,82495:0.17601275444030762,97860:0.15731406211853027,102497:0.1454828977584839,111059:0.118

In [None]:
# recall, exported_rank, debug = experiment.evaluate_validation_test(experiment, retrieval, verbose, 
#                                                         encoded_anchor, issues_by_buckets, evaluate_validation_test)
# test_vectorized, queries_test_vectorized, annoy, X_test, distance_test, indices_test = debug
# "recall@25 last epoch:", recall

### Retrieval evaluation

In [None]:
print("Total of queries:", len(retrieval.test))

#### Getting the model trained

In [None]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

In [None]:
model = encoded_anchor
# model = experiment.get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))

In [None]:
model.summary()

In [None]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, 
                                                                   bug_train_ids, method='bert')

In [None]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

In [None]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [49]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.67,
 '2 - recall_at_10': 0.74,
 '3 - recall_at_15': 0.77,
 '4 - recall_at_20': 0.79,
 '5 - recall_at_25': 0.81}