# Bug triage with Deep Learning - PROPOSE

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = 'propose_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'propose_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

#### Loading bug ids in memory

In [9]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

Reading bug ids


212512

### Dicionário de títulos e descrições

In [10]:
def data_padding(data, max_seq_length):
    seq_lengths = [len(seq) for seq in data]
    seq_lengths.append(6)
    max_seq_length = min(max(seq_lengths), max_seq_length)
    padded_data = np.zeros(shape=[len(data), max_seq_length])
    for i, seq in enumerate(data):
        seq = seq[:max_seq_length]
        for j, token in enumerate(seq):
            padded_data[i, j] = int(token)
    return padded_data.astype(np.int)

In [11]:
import _pickle as pickle

def load_bugs(baseline):   
    removed = []
    baseline.corpus = []
    baseline.sentence_dict = {}
    baseline.bug_set = {}
    title_padding, desc_padding = [], []
    for bug_id in tqdm(baseline.bug_ids):
        try:
            bug = pickle.load(open(os.path.join(baseline.DIR, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
            title_padding.append(bug['title_word'])
            desc_padding.append(bug['description_word'])
            baseline.bug_set[bug_id] = bug
            #break
        except:
            removed.append(bug_id)
    
    # Padding
    title_padding = data_padding(title_padding, 100)
    desc_padding = data_padding(desc_padding, 500)
    
    for bug_id, bug_title, bug_desc in tqdm(zip(baseline.bug_ids, title_padding, desc_padding)):
        baseline.bug_set[bug_id]['title_word'] = bug_title
        baseline.bug_set[bug_id]['description_word'] = bug_desc
        bug = baseline.bug_set[bug_id]
        baseline.sentence_dict[",".join(bug_title.astype(str))] = bug['title']
        baseline.sentence_dict[",".join(bug_desc.astype(str))] = bug['description']
    
    if len(removed) > 0:
        for x in removed:
            baseline.bug_ids.remove(x)
        baseline.removed = removed
        print("{} were removed. To see the list call self.removed".format(len(removed)))

In [12]:
%%time

load_bugs(baseline)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 1min 13s, sys: 2.51 s, total: 1min 16s
Wall time: 1min 15s


## Geração de batches

# Generating tiple of batches

In [13]:
%%time
baseline.prepare_dataset()

Reading train data
CPU times: user 363 ms, sys: 21 µs, total: 363 ms
Wall time: 360 ms


In [14]:
if 2521 in baseline.bug_set:
    print(baseline.bug_set[2521])

{'product': '125\n', 'creation_ts': '2001-10-10 22:38:00 -0400', 'bug_severity': '4\n', 'component': '566\n', 'resolution': 'FIXED', 'bug_status': '0\n', 'description_word': array([ 241, 3070,   86,  548,  297,  394,    9,  196,   95,   16,  131,
          8,   17,  213, 1931,  196,   95,    2,   27,   22,  130,  783,
        276,   16,   20, 3484,   23,  610,   11,   28,   44,  563,    2,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0

### Generating the batch test

In [15]:
%%time

batch_size = 64
batch_size_test = 128

# we want a constant validation group to have a frame of reference for model performance
train_gen = baseline.siam_gen(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = baseline.batch_iterator(baseline.train_data, 
                                                                                          baseline.dup_sets_train, 
                                                                                          batch_size_test, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]
# Max sequence title
MAX_SEQUENCE_LENGTH_T = valid_input_sample['title'].shape[1]
MAX_SEQUENCE_LENGTH_D = valid_input_sample['description'].shape[1]

CPU times: user 51.7 ms, sys: 0 ns, total: 51.7 ms
Wall time: 51.2 ms


In [16]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((128, 43), (128, 500), (128, 1682), (128,))

### Validar entrada

In [17]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, 5)

***Title***: many organization doesn have content
***Title***: organization editor window scrolling area doesn match real size after eclipse restarts
***Description***: created attachment empty debug dialog starting with organization many of the dialogs the run or debug dialog import or export dialog doesn have content after restart everything works fine see the attached file for an example debug dialog already delete the workspace but it didn help
***Description***: created attachment editing window before exiting organization steps to reproduce open large enough source file for scrolling close eclipse start eclipse try to scroll editor window contents note there is only small rectangle scrolling not entire editing area more information organization ultimate to make it work correctly have to double click source tab twice maximize restore
***similar = 1
########################
***Title***: organization organization filter as part of the import wizard which will be sent to the organiza

## Pre-trained embeddings

Loading pretrained word vectors

### Fasttext

In [18]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'vocab_embed_fasttext.pkl'))
#print(np.random.choice(vocab, 10))
# for token in vocab:
#     print(token)

vocabulary loaded


In [19]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 137745'

In [20]:
# def generating_embed(baseline, EMBED_DIR, EMBEDDING_DIM):
#     embeddings_index = {}
#     embed_path = os.path.join(EMBED_DIR, 'glove.42B.300d.txt')
#     f = open(embed_path, 'rb')
#     #num_lines = sum(1 for line in open(embed_path, 'rb'))

#     vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed.pkl'))
#     vocab_size = len(vocab) 

#     # Initialize uniform the vector considering the Tanh activation
#     embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, EMBEDDING_DIM))
#     embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

#     loop = tqdm(f)
#     loop.set_description("Loading Glove")
#     for line in loop:
#         tokens = line.split()
#         word = tokens[0]
#         embeddings_index[word] = np.asarray(tokens[1:], dtype='float32')
#         loop.update(1)
#     f.close()
#     loop.close()

#     print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))

#     loop = tqdm(total=vocab_size)
#     loop.set_description('Loading embedding from dataset pretrained')
#     i = 0
#     for word, embed in vocab.items():
#         if word in embeddings_index:
#             embedding_matrix[i] = embeddings_index[word]
#         else:
#             embedding_matrix[i] = np.asarray(embed, dtype='float32')
#         loop.update(1)
#         i+=1
#     loop.close()
#     baseline.embedding_matrix = embedding_matrix

In [21]:
import io

def generating_embed(baseline, EMBED_DIR, EMBEDDING_DIM):
    embeddings_index = {}
    embed_path = os.path.join(EMBED_DIR, 'crawl-300d-2M.vec')
    f = open(embed_path, 'rb')
    f = io.open(embed_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, f.readline().split())

    vocab = baseline.load_vocabulary(os.path.join(baseline.DIR, 'vocab_embed_fasttext.pkl'))
    vocab_size = len(vocab) 

    # Initialize uniform the vector considering the Tanh activation
    embedding_matrix = np.random.uniform(-1.0, 1.0, (vocab_size, EMBEDDING_DIM))
    embedding_matrix[0, :] = np.zeros(EMBEDDING_DIM)

    loop = tqdm(f)
    loop.set_description("Loading FastText")
    for line in loop:
        tokens = line.rstrip().split(' ')
        embed = list(map(float, tokens[1:]))
        word = tokens[0]
        embeddings_index[word] = np.asarray(embed, dtype='float32')
        loop.update(1)
    f.close()
    loop.close()

    print('Total %s word vectors in FastText 42B 300d.' % len(embeddings_index))

    loop = tqdm(total=vocab_size)
    loop.set_description('Loading embedding from dataset pretrained')
    i = 0
    for word, embed in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
        else:
            embedding_matrix[i] = np.asarray(embed, dtype='float32')
        loop.update(1)
        i+=1
    loop.close()
    baseline.embedding_matrix = embedding_matrix

In [22]:
%%time

generating_embed(baseline, EMBED_DIR=EMBED_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

vocabulary loaded


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1999995 word vectors in FastText 42B 300d.


HBox(children=(IntProgress(value=0, max=137745), HTML(value='')))


CPU times: user 2min, sys: 3.07 s, total: 2min 3s
Wall time: 2min


## Experiment

### Training and evaluating for each epoch at same time

#### Auxiliary methods train experiment siamese

In [23]:
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

In [24]:
def create_queries(retrieval, path_test):
    print("Creating the queries...")
    test = []
    with open(path_test, 'r') as file_test:
        for row in tqdm(file_test):
            tokens = row.strip().split()
            test.append([int(tokens[0]), [int(bug) for bug in tokens[1:]]])
    retrieval.test = test

In [25]:
retrieval = Retrieval()

path = 'data/processed/{}'.format(DOMAIN)
path_buckets = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)
path_train = 'data/processed/{}/train.txt'.format(DOMAIN)
path_test = 'data/processed/{}/test.txt'.format(DOMAIN)

MAX_SEQUENCE_LENGTH_I = number_of_columns_info # Status, Severity, Version, Component, Module

# Create the instance from baseline
#retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
retrieval.baseline = baseline

df = pd.read_csv(path_buckets)

# Load bug ids
#retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))




In [26]:
# Read and create the test queries duplicate
create_queries(retrieval, path_test)

Creating the queries...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Hashing bugs by buckets

In [27]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




#### Model to vectorize

In [28]:
from keras.models import Model
from keras.layers import Input
from keras.models import load_model

def get_model_vectorizer(path=None, loaded_model=None):
    if(path):
        loaded_model = load_model(os.path.join("modelos", "model_{}.h5".format(path)))
        
        '''
            {'l2_normalize' : l2_normalize, 
                                     'margin_loss' : margin_loss,
                                     'pos_distance' : pos_distance,
                                     'neg_distance' : neg_distance,
                                     'stack_tensors': stack_tensors}
        '''
    
    return loaded_model

#### Getting the list of candidates

In [29]:
def indexing_query(annoy, queries_test_vectorized, verbose=1):
    X_test = queries_test_vectorized
    distance_test, indices_test = [], []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Getting the list of candidates from queries')
    for index, row in loop:
        vector = row['vector']
        rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
        indices_test.append(rank)
        distance_test.append(1 - np.array(dist)) # normalize the similarity between 0 and 1
    if(verbose): loop.close()
    return X_test, distance_test, indices_test

#### Indexing bugs

In [30]:
# Indexing all train
def indexing_test(buckets_train_vectorized, verbose=1):
    X = np.array(buckets_train_vectorized)
    annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

    loop = total=len(X)
    if(verbose):
        loop = tqdm(total=len(X))
        loop.set_description("Indexing test in annoy")
    for index, row in enumerate(X):
        vector = row['vector']
        annoy.add_item(index, vector)
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    annoy.build(10) # 10 trees
    return annoy

#### Rank result

In [31]:
def rank_result(test_vectorized, indices_test, distance_test, verbose=1):
    formated_rank = []
    loop = zip(indices_test, distance_test)
    if(verbose):
        loop = tqdm(zip(indices_test, distance_test))
        loop.set_description('Generating the rank')
    for row_index, row_sim in loop:
        row_index, row_sim = row_index[:25], row_sim[:25]
        formated_rank.append(",".join(["{}:{}".format(test_vectorized[index]['bug_id'], sim) 
                                       for index, sim in zip(row_index, row_sim)]))
    if(verbose): loop.close()
    return formated_rank

#### Vectorizer 

In [32]:
def vectorizer_test(bug_set, model, test, issues_by_buckets, verbose=1):
    test_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = test
    if(verbose):
        loop = tqdm(test)
        loop.set_description('Vectorizing buckets')
    buckets = set()
    for row in loop: # retrieval.bugs_train
        query, ground_truth = row
        bugs = [query]
        bugs += ground_truth
        for bug_id in bugs:
            buckets.add(issues_by_buckets[bug_id])
    for bucket_id in buckets:
        bug = bug_set[bucket_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        test_vectorized.append({ 'bug_id' : bucket_id })
    if(verbose):
        loop.close()
    # Get embedding of all buckets
    embed_test = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the buckets array
    for index, vector in enumerate(embed_test):
        test_vectorized[index]['vector'] = vector
    
    return test_vectorized

In [33]:
def vectorize_queries(bug_set, model, test, issues_by_buckets, verbose=1):
    queries_test_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = test
    if(verbose):
        loop = tqdm(test)
    for row in loop:
        test_bug_id, ground_truth = row
        if issues_by_buckets[test_bug_id] == test_bug_id: # if the bug is the master
            test_bug_id = np.random.choice(ground_truth, 1)[0]
        queries = set()
        queries.add(test_bug_id)
        if test_bug_id in ground_truth:
            ground_truth = list(set(ground_truth) - set([test_bug_id])) # Remove the same bug random choice to change the master
        if len(ground_truth) > 0:
            for bug in ground_truth:
                if issues_by_buckets[bug] != bug: # if the bug is the master
                    queries.add(bug)
                
        for bug_id in queries:
            bug = bug_set[bug_id]
            title_data.append(bug['title_word'])
            desc_data.append(bug['description_word'])
            info_data.append(retrieval.get_info(bug))
            queries_test_vectorized.append({ 'bug_id' : bug_id, 'ground_truth': issues_by_buckets[bug_id] })

    # Get embedding of all buckets
    embed_queries = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the queries array    
    for index, vector in enumerate(embed_queries):
        queries_test_vectorized[index]['vector'] = vector
    
    return queries_test_vectorized

#### Queries

In [34]:
# Generating the rank result
def formating_rank(X_test, verbose=1):
    rank_queries = []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Generating the queries from rank')
    for index, row in loop:
        dup_a, ground_truth = row['bug_id'], row['ground_truth']
        rank_queries.append("{}:{}".format(dup_a, ground_truth))
    if(verbose): loop.close()
    return rank_queries

In [35]:
def export_rank(rank_queries, formated_rank, verbose=1):
    exported_rank = []
    loop = len(rank_queries)
    if(verbose):
        loop = tqdm(total=len(rank_queries))
        loop.set_description('Exporting the rank')
    for query, rank in zip(rank_queries, formated_rank):
        exported_rank.append("{}|{}".format(query, rank))
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    return exported_rank

#### Methods to evaluate each epoch

In [36]:
def evaluate_validation_test(retrieval, verbose, loaded_model, issues_by_buckets):
    
    # Load test set
    test = retrieval.test
    bug_set = retrieval.baseline.get_bug_set()
    
    # Get model
    model = get_model_vectorizer(loaded_model=loaded_model)
    
    # Test 
    test_vectorized = vectorizer_test(bug_set, model, test, issues_by_buckets, verbose)
    queries_test_vectorized = vectorize_queries(bug_set, model, test, issues_by_buckets, verbose)
    annoy = indexing_test(test_vectorized, verbose)
    X_test, distance_test, indices_test = indexing_query(annoy, queries_test_vectorized, verbose)
    formated_rank = rank_result(test_vectorized, indices_test, distance_test, verbose)
    rank_queries = formating_rank(X_test, verbose)
    exported_rank = export_rank(rank_queries, formated_rank, verbose)
    evaluation = Evaluation(verbose)
    recall = evaluation.evaluate(exported_rank)['5 - recall_at_25']
    
    # recall@25, loss, cosine_positive, cosine_negative
    return recall, exported_rank
    #return report['5 - recall_at_25'], evaluation_test_batch[0], evaluation_test_batch[1], evaluation_test_batch[2] 

#### Evaluation method

In [37]:
class Evaluation():
    def __init__(self, verbose=1):
        self.verbose = verbose
        self.MAX_RANK = 25
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, row, k):
        query, rank = row.split('|')
        query_dup_id, ground_truth = query.split(":")
        candidates = [int(item.split(':')[0]) for pos, item in enumerate(rank.split(",")[:self.MAX_RANK])]
        corrects = len(set([int(ground_truth)]) & set(candidates[:k]))
        total = len([ground_truth]) # only one master from query
        return float(corrects), total

    def evaluate(self, path):
        self.recall_at_5_corrects_sum, self.recall_at_10_corrects_sum, \
        self.recall_at_15_corrects_sum, self.recall_at_20_corrects_sum, self.recall_at_25_corrects_sum = 0, 0, 0, 0, 0
        self.recall_at_5_total_sum, self.recall_at_10_total_sum, self.recall_at_15_total_sum, \
        self.recall_at_20_total_sum, self.recall_at_25_total_sum = 0, 0, 0, 0, 0 
        if(self.verbose):
            print("Evaluating...")
        if type(path) == str:
            with open(path, 'r') as file_input:
                for row in file_input:
                    self.recall(row)
        else:
            for row in path:
                self.recall(row)
        
        report = {
            '1 - recall_at_5' : round(self.recall_at_5_corrects_sum / self.recall_at_5_total_sum, 2),
            '2 - recall_at_10' : round(self.recall_at_10_corrects_sum / self.recall_at_10_total_sum, 2),
            '3 - recall_at_15' : round(self.recall_at_15_corrects_sum / self.recall_at_15_total_sum, 2),
            '4 - recall_at_20' : round(self.recall_at_20_corrects_sum / self.recall_at_20_total_sum, 2),
            '5 - recall_at_25' : round(self.recall_at_25_corrects_sum / self.recall_at_25_total_sum, 2)
        }

        return report
    def recall(self, row):
        #if row == '': continue
        self.recall_at_5_corrects, self.recall_at_5_total = self.top_k_recall(row, k=5)
        self.recall_at_10_corrects, self.recall_at_10_total = self.top_k_recall(row, k=10)
        self.recall_at_15_corrects, self.recall_at_15_total = self.top_k_recall(row, k=15)
        self.recall_at_20_corrects, self.recall_at_20_total = self.top_k_recall(row, k=20)
        self.recall_at_25_corrects, self.recall_at_25_total = self.top_k_recall(row, k=25)

        self.recall_at_5_corrects_sum += self.recall_at_5_corrects
        self.recall_at_10_corrects_sum += self.recall_at_10_corrects
        self.recall_at_15_corrects_sum += self.recall_at_15_corrects
        self.recall_at_20_corrects_sum += self.recall_at_20_corrects
        self.recall_at_25_corrects_sum += self.recall_at_25_corrects

        self.recall_at_5_total_sum += self.recall_at_5_total
        self.recall_at_10_total_sum += self.recall_at_10_total
        self.recall_at_15_total_sum += self.recall_at_15_total
        self.recall_at_20_total_sum += self.recall_at_20_total
        self.recall_at_25_total_sum += self.recall_at_25_total

#### Save the model

In [38]:
def save_model(model, name, verbose=0):
    m_dir = os.path.join('modelos')
    if not os.path.exists(m_dir):
        os.mkdir(m_dir)
    export = os.path.join(m_dir, "model_{}.h5".format(name))
    model.save(export)
    if(verbose):
        print("Saved model '{}' to disk".format(export))

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [39]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  #input_length=max_sequence_length,
                                  input_length=None,
                                  trainable=trainable)
    return embedding_layer

### ARCII

In [40]:
from keras.layers.core import Dense, Reshape, Flatten, Dropout
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate

def arcii_model(embedding_layer, max_sequence_length):
    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)
    
    filters_1d=max_sequence_length
    kernel_size_1d=3
    num_conv2d_layers=2
    filters_2d=[32,16]
    kernel_size_2d=[[3,3], [3,3]]
    mpool_size_2d=[[2,2], [2,2]]
    dropout_rate=0.5
    
    layer1_conv=Conv1D(filters=filters_1d, kernel_size=kernel_size_1d, padding='same')(embedded_sequences)
    layer1_activation=Activation('relu')(layer1_conv)
    layer1_reshaped=Reshape((max_sequence_length, max_sequence_length, -1))(layer1_activation)
    z=MaxPooling2D(pool_size=(2,2))(layer1_reshaped)

    for i in range(num_conv2d_layers):
        z=Conv2D(filters=filters_2d[i], kernel_size=kernel_size_2d[i], padding='same')(z)
        z=Activation('tanh')(z)
        z=MaxPooling2D(pool_size=(mpool_size_2d[i][0], mpool_size_2d[i][1]))(z)

    pool1_flat=Flatten()(z)
    pool1_flat_drop=Dropout(rate=dropout_rate)(pool1_flat)
    pool1_norm=BatchNormalization()(pool1_flat_drop)
    mlp1=Dense(300)(pool1_norm)
    output=Activation('tanh')(mlp1)
    feature_model = Model(inputs=[sequence_input], outputs=[output], name = 'FeatureARCIIGenerationModel') # inputs=visible
    return feature_model

### CNN Dilated

In [53]:
from keras.constraints import max_norm

def DC_CNN_Block(nb_filter, filter_length, dilation, l2_layer_reg):
    def block(block_input):        
        residual =    block_input
        
        layer_out =   Conv1D(filters=nb_filter, kernel_size=filter_length, 
                      dilation_rate=dilation, 
                      activation='linear', padding='causal', use_bias=False, kernel_constraint=max_norm(1.), kernel_regularizer=l2(l2_layer_reg))(block_input)                    
        selu_out =    Activation('tanh')(layer_out)
        
        skip_out =    Conv1D(1,1, activation='linear', use_bias=False, kernel_constraint=max_norm(1.))(selu_out)
        
        c1x1_out =    Conv1D(1,1, activation='linear', use_bias=False, kernel_constraint=max_norm(1.))(selu_out)
                      
        block_out =   Add()([residual, c1x1_out])
        
        return block_out, skip_out
    return block

def cnn_dilated_model(embedding_layer, max_sequence_length):
    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    units = 128
    number_of_layers = 6

    # Embedding layer with CNN dilated
    la, lb = DC_CNN_Block(units,2,1,0.01)(embedded_sequences)
    
    attention_layes = [lb]
    for index in range(1, number_of_layers + 1):
        la, lb = DC_CNN_Block(units, 3, 2 * index, 0.01)(la)
        attention_layes.append(lb)

    attention_layer =   Add()(attention_layes)
    #l9 =   Add()([l1a, l2a, l3a, l4a, l5a, l6a, l7a])
    
    #layer = Add()([attention_layer, l9])
    
    layer =   Activation('tanh')(attention_layer)

    x =  Conv1D(1,1, activation='linear', use_bias=False, kernel_constraint=max_norm(1.) )(layer)

    x = GlobalAveragePooling1D()(x)
    #x = Flatten()(x)
    #x = Dropout(0.50)(x)
    layer = Dense(300, activation='tanh')(x)

    feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNDilatedGenerationModel') # inputs=visible
    return feature_model

### CNN with filter 3,4,5

In [42]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    #sequence_input = Input(shape=(None,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    layer = Dense(300, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [43]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    #sequence_input = Input(shape=(None, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
#     lstm_layer = Bidirectional(LSTM(number_lstm_units, return_sequences=True), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
#                                merge_mode='ave')

    lstm_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    layer = LSTM(number_lstm_units)(lstm_layer)

    #layer = lstm_layer(embedded_sequences)
    #layer = GlobalAveragePooling1D()(layer)
    layer = Dense(300, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [44]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 300
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [45]:
from keras import backend as K
import tensorflow as tf

def normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = normalize(x, axis=-1)
    y, y_norm = normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return K.mean(distance, axis=-1, keepdims=False)
    
class MarginLoss(keras.layers.Layer):
    def call(self, inputs):
        pos = inputs[0]
        neg = inputs[1]
        loss = self.margin_loss(pos, neg)
        self.add_loss(loss, inputs=inputs)
        return inputs
        
    def margin_loss(self, pos, neg):
        margin = K.constant(1.0)
        return K.sum(K.maximum(0.0, margin - pos + neg))

def custom_margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    pos = y_pred[0]
    neg = y_pred[1]
    return K.sum(K.maximum(0.0, margin - pos + neg))

def pos_distance(y_true, y_pred):
    return y_pred[0]

def neg_distance(y_true, y_pred):
    return y_pred[1]

def stack_tensors(vects):
    return K.stack(vects, axis=-1)

#### Propose

In [57]:
from keras.initializers import TruncatedNormal
from keras.regularizers import l2

def residual_bug():
    def block(block_input):
        shape_size = K.int_shape(block_input)[1]
        
        residual =  block_input
        
        layer_out = Dense(shape_size // 2, activation='tanh')(block_input)
        
        skip_out =  Dense(shape_size, activation='linear', use_bias=False, kernel_constraint=max_norm(1.))(layer_out)
        
        dense_out =  Dense(shape_size, activation='linear', use_bias=False, kernel_constraint=max_norm(1.))(layer_out)
        
        block_out =   Add()([residual, dense_out])
        return block_out, skip_out
    return block

In [58]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat = title_feature_model(bug_t)
    bug_d_feat = desc_feature_model(bug_d)
    bug_i_feat = categorical_feature_model(bug_i)
    
#     encoded_t_1a, encoded_t_1b  = residual_bug()(bug_t_feat)
#     encoded_d_1a, encoded_d_1b  = residual_bug()(bug_d_feat)
#     bug_t_feat = encoded_t_1a
#     bug_d_feat = encoded_d_1a
    
    textual_feat = Add(name='textual_features_{}'.format(name))([bug_t_feat, bug_d_feat])
    
    #bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat, bug_t_feat, bug_d_feat])
    bug_feature_output = concatenate([bug_i_feat, textual_feat], name = 'merge_features_{}'.format(name))
    
    # encoded_1a, encoded_1b  = residual_bug()(bug_feature_output)
    # bug_feature_output = encoded_1a
    #     encoded_2a, encoded_2b  = residual_bug()(encoded_1a)
    
    #     bug_feature_output = Add()([encoded_1b, encoded_2b])
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    # bug_feature_output = Dense(300, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [48]:
def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=1):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    
    # Cosine
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])

    # Loss function only works with a single output
    output = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances',
        output_shape=(2, 1)
    )([positive_d, negative_d])
    
    #loss = MarginLoss()(output)

    similarity_model = Model(inputs = inputs, outputs = output, name = 'Similarity_Model')

    #optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    optimizer = Adam(lr=1e-3 * decay_lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)

    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=custom_margin_loss, metrics=[pos_distance, neg_distance, custom_margin_loss])

    return similarity_model

In [59]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Clear GPU memory
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

# Embeddings
desc_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)
title_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.embedding_matrix), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

# Feature models
'''
    cnn_dilated_model
    arcii_model
    cnn_model
    lstm_model
'''
desc_feature_model = cnn_dilated_model(desc_embedding_layer, MAX_SEQUENCE_LENGTH_D)
title_feature_model = lstm_model(title_embedding_layer, MAX_SEQUENCE_LENGTH_T)
categorical_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')
encoded_negative = siamese_model(title_feature_model, desc_feature_model, categorical_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=1)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = baseline.batch_iterator(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'],
                   train_input_pos['title'], train_input_pos['description'], train_input_pos['info'], 
                   train_input_neg['title'], train_input_neg['description'], train_input_neg['info']]
    
#     if epoch == 10:
#         similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative, decay_lr=0.1)
    
    h = similarity_model.train_on_batch(x=train_batch, y=train_sim)
    
    if (epoch+1 == epochs): #(epoch > 1 and epoch % 10 == 0) or (epoch+1 == epochs):
        recall, _ = evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets)
        print("Epoch: {} Loss: {:.2f}, MarginLoss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}, recall@25: {:.2f}".format(epoch+1,
                                                                                                         h[0],  h[3],
                                                                                                         h[1], h[2], recall))
    else:
        print("Epoch: {} Loss: {:.2f}, MarginLoss: {:.2f}, pos_cosine: {:.2f}, neg_cosine: {:.2f}".format(epoch+1,
                                                                                                         h[0],  h[3],
                                                                                                         h[1],
                                                                                                         h[2]))
    loss = h[3]
    
    if loss < best_loss:
        best_loss = loss
        best_epoch = epoch+1

save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
print('Best_epoch={}, Best_loss={:.2f}, Recall@25={:.2f}'.format(best_epoch, best_loss, recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_in (InputLayer)           (None, 43)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
title_pos (InputLayer)          (None, 43)           0                                            
__________________________________________________________________________________________________
desc_pos (InputLayer)           (None, 500)          0                                            
__________________________________________________________________________________________________
title_neg 

Epoch: 1 Loss: 13.48, MarginLoss: 0.93, pos_cosine: 0.80, neg_cosine: 0.73
Epoch: 2 Loss: 13.05, MarginLoss: 0.84, pos_cosine: 0.83, neg_cosine: 0.67
Epoch: 3 Loss: 12.51, MarginLoss: 0.71, pos_cosine: 0.86, neg_cosine: 0.57
Epoch: 4 Loss: 11.93, MarginLoss: 0.56, pos_cosine: 0.87, neg_cosine: 0.43
Epoch: 5 Loss: 11.35, MarginLoss: 0.42, pos_cosine: 0.89, neg_cosine: 0.31
Epoch: 6 Loss: 10.79, MarginLoss: 0.31, pos_cosine: 0.89, neg_cosine: 0.20
Epoch: 7 Loss: 10.26, MarginLoss: 0.21, pos_cosine: 0.91, neg_cosine: 0.12
Epoch: 8 Loss: 9.77, MarginLoss: 0.15, pos_cosine: 0.93, neg_cosine: 0.07
Epoch: 9 Loss: 9.31, MarginLoss: 0.10, pos_cosine: 0.95, neg_cosine: 0.05
Epoch: 10 Loss: 8.87, MarginLoss: 0.07, pos_cosine: 0.96, neg_cosine: 0.03
Epoch: 11 Loss: 8.45, MarginLoss: 0.05, pos_cosine: 0.98, neg_cosine: 0.02
Epoch: 12 Loss: 8.05, MarginLoss: 0.04, pos_cosine: 0.98, neg_cosine: 0.02
Epoch: 13 Loss: 7.67, MarginLoss: 0.03, pos_cosine: 0.99, neg_cosine: 0.02
Epoch: 14 Loss: 7.31, Margi

In [50]:
_[:20]

['196609:241619|241619:0.38720422983169556,129538:0.2880561947822571,269792:0.2663504481315613,130357:0.2629655599594116,173392:0.22602033615112305,205810:0.20740270614624023,144811:0.18740785121917725,104033:0.1714646816253662,132905:0.1692110300064087,227934:0.16867035627365112,146707:0.16306132078170776,159458:0.15082502365112305,136922:0.1502733826637268,129239:0.14674526453018188,139830:0.1466323733329773,149425:0.14462929964065552,82748:0.13654518127441406,180804:0.13541895151138306,138743:0.13101840019226074,319714:0.11330252885818481,250560:0.11092448234558105,136420:0.10773563385009766,85779:0.10686999559402466,101189:0.10645955801010132,168988:0.10322725772857666',
 '35946:31941|31525:0.9709604922682047,34648:0.9593393839895725,35657:0.922690212726593,32045:0.9193853735923767,34612:0.9155542552471161,30073:0.914227768778801,40460:0.9136621206998825,34159:0.9128481894731522,34365:0.9124507457017899,28812:0.8901591822504997,34826:0.8886719197034836,34940:0.8872854188084602,3884

In [211]:
'''
    Between 0-10 epochs recall@25 = 0.28
    Between 0-20 epochs recall@25 = 0.32
    Between 0-70 epochs recall@25 = ?
    Between 0-100 epochs recall@25 = 0.43
'''
recall, exported_rank = evaluate_validation_test(retrieval, verbose, encoded_anchor, issues_by_buckets)

"recall@25 last epoch:", recall

('recall@25 last epoch:', 0.23)

In [43]:
# loss=h.history['loss']
# val_loss=h.history['val_loss']

# plt.plot(loss, label='loss')
# plt.plot(val_loss, label='val_loss')
# plt.title('Model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()

### Retrieval evaluation

In [77]:
print("Total of queries:", len(retrieval.test))

Total of queries: 7253


#### Getting the model trained

In [75]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'baseline_feature_100epochs_64batch(eclipse)'

In [76]:
model = get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))



In [77]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 43)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          504900      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [78]:
recall, exported_rank = evaluate_validation_test(0, model, retrieval.test, issues_by_buckets)

In [None]:
EXPORT_RANK_PATH = os.path.join(path, 'exported_rank_propose.txt')

In [80]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [81]:
evaluation = Evaluation()
report = evaluation.evaluate(EXPORT_RANK_PATH)
report

Evaluating...


{'1 - recall_at_5': 0.33,
 '2 - recall_at_10': 0.39,
 '3 - recall_at_15': 0.43,
 '4 - recall_at_20': 0.47,
 '5 - recall_at_25': 0.5}