# Bug triage with Deep Learning

In [1]:
import keras
from tensorflow.contrib.tensorboard.plugins import projector

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline

## Auxiliary methods

## Dataset bugs

In [6]:
#from google.colab import drive
#drive.mount('/content/drive')

## Configurações Globais

In [7]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 100 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

### Parse bugs preproprecessed

In [8]:
DIR = 'data/processed/eclipse'
DIR_PAIRS = 'data/normalized/eclipse'

In [9]:
df_train_pair = pd.read_csv(os.path.join(DIR_PAIRS, 'eclipse_pairs.csv'))
baseline = Baseline(DIR, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

In [10]:
baseline.load_ids(DIR)

In [11]:
len(baseline.bug_ids)

212512

### Dicionário de títulos e descrições

In [None]:
%%time

baseline.load_preprocess()

 94%|██████████████████████████████▉  | 199213/212512 [22:08<01:58, 112.34it/s]

In [None]:
len(baseline.sentence_dict)

## Geração de batches

### Generating tiple of batches

In [None]:
%%time

bug_dir = os.path.join(DIR)
baseline.prepare_dataset(bug_dir)

In [None]:
baseline.load_bugs()

In [None]:
%%time

batch_size = 1
batch_size_test = 512

# we want a constant validation group to have a frame of reference for model performance
train_gen = baseline.siam_gen(bug_dir, batch_size, 1)
valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = baseline.batch_iterator(bug_dir, batch_size_test, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description']], valid_sim)

In [None]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_sim.shape

### Validar entrada

In [None]:
%%time 

baseline.display_batch(bug_dir, 5)

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [None]:
%%time

baseline.generating_embed(GLOVE_DIR='data/embed', EMBEDDING_DIM=EMBEDDING_DIM, MAX_NB_WORDS=MAX_NB_WORDS)

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### CNN with filter 3,4,5

In [None]:
import keras
from keras.layers import GlobalMaxPooling1D 

keras.backend.clear_session()

def cnn_model(embeddings, num_words, embedding_dim, max_sequence_length, trainable):

  embedding_layer = Embedding(num_words,
                              embedding_dim,
                              weights=[embeddings],
                              input_length=max_sequence_length,
                              trainable=trainable)

  sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
  embedded_sequences = embedding_layer(sequence_input)

  # Yoon Kim model (https://arxiv.org/abs/1408.5882)
  convs = []
  filter_sizes = [3,4,5]

  for filter_size in filter_sizes:
      l_conv = Conv1D(filters=32, kernel_size=filter_size, activation='relu')(embedded_sequences)
      l_pool = MaxPooling1D(pool_size=3)(l_conv)
      convs.append(l_pool)

  # l_merge = Merge(mode='concat', concat_axis=1)(convs)

  l_merge = Concatenate(axis=1)(convs)

  # add a 1D convnet with global maxpooling, instead of Yoon Kim model
  conv = Conv1D(filters=64, kernel_size=3, activation='relu')(l_merge)
  pool = GlobalMaxPooling1D()(conv) # pool_size=3
  # Original Yoon Kim model
  #x = Flatten()(pool)
  #x = Dropout(0.5)(x)
  layer = Activation('relu')(pool)

  cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible
  
  return cnn_feature_model

### Bi-LSTM

In [None]:
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional

def lstm_model(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
  number_lstm_units = 100
  rate_drop_lstm = 0
  recurrent_dropout = 0

  embedding_layer = Embedding(num_words,
                          embedding_dim,
                          weights=[embeddings],
                          input_length=max_sequence_length,
                          trainable=trainable)

  sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
  embedded_sequences = embedding_layer(sequence_input)

   # Creating LSTM Encoder
  lstm_layer = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm), merge_mode='sum')

  x = lstm_layer(embedded_sequences)

  layer = Activation('relu')(x)

  lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible
  
  return lstm_feature_model

### Siamese model

In [None]:
from keras import backend as K
import tensorflow as tf

# https://stackoverflow.com/questions/50673196/keras-triplet-loss-crashes-when-training
class MarginLoss(keras.layers.Layer):
  def call(self, inputs):
    bug_in, bug_pos, bug_neg  = inputs
    loss, dis_pos, dis_neg = self.distance(bug_in, bug_pos, bug_neg)
    self.add_loss(loss, inputs=inputs)
    return  K.stack(inputs)

  def compute_output_shape(self, input_shape):
        return (None, 1)

  def distance(self, bug_in, bug_pos, bug_neg):
    dis_pos, dis_neg = self.cos_distance(bug_in, bug_pos), self.cos_distance(bug_in, bug_neg)
    ep = 1
    d1 = K.maximum(0.0, ep - dis_pos + dis_neg)
    return K.mean(d1), dis_pos, dis_neg
    
  def l2_normalize(self, x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
    return K.maximum(x, K.epsilon()) / K.maximum(norm, K.epsilon())

  # https://github.com/keras-team/keras/issues/3031
  def cos_distance(self, y_true, y_pred):
    y_true = self.l2_normalize(y_true, axis=-1)
    y_pred = self.l2_normalize(y_pred, axis=-1)
    return K.mean(1 - K.sum((y_true * y_pred), axis=-1))
    
def identity_loss(y_true, y_pred):
    return K.mean(y_pred - 0 * y_true)

# define the margin loss like hinge loss
def margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    return K.mean(K.maximum(0, margin - K.square(y_pred[:,0,0])+ K.square(y_pred[:,1,0])))

In [None]:
class myCallback(keras.callbacks.Callback):
    def __init__(self, log_dir = './logs/', margin_loss=None):
        super(myCallback, self).__init__()
        training_log_dir = os.path.join(log_dir, 'training')
        self.summary_writer = tf.summary.FileWriter(training_log_dir)
        self.margin_loss = margin_loss
    #def on_batch_end(self, batch, logs = None):
        #print(batch)
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
#         print(self.model.get_layer('margin_loss_1'))
#         print(dir(self.model.get_layer('margin_loss_1')))
        model = self.model.get_layer('margin_loss_1')
#         summary = tf.Summary()
#         summary_value = summary.value.add()
#         summary_value.simple_value = model.distance_pos
#         summary_value.tag = 'distance_pos'
#         self.summary_writer.add_summary(summary, epoch)
#         self.val_writer.flush()
        super(myCallback, self).on_epoch_end(epoch, logs)

In [None]:
from keras.layers import concatenate, Add, Lambda
from keras.optimizers import Adam

def siamese_model(lstm_feature_model, cnn_feature_model, max_sequence_length_t, max_sequence_length_d):
  
  bug_t_in = Input(shape = (max_sequence_length_t, ), name = 'title_in')
  bug_t_pos = Input(shape = (max_sequence_length_t, ), name = 'title_pos')
  bug_t_neg = Input(shape = (max_sequence_length_t, ), name = 'title_neg')
  
  bug_d_in = Input(shape = (max_sequence_length_d, ), name = 'desc_in')
  bug_d_pos = Input(shape = (max_sequence_length_d, ), name = 'desc_pos')
  bug_d_neg = Input(shape = (max_sequence_length_d, ), name = 'desc_neg')

  bug_t_in_feat_lstm = lstm_feature_model(bug_t_in)
  bug_t_pos_feat_lstm = lstm_feature_model(bug_t_pos)
  bug_t_neg_feat_lstm = lstm_feature_model(bug_t_neg)
  
  bug_d_in_feat_cnn = cnn_feature_model(bug_d_in)
  bug_d_pos_feat_cnn = cnn_feature_model(bug_d_pos)
  bug_d_neg_feat_cnn = cnn_feature_model(bug_d_neg)

  encoded_anchor = concatenate([bug_t_in_feat_lstm, bug_d_in_feat_cnn], name = 'merge_features_in')
  encoded_positive = concatenate([bug_t_pos_feat_lstm, bug_d_pos_feat_cnn], name = 'merge_features_pos')
  encoded_negative = concatenate([bug_t_neg_feat_lstm, bug_d_neg_feat_cnn], name = 'merge_features_neg')
  
  loss = MarginLoss()([encoded_anchor, encoded_positive, encoded_negative])
  
  #adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)
  
  similarity_model = Model(inputs = [bug_t_in, bug_t_pos, bug_t_neg, bug_d_in, bug_d_pos, bug_d_neg], 
                           outputs = loss, name = 'Similarity_Model')

  # setup the optimization process 
  similarity_model.compile(optimizer='adam', loss=None) # metrics = ['accuracy']
  
  return similarity_model

## Experiment

In [None]:
%%time
import keras

# Inspired on https://pastebin.com/TaGFdcBA

keras.backend.clear_session()

cnn_feature_model = cnn_model(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.word_index) + 1, 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)

lstm_feature_model = lstm_model(embeddings=baseline.embedding_matrix, 
                              num_words=len(baseline.word_index) + 1, 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

similarity_model = siamese_model(lstm_feature_model, cnn_feature_model, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()

#tbCallBack = keras.callbacks.TensorBoard(log_dir='logs/training', histogram_freq=0, write_graph=True, write_images=True)

h = similarity_model.fit_generator(train_gen, 
                               steps_per_epoch = 16,
                               #validation_data=test_gen, # 
                                             epochs = 100,
                                             verbose = True
                                              )  # callbacks=[tbCallBack]

In [None]:
loss=h.history['loss']

plt.plot(loss, label='loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

In [None]:
name = 'baseline_100epoch_16steps_(eclipse)'
Baseline.save_model('', similarity_model, name)
Baseline.save_result('', h, name)

In [None]:
for layer in similarity_model.layers:
    print(layer.name)

In [None]:
## Freeze weights
for layer in similarity_model.layers:
    layer.trainable = False
    
model_in = similarity_model.get_layer('merge_features_in')
model_pos = similarity_model.get_layer('merge_features_pos')
x_in = model_in.output
x_pos = model_pos.output
x = Concatenate()([x_in, x_pos])
x = Dense(64, activation = 'relu')(x)
x = Dense(32, activation = 'relu')(x)
output = Dense(2, activation = 'softmax', name = 'output')(x)
model_clf = Model(inputs=similarity_model.input, outputs=output)
model_clf.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
model_clf.summary()

In [None]:
%%time

from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

valid_labels = encoder.fit_transform(test_gen[1])
valid_labels = to_categorical(valid_labels)
test_validation = (test_gen[0], valid_labels)

h_clf = model_clf.fit_generator(baseline.siam_gen_classification(bug_dir, 512, 1), 
                               steps_per_epoch = 16,
#                                 validation_split=0.2,
                               validation_data=test_validation, # 
                                             epochs = 100,
                                             verbose = True) # callbacks=[early]

In [None]:
Baseline.validation_accuracy_loss(h_clf)

In [None]:
name = 'baseline_10epoch_16steps_512batch(test)'
save_model(similarity_model, name)
save_result(h, name)