# Bug triage with Deep Learning

In [1]:
from __future__ import print_function, division

In [2]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Dataset bugs

In [4]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
%%time

DIR = 'drive/My Drive/Colab Notebooks/dataset/'

x_train, y_train = [], []

with open(DIR + 'train_mozilla.txt', 'r') as f:
    for row in f:
      cols = re.split('\|', row)
      title_a = np.array(re.split(',', cols[0])).astype(int)
      title_b = np.array(re.split(',', cols[1])).astype(int)
      desc_a = np.array(re.split(',', cols[2])).astype(int)
      desc_b = np.array(re.split(',', cols[3])).astype(int)
      x_train.append([title_a, title_b, desc_a, desc_b])
      y_train.append(int(cols[4]))

In [None]:
%%time

DIR = 'drive/My Drive/Colab Notebooks/dataset/'

x_test, y_test = [], []

with open(DIR + 'test_mozilla.txt', 'r') as f:
    for row in f:
      cols = re.split('\|', row)
      title_a = np.array(re.split(',', cols[0])).astype(int)
      title_b = np.array(re.split(',', cols[1])).astype(int)
      desc_a = np.array(re.split(',', cols[2])).astype(int)
      desc_b = np.array(re.split(',', cols[3])).astype(int)
      x_test.append([title_a, title_b, desc_a, desc_b])
      y_test.append(int(cols[4]))

In [0]:
DIR = 'drive/My Drive/Colab Notebooks/dataset/'

df_x_train = pd.read_csv(DIR + 'df_train.csv')
df_x_test = pd.read_csv(DIR + 'df_test.csv')

In [8]:
df_x_train.head()
#x_test[0]

Unnamed: 0,title_a,title_b,description_a,description_b
0,winstripe theme incorrectly positions single tab,first tab content moving opening closing secon...,user agent mozilla 5 0 windows u windows nt 5 ...,user agent mozilla 5 0 windows u windows nt 5 ...
1,ctrl invoke prefered mailer,mailto link improperly launches outlook expres...,user agent mozilla 5 0 windows u windows nt 5 ...,user agent mozilla 5 0 windows u windows nt 5 ...
2,clicking icon opens new tab switches,implement revised new tab experience,clicking icon tab group opens new tab immediat...,https wiki mozilla org firefox projects tabcan...
3,windows installer creates directory startmenu ...,start menu folder created despite selecting do...,user agent mozilla 5 0 windows u windows nt 5 ...,user agent mozilla 5 0 x11 u linux i686 en us ...
4,firefox doesnt load images size doesnt match d...,images appearing webpages,user agent mozilla 5 0 x11 u linux i686 fr rv ...,user agent mozilla 5 0 x11 u linux i686 en us ...


### Dividir train e test (título e descrição)

In [0]:
x_train_title_a = np.array([ row[0] for row in x_train])
x_train_title_b = np.array([ row[1] for row in x_train])
x_test_title_a = np.array([ row[0] for row in x_test])
x_test_title_b = np.array([ row[1] for row in x_test])

x_train_desc_a = np.array([ row[2] for row in x_train])
x_train_desc_b = np.array([ row[3] for row in x_train])
x_test_desc_a = np.array([ row[2] for row in x_test])
x_test_desc_b = np.array([ row[3] for row in x_test])

### Dicionário de títulos e descrições

In [10]:
%%time 

sentence_dict = {}

def creating_dict(vec_a, vec_b, txt_a, txt_b):
  for v1, v2, s1, s2 in zip(vec_a, vec_b, txt_a, txt_b):
    #print(v1, v2)
    name_1 = ','.join(v1.astype(str))
    name_2 = ','.join(v2.astype(str))
    sentence_dict[name_1] = s1
    sentence_dict[name_2] = s2

################# TITLE #############################
creating_dict(x_train_title_a, x_train_title_b, df_x_train['title_a'].values, df_x_train['title_b'].values)
creating_dict(x_test_title_a, x_test_title_b, df_x_test['title_a'].values, df_x_test['title_b'].values)

################ DESCRIPTION #############################
creating_dict(x_train_desc_a, x_train_desc_b, df_x_train['description_a'].values, df_x_train['description_b'].values)
creating_dict(x_test_desc_a, x_test_desc_b, df_x_test['description_a'].values, df_x_test['description_b'].values)

CPU times: user 2min 18s, sys: 614 ms, total: 2min 18s
Wall time: 2min 19s


In [11]:
len(sentence_dict)

374004

### Reogarnizar grupos

In [12]:
# reorganize by groups
train_groups = {}
test_groups = {}

train_groups['title_a'] = [x_train_title_a[np.where(y_train==i)[0]] for i in np.unique(y_train)] 
train_groups['title_b'] = [x_train_title_b[np.where(y_train==i)[0]] for i in np.unique(y_train)] 
train_groups['desc_a'] = [x_train_desc_a[np.where(y_train==i)[0]] for i in np.unique(y_train)] 
train_groups['desc_b'] = [x_train_desc_b[np.where(y_train==i)[0]] for i in np.unique(y_train)] 

test_groups['title_a'] = [x_test_title_a[np.where(y_test==i)[0]] for i in np.unique(y_test)]
test_groups['title_b'] = [x_test_title_b[np.where(y_test==i)[0]] for i in np.unique(y_test)]
test_groups['desc_a'] = [x_test_desc_a[np.where(y_test==i)[0]] for i in np.unique(y_test)]
test_groups['desc_b'] = [x_test_desc_b[np.where(y_test==i)[0]] for i in np.unique(y_test)]

print("Title")
print('train groups:', [x.shape[0] for x in train_groups['title_a']])
print('test groups:', [x.shape[0] for x in test_groups['title_b']])
print("Description")
print('train groups:', [x.shape[0] for x in train_groups['desc_a']])
print('test groups:', [x.shape[0] for x in test_groups['desc_b']])

Title
train groups: [49981, 248421]
test groups: [12644, 61957]
Description
train groups: [49981, 248421]
test groups: [12644, 61957]


### Validando o treino

In [13]:
train_groups['title_a'][0].shape

(49981, 40)

In [0]:
def display_bug(idx, x, y):
  # idx = np.random.choice(range(len(x_train)))
  print("Bugs are '{}''".format('duplicated' if y[idx] == 1 else 'no duplicated' ))
  TITLE_A = 0
  TITLE_B = 1
  DESC_A = 2
  DESC_B = 3
  key_t_a = ','.join(x[idx][TITLE_A].astype(str))
  key_t_b = ','.join(x[idx][TITLE_B].astype(str))
  key_d_a = ','.join(x[idx][DESC_A].astype(str))
  key_d_b = ','.join(x[idx][DESC_B].astype(str))
  print("Title A:", sentence_dict[key_t_a])
  print("Title B:", sentence_dict[key_t_b])
  print("Desciption A:", sentence_dict[key_d_a])
  print("Description B:", sentence_dict[key_d_b])

In [15]:
# idx = np.random.choice(range(len(x_train)))
idx = 12 # 1-9 duplicates, 12 no duplicate
display_bug(idx, x_train, y_train)

Bugs are 'no duplicated''
Title A: new firefox update causes browsing files crash firefox
Title B: en us jar different among platforms
Desciption A: user agent mozilla 4 0 compatible msie 7 0 windows nt 5 1 net clr 1 1 4322 net clr 2 0 50727 net clr 3 0 04506 30 net clr 3 0 04506 648 build identifier 2 0 0 14 want attach document e mail upload site using bowse function thus browse dialoge box opens whole firefox windowx freeze therefore force killing yesterday didnt problem firefox automatically updated new version regards reproducible always steps reproduce 1 press browse button 2 3
Description B: user agent mozilla 5 0 x11 u linux i686 en us rv 1 5 gecko 20031007 firebird 0 7 build identifier mozilla 5 0 x11 u linux i686 en us rv 1 5 gecko 20031007 firebird 0 7 windows specific strings en us jar file bundled windows version firebird 0 7 least locale en us communicator platform pref platformprefoverlay dtd maybe others dont know mentioning default browser settings linux version missin

### Validando o teste

In [16]:
# idx = np.randomvalidation_steps.choice(range(len(x_train)))
idx = 1 # 1 duplicates, 0 no duplicate
display_bug(idx, x_test, y_test)

Bugs are 'duplicated''
Title A: restore session multiple tabs requiring passwords results multiple master pw dialogs
Title B: master password dialog multiple times upon different tabs
Desciption A: user agent mozilla 5 0 x11 u linux i686 en us rv 1 9 2 3 gecko 20100401 firefox 3 6 3 build identifier mozilla 5 0 x11 u linux i686 en us rv 1 9 2 3 gecko 20100401 firefox 3 6 3 one master pw dialog displayed restoring session one tab saved password form reproducible always steps reproduce create session n 1 tabs saved password form close browser reopen workaround fill n max n dialog close rest actual results n master password dialogs stacked sometimes order requires finding filling closing order expected results one master password dialog displayed restoring session multiple tabs saved password forms
Description B: user agent mozilla 5 0 windows u windows nt 6 0 de rv 1 9 2 4 gecko 20100611 firefox 3 6 4 net clr 3 5 30729 build identifier mozilla 5 0 windows u windows nt 6 0 de rv 1 9 2 4 g

## Configurações Globais

In [0]:
MAX_SEQUENCE_LENGTH_T = 40
MAX_SEQUENCE_LENGTH_D = 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

## Geração de batches

#### Balanced 50%

In [0]:
def shuffle_in_unison_scary(a, b, c):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)
    np.random.set_state(rng_state)
    np.random.shuffle(c)
    
def gen_random_batch(in_groups, batch_size = 128):
    titles_a, titles_b, descs_a, descs_b, out_score = [], [], [], [], []
    all_groups = list(range(2)) # duplicated and non-duplicated
    keys = list(in_groups)
    for group in all_groups:
        
        group_idx = [group for i in range(batch_size)]
        
        item_selected = [np.random.choice(range(in_groups[keys[0]][c_idx][0].shape[0])) for c_idx in group_idx]
        
        title_a = [in_groups['title_a'][c_idx][row] for c_idx, row in zip(group_idx, item_selected)]
        title_b = [in_groups['title_b'][c_idx][row] for c_idx, row in zip(group_idx, item_selected)]
        desc_a = [in_groups['desc_a'][c_idx][row] for c_idx, row in zip(group_idx, item_selected)]
        desc_b = [in_groups['desc_b'][c_idx][row] for c_idx, row in zip(group_idx, item_selected)]
        out_score += [group] * batch_size
        
        titles_a += title_a
        titles_b += title_b
        descs_a += desc_a
        descs_b += desc_b
        
    # shuffle_in_unison_scary(sentence1, sentence2, out_score)
        
    return np.stack(titles_a,0), np.stack(titles_b,0), np.stack(descs_a,0), np.stack(descs_b,0), np.stack(out_score,0)

In [0]:
%%time

batch_size = 256

# make a generator out of the data
def siam_gen(in_groups, batch_size = 128):
    while True:
        t_a, t_b, d_a, d_b, sim = gen_random_batch(in_groups, batch_size//2)
        yield { 'title_a' : t_a, 'title_b': t_b, 'desc_a' : d_a, 'desc_b' : d_b }, sim
        
# we want a constant validation group to have a frame of reference for model performance
valid_t_a, valid_t_b, valid_d_a, valid_d_b, valid_sim = gen_random_batch(test_groups, batch_size)
train_gen = siam_gen(train_groups, batch_size=512)
test_gen = ([valid_t_a, valid_t_b, valid_d_a, valid_d_b], valid_sim)

In [0]:
def get_neg_bug(invalid_bugs, bug_ids):
  neg_bug = random.choice(bug_ids)
  while neg_bug in invalid_bugs:
    neg_bug = random.choice(bug_ids)
  return neg_bug

def read_batch_triplets(batch_triplets, data):
  batch_input_bugs = []
  batch_pos_bugs = []
  batch_neg_bugs = []
  for triplet in batch_triplets:
    batch_input_bugs.append(triplet[0])
    batch_pos_bugs.append(triplet[1])
    batch_neg_bugs.append(triplet[2])
  return read_batch_bugs(batch_input_bugs, data), \
         read_batch_bugs(batch_pos_bugs, data), \
         read_batch_bugs(batch_neg_bugs, data)

def read_batch_bugs(batch_bugs, data, test=False):
  desc_word = []
  short_desc_word = []
  info = []
  for bug_id in batch_bugs:
    bug = pickle.load(open(os.path.join(data, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
    desc_word.append(bug['description_word'])
    short_desc_word.append(bug['title_word'])
    
  desc_word = Variable(torch.from_numpy(data_padding(desc_word, 500)), volatile=test).cuda()
  short_desc_word = Variable(torch.from_numpy(data_padding(short_desc_word, 100)), volatile=test).cuda()
  batch_bugs = dict()
  batch_bugs['desc'] = (desc_word)
  batch_bugs['title'] = (short_desc_word)

  return batch_bugs

# data - path
# batch_size - 128
# n_neg - 1
def batch_iterator(train_data, data, batch_size, n_neg):
  random.shuffle(train_data)
  bug_ids = read_bug_ids(data)
  num_batches = int(len(train_data) / batch_size)
  if len(data) % batch_size > 0:
    num_batches += 1
  # loop = tqdm(range(num_batches))
  # loop.set_description('Training')
  for i in range(num_batches):
    batch_triplets = []
    for j in range(batch_size):
      offset = batch_size * i + j
      if offset >= len(train_data):
        break
      for i in range(n_neg):
        neg_bug = get_neg_bug(dup_sets[train_data[offset][0]], bug_ids)
        batch_triplets.append([train_data[offset][0], train_data[offset][1], neg_bug])
    yield loop, read_batch_triplets(batch_triplets, data)

In [0]:
from sklearn.metrics import roc_auc_score
import sklearn.metrics

def curve_roc_auc(model, x, y_valid):
  y_hat = model.predict(x)
  pct_auc = roc_auc_score(y_valid, y_hat) * 100
  #print('ROC/AUC: {:0.2f}'.format(pct_auc))

  fpr, tpr, _ = sklearn.metrics.roc_curve(y_valid, y_hat)
  roc_auc = sklearn.metrics.auc(fpr, tpr)
  plt.figure()
  lw = 2
  plt.plot(fpr, tpr, color='darkorange',
           lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
  plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('Taxa de Falsos Positivos')
  plt.ylabel('Taxa de Verdadeiros Positivos')
  plt.title('Receiver Operating Characteristic (ROC)')
  plt.legend(loc="lower right")
  plt.show()

### Validar entrada

In [0]:
def display_batch(groups, nb):
  t_a, t_b, d_a, d_b, v_sim = gen_random_batch(groups, nb)

  for ta, tb, da, db, sim in zip(t_a, t_b, d_a, d_b, v_sim):
    #print(ba.astype(str))
    key_t_a = ','.join(ta.astype(str))
    key_t_b = ','.join(tb.astype(str))
    key_d_a = ','.join(da.astype(str))
    key_d_b = ','.join(db.astype(str))
    print("Title =", sentence_dict[key_t_a])
    print("Title =", sentence_dict[key_t_b])
    print("Description =", sentence_dict[key_d_a])
    print("Description =", sentence_dict[key_d_b])
    print("similar =", str(sim))
    print("########################")

#### Train

In [21]:
display_batch(train_groups, 5)

Title = unable open preferences undeer options
Title = downloaded updated file old version shown
Description = user agent mozilla 4 0 compatible msie 6 0 windows 98 win 9x 4 90 tucows build identifier firefox 1 5 trying access preferences tools nothing appears screen except bottom shows ok cancel help reproducible always steps reproduce 1 firefox 2 tools 3 options 4 preferences actual results opening tools options ok go preferences nothing shows screen except bottom shows ok cancel help expected results nothing opened preferences
Description = user agent mozilla 5 0 x11 u linux i686 pl pl rv 1 9 2 3 gecko 20100423 ubuntu 10 04 lucid firefox 3 6 3 build identifier mozilla 5 0 x11 u linux i686 pl pl rv 1 9 2 3 gecko 20100423 ubuntu 10 04 lucid firefox 3 6 3 download file updated old version disk name download manager changes name file try open new file using download manager old version shown happend pdf files reproducible always steps reproduce 1 donwload pdf file 2 change content file 

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [0]:
import os

from keras.preprocessing.text import Tokenizer

def word_index_count(df):
  rows = []

  for d in [df['title_a'], df['title_b'], df['description_a'], df['description_b']]:
    added = [{'text' : r} for r in d.values]
    rows += added
  
  df = pd.DataFrame(data=rows, columns=['text'])['text']
  
  # Fill missing description or title
  df.fillna('', inplace=True)
  
  tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
  tokenizer.fit_on_texts(df)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  
  return word_index

In [23]:
df_x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298402 entries, 0 to 298401
Data columns (total 4 columns):
title_a          298373 non-null object
title_b          298378 non-null object
description_a    297934 non-null object
description_b    297963 non-null object
dtypes: object(4)
memory usage: 9.1+ MB


In [24]:
%%time

GLOVE_DIR = "drive/My Drive/Colab Notebooks/dataset/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.42B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))

word_index = word_index_count(df_x_train)

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Total 1917494 word vectors in Glove 42B 300d.
Found 211132 unique tokens.
CPU times: user 4min 7s, sys: 5.87 s, total: 4min 13s
Wall time: 10min 46s


## Auxiliary methods

### Plot ROC/AUC curve

### Plot validation accuracy and loss

In [0]:
def validation_accuracy_loss(history):
  acc=history.history['acc']
  val_acc=history.history['val_acc']
  loss=history.history['loss']
  val_loss=history.history['val_loss']

  plt.plot(acc, label='acc')
  plt.plot(val_acc, label='val_acc')
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

  plt.plot(loss, label='acc')
  plt.plot(val_loss, label='val_acc')
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

In [0]:
def show_model_output(valid_a, valid_b, valid_sim, model, nb_examples = 3):
    #pv_a, pv_b, pv_sim = gen_random_batch(test_groups, nb_examples)
    pred_sim = model.predict([valid_a, valid_b])
#     pred_sim = [1,1,1,1,1,1]
    for b_a, b_b, sim, pred in zip(valid_a, valid_b, valid_sim, pred_sim):
        key_a = ','.join(b_a.astype(str))
        key_b = ','.join(b_b.astype(str))
        print(sentence_dict[key_a])
        print(sentence_dict[key_b])
        print("similar=" + str(sim))
        print("prediction=" + str(pred[0]))
        print("########################")
    return valid_a, valid_b, valid_sim

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### CNN with filter 3,4,5

In [25]:
import keras
from keras.layers import GlobalMaxPooling1D 

keras.backend.clear_session()

def cnn_model(embeddings, num_words, embedding_dim, max_sequence_length, trainable):

  embedding_layer = Embedding(num_words,
                              embedding_dim,
                              weights=[embeddings],
                              input_length=max_sequence_length,
                              trainable=trainable)

  sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
  embedded_sequences = embedding_layer(sequence_input)

  # Yoon Kim model (https://arxiv.org/abs/1408.5882)
  convs = []
  filter_sizes = [3,5]

  for filter_size in filter_sizes:
      l_conv = Conv1D(filters=32, kernel_size=filter_size, activation='relu')(embedded_sequences)
      l_pool = MaxPooling1D(pool_size=3)(l_conv)
      convs.append(l_pool)

  # l_merge = Merge(mode='concat', concat_axis=1)(convs)

  l_merge = Concatenate(axis=1)(convs)

  # add a 1D convnet with global maxpooling, instead of Yoon Kim model
  conv = Conv1D(filters=64, kernel_size=3, activation='relu')(l_merge)
  pool = GlobalMaxPooling1D()(conv) # pool_size=3
  # Original Yoon Kim model
  #x = Flatten()(pool)
  #x = Dropout(0.5)(x)
  layer = Activation('relu')(pool)

  cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible
  cnn_feature_model.summary()
  
  return cnn_feature_model


cnn_feature_model = cnn_model(embeddings=embedding_matrix, 
                              num_words=len(word_index) + 1, 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Feature_BugInput (InputLayer)   (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     63339900    Feature_BugInput[0][0]           
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 198, 32)      28832       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 196, 32)      48032       embedding_1[0][0]                
__________________________________________________________________________________________________
max_poolin

### Bi-LSTM

In [26]:
%%time

from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional

def lstm_model(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
  number_lstm_units = 32
  rate_drop_lstm = 0.75
  recurrent_dropout = 0.25

  embedding_layer = Embedding(num_words,
                          embedding_dim,
                          weights=[embeddings],
                          input_length=max_sequence_length,
                          trainable=trainable)

  sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
  embedded_sequences = embedding_layer(sequence_input)

   # Creating LSTM Encoder
  lstm_layer = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))

  x = lstm_layer(embedded_sequences)

  layer = Activation('relu')(x)

  lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible
  lstm_feature_model.summary()
  
  return lstm_feature_model

lstm_feature_model = lstm_model(embeddings=embedding_matrix, 
                              num_words=len(word_index) + 1, 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Feature_BugInput (InputLayer (None, 40)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 40, 300)           63339900  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
Total params: 63,425,148
Trainable params: 85,248
Non-trainable params: 63,339,900
_________________________________________________________________
CPU times: user 1.18 s, sys: 56 ms, total: 1.23 s
Wall time: 1.31 s


### Siamese model

In [0]:
class CustomLayer(keras.layers.Layer):
  def call(self, inputs):
    bug_a = inputs[0]
    bug_b = inputs[1]
    loss = self.distance(bug_a, bug_b)

In [28]:
from keras.layers import concatenate, Add

def siamese_model(max_sequence_length_t, max_sequence_length_d):
  bug_t_a_in = Input(shape = (max_sequence_length_t, ), name = 'title_a')
  bug_t_b_in = Input(shape = (max_sequence_length_t, ), name = 'title_b')
  bug_d_a_in = Input(shape = (max_sequence_length_d, ), name = 'desc_a')
  bug_d_b_in = Input(shape = (max_sequence_length_d, ), name = 'desc_b')

  bug_d_a_feat_cnn = cnn_feature_model(bug_d_a_in)
  bug_d_b_feat_cnn = cnn_feature_model(bug_d_b_in)

  bug_t_a_feat_lstm = lstm_feature_model(bug_t_a_in)
  bug_t_b_feat_lstm = lstm_feature_model(bug_t_b_in)

  combined_features_a = concatenate([bug_t_a_feat_lstm, bug_d_a_feat_cnn], name = 'merge_features_a')
  combined_features_b = concatenate([bug_t_b_feat_lstm, bug_d_b_feat_cnn], name = 'merge_features_b')

  #combined = concatenate([combined_features_a, combined_features_b])
  combined = Add()([combined_features_a, combined_features_b])
  # combined_features = Dense(100, activation = 'linear')(combined)
  # combined_features = BatchNormalization()(combined_features)
  # combined_features = Activation('relu')(combined_features)
  # combined_features = Dense(4, activation = 'linear')(combined_features)
  # combined_features = BatchNormalization()(combined_features)
  # combined_features = Activation('relu')(combined_features)
  combined_features = Dense(1, activation = 'sigmoid')(combined)
  similarity_model = Model(inputs = [bug_t_a_in, bug_t_b_in, bug_d_a_in, bug_d_b_in], outputs = [combined_features], name = 'Similarity_Model')
  # setup the optimization process
  similarity_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy']) # 'binary_crossentropy'
  similarity_model.summary()
  
  return similarity_model

similarity_model = siamese_model(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
desc_a (InputLayer)             (None, 200)          0                                            
__________________________________________________________________________________________________
title_a (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
desc_b (InputLayer)             (None, 200)          0                                            
__________________________________________________________________________________________________
title_b (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
FeatureCNN

### Fit

In [29]:
%%time

# early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = similarity_model.fit_generator(train_gen, 
                               steps_per_epoch = 256,
                               validation_data=test_gen,
                                             epochs = 10,
                                             verbose = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 24min 31s, sys: 3min 18s, total: 27min 50s
Wall time: 20min 28s


In [0]:
pred_t_a, pred_t_b, pred_t_sim = gen_random_batch(test_groups, 10)
validation_accuracy_loss(h)
curve_roc_auc(similarity_model, x=[pred_t_a, pred_t_b], y_valid=pred_t_sim)
_ = show_model_output(pred_t_a, pred_t_b, pred_t_sim, similarity_model)

## MLP

In [0]:
# Configuration to experiments CNN dilated
embeddings = embedding_matrix
num_words = len(word_index)+1
embedding_dim = EMBEDDING_DIM
max_sequence_length = MAX_SEQUENCE_LENGTH
trainable = False
drop_rate = 0.7

In [0]:
%%time

keras.backend.clear_session()

embedding_layer = Embedding(num_words,
                        embedding_dim,
                        weights=[embeddings],
                        input_length=max_sequence_length,
                        trainable=trainable)

sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
embedded = embedding_layer(sequence_input)


x = Dense(1, activation='linear')(embedded)
#x = Reshape((EMBEDDING_DIM, 1))(embedded)
#x = Dense(EMBEDDING_DIM)(x)
# x = Dropout(drop_rate)(x)
# x = BatchNormalization()(x)
# layer = Activation('relu')(x)
#x = Dense(2)(x)
#x = BatchNormalization()(x)
layer = Flatten()(x)
#layer = Activation('relu')(x)
#yhat = Dense(1, activation='sigmoid')(x)

feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureGenerationModel') # inputs=visible
feature_model.summary()

In [0]:
from keras.layers import concatenate

bug_a_in = Input(shape = (max_sequence_length, ), name = 'title1')
bug_b_in = Input(shape = (max_sequence_length, ), name = 'title2')
bug_a_feat = feature_model(bug_a_in)
bug_b_feat = feature_model(bug_b_in)
combined_features = concatenate([bug_a_feat, bug_b_feat], name = 'merge_features')
combined_features = Dense(max_sequence_length, activation = 'linear')(combined_features)
combined_features = BatchNormalization()(combined_features)
combined_features = Dropout(drop_rate)(combined_features)
combined_features = Activation('relu')(combined_features)
combined_features = Dense(max_sequence_length//2, activation = 'linear')(combined_features)
combined_features = BatchNormalization()(combined_features)
combined_features = Dropout(drop_rate)(combined_features)
combined_features = Activation('relu')(combined_features)
combined_features = Dense(1, activation = 'sigmoid')(combined_features)
similarity_model = Model(inputs = [bug_a_in, bug_b_in], outputs = [combined_features], name = 'Similarity_Model')
# setup the optimization process
#adam = keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.1, amsgrad=False)
similarity_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
similarity_model.summary()

In [0]:
%%time

# early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = similarity_model.fit_generator(train_gen, 
                               steps_per_epoch = 512,
                               validation_data=test_gen,
                                             epochs = 10,
                                             verbose = True)

In [0]:
pred_t_a, pred_t_b, pred_t_sim = gen_random_batch(test_groups, 10)
validation_accuracy_loss(h)
curve_roc_auc(similarity_model, x=[pred_t_a, pred_t_b], y_valid=pred_t_sim)
_ = show_model_output(pred_t_a, pred_t_b, pred_t_sim, similarity_model)

## CNN dilatada

Arquitetura de https://github.com/kristpapadopoulos/seriesnet/blob/master/seriesnet-Krist-Papadopoulos-v1.pdf

In [0]:
import keras
keras.backend.clear_session()

In [0]:
from keras.callbacks import EarlyStopping

# estratégia de treino com parada antecipada consiste em parar o treino
# quando nenhuma melhoria no erro de validação (val_loss) é observada após
# "patience" épocas
earlystop = EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=10, verbose=1, mode='auto')
callback_list = [earlystop]

### Dilataçã*o* 2 a 32 (7 camadas com kernel de 2 e 3 cada)

Seriesnet Krist Papadopoulos

In [0]:
def DC_CNN_Block(nb_filter, filter_length, dilation, l2_layer_reg):
    def block(block_input):        
        residual =    block_input
        
        layer_out =   Conv1D(filters=nb_filter, kernel_size=filter_length, 
                      dilation_rate=dilation, 
                      activation='linear', padding='causal', use_bias=False,
                      kernel_initializer=TruncatedNormal(mean=0.0, stddev=0.05, 
                      seed=42), kernel_regularizer=l2(l2_layer_reg))(block_input)                    
        selu_out =    Activation('selu')(layer_out)
        
        skip_out =    Conv1D(1,1, activation='linear', use_bias=False, 
                      kernel_initializer=TruncatedNormal(mean=0.0, stddev=0.05, 
                      seed=42), kernel_regularizer=l2(l2_layer_reg))(selu_out)
        
        c1x1_out =    Conv1D(1,1, activation='linear', use_bias=False, 
                      kernel_initializer=TruncatedNormal(mean=0.0, stddev=0.05, 
                      seed=42), kernel_regularizer=l2(l2_layer_reg))(selu_out)
                      
        block_out =   Add()([residual, c1x1_out])
        
        return block_out, skip_out
    return block

### Generate features

In [0]:
# Configuration to experiments CNN dilated
embeddings = embedding_matrix
num_words = len(word_index)+1
embedding_dim = EMBEDDING_DIM
max_sequence_length = MAX_SEQUENCE_LENGTH
trainable = False

In [0]:
%%time

keras.backend.clear_session()

embedding_layer = Embedding(num_words,
                        embedding_dim,
                        weights=[embeddings],
                        input_length=max_sequence_length,
                        trainable=trainable)

sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
embedded_sequences = embedding_layer(sequence_input)

units = 32

l1a, l1b = DC_CNN_Block(units,2,1,0.01)(embedded_sequences)    
l2a, l2b = DC_CNN_Block(units,2,2,0.01)(l1a) 
l3a, l3b = DC_CNN_Block(units,2,4,0.01)(l2a)
l4a, l4b = DC_CNN_Block(units,2,8,0.01)(l3a)
l5a, l5b = DC_CNN_Block(units,2,16,0.01)(l4a)
l6a, l6b = DC_CNN_Block(units,2,32,0.01)(l5a)
l7a, l7b = DC_CNN_Block(units,2,64,0.01)(l6a)

l8 =   Add()([l1b, l2b, l3b, l4b, l5b, l6b, l7b])

l9 =   Activation('relu')(l8)

x =  Conv1D(1,1, activation='linear', use_bias=False, 
       kernel_initializer=TruncatedNormal(mean=0.0, stddev=0.05, seed=42),
       kernel_regularizer=l2(0.001))(l9)

x = Flatten()(x)
x = Dropout(0.25)(x)
x = Dense(MAX_SEQUENCE_LENGTH)(x)
#x = Dropout(0.45)(x)
x = BatchNormalization()(x)
layer = Activation('relu')(x)
#x = Dense(2)(x)
#x = BatchNormalization()(x)
#layer = Activation('relu')(x)
#yhat = Dense(1, activation='sigmoid')(x)

feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureGenerationModel') # inputs=visible
feature_model.summary()

### Modelo siamês

In [0]:
from keras.layers import concatenate

bug_a_in = Input(shape = (max_sequence_length, ), name = 'title1')
bug_b_in = Input(shape = (max_sequence_length, ), name = 'title2')
bug_a_feat = feature_model(bug_a_in)
bug_b_feat = feature_model(bug_b_in)
combined_features = concatenate([bug_a_feat, bug_b_feat], name = 'merge_features')
combined_features = Dense(MAX_SEQUENCE_LENGTH, activation = 'linear')(combined_features)
combined_features = BatchNormalization()(combined_features)
combined_features = Activation('relu')(combined_features)
combined_features = Dense(MAX_SEQUENCE_LENGTH//2, activation = 'linear')(combined_features)
combined_features = BatchNormalization()(combined_features)
combined_features = Activation('relu')(combined_features)
combined_features = Dense(1, activation = 'sigmoid')(combined_features)
similarity_model = Model(inputs = [bug_a_in, bug_b_in], outputs = [combined_features], name = 'Similarity_Model')
# setup the optimization process
#adam = keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.1, amsgrad=False)
similarity_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
similarity_model.summary()

### Fit

In [0]:
%%time

# early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = similarity_model.fit_generator(train_gen, 
                               steps_per_epoch = 512,
                               validation_data=test_gen,
                                             epochs = 10,
                                             verbose = True)

In [0]:
pred_t_a, pred_t_b, pred_t_sim = gen_random_batch(test_groups, 10)
validation_accuracy_loss(h)
curve_roc_auc(similarity_model, x=[pred_t_a, pred_t_b], y_valid=pred_t_sim)
_ = show_model_output(pred_t_a, pred_t_b, pred_t_sim, similarity_model)

## ARCII-for-Matching-Natural-Language-Sentences

https://github.com/ddddwy/ARCII-for-Matching-Natural-Language-Sentences

In [0]:
embed_size=EMBEDDING_DIM
text1_maxlen=MAX_SEQUENCE_LENGTH
text2_maxlen=MAX_SEQUENCE_LENGTH
filters_1d=text2_maxlen
num_words = len(word_index)+1
kernel_size_1d=3
num_conv2d_layers=2
filters_2d=[MAX_SEQUENCE_LENGTH,MAX_SEQUENCE_LENGTH//2]
kernel_size_2d=[[3,3], [3,3]]
mpool_size_2d=[[2,2], [2,2]]
dropout_rate=.5

In [0]:
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers.core import Dense, Reshape, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers import concatenate
from keras import backend as K

keras.backend.clear_session()

query=Input(shape=(text1_maxlen,), name = 'title1')
doc=Input(shape=(text2_maxlen,), name = 'title2')

embedding = Embedding(num_words, embed_size, weights=[embedding_matrix], trainable=True)
q_embed = embedding(query)
d_embed = embedding(doc)

layer1_input=concatenate([q_embed, d_embed])

layer1_conv=Conv1D(filters=filters_1d, kernel_size=kernel_size_1d, padding='same')(layer1_input)
layer1_activation=Activation('relu')(layer1_conv)
layer1_reshaped=Reshape((text1_maxlen, text2_maxlen, -1))(layer1_activation)
z=MaxPooling2D(pool_size=(2,2))(layer1_reshaped)
residual = Flatten()(z)

for i in range(num_conv2d_layers):
    z=Conv2D(filters=filters_2d[i], kernel_size=kernel_size_2d[i], padding='same')(z)
    z=Activation('relu')(z)
    z=MaxPooling2D(pool_size=(mpool_size_2d[i][0], mpool_size_2d[i][1]))(z)

# pool1_flat=Flatten()(z)
# residual = Reshape((1, K.int_shape(residual)[1]))(residual)
# pool1_flat = Reshape((K.int_shape(pool1_flat)[1], 1))(pool1_flat)
# residual=Add()([residual, pool1_flat]) # residual
# residual=Flatten()(residual)
# shape_after_flatten = K.int_shape(residual)

pool1_flat=Flatten()(z)

pool1_flat_drop=Dropout(rate=dropout_rate)(pool1_flat)
pool1_norm=BatchNormalization()(pool1_flat_drop)
mlp1=Dense(MAX_SEQUENCE_LENGTH)(pool1_norm)
mlp1=Activation('relu')(mlp1)
out=Dense(1, activation='sigmoid')(mlp1)

model=Model(inputs=[query, doc], outputs=out)
#adam = keras.optimizers.Adam(lr=0.1, beta_1=0.7, beta_2=0.7, epsilon=None, decay=0.1, amsgrad=False)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

#### Fit

In [0]:
%%time

h = model.fit_generator(train_gen, 
                               steps_per_epoch = 1000,
                               validation_data=test_gen,
                                             epochs = 10,
                                             verbose = 1)

In [0]:
pred_t_a, pred_t_b, pred_t_sim = gen_random_batch(test_groups, 10)
validation_accuracy_loss(h)
curve_roc_auc(model, x=[pred_t_a, pred_t_b], y_valid=pred_t_sim)
_ = show_model_output(pred_t_a, pred_t_b, pred_t_sim, model)

## Bi-LSTM siamese

https://github.com/amansrivastava17/lstm-siamese-text-similarity

In [0]:
%%time

from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional

keras.backend.clear_session()

number_lstm_units = 2
rate_drop_lstm = 0
recurrent_dropout = 0

embedding_layer = Embedding(num_words,
                        embedding_dim,
                        weights=[embeddings],
                        input_length=max_sequence_length,
                        trainable=trainable)

sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
embedded_sequences = embedding_layer(sequence_input)

 # Creating LSTM Encoder
lstm_layer = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))

x = lstm_layer(embedded_sequences)

layer = Activation('tanh')(x)

feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureGenerationModel') # inputs=visible
feature_model.summary()

In [0]:
from keras.layers import concatenate

number_dense_units = 32
rate_drop_dense = 0.2

bug_a_in = Input(shape = (max_sequence_length, ), name = 'title1')
bug_b_in = Input(shape = (max_sequence_length, ), name = 'title2')
bug_a_feat = feature_model(bug_a_in)
bug_b_feat = feature_model(bug_b_in)
combined_features = concatenate([bug_a_feat, bug_b_feat], name = 'merge_features')
merged = BatchNormalization()(combined_features)
merged = Dropout(rate_drop_dense)(merged)
merged = Dense(number_dense_units, activation='relu')(merged)
merged = Dense(1, activation = 'sigmoid')(merged)
similarity_model = Model(inputs = [bug_a_in, bug_b_in], outputs = [merged], name = 'Similarity_Model')
# setup the optimization process
similarity_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
similarity_model.summary()

### Fit

In [0]:
%%time

# early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = similarity_model.fit_generator(train_gen, 
                               steps_per_epoch = 512,
                               validation_data=test_gen,
                                             epochs = 10,
                                             verbose = True)

In [0]:
pred_t_a, pred_t_b, pred_t_sim = gen_random_batch(test_groups, 10)
validation_accuracy_loss(h)
curve_roc_auc(similarity_model, x=[pred_t_a, pred_t_b], y_valid=pred_t_sim)
_ = show_model_output(pred_t_a, pred_t_b, pred_t_sim, similarity_model)

## Filtros de 3, 4, 5

https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/

Yoon Kim model (https://arxiv.org/abs/1408.5882)


In [0]:
embeddings = embedding_matrix
num_words = len(word_index) + 1
embedding_dim = EMBEDDING_DIM
max_sequence_length = MAX_SEQUENCE_LENGTH
trainable = False

embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

# Yoon Kim model (https://arxiv.org/abs/1408.5882)
convs = []
filter_sizes = [3,4,5]

for filter_size in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(pool_size=3)(l_conv)
    convs.append(l_pool)

# l_merge = Merge(mode='concat', concat_axis=1)(convs)

l_merge = Concatenate(axis=1)(convs)

# add a 1D convnet with global maxpooling, instead of Yoon Kim model
conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
pool = MaxPooling1D(pool_size=3)(conv)

# Original Yoon Kim model
x = Dropout(0.5)(pool)
   
x = Flatten()(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(2, activation='relu')(x)
# Finally, we feed the output into a Sigmoid layer.
# The reason why sigmoid is used is because we are trying to achieve a binary classification(1,0) 
# for each of the 6 labels, and the sigmoid function will squash the output between the bounds of 0 and 1.
preds = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
model.summary()

In [0]:
%%time

early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = model.fit(X, y, epochs=15, verbose=1, validation_split=0.30, batch_size=256, callbacks=[early])

In [0]:
plt.plot(h.history['loss'], label='train')
plt.plot(h.history['val_loss'], label='val')
plt.legend()

## Dilatação com 32, 64 e 128 neurônios com 3 camadas de dilatação 1, 2, 3 e kernel 3

https://www.kaggle.com/kmader/text-classification-with-atrous-convolutions

In [0]:
%%time

# Configuration to CNN dilated
embeddings = embedding_matrix
num_words = len(word_index)+1
embedding_dim = EMBEDDING_DIM
max_sequence_length = MAX_SEQUENCE_LENGTH
trainable = True

max_dilation_rate = 3
        
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=trainable)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Dropout(0.25)(embedded_sequences)
x = Conv1D(2 * 128, 
               kernel_size = 3)(x)
prefilt_x = Conv1D(128, 
               kernel_size = 3)(x)
out_conv = []
# dilation rate lets us use ngrams and skip grams to process 
for dilation_rate in range(max_dilation_rate):
    x = prefilt_x
    for i in range(3):
        x = Conv1D(32*2**(i), 
                   kernel_size = 3, 
                   dilation_rate = dilation_rate+1)(x)    
    out_conv += [Dropout(0.8)(GlobalMaxPool1D()(x))] # 
# x = concatenate(out_conv, axis = -1)  
x = Concatenate(axis=-1)(out_conv)
x = Dense(32, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(2, activation="sigmoid")(x)

preds = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [0]:
model.summary()

In [0]:
%%time

early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = model.fit(X, y, epochs=15, verbose=1, validation_split=0.30, batch_size=256, callbacks=[early])

In [0]:
plt.plot(h.history['loss'], label='train')
plt.plot(h.history['val_loss'], label='val')
plt.legend()