# Bug triage with Deep Learning

In [1]:
from __future__ import print_function, division

In [2]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [3]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Dataset bugs

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

### Parse bugs preproprecessed

In [5]:
DIR = ''

In [6]:
df_train_pair = pd.read_csv('train_mozilla_firefox.csv')

In [7]:
bug_ids = []
with open(os.path.join(DIR, 'bug_ids.txt'), 'r') as f:
    for row in f:
        bug_ids.append(int(row))

### Dicionário de títulos e descrições

In [8]:
%%time

import pickle as pickle

def padding_embed(max_char, field, bug):
    n = len(bug[field])
    if (max_char - n) > 0: # desc or title
        embed = np.empty(max_char - n)
        embed.fill(0)
        embed = np.concatenate([embed, bug[field]], axis=-1)
        embed = embed.astype(int)
    else:
        embed = np.array(bug[field][:max_char])
    return embed

sentence_dict = {}
corpus = []

for bug_id in bug_ids:
    bug = pickle.load(open(os.path.join('bugs', '{}.pkl'.format(bug_id)), 'rb'))
#     print(str(bug['title_word']))
    title = padding_embed(40, 'title_word', bug)
    desc = padding_embed(200, 'description_word', bug)
    #print(len(title), len(desc))
    #print(",".join(title.astype(str)))
    sentence_dict[",".join(title.astype(str))] = bug['title']
    sentence_dict[",".join(desc.astype(str))] = bug['description']
    corpus.append(bug['title'])
    corpus.append(bug['description'])
#     break

Wall time: 6min 28s


In [9]:
len(sentence_dict)

182244

## Configurações Globais

In [10]:
MAX_SEQUENCE_LENGTH_T = 40
MAX_SEQUENCE_LENGTH_D = 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

## Geração de batches

### Generating tiple of batches

In [41]:
import random
import _pickle as pickle
import numpy as np
from tqdm import tqdm
import os

train_data, bug_ids, dup_sets = None, None, None

def get_neg_bug(invalid_bugs, bug_ids):
  neg_bug = random.choice(bug_ids)
  while neg_bug in invalid_bugs:
    neg_bug = random.choice(bug_ids)
  return neg_bug

def read_train_data(data):
  data_pairs = []
  data_dup_sets = {}
  print('Reading train data')
  with open(os.path.join(data, 'train.txt'), 'r') as f:
    for line in f:
      bug1, bug2 = line.strip().split()
      data_pairs.append([int(bug1), int(bug2)])
      if int(bug1) not in data_dup_sets.keys():
        data_dup_sets[int(bug1)] = set()
      data_dup_sets[int(bug1)].add(int(bug2))
  return data_pairs, data_dup_sets

def read_bug_ids(data):
  bug_ids = []
  print('Reading bug ids')
  with open(os.path.join(data, 'bug_ids.txt'), 'r') as f:
    for line in f:
      bug_ids.append(int(line.strip()))
  return bug_ids

# data - path
def prepare_dataset(data):
  global train_data
  global dup_sets
  global bug_ids
  if not train_data:
    train_data, dup_sets = read_train_data(data)
    #print(len(train_data))
  if not bug_ids:
    bug_ids = read_bug_ids(data)

def siam_gen(data, batch_size, n_neg):
   input_sample, input_pos, input_neg, sim = batch_iterator(data, batch_size, n_neg)
   yield { 'title_in' : input_sample['title'], 'title_pos': input_pos['title'], 'title_neg' : input_neg['title'],
               'desc_in' : input_sample['description'], 'desc_pos' : input_pos['description'], 'desc_neg' : input_neg['description'] }, sim

In [70]:
bug_set = {}

for bug_id in bug_ids:
    bug_set[bug_id] = pickle.load(open(os.path.join(bug_dir, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
    
len(bug_set)

92651

In [71]:
def read_batch_triplets(batch_triplets, data):
  batch_input_bugs = []
  batch_pos_bugs = []
  batch_neg_bugs = []
  for triplet in batch_triplets:
    batch_input_bugs.append(triplet[0])
    batch_pos_bugs.append(triplet[1])
    batch_neg_bugs.append(triplet[2])
  return read_batch_bugs(batch_input_bugs, data), \
         read_batch_bugs(batch_pos_bugs, data), \
         read_batch_bugs(batch_neg_bugs, data)

def read_batch_bugs(batch_bugs, data, test=False):
  global bug_set
  desc_word = []
  short_desc_word = []
  for bug_id in batch_bugs:
    #bug = pickle.load(open(os.path.join(data, 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
    bug = bug_set[bug_id]
    desc_word.append(bug['description_word'])
    short_desc_word.append(bug['title_word'])
    
  desc_word = data_padding(desc_word, 200)
  short_desc_word = data_padding(short_desc_word, 40)
  batch_bugs = dict()
  batch_bugs['desc'] = (desc_word)
  batch_bugs['title'] = (short_desc_word)

  return batch_bugs

def data_padding(data, max_seq_length):
  seq_lengths = [len(seq) for seq in data]
  seq_lengths.append(6)
  #max_seq_length = max_seq_length
  #print(seq_lengths)
  #max_seq_length = min(max(seq_lengths), max_seq_length)
  padded_data = np.zeros(shape=[len(data), max_seq_length])
  for i, seq in enumerate(data):
    seq = seq[:max_seq_length]
    padding_end = max_seq_length - len(seq)
    #print(seq)
    for j, token in enumerate(seq):
      # padding_end = padding_end + j
#       print(padding_end + j)
      padded_data[i, padding_end + j] = int(token)
  return padded_data.astype(np.int)

In [72]:
# data - path
# batch_size - 128
# n_neg - 1
def batch_iterator(data, batch_size, n_neg):
  global train_data
  global dup_sets
  global bug_ids
  random.shuffle(train_data)
  num_batches = int(len(train_data) / batch_size)
  if len(data) % batch_size > 0:
    num_batches += 1
  # print(num_batches)
  # loop = tqdm(range(num_batches))
  # loop.set_description('Training')
  for i in range(num_batches):
    batch_triplets = []
    for j in range(batch_size):
      offset = batch_size * i + j
      if offset >= len(train_data):
        break
      for i in range(n_neg):
        neg_bug = get_neg_bug(dup_sets[train_data[offset][0]], bug_ids)
        batch_triplets.append([train_data[offset][0], train_data[offset][1], neg_bug])
    #yield loop, read_batch_triplets(batch_triplets, data)
    batch_input, batch_pos, batch_neg = read_batch_triplets(batch_triplets, data)
    n_half = batch_size // 2
    pos = np.full((1, n_half), 1)
    neg = np.full((1, n_half), 0)
    sim = np.concatenate([pos, neg], -1)[0]

#     title_a, title_b, desc_a, desc_b = [], [], [], []
    
#     print(dir(batch_input['title'][:n_half]))
    
#     title_a += [row for row in batch_input['title'][:n_half]]
#     title_b += [row for row in batch_pos['title'][:n_half]]
#     title_a += [row for row in batch_input['title'][:n_half]]
#     title_b += [row for row in batch_neg['title'][:n_half]]

#     desc_a += [row for row in batch_input['desc'][:n_half]]
#     desc_b += [row for row in batch_pos['desc'][:n_half]]
#     desc_a += [row for row in batch_input['desc'][:n_half]]
#     desc_b += [row for row in batch_neg['desc'][:n_half]]
    
#     print("Title a", np.array(title_a))
    
    
#     return np.stack(title_a, 0), np.stack(title_b, 0), np.stack(desc_a, 0), np.stack(desc_b, 0), sim.reshape(-1)

    input_sample, input_pos, input_neg = {}, {}, {}
    
    input_sample = { 'title' : batch_input['title'], 'description' : batch_input['desc'] }
    input_pos = { 'title' : batch_pos['title'], 'description' : batch_pos['desc'] }
    input_neg = { 'title' : batch_neg['title'], 'description' : batch_neg['desc'] }
    
    return input_sample, input_pos, input_neg, sim 

In [44]:
%%time

bug_dir = os.path.join(DIR)

prepare_dataset(bug_dir)

Reading train data
Reading bug ids
Wall time: 2.32 s


In [74]:
%%time

batch_size = 128

# we want a constant validation group to have a frame of reference for model performance
train_gen = siam_gen(bug_dir, batch_size, 1)
valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = batch_iterator(bug_dir, 64, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description']], valid_sim)

Wall time: 1.05 s


In [75]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_sim.shape

((64, 40), (64, 200), (64,))

### Validar entrada

In [76]:
def display_batch(groups, nb):
  input_sample, input_pos, input_neg, v_sim = batch_iterator(groups, nb, 1)

  t_a, t_b, d_a, d_b = [], [], [], []
  
  t_a = input_sample['title']
  t_b = input_pos['title']
  d_a = input_sample['description']
  d_b = input_pos['description']
#   v_sim = v_sim[0]
  
  for ta, tb, da, db, sim in zip(t_a, t_b, d_a, d_b, v_sim):
    #print(ta.astype(str))
    key_t_a = ','.join(ta.astype(str))
    key_t_b = ','.join(tb.astype(str))
    key_d_a = ','.join(da.astype(str))
    key_d_b = ','.join(db.astype(str))
    print("Title =", sentence_dict[key_t_a])
    print("Title =", sentence_dict[key_t_b])
    print("Description =", sentence_dict[key_d_a])
    print("Description =", sentence_dict[key_d_b])
    print("similar =", str(sim))
    print("########################")

In [77]:
%%time 

display_batch(bug_dir, 3)

Title = installer ui badly needs replacement
Title = unable to create new folder with custom install winnumber ntnumber
Description = user agent mozilla number windows u windows nt number en us rv number gecko number firefox number build identifier mozilla number windows u windows nt number en us rv number gecko number firefox number it is very difficult to use firefoxs installer to install firefox to a custom location it is so bad that it uncovered the latent dataloss bug that had lain hidden for years in netscapes and mozillas installer specific problems number no editable location line an editable location line would for example allow a user to simply replace c program files mozilla firefox with d program files mozilla firefox number the creation of an installation directory is not automatic number a most installers allow you to navigate to a master directory and create a default installation subdirectory for example one should be able to navigate to d program files mozilla select t

#### Train

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [34]:
import os

from keras.preprocessing.text import Tokenizer

def word_index_count(corpus):
  
  tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
  tokenizer.fit_on_texts(corpus)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  
  return word_index

In [35]:
%%time

GLOVE_DIR = ""
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.42B.300d.txt'), 'rb')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 42B 300d.' % len(embeddings_index))

word_index = word_index_count(corpus)

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Total 1917494 word vectors in Glove 42B 300d.
Found 138251 unique tokens.
Wall time: 11min 35s


## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### CNN with filter 3,4,5

In [78]:
import keras
from keras.layers import GlobalMaxPooling1D 

keras.backend.clear_session()

def cnn_model(embeddings, num_words, embedding_dim, max_sequence_length, trainable):

  embedding_layer = Embedding(num_words,
                              embedding_dim,
                              weights=[embeddings],
                              input_length=max_sequence_length,
                              trainable=trainable)

  sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
  embedded_sequences = embedding_layer(sequence_input)

  # Yoon Kim model (https://arxiv.org/abs/1408.5882)
  convs = []
  filter_sizes = [3,5]

  for filter_size in filter_sizes:
      l_conv = Conv1D(filters=32, kernel_size=filter_size, activation='relu')(embedded_sequences)
      l_pool = MaxPooling1D(pool_size=3)(l_conv)
      convs.append(l_pool)

  # l_merge = Merge(mode='concat', concat_axis=1)(convs)

  l_merge = Concatenate(axis=1)(convs)

  # add a 1D convnet with global maxpooling, instead of Yoon Kim model
  conv = Conv1D(filters=64, kernel_size=3, activation='relu')(l_merge)
  pool = GlobalMaxPooling1D()(conv) # pool_size=3
  # Original Yoon Kim model
  #x = Flatten()(pool)
  #x = Dropout(0.5)(x)
  layer = Activation('relu')(pool)

  cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible
  cnn_feature_model.summary()
  
  return cnn_feature_model


cnn_feature_model = cnn_model(embeddings=embedding_matrix, 
                              num_words=len(word_index) + 1, 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Feature_BugInput (InputLayer)   (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     41475600    Feature_BugInput[0][0]           
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 198, 32)      28832       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 196, 32)      48032       embedding_1[0][0]                
__________________________________________________________________________________________________
max_poolin

### Bi-LSTM

In [79]:
%%time

from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional

def lstm_model(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
  number_lstm_units = 32
  rate_drop_lstm = 0.75
  recurrent_dropout = 0.25

  embedding_layer = Embedding(num_words,
                          embedding_dim,
                          weights=[embeddings],
                          input_length=max_sequence_length,
                          trainable=trainable)

  sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
  embedded_sequences = embedding_layer(sequence_input)

   # Creating LSTM Encoder
  lstm_layer = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))

  x = lstm_layer(embedded_sequences)

  layer = Activation('relu')(x)

  lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible
  lstm_feature_model.summary()
  
  return lstm_feature_model

lstm_feature_model = lstm_model(embeddings=embedding_matrix, 
                              num_words=len(word_index) + 1, 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Feature_BugInput (InputLayer (None, 40)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 40, 300)           41475600  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
Total params: 41,560,848
Trainable params: 85,248
Non-trainable params: 41,475,600
_________________________________________________________________
Wall time: 3.38 s


### Siamese model

In [80]:
from keras import backend as K

class MarginLoss(keras.layers.Layer):
  def call(self, inputs):
    bug_in, bug_pos, bug_neg  = inputs
    loss = self.distance(bug_in, bug_pos, bug_neg)
    self.add_loss(loss, inputs=inputs)
    return bug_in
  def distance(self, bug_in, bug_pos, bug_neg):
    dis_pos, dis_neg = self.cos_distance(bug_in, bug_pos), self.cos_distance(bug_in, bug_neg)
    ep = 1
    d1 = K.maximum(0.0, dis_pos - dis_neg + ep)
    return K.mean(d1)
  
  def euclidean_sim(bug_in, bug_pos, bug_neg):
    dis_pos = K.sum(K.square(bug_in - bug_pos), axis=1, keepdims=True)
    dis_neg = K.sum(K.square(bug_in - bug_neg), axis=1, keepdims=True)
    dis_pos = K.sqrt(dis_pos)
    dis_neg = K.sqrt(dis_neg)
    return dis_pos, dis_neg
  
  def l2_normalize(self, x, axis):
        norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        return K.maximum(x, K.epsilon()) / K.maximum(norm, K.epsilon())
  # https://github.com/keras-team/keras/issues/3031
  def cos_distance(self, y_true, y_pred):
    y_true = self.l2_normalize(y_true, axis=-1)
    y_pred = self.l2_normalize(y_pred, axis=-1)
    return K.mean(1 - K.sum((y_true * y_pred), axis=-1))
  
def identity_loss(y_true, y_pred):
    return K.mean(y_pred - 0 * y_true)
  
# define the margin loss like hinge loss
def margin_loss(y_true, y_pred):
    lamb, margin = 0.5, 0.1
    return K.sum(y_true * K.square(K.relu(1 - margin - y_pred)) + lamb * (
        1 - y_true) * K.square(K.relu(y_pred - margin)), axis=-1)

In [81]:
from keras.layers import concatenate, Add, Lambda
from keras.optimizers import Adam

def siamese_model(max_sequence_length_t, max_sequence_length_d):
  
  bug_t_in = Input(shape = (max_sequence_length_t, ), name = 'title_in')
  bug_t_pos = Input(shape = (max_sequence_length_t, ), name = 'title_pos')
  bug_t_neg = Input(shape = (max_sequence_length_t, ), name = 'title_neg')
  
  bug_d_in = Input(shape = (max_sequence_length_d, ), name = 'desc_in')
  bug_d_pos = Input(shape = (max_sequence_length_d, ), name = 'desc_pos')
  bug_d_neg = Input(shape = (max_sequence_length_d, ), name = 'desc_neg')

  bug_t_in_feat_lstm = lstm_feature_model(bug_t_in)
  bug_t_pos_feat_lstm = lstm_feature_model(bug_t_pos)
  bug_t_neg_feat_lstm = lstm_feature_model(bug_t_neg)
  
  bug_d_in_feat_cnn = cnn_feature_model(bug_d_in)
  bug_d_pos_feat_cnn = cnn_feature_model(bug_d_pos)
  bug_d_neg_feat_cnn = cnn_feature_model(bug_d_neg)

  encoded_anchor = concatenate([bug_t_in_feat_lstm, bug_d_in_feat_cnn], name = 'merge_features_in')
  encoded_positive = concatenate([bug_t_pos_feat_lstm, bug_d_pos_feat_cnn], name = 'merge_features_pos')
  encoded_negative = concatenate([bug_t_neg_feat_lstm, bug_d_neg_feat_cnn], name = 'merge_features_neg')

  #combined = concatenate([combined_features_a, combined_features_b])
#   combined_in_pos = Add()([encoded_anchor, encoded_positive])
#   combined_in_neg = Add()([encoded_anchor, encoded_negative])
#   combined = Add()([combined_in_pos, combined_in_neg])

  
  loss = MarginLoss()([encoded_anchor, encoded_positive, encoded_negative])
  
  # combined_features = Dense(100, activation = 'linear')(combined)
  # combined_features = BatchNormalization()(combined_features)
  # combined_features = Activation('relu')(combined_features)
  # combined_features = Dense(4, activation = 'linear')(combined_features)
  # combined_features = BatchNormalization()(combined_features)
#   combined_features = Activation('relu')(combined_features)
  loss = Dense(2, activation = 'softmax')(loss)
  
  
  # Implementation https://stackoverflow.com/questions/52306282/implementing-triplet-loss-inside-keras-layers
  # https://github.com/maciejkula/triplet_recommendations_keras
#   margin = 1
  
#   DAP = Lambda(lambda tensors:K.sum(K.square(tensors[0] - tensors[1]),axis=1,keepdims=True),name='DAP_loss') #Distance for Anchor-Positive pair
#   DAN = Lambda(lambda tensors:K.sum(K.square(tensors[0] - tensors[1]),axis=1,keepdims=True),name='DAN_loss') #Distance for Anchor-Negative pair
#   Triplet_loss = Lambda(lambda loss: K.maximum(loss[0] - loss[1] + margin, 0.0), name='Triplet_loss') #Distance for Anchor-Negative pair

#   DAP_loss = DAP([encoded_anchor,encoded_positive])
#   DAN_loss = DAN([encoded_anchor,encoded_negative])

#   #call this layer on list of two input tensors.
#   Final_loss = Triplet_loss([DAP_loss,DAN_loss])
  
  adam = Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  
  similarity_model = Model(inputs = [bug_t_in, bug_t_pos, bug_t_neg, bug_d_in, bug_d_pos, bug_d_neg], outputs = loss, name = 'Similarity_Model')
  # setup the optimization process 
  similarity_model.compile(optimizer=adam, loss = margin_loss, metrics = ['accuracy']) # 'binary_crossentropy'
  similarity_model.summary()
  
  return similarity_model

similarity_model = siamese_model(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_in (InputLayer)           (None, 40)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
title_pos (InputLayer)          (None, 40)           0                                            
__________________________________________________________________________________________________
desc_pos (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
title_neg 

### Fit

In [82]:
%%time

# early = EarlyStopping(monitor='loss', patience = 5, min_delta=0, verbose=0)

h = similarity_model.fit_generator(train_gen, 
                               steps_per_epoch = 10,
                               validation_data=test_gen,
                                             epochs = 10,
                                             verbose = True) # validation_steps=10

Epoch 1/10
 1/10 [==>...........................] - ETA: 3:52 - loss: 1.2675 - acc: 0.9531

StopIteration: 

In [25]:
# pred_t_a, pred_t_b, pred_t_sim = gen_random_batch(test_groups, 10)
# validation_accuracy_loss(h)
# curve_roc_auc(similarity_model, x=[pred_t_a, pred_t_b], y_valid=pred_t_sim)
# _ = show_model_output(pred_t_a, pred_t_b, pred_t_sim, similarity_model)

## Auxiliary methods

### Plot ROC/AUC curve

### Plot validation accuracy and loss

In [26]:
def validation_accuracy_loss(history):
  acc=history.history['acc']
  val_acc=history.history['val_acc']
  loss=history.history['loss']
  val_loss=history.history['val_loss']

  plt.plot(acc, label='acc')
  plt.plot(val_acc, label='val_acc')
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

  plt.plot(loss, label='acc')
  plt.plot(val_loss, label='val_acc')
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

In [27]:
from sklearn.metrics import roc_auc_score
import sklearn.metrics

def curve_roc_auc(model, x, y_valid):
  y_hat = model.predict(x)
  pct_auc = roc_auc_score(y_valid, y_hat) * 100
  #print('ROC/AUC: {:0.2f}'.format(pct_auc))

  fpr, tpr, _ = sklearn.metrics.roc_curve(y_valid, y_hat)
  roc_auc = sklearn.metrics.auc(fpr, tpr)
  plt.figure()
  lw = 2
  plt.plot(fpr, tpr, color='darkorange',
           lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
  plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('Taxa de Falsos Positivos')
  plt.ylabel('Taxa de Verdadeiros Positivos')
  plt.title('Receiver Operating Characteristic (ROC)')
  plt.legend(loc="lower right")
  plt.show()

In [28]:
def show_model_output(valid_a, valid_b, valid_sim, model, nb_examples = 3):
    #pv_a, pv_b, pv_sim = gen_random_batch(test_groups, nb_examples)
    pred_sim = model.predict([valid_a, valid_b])
#     pred_sim = [1,1,1,1,1,1]
    for b_a, b_b, sim, pred in zip(valid_a, valid_b, valid_sim, pred_sim):
        key_a = ','.join(b_a.astype(str))
        key_b = ','.join(b_b.astype(str))
        print(sentence_dict[key_a])
        print(sentence_dict[key_b])
        print("similar=" + str(sim))
        print("prediction=" + str(pred[0]))
        print("########################")
    return valid_a, valid_b, valid_sim