# Bug triage with Deep Learning

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Log keras
LOG_DIR='logs/training'
# Checkpoint keras
FILE_PATH = "checkpoint_baseline_1000epoch_10steps_1024batch({})".format(DOMAIN)
# Save model
SAVE_PATH = 'baseline_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

#### Loading bug ids in memory

In [9]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

212512

### Dicionário de títulos e descrições

In [10]:
%%time

baseline.load_preprocess()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


CPU times: user 1min 19s, sys: 1.99 s, total: 1min 21s
Wall time: 1min 21s


## Geração de batches

### Generating tiple of batches

In [11]:
%%time
baseline.prepare_dataset()

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data
CPU times: user 397 ms, sys: 8.14 ms, total: 405 ms
Wall time: 398 ms


In [12]:
baseline.load_bugs()

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




In [13]:
baseline.bug_set[2521]

{'bug_severity': '3\n',
 'bug_status': '2\n',
 'component': '449\n',
 'creation_ts': '2001-10-10 22:38:00 -0400',
 'delta_ts': '2005-05-10 14:55:51 -0400',
 'description': 'steps number person all your windows number go to any window and select the location menu number person any window organization that it only gets selected and not maximized this happens in country as well organization',
 'description_word': array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  

In [14]:
%%time

batch_size = 64
batch_size_test = 1024

# we want a constant validation group to have a frame of reference for model performance
train_gen = baseline.siam_gen(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = baseline.batch_iterator(baseline.train_data, 
                                                                                          baseline.dup_sets_train, 
                                                                                          batch_size_test, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]

CPU times: user 81.7 ms, sys: 28.1 ms, total: 110 ms
Wall time: 109 ms


In [15]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((1024, 100), (1024, 500), (1024, 1682), (1024,))

### Validar entrada

In [16]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, 5)

ValueError: too many values to unpack (expected 4)

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [17]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'word_vocab.pkl'))
for token in vocab:
    print(token)

vocabulary loaded
shall
spamming
ftpclient
graphdef
nojdk
occurs
abcd
inability
correspondence
fsd
scala
upload
means
testplugin
eplnxnumber
distinction
sentence
programing
modisco
afnumberbnumber
tdjava
reverts
rearranging
rpps
patches
subscribe
numberfnumbercnumber
truly
dndutil
restore
narrator
noop
numberxnumberdnumbera
confirm
unpacking
numberaaacnumberaaacnumber
sweep
parking
paren
se
integrated
personx
vte
xpd
numberxeaenumber
iresource
thought
bution
simultaneous
bnumberddnumber
jaxrs
numberxnumberdcnumberxnumberdfaaa
surprises
wsappdev
svnteam
cuoperation
too
documenting
sample
checkstyle
architetture
demonumber
fk
eglxml
codebase
workbench
instantaneously
encode
numberdcnumberdcnumber
determination
gmane
jarfiles
cim
mentors
buildmodel
managedagent
days
cheet
coolitem
rerunning
overkill
through
cpp
sourceeditor
chooser
circuits
nio
numbereenumberfnumber
libfontconfig
portlet
acknowledgements
headed
swapped
personconnector
easymock
wavy
libapr
programatic
unread
uniformly
bnum

names
disappointing
fault
propert
assisting
numberaabnumber
concatenated
ajc
cngb
stddef
bodies
launchpad
apiresults
dispose
neonumberj
rqghnd
mtdnumber
granularity
belive
libz
users
sake
suppliers
attractive
recreation
infrequently
misspelling
indications
tsym
evolutionary
canvases
bellow
incompatibility
dummynumber
intf
retag
starterkit
thus
displaying
ation
organizationlayout
cdtproject
whatsoever
organizationfacade
gdbcountryger
pd
discrimination
dvl
gf
concrete
organizationement
inplace
rdb
ids
conversation
contextual
locked
menuitem
bnumbercnumber
numberenumbercnumberenumbercnumber
includepath
ultra
hints
numberxnumberfe
automatically
duplicates
pgnumberkcqk
numberxnumberanumberxbnumber
julian
relengtool
unbind
rse
dot
ajmodel
msaa
closer
ireporter
pgm
mavenarchiver
oem
drew
merger
rup
orphaned
proce
personor
epsilon
iterators
codasyl
linenumber
heavily
numberxnumberanumberc
saaj
receives
preserve
pafnumber
raserver
snumberclient
draws
succesful
scott
wp
organizationbundle
umlref

indent
crawler
outputs
xmlns
returns
clients
bloat
literally
dejavu
serializer
ob
nds
jasperreports
proceeded
jpi
toggled
unumber
bhunt
week
qname
remartwork
sar
dwarf
bulleted
subscriptions
selecte
ron
reattaching
treewalk
spits
underlining
lperson
topics
loose
kte
reconci
cleaner
cfunction
eall
simulated
jsr
slight
deploys
thod
numberxnumberbenumbercnumber
translatable
histroy
ltw
itree
invariant
excluding
â³
adsldpc
infinity
lpg
adequate
sysvnumber
uml
underscores
reproducable
watchers
organizationcmdocument
svnconnection
inconsistently
tbody
twisty
hence
emnumber
whose
didnt
importer
dnowak
advised
tomer
persontor
ifne
ischeduling
sufficiently
gang
ifacility
jruby
routed
attendees
htmlrendering
expectation
personsnumber
mtejinumberoi
preprocessor
numberxcnumber
widely
in
organizationibute
numberaanumber
bnumberf
apparent
rnumberxnumbercnumber
vetest
appengine
lenght
webserviceutils
postmortem
hitoolbox
happend
unconditional
lg
numberxnumberfbnumberb
viewers
numberxaanumber
verbosit

ink
rdt
tagx
given
patched
blanked
builtin
organizationre
productline
sourcelookup
analexee
higgins
db
httpcore
runtimenumber
assume
omg
webapps
testapp
decrypted
postscript
inferior
interpolation
mango
numberdnumberanumberdnumbercnumber
tptpcompnumbernt
reads
traversals
attributed
mwe
modeltype
persistant
seperator
sits
canned
globus
imposes
versinumberd
finish
dtui
grounp
below
persona
anumberfe
libcrypto
generalize
subfolder
imo
synchronise
declarator
numberdnumberbnumberdnumberbnumber
junitnumber
lineno
prg
invoking
assigns
elcipse
ract
duplications
resid
tester
maintainable
checkmark
pathnames
midlet
adb
solely
bpel
chained
presets
global
sign
shadows
libesd
crashs
exc
nb
low
organizationuct
breite
anumberffnumber
ghost
subroutines
answers
numberbits
xxxnumber
took
editorlist
netapinumber
crap
patient
swingui
resumes
libclientnumber
typename
torolab
mpnumber
negative
drv
rsync
iload
jamesb
frameset
javaclass
numberxnumbercnumberxnumberfff
libatk
hides
updaters
hovered
wouldnt
inte

dated
rejects
quickdiff
restrict
emo
countryger
traversed
osradapter
numberxnumbercnumberxnumbercnumbercnumber
servertype
idx
cluttering
steadily
icon
suse
workgroups
previews
engineer
assembler
emptied
risks
xmlfile
ml
enumberx
ieq
texo
dit
iface
numberconcat
vminstall
revalidate
personver
ifeature
allocate
tldtag
numberanumberaanumber
walkbacks
picks
ag
informe
jnumberc
ajde
sion
openup
bidirectional
connectorservice
numberdnumberdanumber
numberenumberbnumberenumbercnumber
åº
antlib
pad
lf
xman
amnumber
hvl
personelection
entity
tpl
revisited
numberafnumberafdnumberc
uide
hcg
nitin
enumbermnumberx
organizationsave
chunking
ormcountry
organizationst
brees
organizationact
lifelines
dat
casual
bear
netware
containers
vnnif
commonbaseeventnumber
toogle
compnumbernt
jivesoftware
perforce
mangling
mapper
oprofiledb
ets
automatic
timestamps
android
numberxnumberdnumberfnumber
lpex
rpcelement
indentations
propertypages
whiteboard
vz
numbercnumberd
episode
fnumbershnfftnumbercw
dosen
manipula

initiate
idto
numberxnumberdanumber
acceleo
antview
cross
updatecore
intentional
suspicion
managed
numberdnumberbnumberdnumberdnumber
zdenek
screenshots
reply
ajbuilder
organizationconnection
linuxtools
libgailutil
hesitate
stp
character
invokes
ignv
mutual
organizationserver
relevant
nder
dependant
terminator
strange
anumbera
organizationoject
xblockexpression
ats
nationalityanged
mutex
matt
handle
translating
expanded
devel
libzip
ideworkbench
delay
iservice
dita
organizationor
dice
spaces
decides
locationtech
vmarg
jnumberscnumber
vnumberynumberbfsvepow
iorganizationapplication
automaticly
infrastructure
counterintuitive
ning
mytag
involving
mm
soft
vsh
simulates
unmodified
tinumber
forwarded
numberxnumberacnumber
stmt
dexia
forall
cascade
confined
inconsitent
lrg
annotating
perl
disapear
instanceeditor
soas
ffbfenumber
graduate
timeouts
argument
emacs
insqsy
worker
bundledata
gorganizationutil
pertaining
easiest
existant
softwares
typeid
int
marked
dash
documents
pagedesigner
haac


numberlkdeadlockthr
feat
disapears
scoped
detach
bnumberfbnumber
organizationservices
rename
unlike
recordings
collection
patching
numberanumberfnumberanumberfdnumberc
numberxnumberdnumberxnumberdccnumber
so
bld
localstore
suitable
organizationcountryin
anchored
bugtraq
resolution
disconcerting
renew
datatime
ok
oidnumbervnumber
buildhomes
organizationform
nationalityuration
firm
becuase
xdt
jetty
browserreversi
numberxanumber
reportnumber
lookup
problemnumber
invocations
clicked
clnumberd
corrected
innernumber
visable
mysql
ascii
autrun
tzdata
organizationsecurity
hostnumber
numberfnumberfnumberfnumber
callgraph
thinks
javadocexport
vnumberznumberecb
taglibs
share
cvsoperation
restricting
shopping
organizationregistrator
isub
opensymphony
junitnumbertest
relnotes
posting
localize
producing
emfoperation
rfcnumber
canceled
odaconsumer
guess
kb
malloc
beeps
combos
marshalled
serveral
computername
weblogicnumber
retriever
sounds
provisions
jsdi
target
availabe
uimessages
ipart
ss
crt
prom

overline
spmdb
ebnumberc
numbercnumbercnumbercnumber
changing
generate
visitors
jdicountry
jexl
anumberí
avaloq
resubmit
eboolean
principle
numberxnumberefcnumberanumber
explosively
formats
numerical
mismatching
died
zips
organizationnature
colgroup
appreciated
footer
marco
une
rmsystem
jubula
organizationname
capacity
numberc
shorter
attacker
semi
suborganization
qnx
pathelement
reading
synching
degradation
retry
preload
much
crimson
dependend
bookmarked
ein
ãª
organizationcompare
words
eclipsenumberrcnumber
numberanumberd
zzz
hardcoding
separates
assessment
numberdnumberdnumberdnumberenumber
le
newnumber
vbs
box
banner
initialiser
mistyped
tq
soa
stepstone
gcj
artefact
realizing
wraps
mmx
eate
glass
djoerup
neighbours
recovers
subdir
stepnumber
xbase
hava
qe
bnumberfnumberfnumber
trail
osname
sequencer
spanish
libdl
famous
detector
mediator
xsdorganization
emailaddress
compare
mnemonic
drawer
recognition
engineering
overrode
rochade
evangelism
permit
created
overlapping
cmof
adequate

antialias
graphicsnumberd
organizationorganizations
showsplash
numberxbnumberdnumbereenumber
myproduct
tibco
arrow
ler
selects
reaper
numberxnumberffcnumber
identitymaps
indexdiff
instant
earproperties
letter
gre
removal
arounds
adaptors
combine
moderator
meth
rsp
examination
unwrap
handy
gdavison
fullmoon
producta
organizationservice
numberdfnumber
spinner
mistakenly
testserver
considers
disallows
try
scrolling
numberxnumberdnumberxnumberdbnumber
jni
stealing
landmark
process
alog
ceautcom
gui
deprecate
numberefnumber
backbone
oops
vjet
inputhandler
cã
abnumberdnumber
com
svnerror
describer
mgh
numberxnumberfdnumberfnumber
und
spalten
ibmajdknumber
noticable
later
pop
norm
hands
relaunching
vnumbere
ensuring
measuring
featurenumber
usersession
numberanumberfnumber
misbehaving
ireturn
unumberf
rendered
bnumberaenumber
derivative
dft
junitsupport
statusbar
rwxr
apitooling
organizationgenerator
sirius
floats
inaktiv
externaltools
mit
impaired
wipes
cin
lblnumber
press
oddity
windowbuilde

executionenvironment
shoule
distributor
reliable
libaio
useless
hardcoded
routine
subsub
sdk
numberxnumbercnumberxnumbercnumberxnumbercnumber
jobnumber
destroyed
numberxnumberdnumberfnumberxnumberdnumberfnumber
specifing
desired
xmlproperty
switcher
describes
newcomers
results
rwb
jnumbernnumber
likely
sums
httpd
colspan
zzs
recommends
embeded
elegant
erro
iclsnumber
trapped
pen
numberxnumberxnumbercnumberc
urlclass
nfo
shellnumber
tarball
drwxrwsr
fabio
exclusion
numberdnumberfnumber
organizationmap
icm
consider
sslsocket
travel
ejborganization
uriperson
organizationpackage
retire
also
buildnumber
bnumberfnumber
pdomindexer
informs
andy
topicnumber
inputted
flaws
ndency
overwhelming
numberanumberfnumberanumber
ahead
numberxnumberfenumber
confluence
personcpersonznkvtfjvnumber
figurehandler
batchmigration
approved
blogs
libgmodule
latter
closely
ffffffffffnumber
deadlocked
aforementioned
reorganize
traceback
telecom
otequinox
persontation
todays
repeated
caches
been
dbstore
flashing
co

implied
advantages
impose
initialisation
unmarshaller
gvt
libauto
isaveable
backup
jbpmnumber
tus
a
aenumberbnumber
reclaim
dpkg
originate
vec
kspace
changegenerator
domcoordinator
to
hardware
botton
insure
epfnumber
certpath
carefully
progressing
birt
tests
misaligned
investigation
deletable
languitar
growing
challenge
registry
von
sight
rdx
correcly
mourad
tunnel
cre
personring
numberfnumberfnumberdnumber
xorg
btm
altogether
subview
orionhub
cancelled
met
locales
numberb
img
though
sarsenau
x
numberdate
uacontent
jvmwinumbersrnumber
icproject
turns
phpdoc
satisfying
eventually
smoke
perpective
kernelnumber
bundle
jnumbereeflex
cbes
wnumberc
mousedown
mate
monospaced
valgrind
addressing
line
tipo
receiving
remedied
lion
ugsplm
ig
supplied
libcairo
itpui
xxxxxxx
selections
private
third
phtml
ncp
doubleclicking
semicolons
fname
dropped
emof
lvalue
stateless
numberanumberbnumberanumber
started
recheck
ery
multiplexing
ambiguity
unmaximize
uris
taginfo
dyld
astif
reconnect
multiples
simp

In [18]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 19998'

In [19]:
%%time

baseline.generating_embed(GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.
vocabulary loaded
Number of OOV words: 19998
CPU times: user 1min 21s, sys: 2.52 s, total: 1min 24s
Wall time: 1min 23s


## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [20]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  input_length=max_sequence_length,
                                  trainable=trainable)
    return embedding_layer

### CNN with filter 3,4,5

In [21]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    #layer = MaxPooling1D()(conv)
    layer = Flatten()(l_merge)
    layer = Dense(100, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [22]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
    lstm_layer = Bidirectional(LSTM(number_lstm_units, return_sequences=False), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
                               merge_mode='ave')

    layer = lstm_layer(embedded_sequences)
#     layer = GlobalAveragePooling1D()(layer)
    layer = Dense(100, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [23]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 100
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [24]:
from keras import backend as K
import tensorflow as tf

def l2_normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return K.maximum(x, K.epsilon()), K.maximum(norm, K.epsilon())

def normalize(x):
    return l2_normalize(x, axis=-1)
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = l2_normalize(x, axis=-1)
    y, y_norm = l2_normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return distance

def margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    loss = K.maximum(0.0, margin - y_pred[0] +  y_pred[1])
    return K.mean(loss)

def pos_distance(y_true, y_pred):
    return K.mean(y_pred[0])

def neg_distance(y_true, y_pred):
    return K.mean(y_pred[1])

def stack_tensors(vects):
    return K.stack(vects)

In [25]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d):
  
    bug_t_in = Input(shape = (sequence_length_t, ), name = 'title_in')
    bug_t_pos = Input(shape = (sequence_length_t, ), name = 'title_pos')
    bug_t_neg = Input(shape = (sequence_length_t, ), name = 'title_neg')

    bug_d_in = Input(shape = (sequence_length_d, ), name = 'desc_in')
    bug_d_pos = Input(shape = (sequence_length_d, ), name = 'desc_pos')
    bug_d_neg = Input(shape = (sequence_length_d, ), name = 'desc_neg')
    
    bug_i_in = Input(shape = (sequence_length_info, ), name = 'info_in')
    bug_i_pos = Input(shape = (sequence_length_info, ), name = 'info_pos')
    bug_i_neg = Input(shape = (sequence_length_info, ), name = 'info_neg')

    bug_t_in_feat_lstm = lstm_feature_model(bug_t_in)
    bug_t_pos_feat_lstm = lstm_feature_model(bug_t_pos)
    bug_t_neg_feat_lstm = lstm_feature_model(bug_t_neg)

    bug_d_in_feat_cnn = cnn_feature_model(bug_d_in)
    bug_d_pos_feat_cnn = cnn_feature_model(bug_d_pos)
    bug_d_neg_feat_cnn = cnn_feature_model(bug_d_neg)
    
    bug_i_in_feat_mlp = mlp_feature_model(bug_i_in)
    bug_i_pos_feat_mlp = mlp_feature_model(bug_i_pos)
    bug_i_neg_feat_mlp = mlp_feature_model(bug_i_neg)

    encoded_anchor = Add(name = 'merge_features_in')([bug_i_in_feat_mlp, bug_t_in_feat_lstm, bug_d_in_feat_cnn])
    encoded_positive = Add(name = 'merge_features_pos')([bug_i_pos_feat_mlp, bug_t_pos_feat_lstm, bug_d_pos_feat_cnn])
    encoded_negative = Add(name = 'merge_features_neg')([bug_i_neg_feat_mlp, bug_t_neg_feat_lstm, bug_d_neg_feat_cnn])
    
#     encoded_anchor = concatenate([bug_i_in_feat_mlp, bug_t_in_feat_lstm, bug_d_in_feat_cnn], name = 'merge_features_in')
#     encoded_positive = concatenate([bug_i_pos_feat_mlp, bug_t_pos_feat_lstm, bug_d_pos_feat_cnn], name = 'merge_features_pos')
#     encoded_negative = concatenate([bug_i_neg_feat_mlp, bug_t_neg_feat_lstm, bug_d_neg_feat_cnn], name = 'merge_features_neg')
    
    # Bug representation layer  
#     encoded_anchor = Dense(100)(encoded_anchor)
#     encoded_positive = Dense(100)(encoded_positive)
#     encoded_negative = Dense(100)(encoded_negative)
    
    # Cosine
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])
    
    # Loss function only works with a single output
    output = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances',
        output_shape=(2, 1)
    )([positive_d, negative_d])
  
    similarity_model = Model(inputs = [bug_t_in, bug_t_pos, bug_t_neg, 
                                       bug_d_in, bug_d_pos, bug_d_neg, 
                                       bug_i_in, bug_i_pos, bug_i_neg], 
                           outputs = output, name = 'Similarity_Model')
    
    optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.001)
    
    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=margin_loss, metrics=[pos_distance, neg_distance])

    return similarity_model

## Experiment

##### Logs

In [26]:
tbCallBack = keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=True, write_images=True)

##### Checkpoint

In [27]:
from keras.callbacks import ModelCheckpoint

def checkpoint_model(name):
    m_dir = os.path.join('checkpoint')
    if not os.path.exists(m_dir):
        os.mkdir(m_dir)
    return ModelCheckpoint(os.path.join(m_dir, "{}.hdf5".format(name)), monitor='loss', \
                                        verbose=1, save_best_only=False, mode='min', period=1)

# checkpoint
checkpoint = checkpoint_model(FILE_PATH)

### Train with steps for each epoch

In [28]:
# %%time
# import keras

# # Inspired on https://'pastebin.com/TaGFdcBA
# # TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
# keras.backend.clear_session()

# # Embeddings
# cnn_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
#                               num_words=len(vocab), 
#                               embedding_dim=EMBEDDING_DIM, 
#                               max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
#                               trainable=True)
# lstm_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
#                               num_words=len(vocab), 
#                               embedding_dim=EMBEDDING_DIM, 
#                               max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
#                               trainable=True)

# # Feature models
# cnn_feature_model = cnn_model(cnn_embedding_layer, MAX_SEQUENCE_LENGTH_D)
# lstm_feature_model = lstm_model(lstm_embedding_layer, MAX_SEQUENCE_LENGTH_T)
# mlp_feature_model = mlp_model(number_of_columns_info)

# # Similarity model
# similarity_model = siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, 
#                                      number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

# # cnn_feature_model.summary()
# # lstm_feature_model.summary()
# similarity_model.summary()

# '''
#     Experiments log
# '''
# h = similarity_model.fit_generator(train_gen, 
#                                steps_per_epoch = 10,
#                                              epochs = 100,
#                                              verbose = 1,
#                                              validation_data=test_gen,
#                                                # callbacks=[tbCallBack, checkpoint]
#                                               )  # 

In [29]:
# loss=h.history['loss']
# val_loss=h.history['val_loss']

# plt.plot(loss, label='loss')
# plt.plot(val_loss, label='val_loss')
# plt.title('Model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()

### Training and evaluating for each epoch at same time

#### Auxiliary methods train experiment siamese

In [30]:
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

In [31]:
retrieval = Retrieval()

path = 'data/processed/{}'.format(DOMAIN)
path_buckets = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)
path_train = 'data/processed/{}/train.txt'.format(DOMAIN)
path_test = 'data/processed/{}/test.txt'.format(DOMAIN)

MAX_SEQUENCE_LENGTH_I = number_of_columns_info # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

NameError: name 'number_of_columns_info' is not defined

#### Hashing bugs by buckets

In [None]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

#### Model to vectorize

In [None]:
import keras
from keras.models import Model
from keras.layers import Input
from keras.models import load_model

def get_model_vectorizer():
    bug_title =  similarity_model.get_layer('title_in').input 
    bug_desc =  similarity_model.get_layer('desc_in').input 
    bug_info = similarity_model.get_layer('info_in').input 

    title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
    desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')
    info_encoder = similarity_model.get_layer('FeatureMlpGenerationModel')

    bug_t = title_encoder(bug_title)
    bug_d = desc_encoder(bug_desc)
    bug_i = info_encoder(bug_info)
    # Representation layer
    model = similarity_model.get_layer('merge_features_in')
    output = model([bug_i, bug_t, bug_d])
    # Bug Representation
    # bug_model = similarity_model.get_layer('dense_4')
    # output = bug_model(output)

    model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
    
    return model

#### Methods to evaluate each epoch

In [None]:
def vectorizer_buckets(verbose, model, buckets, buckets_data):
    embed_buckets = model.predict(buckets_data)
    loop = enumerate(embed_buckets)
    if(verbose):
        loop = tqdm(enumerate(embed_buckets))
        loop.set_description("Vectorizing buckets batch")
    buckets_vectorized = []
    for index, vector in loop:
        buckets_vectorized.append({ 'bug_id': buckets[index],  'vector': vector })
    if(verbose): loop.close()
    return buckets_vectorized
    

def vectozing_validation_batch(verbose, model, queries_test_vectorized, queries_data):
    embed_validation = model.predict(queries_data)
    loop = enumerate(embed_validation)
    if(verbose):
        loop = tqdm(enumerate(embed_validation))
        loop.set_description("Vectorizing validation batch")
    for index, vector in loop:
        if issues_by_buckets[bug_id] == test_bug_id: continue # if the bug is the master
        queries_test_vectorized[index]['vector'] = vector
    if(verbose): loop.close()
    return queries_test_vectorized

def get_validation_batch(verbose, model, queries_validation):
    bug_set = baseline.get_bug_set()
    queries_test_vectorized, queries_data = [], []
    title_data, desc_data, info_data = [], [], [] 
    loop = queries_validation
    if(verbose):
        loop = tqdm(queries_validation)
        loop.set_description("Getting validation batch")
    for bug_id in loop:
        if issues_by_buckets[bug_id] == test_bug_id: continue # if the bug is the master
        bug = bug_set[bug_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        queries_test_vectorized.append({ 'bug_id' : bug_id, 'ground_truth': issues_by_buckets[bug_id] })
    if(verbose): loop.close()
    queries_data = [ np.array(title_data), np.array(desc_data), np.array(info_data) ]
    return queries_test_vectorized, queries_data

def get_buckets_from_validation(verbose, validation_data):
    bug_set = baseline.get_bug_set()
    buckets = set()
    title_data, desc_data, info_data = [], [], []
    loop = validation_data
    if(verbose):
        loop = tqdm(validation_data)
        loop.set_description("Reading buckets from validation batch")
    for row in loop:
        bug_anchor, bug_pos, bug_neg = row
        vectorizer = [bug_anchor, bug_pos, bug_neg]
        for test_bug_id in vectorizer:
            buckets.add(issues_by_buckets[test_bug_id])
    for bucket_id in buckets:
        bug = bug_set[bucket_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
    buckets_data = [ np.array(title_data), np.array(desc_data), np.array(info_data) ]
    if(verbose): loop.close()
    return list(buckets), buckets_data

def get_validation_ids(verbose, validation_data):
    validation_bugs = []
    loop = validation_data
    if(verbose):
        loop = tqdm(validation_data)
        loop.set_description('Reading the bug ids from duplicates in validation')
    for row in loop:
        bug_anchor, bug_pos, bug_neg = row
        validation_bugs.append(bug_anchor)
        validation_bugs.append(bug_pos)
    if(verbose): loop.close()
    return validation_bugs

def get_loss_validation(model, queries_data):
    return model.test_on_batch(queries_data)

def evaluate_validation_test(verbose, test_gen, batch_triplets_valid, buckets, buckets_data, queries_test_vectorized, queries_data):
    test_batch, test_sim = test_gen
    model = get_model_vectorizer()
    evaluation_test_batch = get_loss_validation(queries_data)
    print(evaluation_test_batch)
    buckets_vectorized = vectorizer_buckets(verbose, model, buckets, buckets_data)
    queries_vectorized = vectozing_validation_batch(verbose, model, queries_test_vectorized, queries_data)
    annoy = indexing_train(buckets_vectorized, verbose)
    X_test, distance_test, indices_test = indexing_test(queries_vectorized, verbose)
    formated_rank = rank_result(indices_test, distance_test, verbose)
    rank_queries = formating_rank(X_test, verbose)
    exported_rank = export_rank(rank_queries, formated_rank, verbose)
    evaluation = Evaluation(verbose)
    report = evaluation.evaluate(exported_rank)
    
    # recall@25, loss, cosine_positive, cosine_negative
    return report['5 - recall_at_25'], evaluation_test_batch[0], evaluation_test_batch[1], evaluation_test_batch[2] 

In [None]:
class Evaluation():
    def __init__(self, verbose=1):
        self.verbose = verbose
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, query_master = query.split(":")
        query_master = int(query_master)
        rank_masters = [int(item.split(':')[0]) for pos, item in enumerate(rank.split(",")[:20])]
        corrects = len(set([query_master]) & set(rank_masters[:k]))
        #total = len(retrieval.buckets[issues_by_buckets[query_master]])
        total = 1 if corrects <= 0 else corrects
        return float(corrects), total

    def evaluate(self, path):
        self.recall_at_5_corrects_sum, self.recall_at_10_corrects_sum, \
        self.recall_at_15_corrects_sum, self.recall_at_20_corrects_sum, self.recall_at_25_corrects_sum = 0, 0, 0, 0, 0
        self.recall_at_5_total_sum, self.recall_at_10_total_sum, self.recall_at_15_total_sum, \
        self.recall_at_20_total_sum, self.recall_at_25_total_sum = 0, 0, 0, 0, 0 
        if(self.verbose):
            print("Evaluating...")
        if type(path) == str:
            with open(path, 'r') as file_input:
                for row in file_input:
                    self.recall(row)
        else:
            for row in path:
                self.recall(row)
        
        report = {
            '1 - recall_at_5' : round(self.recall_at_5_corrects_sum / self.recall_at_5_total_sum, 2),
            '2 - recall_at_10' : round(self.recall_at_10_corrects_sum / self.recall_at_10_total_sum, 2),
            '3 - recall_at_15' : round(self.recall_at_15_corrects_sum / self.recall_at_15_total_sum, 2),
            '4 - recall_at_20' : round(self.recall_at_20_corrects_sum / self.recall_at_20_total_sum, 2),
            '5 - recall_at_25' : round(self.recall_at_25_corrects_sum / self.recall_at_25_total_sum, 2)
        }

        return report
    def recall(self, row):
        #if row == '': continue
        self.recall_at_5_corrects, self.recall_at_5_total = self.top_k_recall(row, k=5)
        self.recall_at_10_corrects, self.recall_at_10_total = self.top_k_recall(row, k=10)
        self.recall_at_15_corrects, self.recall_at_15_total = self.top_k_recall(row, k=15)
        self.recall_at_20_corrects, self.recall_at_20_total = self.top_k_recall(row, k=20)
        self.recall_at_25_corrects, self.recall_at_25_total = self.top_k_recall(row, k=25)

        self.recall_at_5_corrects_sum += self.recall_at_5_corrects
        self.recall_at_10_corrects_sum += self.recall_at_10_corrects
        self.recall_at_15_corrects_sum += self.recall_at_15_corrects
        self.recall_at_20_corrects_sum += self.recall_at_20_corrects
        self.recall_at_25_corrects_sum += self.recall_at_25_corrects

        self.recall_at_5_total_sum += self.recall_at_5_total
        self.recall_at_10_total_sum += self.recall_at_10_total
        self.recall_at_15_total_sum += self.recall_at_15_total
        self.recall_at_20_total_sum += self.recall_at_20_total
        self.recall_at_25_total_sum += self.recall_at_25_total

#### Save the model

In [None]:
def save_model(model, name):
    m_dir = os.path.join('modelos')
    if not os.path.exists(m_dir):
        os.mkdir(m_dir)
    model.save(os.path.join(m_dir, "model_{}.h5".format(name)))
    print("Saved model to disk")

#### Train siamese model

In [None]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Embeddings
cnn_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=True)
lstm_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=True)

# Feature models
cnn_feature_model = cnn_model(cnn_embedding_layer, MAX_SEQUENCE_LENGTH_D)
lstm_feature_model = lstm_model(lstm_embedding_layer, MAX_SEQUENCE_LENGTH_T)
mlp_feature_model = mlp_model(number_of_columns_info)

# Similarity model
similarity_model = siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 10
best_recall = 0
best_epoch = 0
verbose = 0

# Pre load validation
buckets, buckets_data = get_buckets_from_validation(verbose, batch_triplets_valid)
queries_validation = get_validation_ids(verbose, batch_triplets_valid)
queries_test_vectorized, queries_data = get_validation_batch(verbose, model, queries_validation)

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = baseline.batch_iterator(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
    train_batch = [train_input_sample['title'], train_input_pos['title'], train_input_neg['title'], 
                 train_input_sample['description'], train_input_pos['description'], train_input_neg['description'],
                train_input_sample['info'], train_input_pos['info'], train_input_neg['info']]
    
    h = similarity_model.train_on_batch(train_batch, train_sim)
    recall, valid_loss, valid_cos_pos, valid_cos_neg = evaluate_validation_test(verbose, test_gen, batch_triplets_valid, 
                                buckets, buckets_data, queries_test_vectorized, queries_data)
    print("Epoch: {} - Loss: {:.2f}, positive_cosine: {:.2f}, negative_cosine: {:.2f}, validation_recall@25: {:.2f}".format(
        epoch+1, h[0], h[1], h[2], recall))
    if recall > best_recall:
        save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
        best_recall = recall
        best_epoch = epoch
    # Without step decay for each 10 epochs because the Adam optimizer already do this
    # https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1
print('Best_epoch={}, Best_recall={:.2f}'.format(best_epoch, best_recall))

### Using the feature layers

#### Similarity cosine 

In [33]:
def cosine_normalized(a, b):
    a = K.variable(a)
    b = K.variable(b)
    # normalization
    #a, a_norm = normalize(a)
    #b, b_norm = normalize(b)
    a_norm = K.sqrt(K.sum(K.square(a), axis=-1, keepdims=False))
    b_norm = K.sqrt(K.sum(K.square(b), axis=-1, keepdims=False))
    a_norm = K.maximum(a_norm, K.epsilon())
    b_norm = K.maximum(b_norm, K.epsilon())
    # dot
    #print("Var from mut", K.eval(a), K.eval(b))
    cos_sim = K.sum( a * b ) / (a_norm * b_norm)  
    #print("Mut", K.eval(cos_sim))
    return K.eval( (cos_sim + 1 ) / 2 )
    #return K.eval(cos_sim)[0], K.eval(K.sum(a * b)), K.eval(a_norm), K.eval(b_norm)
    
def cos_distance_keras(y_true, y_pred):
    y_true = K.l2_normalize(y_true, axis=-1)
    y_pred = K.l2_normalize(y_pred, axis=-1)
    return K.eval(K.mean(K.sum(y_true * y_pred, axis=-1)))

def cos_distance(y_true, y_pred):
    def l2_normalize(x, axis):
        norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        return K.maximum(x, K.epsilon()) / K.maximum(norm, K.epsilon())
    y_true = l2_normalize(y_true, axis=-1)
    y_pred = l2_normalize(y_pred, axis=-1)
    return K.eval(K.mean(y_true * y_pred))

def cos_custom(a, b): # Cosine used in the siamese model
    a = K.variable(a)
    b = K.variable(b)
    return K.eval(cosine_distance([a, b]))

bug_vector_a_t = np.random.rand(2)
bug_vector_b_t = -1.0 * np.random.rand(2)
# bug_vector_a_t = np.array([1.0, 1.0, 2.0])
# bug_vector_b_t = np.array([1.0, 2.0, 1.0])
bug_vector_a_t = np.array([0.0, 0.1, 0.0])
bug_vector_b_t = np.array([0.0, 0.1, 0.1])

print(bug_vector_a_t, bug_vector_b_t)

result = cos_distance(bug_vector_a_t, bug_vector_b_t)
result2 = cosine_normalized(bug_vector_a_t, bug_vector_b_t)
result3 = cos_distance_keras(bug_vector_a_t, bug_vector_b_t)
result4 = cos_custom(bug_vector_a_t, bug_vector_b_t)
result, result2, result3, result4

[0.  0.1 0. ] [0.  0.1 0.1]


(0.23570249609801194, 0.8535534, 0.7071067811865475, 0.8535538)

In [476]:
bug_vector_a_t = np.array([1, 1, 2])
bug_vector_b_t = np.array([1, 2, 1])
result = cos_custom(bug_vector_a_t, bug_vector_b_t)
result

0.9166666

#### Loading bugs of test

In [32]:
from scipy import spatial
bug_set = baseline.get_bug_set()
bug_id = [96204, np.random.choice(list(bug_set))] # non-duplicate {15196, 2}
# bug_id = [96204, 85581] # duplicate {85581, 96204, 106979}
dup_a, dup_b = bug_id
bug_a = bug_set[dup_a]
bug_b = bug_set[dup_b]

dup_a, dup_b

(96204, 402530)

#### LSTM feature

In [87]:
bug_a['title'], bug_b['title']

('preferences filter text cut off using default fonts on organization',
 'organization should be able to parse swt color constants')

In [88]:
bug_vector_a_t = lstm_feature_model.predict(np.array([bug_a['title_word']]))[0]
bug_vector_b_t = lstm_feature_model.predict(np.array([bug_b['title_word']]))[0]
result = cosine_normalized(bug_vector_a_t, bug_vector_b_t)
result

0.9587623

In [89]:
bug_vector_a_t, bug_vector_b_t

(array([ 2.49019265e-03, -1.60485297e-03, -8.00511334e-03, -4.36931569e-03,
         3.80078261e-03, -7.72654591e-03,  1.18791999e-03, -1.23189471e-03,
        -6.61424035e-03, -5.28087979e-03, -8.17856472e-03, -1.36691201e-02,
        -3.50416638e-03,  9.99897439e-03, -7.14393845e-03,  1.43039436e-03,
        -5.39363921e-03, -1.05063571e-02, -1.47022083e-02,  1.06756678e-02,
        -1.14908516e-02, -1.28844017e-02,  1.17386179e-02, -7.49616744e-03,
         6.19291374e-03, -1.54948235e-02,  1.27496589e-02,  2.06830795e-03,
        -2.79657356e-03, -4.05305345e-03,  5.60892466e-03,  1.46355713e-02,
        -2.48667272e-03, -3.11291520e-03,  5.17379260e-03,  1.89953148e-02,
        -8.90053064e-03,  7.44841527e-05,  1.56687282e-03,  1.10621415e-02,
        -2.15342222e-03,  3.85728665e-03,  1.66874588e-03, -1.00547038e-02,
        -1.35402819e-02, -7.29402620e-03, -4.51371633e-03,  7.58049265e-03,
        -2.44058203e-04,  4.99692652e-03,  3.65793053e-03,  1.92573480e-03,
         7.9

#### CNN feature

In [90]:
bug_a['description'], bug_b['description']

('nnumber the standard default font size for organization desktops is number points at number organization at this size the message type filter text in the preferences dialog is being cut off',
 'swt provides access to specific organization theme colors and these are currently used in the number x organization we should provide a way for them to be used directly in stylesheets')

In [91]:
bug_vector_a_d = cnn_feature_model.predict(np.array([bug_a['description_word']]))[0]
bug_vector_b_d = cnn_feature_model.predict(np.array([bug_b['description_word']]))[0]
result = cosine_normalized(bug_vector_a_d, bug_vector_b_d)
result

0.98870444

In [92]:
bug_vector_a_d, bug_vector_b_d

(array([ 8.8830860e-03, -1.0803261e-02,  1.0162550e-02,  1.2007233e-02,
         1.0880478e-02, -1.7862413e-02, -5.3712893e-02, -2.0619776e-02,
        -8.7249121e-03, -1.2073410e-02, -3.8517809e-03, -1.6929395e-02,
         2.4875256e-03, -4.1323770e-02, -6.6935318e-05, -9.0786377e-03,
        -2.1728144e-03, -2.1961698e-02, -1.7755976e-02,  4.0341867e-05,
        -1.2962662e-02,  3.4561583e-03,  7.6260841e-03, -2.4003433e-03,
        -2.3997778e-03, -3.5582546e-02, -2.8721876e-02, -2.5514863e-05,
         1.3654120e-03, -4.2066039e-03, -3.9551809e-02,  4.5694960e-03,
         5.3639938e-03,  2.3343449e-04,  1.7832408e-02,  4.8351940e-02,
        -3.7632692e-03, -3.5166917e-03, -2.1306466e-02,  2.6301065e-02,
        -1.5773321e-02, -2.3028695e-03, -5.1696677e-02, -3.0652801e-02,
        -1.7624872e-02, -6.0685039e-02,  1.0800240e-02,  1.1033656e-02,
         1.2291396e-02,  1.7734807e-02,  1.2195990e-02,  2.3077307e-03,
        -1.3846916e-03, -4.4645134e-02,  2.3815729e-02,  2.13890

#### MLP feature

In [93]:
bug_vector_a_i = mlp_feature_model.predict(np.array([retrieval.get_info(bug_a)]))[0]
bug_vector_b_i = mlp_feature_model.predict(np.array([retrieval.get_info(bug_b)]))[0]
result = cosine_normalized(bug_vector_a_i, bug_vector_b_i)
result

0.91240877

In [94]:
bug_vector_a_i, bug_vector_b_i

(array([ 0.00294508,  0.12485351, -0.01198962,  0.00761335,  0.10318834,
         0.10463256,  0.15832482,  0.08514886,  0.13201833,  0.10030671,
         0.00346754,  0.06291472,  0.00116868, -0.00692416,  0.02047804,
        -0.0098251 ,  0.12770486,  0.0309453 ,  0.1156408 ,  0.07484929,
         0.06361708,  0.11392093,  0.05528144,  0.0607774 , -0.01772368,
         0.06012239,  0.00829012, -0.01182164, -0.01215539,  0.03081871,
         0.03360711, -0.00719916,  0.07813127,  0.01906503, -0.03079052,
        -0.12193003, -0.0299939 ,  0.08874536,  0.15156765, -0.04985955,
         0.00266152, -0.00970931, -0.02372491,  0.15421453,  0.09760781,
         0.05890281,  0.13539729,  0.06730033,  0.0239473 ,  0.06715932,
        -0.05586645, -0.02251041,  0.1096842 ,  0.08411299, -0.01812845,
         0.00847009,  0.1409047 , -0.00940015, -0.05961862,  0.01844124,
         0.04384093,  0.00596237,  0.01865939, -0.02397243,  0.06010606,
         0.00817457, -0.03596248,  0.00515889, -0.0

#### Merge features

In [95]:
bug_vector_a = np.concatenate([ bug_vector_a_i, bug_vector_a_t, bug_vector_a_d ], -1)
bug_vector_b = np.concatenate([ bug_vector_b_i, bug_vector_b_t, bug_vector_b_d ], -1)
result = cosine_normalized(bug_vector_a, bug_vector_b)
result

0.9192383

In [96]:
bug_vector_a, bug_vector_b

(array([ 2.94508343e-03,  1.24853514e-01, -1.19896187e-02,  7.61334971e-03,
         1.03188336e-01,  1.04632564e-01,  1.58324823e-01,  8.51488635e-02,
         1.32018328e-01,  1.00306712e-01,  3.46754305e-03,  6.29147217e-02,
         1.16868038e-03, -6.92415517e-03,  2.04780437e-02, -9.82510205e-03,
         1.27704859e-01,  3.09452973e-02,  1.15640804e-01,  7.48492852e-02,
         6.36170805e-02,  1.13920934e-01,  5.52814379e-02,  6.07774034e-02,
        -1.77236833e-02,  6.01223893e-02,  8.29012319e-03, -1.18216397e-02,
        -1.21553941e-02,  3.08187138e-02,  3.36071067e-02, -7.19915982e-03,
         7.81312659e-02,  1.90650318e-02, -3.07905171e-02, -1.21930033e-01,
        -2.99938954e-02,  8.87453556e-02,  1.51567653e-01, -4.98595499e-02,
         2.66152481e-03, -9.70931165e-03, -2.37249099e-02,  1.54214531e-01,
         9.76078063e-02,  5.89028075e-02,  1.35397285e-01,  6.73003271e-02,
         2.39473041e-02,  6.71593249e-02, -5.58664463e-02, -2.25104094e-02,
         1.0

### Retrieval evaluation

In [48]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

In [49]:
print("Total of queries:", len(retrieval.test))

Total of queries: 12864


#### Selecting bugs from test

In [50]:
buckets_train = set()
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    vectorizer = [bug_id] 
    vectorizer += ground_truth
    for test_bug_id in vectorizer:
        buckets_train.add(issues_by_buckets[test_bug_id])

HBox(children=(IntProgress(value=0, max=12864), HTML(value='')))




#### Getting the model trained

In [203]:
model = get_model_vectorizer()

In [204]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 100)          168300      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

#### Vectorizing bugs from train

In [205]:
def vectorizer_buckets_train(model, buckets_train):
    bug_set = retrieval.baseline.get_bug_set()
    buckets_train_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = tqdm(buckets_train)
    loop.set_description('Vectorizing buckets')
    for bug_id in loop: # retrieval.bugs_train
        bug = bug_set[bug_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        buckets_train_vectorized.append({ 'bug_id' : bug_id })
    loop.close()
    # Get embedding of all buckets
    embed_buckets = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the buckets array
    for index, vector in enumerate(embed_buckets):
        buckets_train_vectorized[index]['vector'] = vector
    
    return buckets_train_vectorized

In [206]:
buckets_train_vectorized = vectorizer_buckets_train(model, buckets_train)

HBox(children=(IntProgress(value=0, max=8086), HTML(value='')))

#### Vectorizing bugs from test

In [207]:
bug_set = retrieval.baseline.get_bug_set()
queries_test_vectorized = []
title_data, desc_data, info_data = [], [], []
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    vectorizer = [bug_id] 
    vectorizer += ground_truth
    for test_bug_id in vectorizer:
        if issues_by_buckets[test_bug_id] == test_bug_id: continue # if the bug is the master
        bug = bug_set[test_bug_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        #bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
        queries_test_vectorized.append({ 'bug_id' : test_bug_id, 'ground_truth': issues_by_buckets[test_bug_id] })

# Get embedding of all buckets
embed_queries = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
# Fill the queries array    
for index, vector in enumerate(embed_queries):
    queries_test_vectorized[index]['vector'] = vector

HBox(children=(IntProgress(value=0, max=12864), HTML(value='')))

#### Indexing bugs

In [246]:
# Indexing all train
def indexing_train(buckets_train_vectorized, verbose=1):
    X = np.array(buckets_train_vectorized)
    annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

    loop = total=len(X)
    if(verbose):
        loop = tqdm(total=len(X))
        loop.set_description("Indexing buckets in annoy")
    for index, row in enumerate(X):
        vector = row['vector']
        annoy.add_item(index, vector)
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    annoy.build(10) # 10 trees
    return annoy

In [210]:
annoy = indexing_train(buckets_train_vectorized)

HBox(children=(IntProgress(value=0, max=8086), HTML(value='')))

#### Getting the list of candidates

In [247]:
def indexing_test(queries_test_vectorized, verbose=1):
    X_test = queries_test_vectorized
    distance_test, indices_test = [], []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Getting the list of candidates from queries')
    for index, row in loop:
        vector = row['vector']
        rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
        indices_test.append(rank)
        distance_test.append(1 - np.array(dist)) # normalize the similarity between 0 and 1
    if(verbose): loop.close()
    return X_test, distance_test, indices_test

In [212]:
X_test, distance_test, indices_test = indexing_test(queries_test_vectorized)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [213]:
print("Total buckets train vectorized: {}".format(len(buckets_train_vectorized)))
print("Total queries vectorized: {}".format(len(queries_test_vectorized)))

Total buckets train vectorized: 8086
Total queries vectorized: 22758


#### Rank result

In [239]:
def rank_result(indices_test, distance_test, verbose=1):
    formated_rank = []
    loop = zip(indices_test, distance_test)
    if(verbose):
        loop = tqdm(zip(indices_test, distance_test))
        loop.set_description('Generating the rank')
    for row_index, row_sim in loop:
        row_index, row_sim = row_index[:25], row_sim[:25]
        formated_rank.append(",".join(["{}:{}".format(buckets_train_vectorized[index]['bug_id'], sim) 
                                       for index, sim in zip(row_index, row_sim)]))
    if(verbose): loop.close()
    return formated_rank

In [215]:
formated_rank = rank_result(indices_test, distance_test)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

#### Queries

In [240]:
# Generating the rank result
def formating_rank(X_test, verbose=1):
    rank_queries = []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Generating the queries from rank')
    for index, row in loop:
        dup_a, ground_truth = row['bug_id'], row['ground_truth']
        rank_queries.append("{}:{}".format(dup_a, ground_truth))
    if(verbose): loop.close()
    return rank_queries

In [217]:
rank_queries = formating_rank(X_test)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [248]:
def export_rank(rank_queries, formated_rank, verbose=1):
    exported_rank = []
    loop = len(rank_queries)
    if(verbose):
        loop = tqdm(total=len(rank_queries))
        loop.set_description('Exporting the rank')
    for query, rank in zip(rank_queries, formated_rank):
        exported_rank.append("{}|{}".format(query, rank))
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    return exported_rank

In [219]:
exported_rank = export_rank(rank_queries, formated_rank)

HBox(children=(IntProgress(value=0, max=22758), HTML(value='')))

In [220]:
exported_rank[:20]

['229466:229377|213305:0.8917736858129501,230940:0.8482553958892822,256493:0.8428337126970291,211796:0.8367926627397537,239109:0.8214627057313919,207927:0.8084089756011963,239477:0.7742815613746643,216357:0.7498266994953156,295482:0.7485641539096832,207347:0.7459672093391418,226363:0.7429795563220978,236524:0.7426169812679291,245667:0.7340384721755981,165468:0.7261808216571808,155390:0.7190908193588257,177589:0.7147897183895111,236855:0.7087783813476562,166483:0.7067970633506775,178044:0.7061905562877655,236513:0.7040550112724304,164437:0.7010302543640137,259013:0.7009277641773224,164131:0.6993570923805237,167656:0.696305125951767,188381:0.6945757865905762',
 '98307:99831|99831:0.8204733431339264,110715:0.6560676395893097,90100:0.6312716007232666,121593:0.6059416830539703,87319:0.5337146818637848,206810:0.5054284930229187,93363:0.466127872467041,127820:0.4575258493423462,120715:0.442516565322876,124023:0.4391249418258667,82836:0.4380982518196106,125364:0.43417614698410034,81997:0.37885

In [221]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [222]:
'''
    With CNN print all embeddings zero and 2 epochs
    {'1 - recall_at_5': 0.13,
     '2 - recall_at_10': 0.18,
     '3 - recall_at_15': 0.22,
     '4 - recall_at_20': 0.24}
     Without relu activation for each feature siamese in 100 epochs
     {'1 - recall_at_5': 0.16,
     '2 - recall_at_10': 0.23,
     '3 - recall_at_15': 0.27,
     '4 - recall_at_20': 0.31}
     Without dense in the last layer with 100 epochs with embed trainable
     {'1 - recall_at_5': 0.16,
     '2 - recall_at_10': 0.22,
     '3 - recall_at_15': 0.26,
     '4 - recall_at_20': 0.3}
      
      {'1 - recall_at_5': 0.16,
         '2 - recall_at_10': 0.22,
         '3 - recall_at_15': 0.26,
         '4 - recall_at_20': 0.29,
         '5 - recall_at_25': 0.29}
    With title (100 padding) and desc (500 padding) and batch refactored
        {'1 - recall_at_5': 0.2,
         '2 - recall_at_10': 0.26,
         '3 - recall_at_15': 0.3,
         '4 - recall_at_20': 0.33,
         '5 - recall_at_25': 0.33}
         
         {'1 - recall_at_5': 0.2,
         '2 - recall_at_10': 0.27,
         '3 - recall_at_15': 0.31,
         '4 - recall_at_20': 0.34,
         '5 - recall_at_25': 0.34}
'''
evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report

Evaluating...


{'1 - recall_at_5': 0.2,
 '2 - recall_at_10': 0.27,
 '3 - recall_at_15': 0.31,
 '4 - recall_at_20': 0.34,
 '5 - recall_at_25': 0.34}