# Bug triage with Deep Learning

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = 'baseline_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

#### Loading bug ids in memory

In [9]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

212512

### Dicionário de títulos e descrições

In [10]:
%%time

baseline.load_preprocess()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


CPU times: user 1min 16s, sys: 2 s, total: 1min 18s
Wall time: 1min 18s


## Geração de batches

### Generating tiple of batches

In [11]:
%%time
baseline.prepare_dataset()

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data
CPU times: user 404 ms, sys: 16 ms, total: 420 ms
Wall time: 410 ms


In [12]:
baseline.load_bugs()

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




In [13]:
if 2521 in baseline.bug_set:
    baseline.bug_set[2521]

In [14]:
%%time

batch_size = 64
batch_size_test = 512

# we want a constant validation group to have a frame of reference for model performance
train_gen = baseline.siam_gen(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = baseline.batch_iterator(baseline.train_data, 
                                                                                          baseline.dup_sets_train, 
                                                                                          batch_size_test, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]

CPU times: user 65.1 ms, sys: 8.07 ms, total: 73.2 ms
Wall time: 73.9 ms


In [15]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((512, 100), (512, 500), (512, 1682), (512,))

### Validar entrada

In [16]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, 5)

***Title***: organization actions in the global debug toolbar are enabled prematurely
***Title***: compatibility in organization the build and manage config buttons are disabled when editor has focus
***Description***: the actions in the global debug toolbar are enabled on startup if you click on them they become disabled they start off as being disabled in
***Description***: using person id juno create project hello world with the focus on the editor and no text selected the organization configurations button looks like compass and the person active configuration button hammer are disabled in the window toolbar if text is selected then the buttons are enabled it works correctly with organizationrc
***similar = 1
########################
***Title***: organization at at org eclipse ui keys person hash code person java
***Title***: unable to launch organization jdt
***Description***: attempting to install upon intial launch received this error in the log file is running fine session entr

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [17]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'word_vocab.pkl'))
for token in vocab:
    print(token)

vocabulary loaded
compressing
xcc
after
unzipped
cached
producturce
personhcg
tributi
capabilitypatterns
undesirable
idas
deserialization
zend
mnt
wk
patches
presenters
imodule
changesets
artworks
opm
amis
allready
impacting
dcec
countryboard
datacube
operational
run
personren
poped
modifie
wtpmodules
commenthandler
extent
ek
organizationbirt
dds
buildd
performs
importexport
showlocation
cloneable
bendpoints
packages
xhbsd
winstone
artworkr
essentialplatform
ejbreferences
silently
xbf
reworded
username
perfs
personorg
abd
panes
ish
tunneling
spare
plugincustomization
lastly
suffix
zz
defer
dbms
dell
effort
generates
gitroot
linenumber
fashion
elt
djacobs
positinumberd
shold
intercepted
mainly
organizationn
organizationright
lorganization
lambda
anoncvs
foldable
clarify
title
composer
relying
zlib
arnumberrk
became
three
personcontrol
touchpoint
enhydra
tracepoint
performer
organizationnode
swadhwa
planty
october
obstacles
ables
respect
organizationshell
gzip
sake
dispatch
ter
subclasse

reorganized
oshut
jabber
dren
depends
accumulated
unregister
originally
thoroughly
jb
containing
ord
ericsson
cã
janb
handle
caches
intending
reuse
uide
kiosk
planning
misplaced
process
parserbridge
chair
andrew
doubled
tjx
xwork
soapenc
organizationwizard
cone
acronyms
stereotyped
locationbuild
xlink
libcom
rejected
targ
jsflibrary
evidently
reference
subtask
libxpconnect
joining
gates
facilityr
overcome
libqtengine
wipes
strategies
gcda
precedence
crawling
retargettable
arraylength
idcode
organizationmake
prs
int
productand
msi
qvtr
rails
krb
failures
ie
restarting
nationalityable
reynolds
accollector
uncommenting
personpnl
ways
greatest
worldwind
translates
emsweb
impression
coexist
aaaab
ad
rocks
shortcomings
svnconnector
libicucore
basic
stepping
dubious
organizationradapter
pag
holding
augmented
getting
intertype
operand
quintron
facilitygurations
organizationadd
jtable
checkout
antcontrib
dimension
images
xtxt
simplifying
completely
slice
icontent
oci
gccscanner
sites
sqldev
sub

essentially
malformed
degradation
usagedata
tiled
soapclient
eike
contrary
beehive
interning
lenient
keycode
autosave
imo
prepopulated
rochade
vmnode
bec
faces
forbidden
libpipboot
aaf
ye
cienaoc
frees
visualizer
insignificant
personsr
advise
swtexception
eclipserun
disk
ejb
deps
arguably
retention
advance
marte
completions
comodification
fade
logging
furthermore
xi
dave
registers
chevron
webbrowser
duplicates
signify
organizationview
kindly
probescript
helpdata
megs
ugly
decrement
runnable
bffb
broader
cropped
between
editions
column
dailybuild
segoe
lint
vtp
blow
aligns
payment
eclpise
unreasonable
collected
techniques
maker
ion
personversion
cu
libgalaxy
tableviewer
lsof
snp
personlabel
jee
omry
realizations
expire
grave
dialogue
personifx
doubling
astjob
hierachy
zgvte
businessobjects
ncpwin
xffffffff
hotbug
quick
facilityj
eng
geographic
challenging
starterkit
xffffe
loggable
dennis
walking
libgnome
msafd
adapters
emfcompare
noreference
seeking
inconvenience
carried
intercept
bean

malloc
unedit
libra
excluding
wolips
scheduling
administration
libdispatch
hh
javax
roots
styling
infor
gap
autosar
slicer
persingleton
assign
adapting
hbx
pascal
avalidate
sqos
endpoints
officially
tremendously
rms
chfieldto
hibernatedoclet
corrects
surname
personhtml
teamswt
productecore
eclise
absence
initialized
agentcontroller
predefine
magic
anyway
swith
tuning
stated
strikethrough
faults
personclosed
lagno
libuconv
cope
retrieves
itself
organizationgetstatic
reviewer
jbo
optimal
redefined
thoughts
organizationock
zipfile
multipage
abcde
antrun
somepackage
nail
pointed
audio
libgklayout
illegible
cq
aaacc
behave
behaves
visits
unpublish
unfamiliar
dups
camera
benchmarking
log
relative
soap
gkz
pp
dictates
injection
gfx
reformat
perceived
exceeding
tascpt
spurious
cyclic
refrences
consistant
xmx
previous
brute
personrch
xestorganizationktrorganizatione
darins
einstellungen
collide
mad
ind
dougie
pipelined
prova
organizationcharacter
dial
redirected
organizationon
pack
dyn
recommen

wsspi
ffbfe
incorporating
wpe
eeimport
explains
contained
runtimes
somebody
webdefault
recommending
spy
compaction
jnienv
immediatly
assignment
oard
spend
umlnode
oddly
organizationemitter
booting
stores
execu
forcibly
tvf
isnâ
nsobject
scsi
solving
ak
viewtopic
jn
slowing
devices
organizationfactory
emfatic
undocked
dph
safepoint
econ
wpf
libmd
gcountryibc
cvsnt
ordering
lengthy
partitinumberr
organizationhead
parametric
one
qualifiers
gz
segmentation
suspend
consistency
javaâ
law
unkown
xsdorganizationa
bob
milestnumbers
irc
istructured
waste
exchange
cdocommit
persistently
personare
characters
ufacekit
whichever
unencrypted
formalize
sea
reclaimed
msimtf
midp
basebuilder
animations
rtpdesign
shrunk
analyzed
appsrv
alas
nworganization
selector
libudev
deprecating
twistie
exponent
restricted
unassigned
rip
lifting
dispose
continued
program
befor
clarity
rainbow
tahoma
cursor
forest
fail
valgrind
elsewhere
raleigh
ioexceptions
wsil
classcast
personlog
personmissing
dislike
fg
fullpath


deferring
xfrqvfa
mockito
tracked
enumerate
pulled
art
tech
histogram
presto
param
module
board
offers
grouped
redefines
evaluations
passphrase
qr
affinity
libz
facilityer
eproduct
unstable
iuproperty
replaceregexp
trafo
countryection
vsf
sol
prefixes
listboxes
late
moo
incquery
injecting
idl
teams
cg
extended
parenting
appending
improving
splitter
honest
nop
jsessionid
tcl
actually
android
jdev
flattened
organizationstructure
ihe
vx
xmlload
management
organizationreader
appendable
productzz
toobar
pqr
overridable
fwsl
organizationline
dmcontext
elaboration
jawin
ocument
closing
obey
capitalize
netrap
coretestapplication
mikhail
organizationmessages
further
serviceregistry
tip
stereotype
multibyte
eve
allocator
basename
findrefs
jst
stager
removed
granite
moments
uids
group
idfactory
jl
declaratively
worse
webtier
mylar
caa
ack
crtl
pae
classes
ute
scrubbing
mt
aaa
commiter
wid
sorganizationt
ac
xmlfile
echoed
dir
comparisons
exclusion
cancels
oprofile
user
happy
libraries
measurement


sdks
organizationdecorator
fu
funky
libmx
happily
testframework
ddlgenerator
reproducible
brackets
unneccessary
mem
localization
older
mergable
personcation
selectionactions
ffffff
poc
samlib
ue
eventhough
realized
homepath
complexity
persondll
spec
kmoir
completeclipse
organizationbug
suspect
lifetime
room
importantly
rpmstubby
denis
comming
nput
personegl
controlmode
personannotation
giopmsgheaders
grimeton
buggy
ago
dit
lowest
supplies
ñžð½
acce
captured
quit
facilityor
myplugin
canned
cosmos
normalized
parts
linewrap
attempt
tmatesoft
ernal
ottvm
wonderful
gard
personsh
never
pdomcppclass
contributions
val
cbe
teamcvsui
commit
remoteservices
xmlroot
simul
bstract
eucjp
destroyed
cdoclient
frequency
ticked
differentiating
unsaved
qs
intermittently
jcp
href
certificate
organizationfolder
breadcrumb
xl
libxul
personentity
curves
clears
round
igor
personif
comparision
numbers
ecma
hw
implication
mytag
dnow
edi
xmldocimport
symlinked
fiddling
bugs
rtag
hotmail
issuing
threading
regex
wa

potential
installers
spikes
stapler
orbutil
dx
oome
datas
draws
productger
extracts
bleeding
incompletely
bundle
platforms
ebfc
rerunning
indention
personhad
migrator
execution
jaxpperson
organizationgen
epi
indexer
highlighters
gyrex
iter
contentassist
employ
xft
domadapter
gronback
entail
asterisks
isc
doctype
gmfgen
turkish
unter
sampledb
maturity
sqllib
buildfile
easie
happens
ros
organizationrun
sprint
sigbcountry
ejp
personad
objective
organizationbuilder
xfdd
intensive
displaced
generic
bitwise
estring
inits
units
agilemore
percents
adaptable
recipient
term
ireporter
clnumberd
organizationfeature
flusher
verbose
hotspot
antialias
xmlrpc
reroot
recognise
paragraphs
oauth
ron
modeled
loops
legal
disclosure
tefilter
awful
volume
syslog
ioutil
jspmodel
zfl
actived
coordinator
uilder
dup
drwxr
moc
emfvm
encryption
public
following
safely
balloon
sx
dict
ijob
reparent
arbitrary
zclipse
fingerprint
sizing
shortened
chkpii
neighbours
drag
fanfare
creation
submodel
fzk
sans
personstyle
l

mass
organizationfetch
regiondept
nq
bjal
emp
shortcoming
embeddable
devstudio
dbutil
weakness
indexes
traverses
buttom
shading
instances
preinitialization
hearing
lenght
recalculate
virus
libart
adversely
compilerarg
output
ideworkbench
msltransactional
seeing
toolinggtk
organizationdo
she
commands
personbinary
xtend
throwing
advising
castortech
fsm
strips
ithread
testapp
actionhandler
mcall
cdtbuild
upgrades
aaacb
doing
projekte
telecom
feedburner
behaving
something
absolutely
locationrel
jdtcoresrc
ijjjljava
ctxhelp
gr
arises
wtpbuilder
foliofn
emitted
screencapture
pdftable
personen
multicast
anyedit
combination
dag
benign
amazon
blooper
personkbench
provoke
ttt
memmove
xxxxxxxxxx
behaviours
uvm
rasapi
ebfd
checkmarks
strict
undeploy
managedbuilder
high
glite
allocating
switcher
reposition
overwrites
targetted
localized
classloader
gets
nlv
withaxes
reroute
organizationhas
remotely
deprecate
assp
daniel
organizationlinenumber
whose
alex
power
prime
multiselect
kd
corganization
pare

clarified
liberace
question
programm
exemple
sitraka
jsps
flflmj
key
applets
admittedly
hierarchial
seeded
unfilter
trc
procedures
mdrs
uiimpl
dex
backing
pq
ocl
personmain
yf
manage
wli
gone
propertly
insets
lowercase
emfvisitable
continually
preload
stems
repack
noise
invokevirtual
successful
voice
refui
messenger
explanations
jstojava
unlike
subpack
onclick
aaaaaf
tcp
neglected
cyclical
inject
blade
reselect
devs
landmarks
tps
xaf
fvt
itype
french
commiters
bss
sel
swg
hacky
eresource
traversing
earcomponent
ownable
epl
construct
buildresults
directed
nsautorelease
smarter
organizationapp
curently
mounting
ampersand
normaliz
wireadmin
setconfig
dominator
resutls
meantime
torolab
clumsy
tj
olivier
organizationthere
hybrid
mneumonic
ras
fallback
crashs
succeeded
xplanner
organizationc
showcase
modelqueryimpl
lexer
tao
cart
dereference
dependency
evil
fdd
xfb
subscribe
xmp
benefits
renaming
august
id
facilityis
inadvertently
svnkit
lib
transactions
cocoon
tooling
introduces
wstx
arbitr

bsiag
qos
imagine
stays
jpl
initmenupopup
eck
personhaving
ccb
libjar
sould
identitymaps
persontrol
honor
showed
unfolded
absorbed
starting
whereby
distinguishing
servletbridge
barrier
randy
casted
customized
urlfactory
completition
tu
rth
persontrying
iterate
urn
regenerated
jacoco
personsor
futher
integrates
visualizing
sqljet
hoehe
liblayout
reportdesign
glossary
connected
organizationout
trunc
feat
fetching
personsource
truncate
ows
organizationcontext
backwards
volatile
znumber
guides
daze
jdbcdriver
vanish
smoothly
receiving
rapid
toto
decrease
bpmn
servant
poorly
opposite
xbd
spinner
countrying
grcad
organizationsp
extensoes
judge
angle
immodules
personsince
inspired
pyramid
multithreaded
mtez
sorters
expection
gnucppsource
instruments
mango
destinations
genkey
informix
contr
coping
ltr
exceeds
inverted
mtime
suppliers
adaptors
uu
personfor
adamb
ning
za
constellation
personing
continue
fefc
egl
personplugins
favor
comprised
iostream
thinking
jdbctable
dea
beleive
noinstantiate


celadon
wsock
parallel
argument
ethernet
organizations
tutorial
externalization
galaxy
classnames
comparemerge
opt
discarding
dlcl
ielement
wcag
delegate
implement
systemtype
profiles
casts
maxdb
net
sfilter
misspelling
forcing
personof
submit
bbc
privileges
popping
poapackage
traditionally
personcompiler
exisiting
clibmod
consequence
axiom
llvm
tricky
emfoperation
speed
breadcrumbs
icorganization
targetvm
cristina
cstnode
smarty
measures
stress
ahead
ffbeb
propert
cppastparameter
squiggly
attrs
contextlaunching
contains
mqisdp
greenpages
persisting
mycom
leave
smalltalk
finfox
wldap
jpqlorganization
arising
unknow
designtime
embeded
paramater
osgiliath
year
aan
gestures
samba
personpix
admit
supertype
aims
organizationlist
baked
xmls
reorg
ename
unable
libj
formally
trackpad
xmlany
tsccos
requestor
javapp
epub
forge
fop
productor
accumulates
nationalityed
compose
hoped
confirms
can
ebx
synonym
papyrus
designator
agentpath
datasets
studies
invariably
exemplarysetup
automation
itask
gfo

apiscanner
say
plattform
brought
launcher
adteractive
rpmbuild
ghost
patching
firewall
metadata
bject
symmetric
locators
constants
portal
std
geis
ecli
quote
seconds
sub
buil
safari
srcs
generalizations
accounted
proceed
mercury
ifeq
cyan
buid
bundlor
textbox
mountainminds
vista
mbcs
cpclass
eearchive
document
orangevolt
midi
imagehlp
overlook
personabout
registry
apidocs
attachedbinary
branched
mst
jaxb
somenumber
agree
stability
ase
ew
zzzz
once
misunderstood
eannotations
organizationabstract
build
memorybrowser
conversely
cglib
phwetzler
tml
pixel
iousers
mtax
opposites
questionnaire
organizationscanner
nobody
orders
xtext
jslibrary
removing
targetting
versioncheck
tracwiki
mirroring
navi
iworkbench
gnwoc
slhau
months
timely
inte
wenn
thing
zipped
nominate
reformatting
geclipse
resultant
space
swallowing
crash
generated
organizationcore
role
vh
jrockit
taskdefs
gear
aopalliance
fileupload
thoughtworks
birt
irq
pdfpage
eclipsebuilder
compilance
dcterms
fil
places
dist
selinux
asked
e

facilityguration
personhandle
flexibility
sting
provi
edatatype
cff
validations
selectively
intent
matlab
stubbed
responsibility
become
wms
chrisaniszczyk
mailto
negotiate
mosaic
cppclass
mistake
cview
elias
organizationtranslation
interaction
disengage
work
libexpat
inavigator
domkit
px
perforce
saxon
cvsssh
ximian
dita
ctc
ear
astprovider
adaptive
schwerwiegend
castdeclarator
edithelper
xtab
includepath
specific
mouseup
anf
soarclient
consistantly
browse
jsdebug
personinstalled
junghan
researching
joerg
erro
xdoclet
displaykit
organizationmessage
organizationner
aniefer
semi
globus
jsch
environments
cluttering
libbonoboui
fusion
catched
che
kamil
codefarm
odaconsumer
xmlcomposite
exceptional
gresham
hsqldb
personstart
reimporting
iclasspath
pipelet
unfortunatly
realizes
downloadable
control
persondc
personef
organizationsocket
bull
deliverable
persontext
organizationworks
fcf
aspektor
sef
quickaccess
bution
fj
bindings
clientserver
troublesome
bucky
rationalsdp
emftest
deutil
refused

In [18]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 19998'

In [19]:
%%time

baseline.generating_embed(GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.
vocabulary loaded
Number of OOV words: 19998
CPU times: user 1min 21s, sys: 3.08 s, total: 1min 25s
Wall time: 1min 23s


## Experiment

### Training and evaluating for each epoch at same time

#### Auxiliary methods train experiment siamese

In [20]:
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

In [21]:
retrieval = Retrieval()

path = 'data/processed/{}'.format(DOMAIN)
path_buckets = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)
path_train = 'data/processed/{}/train.txt'.format(DOMAIN)
path_test = 'data/processed/{}/test.txt'.format(DOMAIN)

MAX_SEQUENCE_LENGTH_I = number_of_columns_info # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data


HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))


Creating the queries...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Hashing bugs by buckets

In [22]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




#### Model to vectorize

In [23]:
from keras.models import Model
from keras.layers import Input
from keras.models import load_model

def get_model_vectorizer(path=None, loaded_model=None):
    if(path):
        loaded_model = load_model(os.path.join("modelos", "model_{}.h5".format(path)), 
                                    {'l2_normalize' : l2_normalize, 
                                     'margin_loss' : margin_loss,
                                     'pos_distance' : pos_distance,
                                     'neg_distance' : neg_distance,
                                     'stack_tensors': stack_tensors})
    
    
    bug_title =  loaded_model.get_layer('title_in').input 
    bug_desc =  loaded_model.get_layer('desc_in').input 
    bug_info = loaded_model.get_layer('info_in').input 

    title_encoder = loaded_model.get_layer('FeatureLstmGenerationModel')
    desc_encoder = loaded_model.get_layer('FeatureCNNGenerationModel')
    info_encoder = loaded_model.get_layer('FeatureMlpGenerationModel')

    bug_t = title_encoder(bug_title)
    bug_d = desc_encoder(bug_desc)
    bug_i = info_encoder(bug_info)
    # Representation layer
    merge_layer = loaded_model.get_layer('merge_features_in')
    output = merge_layer([bug_i, bug_t, bug_d])
    # Bug Representation
    # bug_model = loaded_model.get_layer('dense_4')
    # output = bug_model(output)

    model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
    
    return model

#### Getting the list of candidates

In [24]:
def indexing_test(annoy, queries_test_vectorized, verbose=1):
    X_test = queries_test_vectorized
    distance_test, indices_test = [], []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Getting the list of candidates from queries')
    for index, row in loop:
        vector = row['vector']
        rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
        indices_test.append(rank)
        distance_test.append(1 - np.array(dist)) # normalize the similarity between 0 and 1
    if(verbose): loop.close()
    return X_test, distance_test, indices_test

#### Indexing bugs

In [25]:
# Indexing all train
def indexing_train(buckets_train_vectorized, verbose=1):
    X = np.array(buckets_train_vectorized)
    annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

    loop = total=len(X)
    if(verbose):
        loop = tqdm(total=len(X))
        loop.set_description("Indexing buckets in annoy")
    for index, row in enumerate(X):
        vector = row['vector']
        annoy.add_item(index, vector)
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    annoy.build(10) # 10 trees
    return annoy

#### Rank result

In [26]:
def rank_result(buckets_train_vectorized, indices_test, distance_test, verbose=1):
    formated_rank = []
    loop = zip(indices_test, distance_test)
    if(verbose):
        loop = tqdm(zip(indices_test, distance_test))
        loop.set_description('Generating the rank')
    for row_index, row_sim in loop:
        row_index, row_sim = row_index[:25], row_sim[:25]
        formated_rank.append(",".join(["{}:{}".format(buckets_train_vectorized[index]['bug_id'], sim) 
                                       for index, sim in zip(row_index, row_sim)]))
    if(verbose): loop.close()
    return formated_rank

#### Queries

In [27]:
# Generating the rank result
def formating_rank(X_test, verbose=1):
    rank_queries = []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Generating the queries from rank')
    for index, row in loop:
        dup_a, ground_truth = row['bug_id'], row['ground_truth']
        rank_queries.append("{}:{}".format(dup_a, ground_truth))
    if(verbose): loop.close()
    return rank_queries

In [28]:
def export_rank(rank_queries, formated_rank, verbose=1):
    exported_rank = []
    loop = len(rank_queries)
    if(verbose):
        loop = tqdm(total=len(rank_queries))
        loop.set_description('Exporting the rank')
    for query, rank in zip(rank_queries, formated_rank):
        exported_rank.append("{}|{}".format(query, rank))
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    return exported_rank

#### Methods to evaluate each epoch

In [29]:
def vectorizer_buckets(verbose, model, buckets, buckets_data):
    embed_buckets = model.predict(buckets_data)
    loop = enumerate(embed_buckets)
    if(verbose):
        loop = tqdm(enumerate(embed_buckets))
        loop.set_description("Vectorizing buckets batch")
    buckets_vectorized = []
    for index, vector in loop:
        buckets_vectorized.append({ 'bug_id': buckets[index],  'vector': vector })
    if(verbose): loop.close()
    return buckets_vectorized
    

def vectozing_validation_batch(verbose, model, queries_test_vectorized, queries_data):
    embed_validation = model.predict(queries_data)
    loop = enumerate(embed_validation)
    if(verbose):
        loop = tqdm(enumerate(embed_validation))
        loop.set_description("Vectorizing validation batch")
    for index, vector in loop:
        bug_id = queries_test_vectorized[index]['bug_id']
        if issues_by_buckets[bug_id] == bug_id: continue # if the bug is the master
        queries_test_vectorized[index]['vector'] = vector
    if(verbose): loop.close()
    return queries_test_vectorized

def get_validation_batch(verbose, queries_validation):
    bug_set = baseline.get_bug_set()
    queries_test_vectorized, queries_data = [], []
    title_data, desc_data, info_data = [], [], [] 
    loop = queries_validation
    if(verbose):
        loop = tqdm(queries_validation)
        loop.set_description("Getting validation batch")
    for bug_id in loop:
        if issues_by_buckets[bug_id] == bug_id: continue # if the bug is the master
        bug = bug_set[bug_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        queries_test_vectorized.append({ 'bug_id' : bug_id, 'ground_truth': issues_by_buckets[bug_id] })
    if(verbose): loop.close()
    queries_data = [ np.array(title_data), np.array(desc_data), np.array(info_data) ]
    return queries_test_vectorized, queries_data

def get_buckets_from_validation(verbose, validation_data):
    bug_set = baseline.get_bug_set()
    buckets = set()
    title_data, desc_data, info_data = [], [], []
    loop = validation_data
    if(verbose):
        loop = tqdm(validation_data)
        loop.set_description("Reading buckets from validation batch")
    for row in loop:
        bug_anchor, bug_pos, bug_neg = row
        vectorizer = [bug_anchor, bug_pos, bug_neg]
        for test_bug_id in vectorizer:
            buckets.add(issues_by_buckets[test_bug_id])
    for bucket_id in buckets:
        bug = bug_set[bucket_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
    buckets_data = [ np.array(title_data), np.array(desc_data), np.array(info_data) ]
    if(verbose): loop.close()
    return list(buckets), buckets_data

def get_validation_ids(verbose, validation_data):
    validation_bugs = []
    loop = validation_data
    if(verbose):
        loop = tqdm(validation_data)
        loop.set_description('Reading the bug ids from duplicates in validation')
    for row in loop:
        bug_anchor, bug_pos, bug_neg = row
        validation_bugs.append(bug_anchor)
        validation_bugs.append(bug_pos)
    if(verbose): loop.close()
    return validation_bugs

def get_loss_validation(model, queries_data, test_sim):
    # print(len(queries_data), queries_data[0].shape)
    return model.test_on_batch([ queries_data[0].tolist(), queries_data[1].tolist(), queries_data[2].tolist() ], test_sim)

def evaluate_validation_test(verbose, loaded_model, test_gen, 
                             batch_triplets_valid, buckets, buckets_data, 
                             queries_test_vectorized, queries_data, test, issues_by_buckets):
    test_batch, test_sim = test_gen
    model = get_model_vectorizer(loaded_model=loaded_model)
    #evaluation_test_batch = get_loss_validation(model, queries_data, test_sim)
    #print(evaluation_test_batch)
    
    # Validation
    buckets_vectorized = vectorizer_buckets(verbose, model, buckets, buckets_data)
    queries_vectorized = vectozing_validation_batch(verbose, model, queries_test_vectorized, queries_data)
    annoy = indexing_train(buckets_vectorized, verbose)
    X_test, distance_test, indices_test = indexing_test(annoy, queries_vectorized, verbose)
    formated_rank = rank_result(buckets_vectorized, indices_test, distance_test, verbose)
    rank_queries = formating_rank(X_test, verbose)
    exported_rank = export_rank(rank_queries, formated_rank, verbose)
    evaluation = Evaluation(verbose)
    validation_recall = evaluation.evaluate(exported_rank)['5 - recall_at_25']
    # Test 
#     buckets_test_vectorized = vectorizer_buckets_train(model, buckets_train, verbose)
#     queries_test_vectorized = vectorize_queries(model, test, issues_by_buckets)
#     annoy = indexing_train(buckets_test_vectorized, verbose)
#     X_test, distance_test, indices_test = indexing_test(annoy, queries_test_vectorized, verbose)
#     formated_rank = rank_result(buckets_test_vectorized, indices_test, distance_test, verbose)
#     rank_queries = formating_rank(X_test, verbose)
#     exported_rank = export_rank(rank_queries, formated_rank, verbose)
#     evaluation = Evaluation(verbose)
#     test_recall = evaluation.evaluate(exported_rank)['5 - recall_at_25']

    test_recall = 0
    
    # validation_recall@25, test_recall@25, loss, cosine_positive, cosine_negative
    return validation_recall, test_recall
    #return report['5 - recall_at_25'], evaluation_test_batch[0], evaluation_test_batch[1], evaluation_test_batch[2] 

In [30]:
class Evaluation():
    def __init__(self, verbose=1):
        self.verbose = verbose
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, query_master = query.split(":")
        query_master = int(query_master)
        rank_masters = [int(item.split(':')[0]) for pos, item in enumerate(rank.split(",")[:25])]
        corrects = len(set([query_master]) & set(rank_masters[:k]))
        #total = len(retrieval.buckets[issues_by_buckets[query_master]])
        total = 1
        return float(corrects), total

    def evaluate(self, path):
        self.recall_at_5_corrects_sum, self.recall_at_10_corrects_sum, \
        self.recall_at_15_corrects_sum, self.recall_at_20_corrects_sum, self.recall_at_25_corrects_sum = 0, 0, 0, 0, 0
        self.recall_at_5_total_sum, self.recall_at_10_total_sum, self.recall_at_15_total_sum, \
        self.recall_at_20_total_sum, self.recall_at_25_total_sum = 0, 0, 0, 0, 0 
        if(self.verbose):
            print("Evaluating...")
        if type(path) == str:
            with open(path, 'r') as file_input:
                for row in file_input:
                    self.recall(row)
        else:
            for row in path:
                self.recall(row)
        
        report = {
            '1 - recall_at_5' : round(self.recall_at_5_corrects_sum / self.recall_at_5_total_sum, 2),
            '2 - recall_at_10' : round(self.recall_at_10_corrects_sum / self.recall_at_10_total_sum, 2),
            '3 - recall_at_15' : round(self.recall_at_15_corrects_sum / self.recall_at_15_total_sum, 2),
            '4 - recall_at_20' : round(self.recall_at_20_corrects_sum / self.recall_at_20_total_sum, 2),
            '5 - recall_at_25' : round(self.recall_at_25_corrects_sum / self.recall_at_25_total_sum, 2)
        }

        return report
    def recall(self, row):
        #if row == '': continue
        self.recall_at_5_corrects, self.recall_at_5_total = self.top_k_recall(row, k=5)
        self.recall_at_10_corrects, self.recall_at_10_total = self.top_k_recall(row, k=10)
        self.recall_at_15_corrects, self.recall_at_15_total = self.top_k_recall(row, k=15)
        self.recall_at_20_corrects, self.recall_at_20_total = self.top_k_recall(row, k=20)
        self.recall_at_25_corrects, self.recall_at_25_total = self.top_k_recall(row, k=25)

        self.recall_at_5_corrects_sum += self.recall_at_5_corrects
        self.recall_at_10_corrects_sum += self.recall_at_10_corrects
        self.recall_at_15_corrects_sum += self.recall_at_15_corrects
        self.recall_at_20_corrects_sum += self.recall_at_20_corrects
        self.recall_at_25_corrects_sum += self.recall_at_25_corrects

        self.recall_at_5_total_sum += self.recall_at_5_total
        self.recall_at_10_total_sum += self.recall_at_10_total
        self.recall_at_15_total_sum += self.recall_at_15_total
        self.recall_at_20_total_sum += self.recall_at_20_total
        self.recall_at_25_total_sum += self.recall_at_25_total

#### Save the model

In [31]:
def save_model(model, name):
    m_dir = os.path.join('modelos')
    if not os.path.exists(m_dir):
        os.mkdir(m_dir)
    model.save(os.path.join(m_dir, "model_{}.h5".format(name)))
    print("Saved model to disk")

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [32]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  input_length=max_sequence_length,
                                  trainable=trainable)
    return embedding_layer

### CNN with filter 3,4,5

In [33]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    layer = Dense(100, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [34]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
#     lstm_layer = Bidirectional(LSTM(number_lstm_units, return_sequences=True), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
#                                merge_mode='ave')

    lstm_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    layer = LSTM(number_lstm_units)(lstm_layer)

    #layer = lstm_layer(embedded_sequences)
    #layer = GlobalAveragePooling1D()(layer)
    layer = Dense(100, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [35]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 100
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [36]:
from keras import backend as K
import tensorflow as tf

def l2_normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())

def normalize(x):
    return l2_normalize(x, axis=-1)
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = l2_normalize(x, axis=-1)
    y, y_norm = l2_normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return distance

def margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    loss = K.maximum(0.0, margin - y_pred[0] +  y_pred[1])
    return K.mean(loss)

def pos_distance(y_true, y_pred):
    return K.mean(y_pred[0])

def neg_distance(y_true, y_pred):
    return K.mean(y_pred[1])

def stack_tensors(vects):
    return K.stack(vects)

In [37]:
from keras.initializers import TruncatedNormal
from keras.regularizers import l2

def residual_bug():
    def block(block_input):
        shape_size = K.int_shape(block_input)[1]
        
        residual =  block_input
        
        layer_out = Dense(shape_size // 2, activation='tanh')(block_input)
        
        skip_out =  Dense(shape_size, activation='linear', use_bias=False)(layer_out)
        # kernel_initializer=TruncatedNormal(mean=0.0, stddev=0.05, 
         #             seed=42), kernel_regularizer=l2(0.01)
        
        dense_out =  Dense(shape_size, activation='linear', use_bias=False)(layer_out)
        
        block_out =   Add()([residual, dense_out])
        return block_out, skip_out
    return block

In [38]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d):
  
    bug_t_in = Input(shape = (sequence_length_t, ), name = 'title_in')
    bug_t_pos = Input(shape = (sequence_length_t, ), name = 'title_pos')
    bug_t_neg = Input(shape = (sequence_length_t, ), name = 'title_neg')

    bug_d_in = Input(shape = (sequence_length_d, ), name = 'desc_in')
    bug_d_pos = Input(shape = (sequence_length_d, ), name = 'desc_pos')
    bug_d_neg = Input(shape = (sequence_length_d, ), name = 'desc_neg')
    
    bug_i_in = Input(shape = (sequence_length_info, ), name = 'info_in')
    bug_i_pos = Input(shape = (sequence_length_info, ), name = 'info_pos')
    bug_i_neg = Input(shape = (sequence_length_info, ), name = 'info_neg')

    bug_t_in_feat_lstm = lstm_feature_model(bug_t_in)
    bug_t_pos_feat_lstm = lstm_feature_model(bug_t_pos)
    bug_t_neg_feat_lstm = lstm_feature_model(bug_t_neg)

    bug_d_in_feat_cnn = cnn_feature_model(bug_d_in)
    bug_d_pos_feat_cnn = cnn_feature_model(bug_d_pos)
    bug_d_neg_feat_cnn = cnn_feature_model(bug_d_neg)
    
    bug_i_in_feat_mlp = mlp_feature_model(bug_i_in)
    bug_i_pos_feat_mlp = mlp_feature_model(bug_i_pos)
    bug_i_neg_feat_mlp = mlp_feature_model(bug_i_neg)

#     encoded_anchor = Add(name = 'merge_features_in')([bug_i_in_feat_mlp, bug_t_in_feat_lstm, bug_d_in_feat_cnn])
#     encoded_positive = Add(name = 'merge_features_pos')([bug_i_pos_feat_mlp, bug_t_pos_feat_lstm, bug_d_pos_feat_cnn])
#     encoded_negative = Add(name = 'merge_features_neg')([bug_i_neg_feat_mlp, bug_t_neg_feat_lstm, bug_d_neg_feat_cnn])
    
    encoded_anchor = concatenate([bug_i_in_feat_mlp, bug_t_in_feat_lstm, bug_d_in_feat_cnn], name = 'merge_features_in')
    encoded_positive = concatenate([bug_i_pos_feat_mlp, bug_t_pos_feat_lstm, bug_d_pos_feat_cnn], name = 'merge_features_pos')
    encoded_negative = concatenate([bug_i_neg_feat_mlp, bug_t_neg_feat_lstm, bug_d_neg_feat_cnn], name = 'merge_features_neg')
    
#     encoded_anchor_1a, encoded_anchor_1b  = residual_bug()(encoded_anchor)
#     encoded_anchor_2a, encoded_anchor_2b  = residual_bug()(encoded_anchor_1a)
    
#     encoded_positive_1a, encoded_positive_1b = residual_bug()(encoded_positive)
#     encoded_positive_2a, encoded_positive_2b = residual_bug()(encoded_positive_1a)
    
#     encoded_negative_1a, encoded_negative_1b = residual_bug()(encoded_negative)
#     encoded_negative_2a, encoded_negative_2b = residual_bug()(encoded_negative_1a)
    
#     encoded_anchor = Add()([encoded_anchor_1b, encoded_anchor_2b])
#     encoded_positive = Add()([encoded_positive_1b, encoded_positive_2b])
#     encoded_negative = Add()([encoded_negative_1b, encoded_negative_2b])
    
#     encoded_anchor = Activation('tanh')(encoded_anchor)
#     encoded_positive = Activation('tanh')(encoded_positive)
#     encoded_negative = Activation('tanh')(encoded_negative)
    # Bug representation layer
#     encoded_anchor = Dense(100, activation='tanh')(encoded_anchor)
#     encoded_positive = Dense(100, activation='tanh')(encoded_positive)
#     encoded_negative = Dense(100, activation='tanh')(encoded_negative)
    
    # Cosine
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])
    
    # Loss function only works with a single output
    output = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances',
        output_shape=(2, 1)
    )([positive_d, negative_d])
  
    similarity_model = Model(inputs = [bug_t_in, bug_t_pos, bug_t_neg, 
                                       bug_d_in, bug_d_pos, bug_d_neg, 
                                       bug_i_in, bug_i_pos, bug_i_neg], 
                           outputs = output, name = 'Similarity_Model')
    
    optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    #optimizer = 'adam'
    
    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=margin_loss, metrics=[pos_distance, neg_distance])

    return similarity_model

In [39]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Embeddings
cnn_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=True)
lstm_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=True)

# Feature models
cnn_feature_model = cnn_model(cnn_embedding_layer, MAX_SEQUENCE_LENGTH_D)
lstm_feature_model = lstm_model(lstm_embedding_layer, MAX_SEQUENCE_LENGTH_T)
mlp_feature_model = mlp_model(number_of_columns_info)

# Similarity model
similarity_model = siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 1000
best_recall = 0
best_epoch = 0
verbose = 0

# Pre load validation
buckets, buckets_data = get_buckets_from_validation(verbose, batch_triplets_valid)
queries_validation = get_validation_ids(verbose, batch_triplets_valid)
queries_test_vectorized, queries_data = get_validation_batch(verbose, queries_validation)

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = baseline.batch_iterator(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
    train_batch = [train_input_sample['title'], train_input_pos['title'], train_input_neg['title'], 
                 train_input_sample['description'], train_input_pos['description'], train_input_neg['description'],
                train_input_sample['info'], train_input_pos['info'], train_input_neg['info']]
    
    h = similarity_model.train_on_batch(train_batch, train_sim)
    validation_recall, test_recall = evaluate_validation_test(verbose, similarity_model,
                                                                                test_gen, batch_triplets_valid, 
                                                                                    buckets, buckets_data, 
                                                                                        queries_test_vectorized, queries_data,
                                                                                         retrieval.test, issues_by_buckets)
    print("Epoch: {} - Loss: {:.2f}, positive_cosine: {:.2f}, negative_cosine: {:.2f}, validation_recall@25: {:.2f}, test_recall@25: {:.2f}".format(
        epoch+1, h[0], h[1], h[2], validation_recall, test_recall))
    if validation_recall > best_recall:
        save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
        best_recall = validation_recall
        best_epoch = epoch+1
    # Without step decay for each 10 epochs because the Adam optimizer already do this
    # https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1
print('Best_epoch={}, Best_recall={:.2f}'.format(best_epoch, best_recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
info_pos (InputLayer)           (None, 1682)         0                                            
__________________________________________________________________________________________________
title_pos 

Epoch: 22 - Loss: 0.68, positive_cosine: 0.83, negative_cosine: 0.51, validation_recall@25: 0.55, test_recall@25: 0.00
Epoch: 23 - Loss: 0.69, positive_cosine: 0.83, negative_cosine: 0.52, validation_recall@25: 0.56, test_recall@25: 0.00
Epoch: 24 - Loss: 0.66, positive_cosine: 0.84, negative_cosine: 0.50, validation_recall@25: 0.57, test_recall@25: 0.00
Saved model to disk
Epoch: 25 - Loss: 0.65, positive_cosine: 0.86, negative_cosine: 0.51, validation_recall@25: 0.56, test_recall@25: 0.00
Epoch: 26 - Loss: 0.68, positive_cosine: 0.83, negative_cosine: 0.51, validation_recall@25: 0.57, test_recall@25: 0.00
Epoch: 27 - Loss: 0.68, positive_cosine: 0.85, negative_cosine: 0.53, validation_recall@25: 0.59, test_recall@25: 0.00
Saved model to disk
Epoch: 28 - Loss: 0.66, positive_cosine: 0.83, negative_cosine: 0.50, validation_recall@25: 0.60, test_recall@25: 0.00
Saved model to disk
Epoch: 29 - Loss: 0.66, positive_cosine: 0.85, negative_cosine: 0.51, validation_recall@25: 0.60, test_reca

Epoch: 90 - Loss: 0.55, positive_cosine: 0.93, negative_cosine: 0.49, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 91 - Loss: 0.59, positive_cosine: 0.91, negative_cosine: 0.50, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 92 - Loss: 0.61, positive_cosine: 0.89, negative_cosine: 0.50, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 93 - Loss: 0.56, positive_cosine: 0.93, negative_cosine: 0.49, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 94 - Loss: 0.58, positive_cosine: 0.89, negative_cosine: 0.47, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 95 - Loss: 0.61, positive_cosine: 0.91, negative_cosine: 0.52, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 96 - Loss: 0.52, positive_cosine: 0.93, negative_cosine: 0.45, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 97 - Loss: 0.63, positive_cosine: 0.90, negative_cosine: 0.53, validation_recall@25: 0.71, test_recall@25: 0.00
Saved model to disk
Epoch: 98 - Loss: 0.56, posi

Epoch: 158 - Loss: 0.52, positive_cosine: 0.93, negative_cosine: 0.45, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 184 - Loss: 0.53, positive_cosine: 0.96, negative_cosine: 0.49, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 185 - Loss: 0.52, positive_cosine: 0.92, negative_cosine: 0.44, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 186 - Loss: 0.53, positive_cosine: 0.93, negative_cosine: 0.46, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 187 - Loss: 0.52, positive_cosine: 0.96, negative_cosine: 0.48, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 188 - Loss: 0.49, positive_cosine: 0.94, negative_cosine: 0.44, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 189 - Loss: 0.64, positive_cosine: 0.92, negative_cosine: 0.56, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 190 - Loss: 0.51, positive_cosine: 0.96, negative_cosine: 0.47, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 191 - Loss: 0.54, positive_cosine

Epoch: 252 - Loss: 0.48, positive_cosine: 0.98, negative_cosine: 0.46, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 253 - Loss: 0.53, positive_cosine: 0.93, negative_cosine: 0.46, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 254 - Loss: 0.56, positive_cosine: 0.97, negative_cosine: 0.52, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 255 - Loss: 0.48, positive_cosine: 0.94, negative_cosine: 0.42, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 256 - Loss: 0.51, positive_cosine: 0.97, negative_cosine: 0.48, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 257 - Loss: 0.49, positive_cosine: 0.96, negative_cosine: 0.46, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 258 - Loss: 0.54, positive_cosine: 0.97, negative_cosine: 0.51, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 259 - Loss: 0.52, positive_cosine: 0.97, negative_cosine: 0.49, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 260 - Loss: 0.37, positive_cosine

Epoch: 321 - Loss: 0.49, positive_cosine: 0.95, negative_cosine: 0.44, validation_recall@25: 0.75, test_recall@25: 0.00
Epoch: 322 - Loss: 0.49, positive_cosine: 0.95, negative_cosine: 0.44, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 323 - Loss: 0.47, positive_cosine: 0.96, negative_cosine: 0.43, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 324 - Loss: 0.59, positive_cosine: 0.92, negative_cosine: 0.51, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 325 - Loss: 0.55, positive_cosine: 0.94, negative_cosine: 0.48, validation_recall@25: 0.75, test_recall@25: 0.00
Epoch: 326 - Loss: 0.61, positive_cosine: 0.94, negative_cosine: 0.55, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 327 - Loss: 0.52, positive_cosine: 0.99, negative_cosine: 0.51, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 328 - Loss: 0.52, positive_cosine: 0.94, negative_cosine: 0.45, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 329 - Loss: 0.54, positive_cosine

Epoch: 390 - Loss: 0.50, positive_cosine: 0.98, negative_cosine: 0.48, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 391 - Loss: 0.59, positive_cosine: 0.96, negative_cosine: 0.55, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 392 - Loss: 0.48, positive_cosine: 0.96, negative_cosine: 0.44, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 393 - Loss: 0.52, positive_cosine: 0.96, negative_cosine: 0.48, validation_recall@25: 0.74, test_recall@25: 0.00
Epoch: 394 - Loss: 0.52, positive_cosine: 0.97, negative_cosine: 0.48, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 395 - Loss: 0.44, positive_cosine: 0.96, negative_cosine: 0.40, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 396 - Loss: 0.45, positive_cosine: 0.94, negative_cosine: 0.39, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 397 - Loss: 0.45, positive_cosine: 0.95, negative_cosine: 0.40, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 398 - Loss: 0.44, positive_cosine

Epoch: 459 - Loss: 0.51, positive_cosine: 0.98, negative_cosine: 0.49, validation_recall@25: 0.73, test_recall@25: 0.00
Epoch: 460 - Loss: 0.54, positive_cosine: 0.97, negative_cosine: 0.51, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 461 - Loss: 0.44, positive_cosine: 0.95, negative_cosine: 0.39, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 462 - Loss: 0.59, positive_cosine: 0.95, negative_cosine: 0.54, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 463 - Loss: 0.48, positive_cosine: 0.98, negative_cosine: 0.46, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 464 - Loss: 0.57, positive_cosine: 0.99, negative_cosine: 0.56, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 465 - Loss: 0.50, positive_cosine: 0.96, negative_cosine: 0.46, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 466 - Loss: 0.42, positive_cosine: 0.97, negative_cosine: 0.38, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 467 - Loss: 0.61, positive_cosine

Epoch: 528 - Loss: 0.47, positive_cosine: 0.97, negative_cosine: 0.44, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 529 - Loss: 0.48, positive_cosine: 0.98, negative_cosine: 0.45, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 530 - Loss: 0.53, positive_cosine: 0.92, negative_cosine: 0.45, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 531 - Loss: 0.50, positive_cosine: 0.95, negative_cosine: 0.45, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 532 - Loss: 0.48, positive_cosine: 0.94, negative_cosine: 0.42, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 533 - Loss: 0.60, positive_cosine: 0.92, negative_cosine: 0.52, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 534 - Loss: 0.45, positive_cosine: 0.96, negative_cosine: 0.41, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 535 - Loss: 0.54, positive_cosine: 0.93, negative_cosine: 0.47, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 536 - Loss: 0.47, positive_cosine

Epoch: 597 - Loss: 0.54, positive_cosine: 0.92, negative_cosine: 0.47, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 598 - Loss: 0.44, positive_cosine: 0.96, negative_cosine: 0.41, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 599 - Loss: 0.42, positive_cosine: 0.96, negative_cosine: 0.38, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 600 - Loss: 0.54, positive_cosine: 0.93, negative_cosine: 0.47, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 601 - Loss: 0.49, positive_cosine: 0.97, negative_cosine: 0.46, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 602 - Loss: 0.43, positive_cosine: 0.98, negative_cosine: 0.41, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 603 - Loss: 0.56, positive_cosine: 0.97, negative_cosine: 0.54, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 604 - Loss: 0.54, positive_cosine: 0.94, negative_cosine: 0.47, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 605 - Loss: 0.50, positive_cosine

Epoch: 666 - Loss: 0.55, positive_cosine: 0.96, negative_cosine: 0.51, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 667 - Loss: 0.54, positive_cosine: 0.97, negative_cosine: 0.50, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 668 - Loss: 0.56, positive_cosine: 0.98, negative_cosine: 0.54, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 669 - Loss: 0.56, positive_cosine: 0.97, negative_cosine: 0.53, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 670 - Loss: 0.54, positive_cosine: 0.98, negative_cosine: 0.52, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 671 - Loss: 0.49, positive_cosine: 0.94, negative_cosine: 0.43, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 672 - Loss: 0.49, positive_cosine: 0.94, negative_cosine: 0.43, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 673 - Loss: 0.50, positive_cosine: 0.99, negative_cosine: 0.49, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 674 - Loss: 0.54, positive_cosine

Epoch: 735 - Loss: 0.45, positive_cosine: 0.97, negative_cosine: 0.43, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 736 - Loss: 0.46, positive_cosine: 0.95, negative_cosine: 0.41, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 737 - Loss: 0.60, positive_cosine: 0.96, negative_cosine: 0.56, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 738 - Loss: 0.60, positive_cosine: 0.96, negative_cosine: 0.56, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 739 - Loss: 0.49, positive_cosine: 0.96, negative_cosine: 0.45, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 740 - Loss: 0.62, positive_cosine: 0.94, negative_cosine: 0.55, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 741 - Loss: 0.52, positive_cosine: 0.95, negative_cosine: 0.47, validation_recall@25: 0.67, test_recall@25: 0.00
Epoch: 742 - Loss: 0.54, positive_cosine: 0.94, negative_cosine: 0.48, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 743 - Loss: 0.50, positive_cosine

Epoch: 804 - Loss: 0.44, positive_cosine: 0.98, negative_cosine: 0.42, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 805 - Loss: 0.48, positive_cosine: 0.95, negative_cosine: 0.43, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 806 - Loss: 0.62, positive_cosine: 0.98, negative_cosine: 0.60, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 807 - Loss: 0.50, positive_cosine: 0.98, negative_cosine: 0.48, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 808 - Loss: 0.51, positive_cosine: 0.96, negative_cosine: 0.47, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 809 - Loss: 0.48, positive_cosine: 0.95, negative_cosine: 0.44, validation_recall@25: 0.67, test_recall@25: 0.00
Epoch: 810 - Loss: 0.49, positive_cosine: 0.97, negative_cosine: 0.46, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 811 - Loss: 0.55, positive_cosine: 0.98, negative_cosine: 0.53, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 812 - Loss: 0.47, positive_cosine

Epoch: 873 - Loss: 0.53, positive_cosine: 0.98, negative_cosine: 0.51, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 874 - Loss: 0.57, positive_cosine: 0.98, negative_cosine: 0.55, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 875 - Loss: 0.52, positive_cosine: 0.97, negative_cosine: 0.49, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 876 - Loss: 0.50, positive_cosine: 0.96, negative_cosine: 0.46, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 877 - Loss: 0.54, positive_cosine: 0.98, negative_cosine: 0.53, validation_recall@25: 0.72, test_recall@25: 0.00
Epoch: 878 - Loss: 0.44, positive_cosine: 0.98, negative_cosine: 0.42, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 879 - Loss: 0.62, positive_cosine: 0.96, negative_cosine: 0.57, validation_recall@25: 0.70, test_recall@25: 0.00
Epoch: 880 - Loss: 0.56, positive_cosine: 0.98, negative_cosine: 0.54, validation_recall@25: 0.71, test_recall@25: 0.00
Epoch: 881 - Loss: 0.49, positive_cosine

Epoch: 942 - Loss: 0.49, positive_cosine: 0.94, negative_cosine: 0.43, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 943 - Loss: 0.51, positive_cosine: 0.93, negative_cosine: 0.45, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 944 - Loss: 0.46, positive_cosine: 0.96, negative_cosine: 0.41, validation_recall@25: 0.68, test_recall@25: 0.00
Epoch: 945 - Loss: 0.51, positive_cosine: 0.99, negative_cosine: 0.50, validation_recall@25: 0.67, test_recall@25: 0.00
Epoch: 946 - Loss: 0.42, positive_cosine: 0.99, negative_cosine: 0.41, validation_recall@25: 0.67, test_recall@25: 0.00
Epoch: 947 - Loss: 0.49, positive_cosine: 0.97, negative_cosine: 0.46, validation_recall@25: 0.66, test_recall@25: 0.00
Epoch: 948 - Loss: 0.49, positive_cosine: 0.95, negative_cosine: 0.43, validation_recall@25: 0.67, test_recall@25: 0.00
Epoch: 949 - Loss: 0.45, positive_cosine: 0.97, negative_cosine: 0.42, validation_recall@25: 0.69, test_recall@25: 0.00
Epoch: 950 - Loss: 0.54, positive_cosine

In [40]:
# loss=h.history['loss']
# val_loss=h.history['val_loss']

# plt.plot(loss, label='loss')
# plt.plot(val_loss, label='val_loss')
# plt.title('Model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()

### Using the feature layers

#### Similarity cosine 

In [41]:
def cosine_normalized(a, b):
    a = K.variable(a)
    b = K.variable(b)
    # normalization
    #a, a_norm = normalize(a)
    #b, b_norm = normalize(b)
    a_norm = K.sqrt(K.sum(K.square(a), axis=-1, keepdims=False))
    b_norm = K.sqrt(K.sum(K.square(b), axis=-1, keepdims=False))
    a_norm = K.maximum(a_norm, K.epsilon())
    b_norm = K.maximum(b_norm, K.epsilon())
    # dot
    #print("Var from mut", K.eval(a), K.eval(b))
    cos_sim = K.sum( a * b ) / (a_norm * b_norm)  
    #print("Mut", K.eval(cos_sim))
    return K.eval( (cos_sim + 1 ) / 2 )
    #return K.eval(cos_sim)[0], K.eval(K.sum(a * b)), K.eval(a_norm), K.eval(b_norm)
    
def cos_distance_keras(y_true, y_pred):
    y_true = K.l2_normalize(y_true, axis=-1)
    y_pred = K.l2_normalize(y_pred, axis=-1)
    return K.eval(K.mean(K.sum(y_true * y_pred, axis=-1)))

def cos_distance(y_true, y_pred):
    def l2_normalize(x, axis):
        norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=True))
        return K.maximum(x, K.epsilon()) / K.maximum(norm, K.epsilon())
    y_true = l2_normalize(y_true, axis=-1)
    y_pred = l2_normalize(y_pred, axis=-1)
    return K.eval(K.mean(y_true * y_pred))

def cos_custom(a, b): # Cosine used in the siamese model
    a = K.variable(a)
    b = K.variable(b)
    return K.eval(cosine_distance([a, b]))

bug_vector_a_t = np.random.rand(2)
bug_vector_b_t = -1.0 * np.random.rand(2)
# bug_vector_a_t = np.array([1.0, 1.0, 2.0])
# bug_vector_b_t = np.array([1.0, 2.0, 1.0])
bug_vector_a_t = np.array([0.0, 0.1, 0.0])
bug_vector_b_t = np.array([0.0, 0.1, 0.1])

print(bug_vector_a_t, bug_vector_b_t)

result = cos_distance(bug_vector_a_t, bug_vector_b_t)
result2 = cosine_normalized(bug_vector_a_t, bug_vector_b_t)
result3 = cos_distance_keras(bug_vector_a_t, bug_vector_b_t)
result4 = cos_custom(bug_vector_a_t, bug_vector_b_t)
result, result2, result3, result4

[0.  0.1 0. ] [0.  0.1 0.1]


(0.23570249609801194, 0.8535534, 0.7071067811865475, 0.8535534)

In [42]:
bug_vector_a_t = np.array([1, 1, 2])
bug_vector_b_t = np.array([1, 2, 1])
result = cos_custom(bug_vector_a_t, bug_vector_b_t)
result

0.9166666

#### Loading bugs of test

In [43]:
test = 'eclipse'

In [44]:
from scipy import spatial
if (DOMAIN == test):
    bug_set = baseline.get_bug_set()
    # Eclipse test
    bug_id = [96204, np.random.choice(list(bug_set))] # non-duplicate {15196, 2}
    # bug_id = [96204, 85581] # duplicate {85581, 96204, 106979}
    dup_a, dup_b = bug_id
    bug_a = bug_set[dup_a]
    bug_b = bug_set[dup_b]

    dup_a, dup_b

#### LSTM feature

In [45]:
if (DOMAIN == test):
    print(bug_a['title'], bug_b['title'])

preferences filter text cut off using default fonts on organization transparent composite background color in default theme


In [46]:
if (DOMAIN == test):
    bug_vector_a_t = lstm_feature_model.predict(np.array([bug_a['title_word']]))[0]
    bug_vector_b_t = lstm_feature_model.predict(np.array([bug_b['title_word']]))[0]
    result = cosine_normalized(bug_vector_a_t, bug_vector_b_t)
    print(result)

0.45808825


In [47]:
if (DOMAIN == test):
    bug_vector_a_t, bug_vector_b_t

#### CNN feature

In [48]:
if (DOMAIN == test):
    bug_a['description'], bug_b['description']

In [49]:
if (DOMAIN == test):
    bug_vector_a_d = cnn_feature_model.predict(np.array([bug_a['description_word']]))[0]
    bug_vector_b_d = cnn_feature_model.predict(np.array([bug_b['description_word']]))[0]
    result = cosine_normalized(bug_vector_a_d, bug_vector_b_d)
    result

In [50]:
if (DOMAIN == test):
    bug_vector_a_d, bug_vector_b_d

#### MLP feature

In [51]:
if (DOMAIN == test):
    bug_vector_a_i = mlp_feature_model.predict(np.array([retrieval.get_info(bug_a)]))[0]
    bug_vector_b_i = mlp_feature_model.predict(np.array([retrieval.get_info(bug_b)]))[0]
    result = cosine_normalized(bug_vector_a_i, bug_vector_b_i)
    result

In [52]:
if (DOMAIN == test):
    bug_vector_a_i, bug_vector_b_i

#### Merge features

In [53]:
if (DOMAIN == test):
    bug_vector_a = np.concatenate([ bug_vector_a_i, bug_vector_a_t, bug_vector_a_d ], -1)
    bug_vector_b = np.concatenate([ bug_vector_b_i, bug_vector_b_t, bug_vector_b_d ], -1)
    result = cosine_normalized(bug_vector_a, bug_vector_b)
    result

In [54]:
if (DOMAIN == test):
    bug_vector_a, bug_vector_b

### Retrieval evaluation

In [55]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

In [56]:
print("Total of queries:", len(retrieval.test))

Total of queries: 7253


#### Selecting bugs from test

In [57]:
buckets_train = set()
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    vectorizer = [bug_id] 
    vectorizer += ground_truth
    for test_bug_id in vectorizer:
        buckets_train.add(issues_by_buckets[test_bug_id])

HBox(children=(IntProgress(value=0, max=7253), HTML(value='')))




#### Getting the model trained

In [58]:
#SAVE_PATH.replace('@number_of_epochs@', str(epochs))

In [59]:
def get_model_vectorizer_clf(path):
    if(path):
        loaded_model = load_model(os.path.join("modelos", "model_{}.h5".format(path)), 
                                    {'l2_normalize' : l2_normalize, 
                                     'margin_loss' : margin_loss,
                                     'pos_distance' : pos_distance,
                                     'neg_distance' : neg_distance,
                                     'stack_tensors': stack_tensors})
    
    
    bug_title =  loaded_model.get_layer('title_a').input 
    bug_desc =  loaded_model.get_layer('desc_a').input 
    bug_info = loaded_model.get_layer('info_a').input 

    title_encoder = loaded_model.get_layer('FeatureLstmGenerationModel')
    desc_encoder = loaded_model.get_layer('FeatureCNNGenerationModel')
    info_encoder = loaded_model.get_layer('FeatureMlpGenerationModel')

    bug_t = title_encoder(bug_title)
    bug_d = desc_encoder(bug_desc)
    bug_i = info_encoder(bug_info)
    # Representation layer
    model = loaded_model.get_layer('merge_features_in')
    output = model([bug_i, bug_t, bug_d])
    # Bug Representation
    # bug_model = similarity_model.get_layer('dense_4')
    # output = bug_model(output)

    model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [60]:
#epochs = 100
model = get_model_vectorizer(path=SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
#model = get_model_vectorizer_clf(path='baseline_classification_100epoch_10steps({})'.format(DOMAIN))

In [61]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 100)          168300      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

#### Vectorizing bugs from train

In [62]:
def vectorizer_buckets_train(model, buckets_train, verbose=1):
    bug_set = retrieval.baseline.get_bug_set()
    buckets_train_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = buckets_train
    if(verbose):
        loop = tqdm(buckets_train)
        loop.set_description('Vectorizing buckets')
    for bug_id in loop: # retrieval.bugs_train
        bug = bug_set[bug_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        buckets_train_vectorized.append({ 'bug_id' : bug_id })
    if(verbose):
        loop.close()
    # Get embedding of all buckets
    embed_buckets = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the buckets array
    for index, vector in enumerate(embed_buckets):
        buckets_train_vectorized[index]['vector'] = vector
    
    return buckets_train_vectorized

In [63]:
buckets_train_vectorized = vectorizer_buckets_train(model, buckets_train)

HBox(children=(IntProgress(value=0, max=4739), HTML(value='')))




#### Vectorizing bugs from test

In [64]:
def vectorize_queries(model, test, issues_by_buckets, verbose=1):
    bug_set = retrieval.baseline.get_bug_set()
    queries_test_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = test
    if(verbose):
        loop = tqdm(test)
    for row in loop:
        bug_id, ground_truth = row
        vectorizer = [bug_id] 
        vectorizer += ground_truth
        for test_bug_id in vectorizer:
            #if issues_by_buckets[test_bug_id] == test_bug_id: continue # if the bug is the master
            bug = bug_set[test_bug_id]
            title_data.append(bug['title_word'])
            desc_data.append(bug['description_word'])
            info_data.append(retrieval.get_info(bug))
            queries_test_vectorized.append({ 'bug_id' : test_bug_id, 'ground_truth': issues_by_buckets[test_bug_id] })

    # Get embedding of all buckets
    embed_queries = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the queries array    
    for index, vector in enumerate(embed_queries):
        queries_test_vectorized[index]['vector'] = vector
    
    return queries_test_vectorized

In [65]:
queries_test_vectorized = vectorize_queries(model, retrieval.test, issues_by_buckets)

HBox(children=(IntProgress(value=0, max=7253), HTML(value='')))




In [66]:
annoy = indexing_train(buckets_train_vectorized)

HBox(children=(IntProgress(value=0, max=4739), HTML(value='')))




In [67]:
X_test, distance_test, indices_test = indexing_test(annoy, queries_test_vectorized)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [68]:
print("Total buckets train vectorized: {}".format(len(buckets_train_vectorized)))
print("Total queries vectorized: {}".format(len(queries_test_vectorized)))

Total buckets train vectorized: 4739
Total queries vectorized: 12161


In [69]:
formated_rank = rank_result(buckets_train_vectorized, indices_test, distance_test)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [70]:
rank_queries = formating_rank(X_test)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [71]:
exported_rank = export_rank(rank_queries, formated_rank)

HBox(children=(IntProgress(value=0, max=12161), HTML(value='')))




In [72]:
exported_rank[:20]

['196609:241619|88306:0.7098919153213501,159458:0.7057186365127563,135541:0.7009275257587433,126843:0.7009234130382538,140778:0.7008328437805176,125338:0.6992507874965668,154045:0.6976706385612488,105182:0.697040468454361,144023:0.6956570446491241,205445:0.6928595900535583,104033:0.6906803250312805,86052:0.6896497905254364,80621:0.6896286606788635,128446:0.6894574165344238,144811:0.6885432600975037,229919:0.6878058314323425,364179:0.687795490026474,112805:0.6856027841567993,173664:0.6848636865615845,106295:0.6806111335754395,92570:0.6806050539016724,302491:0.6799167990684509,101323:0.6792609095573425,104965:0.6792502999305725,111092:0.6781795620918274',
 '38230:31941|34940:0.9879214176908135,36386:0.9852909073233604,31525:0.9757866952568293,34648:0.9748554434627295,53544:0.9593136198818684,42202:0.9590845108032227,80561:0.9590722993016243,67219:0.9589557573199272,42465:0.958843607455492,32204:0.9587622731924057,42225:0.9586462303996086,26563:0.9583579786121845,40693:0.9582473561167717,

In [73]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [74]:
'''
# Eclipse
    With CNN print all embeddings zero and 2 epochs
    {'1 - recall_at_5': 0.13,
     '2 - recall_at_10': 0.18,
     '3 - recall_at_15': 0.22,
     '4 - recall_at_20': 0.24}
     Without relu activation for each feature siamese in 100 epochs
     {'1 - recall_at_5': 0.16,
     '2 - recall_at_10': 0.23,
     '3 - recall_at_15': 0.27,
     '4 - recall_at_20': 0.31}
     Without dense in the last layer with 100 epochs with embed trainable
     {'1 - recall_at_5': 0.16,
     '2 - recall_at_10': 0.22,
     '3 - recall_at_15': 0.26,
     '4 - recall_at_20': 0.3}
      
      {'1 - recall_at_5': 0.16,
         '2 - recall_at_10': 0.22,
         '3 - recall_at_15': 0.26,
         '4 - recall_at_20': 0.29,
         '5 - recall_at_25': 0.29}
    With title (100 padding) and desc (500 padding) and batch refactored
        {'1 - recall_at_5': 0.2,
         '2 - recall_at_10': 0.26,
         '3 - recall_at_15': 0.3,
         '4 - recall_at_20': 0.33,
         '5 - recall_at_25': 0.33}
         
         {'1 - recall_at_5': 0.2,
         '2 - recall_at_10': 0.27,
         '3 - recall_at_15': 0.31,
         '4 - recall_at_20': 0.34,
         '5 - recall_at_25': 0.34}
         With recall in validation step and split 90 train 10 to test
         {'1 - recall_at_5': 0.25,
         '2 - recall_at_10': 0.32,
         '3 - recall_at_15': 0.37,
         '4 - recall_at_20': 0.4,
         '5 - recall_at_25': 0.4}
         With 200 epochs validation_recall@25 = 58, optimizer=Nadam
         {'1 - recall_at_5': 0.26,
         '2 - recall_at_10': 0.34,
         '3 - recall_at_15': 0.39,
         '4 - recall_at_20': 0.42,
         '5 - recall_at_25': 0.42}
         With 100 epochs validation_recall@25 = 52, optimizer=Adam
         {'1 - recall_at_5': 0.23,
         '2 - recall_at_10': 0.3,
         '3 - recall_at_15': 0.34,
         '4 - recall_at_20': 0.37,
         '5 - recall_at_25': 0.37}
        With 1000 epochs validation_recall@25=60, optimizer=Nadam
        {'1 - recall_at_5': 0.24,
         '2 - recall_at_10': 0.32,
         '3 - recall_at_15': 0.37,
         '4 - recall_at_20': 0.41,
         '5 - recall_at_25': 0.41}
         With 1000 epochs validation_recall@25=64, optimizer=Nadam
         {'1 - recall_at_5': 0.28,
         '2 - recall_at_10': 0.36,
         '3 - recall_at_15': 0.41,
         '4 - recall_at_20': 0.45,
         '5 - recall_at_25': 0.45}
         Withou change the distance x when calculate the cosine
         {'1 - recall_at_5': 0.18,
         '2 - recall_at_10': 0.24,
         '3 - recall_at_15': 0.28,
         '4 - recall_at_20': 0.31,
         '5 - recall_at_25': 0.31}
         With concatenation
         {'1 - recall_at_5': 0.23,
         '2 - recall_at_10': 0.31,
         '3 - recall_at_15': 0.36,
         '4 - recall_at_20': 0.4,
         '5 - recall_at_25': 0.43}
             
    # Open Office
    {'1 - recall_at_5': 0.2,
     '2 - recall_at_10': 0.27,
     '3 - recall_at_15': 0.31,
     '4 - recall_at_20': 0.34,
     '5 - recall_at_25': 0.34}
'''
evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report

Evaluating...


{'1 - recall_at_5': 0.23,
 '2 - recall_at_10': 0.31,
 '3 - recall_at_15': 0.36,
 '4 - recall_at_20': 0.41,
 '5 - recall_at_25': 0.44}