# Bug triage with Deep Learning

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from __future__ import print_function, division

In [3]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, \
    GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [5]:
from methods.baseline import Baseline

## Auxiliary methods

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Glove embeddings
GLOVE_DIR='data/embed'
# Save model
SAVE_PATH = 'baseline_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'baseline_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

#### Loading bug ids in memory

In [9]:
baseline.load_ids(DIR)
len(baseline.bug_ids)

212512

### Dicionário de títulos e descrições

In [10]:
%%time

baseline.load_preprocess()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


CPU times: user 1min 19s, sys: 1.88 s, total: 1min 20s
Wall time: 1min 20s


## Geração de batches

### Generating tiple of batches

In [11]:
%%time
baseline.prepare_dataset()

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data
CPU times: user 390 ms, sys: 4.07 ms, total: 394 ms
Wall time: 388 ms


In [12]:
baseline.load_bugs()

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




In [13]:
if 2521 in baseline.bug_set:
    baseline.bug_set[2521]

In [14]:
%%time

batch_size = 64
batch_size_test = 512

# we want a constant validation group to have a frame of reference for model performance
train_gen = baseline.siam_gen(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
batch_triplets_valid, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = baseline.batch_iterator(baseline.train_data, 
                                                                                          baseline.dup_sets_train, 
                                                                                          batch_size_test, 1)
test_gen = ([valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'], 
             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],
            valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info']], valid_sim)

# Categorical columns
number_of_columns_info = valid_input_sample['info'].shape[1]

CPU times: user 60.5 ms, sys: 12 ms, total: 72.5 ms
Wall time: 72 ms


In [15]:
valid_input_sample['title'].shape, valid_input_sample['description'].shape, valid_input_sample['info'].shape, valid_sim.shape

((512, 100), (512, 500), (512, 1682), (512,))

### Validar entrada

In [16]:
%%time 

baseline.display_batch(baseline.train_data, baseline.dup_sets_train, 5)

***Title***: quick diff iaes on ruler context menu person
***Title***: index out of bounds in ruler
***Description***: check out org eclipse core expressions open organization java context menu organization click yes click ok ruler context menu person id try to scroll down to the end of file iaes are thrown when touch the scroll bar or move the mouse over the change ruler dragging the scrollbar does not repaint the editor and rulers correctly java lang law index out of bounds at org eclipse swt swt error swt java at org eclipse swt swt error swt java at org eclipse swt swt error swt java at org eclipse swt custom person get organization at line person java at org eclipse jface internal text revisions person get baseline bias person java at org eclipse jface internal text revisions person paint change region person java at org eclipse jface internal text revisions person paint person java at org eclipse jface text source organization do paint organization java at org eclipse jface text 

## Pre-trained embeddings

Loading pretrained word vectors

### Glove

In [17]:
vocab = baseline.load_vocabulary(os.path.join(DIR, 'word_vocab.pkl'))
for token in vocab:
    print(token)

vocabulary loaded
multiplicative
newfile
interfering
organizationapp
swat
mkdir
opinions
stud
udp
it
provokes
locationis
graphdef
organizationupdater
unfocused
den
joerg
kinds
hunks
processor
vwc
gfx
dorg
yr
personcompilelogs
slowed
nn
surrounded
disappears
asap
meantime
intents
martin
mneumonics
mtl
regressions
ramp
znumber
vert
pinning
dependecies
resources
suffice
jdiorganization
menuitems
laworganization
metal
exit
nationalityanged
jfn
cmdline
flaw
mangling
recreating
personsor
poped
haven
indices
unfocus
personxms
inconvenient
ccb
eager
irunnable
smartness
later
personnt
personruntime
offs
deselection
birttests
crosscutting
relayout
citizen
shipped
cht
mswin
walked
waspdeveloper
ãª
clutter
cash
producers
ejbperson
lemmy
include
succeed
producta
personunder
arial
ioperation
unmatched
organizationshape
appcontext
standing
extracts
textfile
teste
risks
traverser
organizationpersistence
xmlorganization
rightmost
rences
personincubation
extras
organizationcontext
richfaces
screencast
i

ract
nmr
supplement
verification
rq
openarchitectureware
reacting
loader
screw
discoverers
meant
appender
ptr
themeable
asks
untranslated
macosx
aj
sparcv
ecs
prepared
relengtool
mongo
thead
vb
jn
ataspectj
awesome
cairo
numberdialogimpl
personpm
programy
improper
fk
diagnostician
transfo
lgc
dependent
resouces
classify
asset
counterintuitive
hte
accumulates
enable
ext
reversing
rl
involves
rom
availble
cucorrection
etyped
jorganizationpi
tty
concise
jt
committed
subclasses
pci
perference
bffc
personshow
ride
qorganization
merks
remembers
msli
jrockit
themes
categories
moffeature
productlinux
saving
dric
correlating
hoover
aconst
bendpoints
tldtag
samba
amend
dataname
productopen
autrun
aaae
referencer
linking
neiner
discrimination
idoc
neighbours
recompiled
productle
pleiades
elif
dren
jburns
possibly
updates
organizationor
hsqldb
rcbeadapt
verdana
workshop
aggregate
merant
regex
slf
bp
zperson
assemble
organizationfetch
bigint
wbem
microsoft
accesscontrol
ressources
efactoring
nbr
pa

writestdout
dbb
gadi
dbdefinition
projectname
tickets
vtk
slick
runned
dependecy
classdesc
analytics
shaded
protectable
cdiexception
charset
transparently
icommand
implementors
personplugins
installable
competing
dependencies
react
incubating
attemp
carry
poi
resourceco
bca
personand
nonsense
uo
guest
tsi
stepped
roster
indexer
tier
eswt
nsautorelease
lve
moved
iterator
oejs
pred
personvm
violate
dismissed
wasteful
wsw
execution
ffe
tlds
icproject
workpsace
mylib
approximate
leve
proportional
rad
ifndef
seu
testresults
eclipsercp
unstage
side
accessbean
generators
weme
abled
guidetolegaldoc
sizeof
fdb
trafo
jreichert
sporadically
pkix
grids
personnumber
potentially
lenght
fragment
htmlstacking
glasses
switched
libgcj
unidentified
laid
indexing
ipv
dom
myproj
carries
ecosystem
adapting
destroy
as
jvm
garbled
locale
personab
reworked
persong
mk
interested
submodule
reproduction
organizationkey
aurorabuilder
pl
general
loskutov
encountered
exportviewergenerator
selection
xbfffd
svninfo
ex

coff
personunable
cycle
gdb
wsadie
locationr
related
aaaabe
organizationer
extentions
unrecognized
understands
junk
pastebin
ace
mepersonpersonage
valid
with
spacing
recompiling
bf
limited
nce
measurement
extend
installs
existant
grand
verbose
cccf
stame
personcpu
personthe
organizationyou
personpnl
donated
latency
gccperson
sigusr
now
datastore
allocates
quit
losing
super
instr
corresponding
polishing
parameter
fx
iti
astprovider
unaware
beneath
eventhandler
dear
filemenu
protected
efficiency
planet
sles
aggregates
bid
associate
compounded
personv
pradeep
pausible
hh
dialogfields
suppressed
prs
techniques
odds
party
mathcs
openmp
tw
jmdns
becuase
kz
descent
cwd
theirs
complete
dropin
wn
idcode
libgcc
ilazy
retrieving
habit
materialization
invalidations
xms
uncommitted
boundary
collecting
destruct
wp
jmock
pii
tackle
strike
exportdata
newer
asc
xestnationalityktrnationalitye
personration
inln
skipped
personk
update
personf
filing
rseterminal
forwarding
saxorganization
flashes
appearing

persontext
saying
between
virus
archiving
complicates
sys
hql
attributed
misunderstood
glue
etrice
characters
efc
deduced
ijob
prompting
debuggers
code
catchup
vtune
ojdbc
â³
syslib
libpthread
miners
workflows
cspex
displayed
ajp
viewpart
kohsuke
rac
shuffle
misleading
center
psn
uppercase
mveli
paris
nearly
personagent
ppt
testeclipse
quiz
iterated
indentation
onclick
precede
normalize
hash
adn
gitapi
anon
libmozjs
wairole
wed
rwt
blanks
crude
editted
disappearing
websphere
mlang
deluxe
jav
geo
tray
architect
earorganization
noupdate
httpcomponents
aspectivity
char
specialize
iviewer
foreground
img
recognition
cdate
cndev
dorganization
xmldocument
systemtap
parametrized
delimiter
svh
countryion
identification
replication
owners
identified
devote
personcompiler
gator
titi
organizationnonblocking
and
memento
diffrent
debugtest
encode
getstarted
unnamed
vim
crowded
question
flatfile
wssecurity
cps
enter
eventhough
bracket
gmfgen
refined
needed
norm
ldauthor
glass
cperson
ole
causing
wind

eserver
astand
meets
queue
jnp
invoked
leaner
filelist
landscape
phpstructured
dms
factorial
efficiently
xlink
oaw
crashed
linenumber
cots
numberorganization
earperson
paints
organizationof
cms
jet
auto
determining
august
dropins
transfer
flagging
reviewers
ffd
eqxd
qvto
pave
workspace
warranty
tuple
sysdeps
considerations
dextensible
hidden
reintegrate
intrepid
od
osservices
styled
insight
projection
accross
dateth
personnapshot
those
alygilani
cec
htmlabstract
inspector
reserved
srap
affect
docu
fuss
specifics
relating
revising
libsvnjavahl
classe
productact
lines
cachepath
preprocess
inconvertible
dog
hp
gbg
operations
lic
hints
mismatch
pool
guesses
fin
windowposchanged
taskdef
stroke
batches
bsf
phpide
readmes
turn
hacking
selectiontests
iperspective
icontext
entwicklung
ships
problems
organizationlock
dev
oard
neglects
shorter
attribs
preempt
alex
build
personshell
intends
otdt
deployment
scrubbed
boilerplate
dups
rewire
bdoclass
xjjz
refreshable
arrange
entity
miktex
mica
rootdi

organizationare
xmpp
pkey
spellcheck
suppress
attahced
sts
sleeping
dfd
designing
jjj
fear
xfw
indicates
explanatory
like
cbinopom
clipboards
lapack
beans
intelligence
awtui
calibri
wanted
facts
pathname
mxml
userui
intershop
interacting
cache
shortname
hrefs
branch
maximized
zzperson
sveditor
condition
infamous
rsecomm
bmuskalla
ensure
changegenerator
avd
reclaim
chair
fullmoon
uow
painfully
adopt
hyperlinking
lastname
junction
consists
define
fdf
ntext
fffd
helsinki
orthogonal
mmx
glance
libswt
organizationment
ejb
settable
reconfigure
myvar
ghost
beggining
jnationality
ternal
organizationnot
xmlencoding
faf
interval
cvsworkspace
pst
cvsnt
myprofiler
communities
syncinfo
myorg
documents
multiples
simplification
dvt
tomasz
crud
xnu
jp
xstart
libnecko
customizing
direct
attached
tigerstripe
mozillperson
cpplinker
answers
inputted
osgilauncher
repackaging
xac
fy
progressbar
nenumberrk
thinking
jcqk
textsize
childs
dfl
jfacetext
cdo
dead
failures
organizationger
toplevel
zzzz
anager
xa
b

syst
athlon
cppastunary
gdip
inferred
lsloan
dltkcore
mmip
enhancments
tension
quotation
ai
curiously
svg
artworkmessage
door
filling
processors
decrypted
onitor
personn
abf
mutiple
incr
mind
tring
canâ
edges
vers
unlike
tracer
helpers
libhccls
jimc
mediawiki
organizationline
somepackage
constituent
logos
moc
tunneling
configurators
repaints
organizationchange
talked
ao
organizationtion
statcon
packrat
content
codeassist
lockup
jens
personide
imanaged
cess
ambiguities
innerclass
devsosgi
tzdata
quintron
whether
libpi
egg
reclaimed
ttc
intl
just
aeac
presentations
organizationperson
decl
infomation
virgo
watched
afef
uicontrol
switch
fd
sel
occurence
var
pausing
meow
quitting
clarifications
covered
xxxxx
often
dismisses
sso
cvcontinue
organizationinfo
facilitates
ject
openejb
substitution
productj
got
topic
qh
usually
lmckhou
invokeinterface
rwxs
baselines
originates
projectâ
extradir
daniel
wit
colleagues
ffc
cw
dddd
trial
nx
formed
objcopy
metric
ages
recreation
autorefresh
webinf
opp

highlighted
fool
enumerator
bolding
pathmap
rptdesign
pervasive
rcx
personcst
alternating
views
gh
injar
gx
ttl
macrodef
personter
attempting
libdb
minified
appserver
eclipselabs
println
manual
jdo
regards
remoting
typos
summarized
fiel
rdbms
technique
insert
colons
textedit
rv
recover
myfunc
fffb
jjt
personwill
personce
extensionpoints
hardcoded
disk
afe
discourage
personhaving
radio
rms
cvsdecoration
fills
manipulate
oagis
idecorator
bhvgfy
allinone
incorporating
splits
merges
vendors
recommended
cosmos
atna
ressource
replicate
rst
libdispatch
initiating
says
remade
exposure
parsing
gudeg
suffering
exported
forum
pb
variation
referenceable
partner
dynamically
rte
handlehhh
unlock
makes
cr
comprise
personfast
macd
points
rolling
florou
taking
libjvm
symlink
cdoresource
dsdp
describe
inconsistant
formatter
saves
country
unpublish
ioorganization
vfbuuc
composed
personmissing
mtool
cproperty
should
jd
flexide
hava
ä½
consequence
dtdeditor
citizens
mteji
ass
copyright
feb
continuously
cre

servant
libaprutil
whitespace
reduced
asynch
stepstone
funcs
refesh
personusr
extent
taskdefs
location
jpainitializer
hexr
swallowing
fheidric
postmortem
xbfffa
japan
onmenu
misbehavior
weight
bit
syncviewpage
ec
thanx
metrics
unsupported
mil
ñƒ
automatic
rdz
honoured
experienced
uijob
viz
persistance
nationalityion
textarea
artificial
doit
ffffffec
getfield
basis
particle
advertisements
outputter
closures
edi
uisynchronizer
strangely
vos
vi
highest
detects
coordination
korganizationg
consisting
organizationcannot
usernames
personcontrol
eoperation
programatic
ct
fragements
touched
xyperson
optimize
unlocking
perf
dismiss
parallel
ultra
defaults
processed
bordered
plk
personent
yg
packaged
likes
withing
ple
presence
cvsperson
reduction
parsed
allocating
experiences
ioexceptions
organizationchoose
todir
retrieved
interleaved
sf
replies
cemonitr
adjustment
catalogs
lexer
cda
matthias
cl
ears
lazily
programmatically
wms
derivative
trn
operands
iweb
nfo
supplying
akurtakov
situated
navigat

counters
sqlsyntax
personxcdec
fragmented
hdwb
trademark
kazinfotel
clabel
sei
cleaner
env
searchcvs
reconnecting
chmod
um
manuals
job
idemulti
tworkwac
setting
iapi
adaptive
paramters
traverse
scaled
instruments
deregistration
libnsl
cstdio
uninstalled
eip
predeploying
countryes
agentctrl
acserver
tobject
yt
nyg
armanta
chunky
pragma
unconditional
irow
thorough
reworded
equivalents
roject
statements
cimom
jse
slave
personcore
mimics
wininet
monitoring
defining
discussions
foundation
binds
reflected
conforming
spanning
organizationnable
oejw
orderdetails
jve
unlink
ivalidator
parm
dimensions
considers
reselecting
ofcourse
male
managed
terminating
clik
doclet
abbot
comobject
efff
jrig
testutils
organizationus
mysql
hewitt
weblogic
templates
childinfo
lately
maximo
boss
tsukakoshi
zoo
lame
agents
jwt
updated
atlassian
summon
ecountry
wow
taglibprocessing
jasperreports
simulator
interceptors
cluster
personpublic
summary
fw
solicit
personreproducible
organizationvalue
preallocation
tcp
eth

releasenotes
htmlgroup
outputs
vie
jbyte
instances
view
distinct
disallow
contants
udf
ihe
decrypt
sasl
rapdemo
recomputed
multiplier
unconfigure
pkgs
eability
approach
synchronize
nf
beach
personcurrently
uis
validates
reproducer
serviceconfig
incomming
none
membership
layers
cyclic
improve
yum
distribution
albeit
special
trcadb
putstatic
dialog
kw
schoollibrary
platfrom
utils
pcvi
persontestsuite
wrong
atria
svnteam
wsrr
participation
forking
cap
polled
mcountry
testng
shorten
simrel
suspecting
disable
symantec
centered
iii
visualisation
sourceforge
who
incorrect
personee
loggers
unzipped
nearest
jie
firstly
lsi
hcframe
synchronizes
grizzly
odaconsumer
organizationschema
mtx
bears
activators
ons
employment
emailed
greek
ann
interaction
sprites
cvslightweight
dashboard
curl
personcf
fffffff
permissions
cppastid
masterdetails
thingy
compilerarg
sleeps
unrecoverable
ur
jesse
normalization
khorne
eate
jarsigner
refrences
eastern
approx
contribute
breadcrumbs
assumed
ify
imo
iwab
organiza

vex
eax
eclipsec
interpreted
your
notation
personmessage
conventional
jb
visibility
preloading
queueing
hasn
iiii
organizationrun
ãÿã
perpective
employer
memcpy
relevent
displaced
autogen
destinations
domsserenderer
managedagent
filename
istatus
jdtmodified
tooltips
evolves
hs
undocumented
booting
personner
streamline
bhv
frhm
cheetah
autoproxy
statment
eclipes
floppy
personnage
ownable
personplatform
visitor
rts
xls
ncpwin
nicolas
phpproject
genuitec
digital
compiler
factoring
brian
resolved
unusually
rtrim
cout
fragments
organizationconfig
throws
led
bat
spelling
classwizard
newsportal
downgrade
thr
expression
extracting
drop
tsmodel
ð¾ð
contex
writes
hope
prio
andreas
unselected
sdotype
flavor
msacm
leftover
organized
xl
organizationformatter
kicks
syb
pdeui
baseclass
coping
winmm
ou
sj
starts
blah
includes
modelquery
jdi
increase
squire
maximus
deploys
decorator
mypc
luckily
zz
trax
pdelaunch
ajbuilder
sit
predictable
redundantly
oi
sin
grained
caniszczyk
correlator
revised
princip

consoles
realized
expand
spelled
countryger
mixed
jdom
glaperson
dirtied
opera
domnode
asynchronous
floor
eec
yf
nationality
xg
dcc
evaluation
photran
telelogic
counting
dtfjindex
hmc
kd
seed
itpdui
sdbeditor
smallfloat
role
libk
running
organizationd
sdch
configuration
semi
avg
qui
tnt
ucy
beginning
unassigned
pkgsrc
productthis
spaces
countrymanager
zorganization
deltas
contentoutline
tracepoint
modify
irregular
fashion
resultant
usersession
recursion
httpd
frog
javacccore
javatest
walkbacks
activation
china
breakages
modularity
progs
googling
thus
notepad
lixto
hex
nc
infocenter
strategy
dependency
bundling
xt
devzuz
lose
jamesb
naziv
unkown
iapu
tem
staging
hierarchial
zgvte
facilityguration
least
fwd
predefine
ource
licence
recreated
stephan
personac
idto
qvtbase
uninteresting
taxonomy
organizationjava
abandon
screenshot
kperson
oject
dpi
damage
unhelpful
oleacc
personboth
xdb
lbl
adopted
rectangular
tweaked
sgi
filemode
recovers
relevance
alive
add
background
se
hyaorganizations


ecos
collapsing
xbb
tory
unpacked
detection
technical
nationalityable
passphrase
wspaces
preverifier
graphviz
torolab
nec
flight
testplayer
escape
postgresql
unmaximize
controversial
enhancement
facilityr
restarted
keep
minute
cfef
bv
iç
interference
members
wildly
rtutils
organizationorganizationplugin
jdidebug
unsaved
exchanged
ed
zero
decreases
gdr
iterations
umlreflection
europe
eve
life
resize
premature
sunrsasign
ddlgeneration
eba
stability
firepath
chaining
sp
bringing
outbound
quickfixes
xec
convince
conform
productg
fbc
rootfiles
architecture
organizationinit
backported
massage
unfriendly
director
xcd
saxperson
organizationx
javabean
plexobject
lipse
gaskiewicz
scalability
aaf
bbawt
helper
versioned
dirty
perspectiveswitcher
thier
cdopackage
ez
umd
abb
impractical
freezing
guy
jaxpnumber
monospaced
exp
subdirectories
xdrive
expense
swtorganization
adeya
personildren
icountry
panel
megs
osg
analyzer
worldwind
faceted
sprite
personged
xdsref
disconnects
pse
root
papyrus
checkout

models
dhcpcsvc
eclipselink
bodies
option
everything
esource
sua
outof
populated
personis
behaviors
quickassist
chkpii
artworkorganization
inlining
personwould
unencrypted
etab
killed
rmdir
rely
heavy
believes
rjvm
school
classification
mode
idl
tint
itemcreation
cuperson
organizationform
blocked
only
principle
fqn
latter
off
oauth
organizationsyntax
metaphor
propery
drawn
harness
delayed
agrees
organizationtext
grow
nullpointer
sadly
šå
infoviews
dictate
varied
buglist
intermittently
codemanipulation
compromise
repo
mprv
quickaccess
mantis
fsa
uhox
at
filter
os
video
uuid
index
personvar
libcaps
seamlessly
zipfileset
hss
psold
shrinking
personm
yjp
refid
kingdom
bothered
variability
ption
precise
intervention
locates
acceptable
drillthrough
confidentiality
cglib
sensitivity
itype
cbecommon
enhancment
eike
personring


In [18]:
"Total vocabulary: {}".format(len(vocab))

'Total vocabulary: 19998'

In [19]:
%%time

baseline.generating_embed(GLOVE_DIR=GLOVE_DIR, EMBEDDING_DIM=EMBEDDING_DIM) # MAX_NB_WORDS=MAX_NB_WORDS

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Total 1917494 word vectors in Glove 42B 300d.
vocabulary loaded
Number of OOV words: 19998
CPU times: user 1min 22s, sys: 3.07 s, total: 1min 25s
Wall time: 1min 24s


## Experiment

### Training and evaluating for each epoch at same time

#### Auxiliary methods train experiment siamese

In [20]:
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

In [21]:
retrieval = Retrieval()

path = 'data/processed/{}'.format(DOMAIN)
path_buckets = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)
path_train = 'data/processed/{}/train.txt'.format(DOMAIN)
path_test = 'data/processed/{}/test.txt'.format(DOMAIN)

MAX_SEQUENCE_LENGTH_I = number_of_columns_info # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data


HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))


Creating the queries...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Hashing bugs by buckets

In [22]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




#### Model to vectorize

In [23]:
from keras.models import Model
from keras.layers import Input
from keras.models import load_model

def get_model_vectorizer(path=None, loaded_model=None):
    if(path):
        loaded_model = load_model(os.path.join("modelos", "model_{}.h5".format(path)))
        
        '''
            {'l2_normalize' : l2_normalize, 
                                     'margin_loss' : margin_loss,
                                     'pos_distance' : pos_distance,
                                     'neg_distance' : neg_distance,
                                     'stack_tensors': stack_tensors}
        '''
    
    return loaded_model

#### Getting the list of candidates

In [24]:
def indexing_query(annoy, queries_test_vectorized, verbose=1):
    X_test = queries_test_vectorized
    distance_test, indices_test = [], []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Getting the list of candidates from queries')
    for index, row in loop:
        vector = row['vector']
        rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
        indices_test.append(rank)
        distance_test.append(1 - np.array(dist)) # normalize the similarity between 0 and 1
    if(verbose): loop.close()
    return X_test, distance_test, indices_test

#### Indexing bugs

In [25]:
# Indexing all train
def indexing_test(buckets_train_vectorized, verbose=1):
    X = np.array(buckets_train_vectorized)
    annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

    loop = total=len(X)
    if(verbose):
        loop = tqdm(total=len(X))
        loop.set_description("Indexing test in annoy")
    for index, row in enumerate(X):
        vector = row['vector']
        annoy.add_item(index, vector)
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    annoy.build(10) # 10 trees
    return annoy

#### Rank result

In [26]:
def rank_result(test_vectorized, indices_test, distance_test, verbose=1):
    formated_rank = []
    loop = zip(indices_test, distance_test)
    if(verbose):
        loop = tqdm(zip(indices_test, distance_test))
        loop.set_description('Generating the rank')
    for row_index, row_sim in loop:
        row_index, row_sim = row_index[:25], row_sim[:25]
        formated_rank.append(",".join(["{}:{}".format(test_vectorized[index]['bug_id'], sim) 
                                       for index, sim in zip(row_index, row_sim)]))
    if(verbose): loop.close()
    return formated_rank

#### Vectorizer 

In [27]:
def vectorizer_test(model, test, verbose=1):
    bug_set = retrieval.baseline.get_bug_set()
    test_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = test
    if(verbose):
        loop = tqdm(test)
        loop.set_description('Vectorizing buckets')
    for row in loop: # retrieval.bugs_train
        query, ground_truth = row
        for bug_id in ground_truth:
            bug = bug_set[bug_id]
            title_data.append(bug['title_word'])
            desc_data.append(bug['description_word'])
            info_data.append(retrieval.get_info(bug))
            test_vectorized.append({ 'bug_id' : bug_id })
    if(verbose):
        loop.close()
    # Get embedding of all buckets
    embed_test = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the buckets array
    for index, vector in enumerate(embed_test):
        test_vectorized[index]['vector'] = vector
    
    return test_vectorized

In [28]:
def vectorize_queries(model, test, issues_by_buckets, verbose=1):
    bug_set = retrieval.baseline.get_bug_set()
    queries_test_vectorized = []
    title_data, desc_data, info_data = [], [], []
    loop = test
    if(verbose):
        loop = tqdm(test)
    for row in loop:
        test_bug_id, ground_truth = row
        #if issues_by_buckets[test_bug_id] == test_bug_id: continue # if the bug is the master
        bug = bug_set[test_bug_id]
        title_data.append(bug['title_word'])
        desc_data.append(bug['description_word'])
        info_data.append(retrieval.get_info(bug))
        queries_test_vectorized.append({ 'bug_id' : test_bug_id, 'ground_truth': ground_truth })

    # Get embedding of all buckets
    embed_queries = model.predict([ np.array(title_data), np.array(desc_data), np.array(info_data) ])
    # Fill the queries array    
    for index, vector in enumerate(embed_queries):
        queries_test_vectorized[index]['vector'] = vector
    
    return queries_test_vectorized

#### Queries

In [29]:
# Generating the rank result
def formating_rank(X_test, verbose=1):
    rank_queries = []
    loop = enumerate(X_test)
    if(verbose):
        loop = tqdm(enumerate(X_test))
        loop.set_description('Generating the queries from rank')
    for index, row in loop:
        dup_a, ground_truth = row['bug_id'], row['ground_truth']
        rank_queries.append("{}:{}".format(dup_a, ','.join(np.array(ground_truth, str))))
    if(verbose): loop.close()
    return rank_queries

In [30]:
def export_rank(rank_queries, formated_rank, verbose=1):
    exported_rank = []
    loop = len(rank_queries)
    if(verbose):
        loop = tqdm(total=len(rank_queries))
        loop.set_description('Exporting the rank')
    for query, rank in zip(rank_queries, formated_rank):
        exported_rank.append("{}|{}".format(query, rank))
        if(verbose): loop.update(1)
    if(verbose): loop.close()
    return exported_rank

#### Methods to evaluate each epoch

In [31]:
def evaluate_validation_test(verbose, loaded_model, test, issues_by_buckets):
    # Get model
    model = get_model_vectorizer(loaded_model=loaded_model)
    
    # Test 
    test_vectorized = vectorizer_test(model, test, verbose)
    queries_test_vectorized = vectorize_queries(model, test, issues_by_buckets, verbose)
    annoy = indexing_test(test_vectorized, verbose)
    X_test, distance_test, indices_test = indexing_query(annoy, queries_test_vectorized, verbose)
    formated_rank = rank_result(test_vectorized, indices_test, distance_test, verbose)
    rank_queries = formating_rank(X_test, verbose)
    exported_rank = export_rank(rank_queries, formated_rank, verbose)
    evaluation = Evaluation(verbose)
    recall = evaluation.evaluate(exported_rank)['5 - recall_at_25']
    
    # recall@25, loss, cosine_positive, cosine_negative
    return recall
    #return report['5 - recall_at_25'], evaluation_test_batch[0], evaluation_test_batch[1], evaluation_test_batch[2] 

#### Evaluation method

In [32]:
class Evaluation():
    def __init__(self, verbose=1):
        self.verbose = verbose
        self.MAX_RANK = 25
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, ground_truth = query.split(":")
        ground_truth = np.array(ground_truth.split(','), int)
        candidates = [int(item.split(':')[0]) for pos, item in enumerate(rank.split(",")[:self.MAX_RANK])]
        corrects = len(set(ground_truth) & set(candidates[:k]))
        total = len(ground_truth)
        return float(corrects), total

    def evaluate(self, path):
        self.recall_at_5_corrects_sum, self.recall_at_10_corrects_sum, \
        self.recall_at_15_corrects_sum, self.recall_at_20_corrects_sum, self.recall_at_25_corrects_sum = 0, 0, 0, 0, 0
        self.recall_at_5_total_sum, self.recall_at_10_total_sum, self.recall_at_15_total_sum, \
        self.recall_at_20_total_sum, self.recall_at_25_total_sum = 0, 0, 0, 0, 0 
        if(self.verbose):
            print("Evaluating...")
        if type(path) == str:
            with open(path, 'r') as file_input:
                for row in file_input:
                    self.recall(row)
        else:
            for row in path:
                self.recall(row)
        
        report = {
            '1 - recall_at_5' : round(self.recall_at_5_corrects_sum / self.recall_at_5_total_sum, 2),
            '2 - recall_at_10' : round(self.recall_at_10_corrects_sum / self.recall_at_10_total_sum, 2),
            '3 - recall_at_15' : round(self.recall_at_15_corrects_sum / self.recall_at_15_total_sum, 2),
            '4 - recall_at_20' : round(self.recall_at_20_corrects_sum / self.recall_at_20_total_sum, 2),
            '5 - recall_at_25' : round(self.recall_at_25_corrects_sum / self.recall_at_25_total_sum, 2)
        }

        return report
    def recall(self, row):
        #if row == '': continue
        self.recall_at_5_corrects, self.recall_at_5_total = self.top_k_recall(row, k=5)
        self.recall_at_10_corrects, self.recall_at_10_total = self.top_k_recall(row, k=10)
        self.recall_at_15_corrects, self.recall_at_15_total = self.top_k_recall(row, k=15)
        self.recall_at_20_corrects, self.recall_at_20_total = self.top_k_recall(row, k=20)
        self.recall_at_25_corrects, self.recall_at_25_total = self.top_k_recall(row, k=25)

        self.recall_at_5_corrects_sum += self.recall_at_5_corrects
        self.recall_at_10_corrects_sum += self.recall_at_10_corrects
        self.recall_at_15_corrects_sum += self.recall_at_15_corrects
        self.recall_at_20_corrects_sum += self.recall_at_20_corrects
        self.recall_at_25_corrects_sum += self.recall_at_25_corrects

        self.recall_at_5_total_sum += self.recall_at_5_total
        self.recall_at_10_total_sum += self.recall_at_10_total
        self.recall_at_15_total_sum += self.recall_at_15_total
        self.recall_at_20_total_sum += self.recall_at_20_total
        self.recall_at_25_total_sum += self.recall_at_25_total

#### Save the model

In [33]:
def save_model(model, name, verbose=0):
    m_dir = os.path.join('modelos')
    if not os.path.exists(m_dir):
        os.mkdir(m_dir)
    export = os.path.join(m_dir, "model_{}.h5".format(name))
    model.save(export)
    if(verbose):
        print("Saved model '{}' to disk".format(export))

## Towards Accurate Duplicate Bug Retrieval Using Deep Learning Techniques

https://github.com/tqtg/DuplicateBugFinder

### Embedding layer

In [34]:
from keras.constraints import MaxNorm
from keras.initializers import TruncatedNormal, RandomUniform

# Is missing the padding_idx used in pytorch
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html
# https://stackoverflow.com/questions/54824768/rnn-model-gru-of-word2vec-to-regression-not-learning
def embedding_layer(embeddings, num_words, embedding_dim, max_sequence_length, trainable):
    embedding_layer = Embedding(num_words,
                                  embedding_dim,
                                  name='embedding_layer',
                                  weights=[embeddings],
                                  embeddings_constraint=MaxNorm(max_value=1, axis=0),
                                  input_length=max_sequence_length,
                                  trainable=trainable)
    return embedding_layer

### CNN with filter 3,4,5

In [35]:
import keras
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D

def cnn_model(embedding_layer, max_sequence_length):

    sequence_input = Input(shape=(max_sequence_length,), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    # best combination filter (3, 4, 5) e 128 e 256
    convs = []
    filter_sizes = [3, 4, 5]
    n_filters = 64

    for index, filter_size in enumerate(filter_sizes):
        l_conv = Conv1D(filters=n_filters, kernel_size=filter_size)(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=filter_size)(l_conv) # index+1
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    #conv = Conv1D(filters=n_filters * 3, kernel_size=3)(l_merge)
    layer = GlobalAveragePooling1D()(l_merge)
    #layer = Flatten()(l_merge)
    layer = Dense(100, activation='tanh')(layer)
    #layer = LeakyReLU()(layer)

    cnn_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureCNNGenerationModel') # inputs=visible

    return cnn_feature_model

### Bi-LSTM

In [36]:
from keras.layers import Dense, Input, LSTM, GRU, Dropout, Bidirectional, GlobalAveragePooling1D

def lstm_model(embedding_layer, max_sequence_length):
    number_lstm_units = 50
    rate_drop_lstm = 0
    recurrent_dropout = 0

    sequence_input = Input(shape=(max_sequence_length, ), name='Feature_BugInput')
    embedded_sequences = embedding_layer(sequence_input)

    # Creating LSTM Encoder
#     lstm_layer = Bidirectional(LSTM(number_lstm_units, return_sequences=True), # dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm 
#                                merge_mode='ave')

    lstm_layer = LSTM(number_lstm_units, return_sequences=True)(embedded_sequences)
    layer = LSTM(number_lstm_units)(lstm_layer)

    #layer = lstm_layer(embedded_sequences)
    #layer = GlobalAveragePooling1D()(layer)
    layer = Dense(100, activation='tanh')(layer)

    lstm_feature_model = Model(inputs=[sequence_input], outputs=[layer], name = 'FeatureLstmGenerationModel') # inputs=visible

    return lstm_feature_model

### MLP

In [37]:
def mlp_model(input_size):
    info_input = Input(shape=(input_size, ), name='Feature_BugInput')
    input_size = 100
    
    layer = Dense(input_size, activation='tanh')(info_input)
    
    mlp_feature_model = Model(inputs=[info_input], outputs=[layer], name = 'FeatureMlpGenerationModel')
    
    return mlp_feature_model

### Siamese model

In [38]:
from keras import backend as K
import tensorflow as tf

def l2_normalize(x, axis):
    norm = K.sqrt(K.sum(K.square(x), axis=axis, keepdims=False))
    return x, K.maximum(norm, K.epsilon())

def normalize(x):
    return l2_normalize(x, axis=-1)
    
# https://github.com/keras-team/keras/issues/3031
# https://github.com/keras-team/keras/issues/8335
def cosine_distance(inputs):
    x, y = inputs
    x, x_norm = l2_normalize(x, axis=-1)
    y, y_norm = l2_normalize(y, axis=-1)
    distance = K.sum( x * y, axis=-1) / (x_norm * y_norm)
    distance = (distance + K.constant(1)) / K.constant(2)
    # Distance goes from 0 to 2 in theory, but from 0 to 1 if x and y are both
    # positive (which is the case after ReLU activation).
    return distance

def margin_loss(y_true, y_pred):
    margin = K.constant(1.0)
    loss = K.maximum(0.0, margin - y_pred[0] +  y_pred[1])
    return K.mean(loss)

def pos_distance(y_true, y_pred):
    return K.mean(y_pred[0])

def neg_distance(y_true, y_pred):
    return K.mean(y_pred[1])

def stack_tensors(vects):
    return K.stack(vects)

#### Propose

In [39]:
from keras.initializers import TruncatedNormal
from keras.regularizers import l2

def residual_bug():
    def block(block_input):
        shape_size = K.int_shape(block_input)[1]
        
        residual =  block_input
        
        layer_out = Dense(shape_size // 2, activation='tanh')(block_input)
        
        skip_out =  Dense(shape_size, activation='linear', use_bias=False)(layer_out)
        # kernel_initializer=TruncatedNormal(mean=0.0, stddev=0.05, 
         #             seed=42), kernel_regularizer=l2(0.01)
        
        dense_out =  Dense(shape_size, activation='linear', use_bias=False)(layer_out)
        
        block_out =   Add()([residual, dense_out])
        return block_out, skip_out
    return block

In [40]:
from keras.layers import concatenate, Add, Lambda, merge, Average, Maximum
from keras.optimizers import Adam, Nadam

def siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, sequence_length_info, 
                  sequence_length_t, sequence_length_d, name):
  
    bug_t = Input(shape = (sequence_length_t, ), name = 'title_{}'.format(name))
    bug_d = Input(shape = (sequence_length_d, ), name = 'desc_{}'.format(name))
    bug_i = Input(shape = (sequence_length_info, ), name = 'info_{}'.format(name))
    
    bug_t_feat_lstm = lstm_feature_model(bug_t)
    bug_d_feat_cnn = cnn_feature_model(bug_d)
    bug_i_feat_mlp = mlp_feature_model(bug_i)
    
    #     bug_feature_output = Add(name = 'merge_features_{}'.format(name))([bug_i_feat_mlp, bug_t_feat_lstm, bug_d_feat_cnn])
    bug_feature_output = concatenate([bug_i_feat_mlp, bug_t_feat_lstm, bug_d_feat_cnn], name = 'merge_features_{}'.format(name))
    
    #     encoded_1a, encoded_1b  = residual_bug()(bug_feature_output)
    #     encoded_2a, encoded_2b  = residual_bug()(encoded_1a)
    
    #     bug_feature_output = Add()([encoded_1b, encoded_2b])
    #     bug_feature_output = Activation('tanh')(bug_feature_output)
    
    # Bug representation layer
    #     bug_feature_output = Dense(100, activation='tanh')(bug_feature_output)
    
    bug_feature_model = Model(inputs=[bug_t, bug_d, bug_i], outputs=[bug_feature_output], name = 'merge_features_{}'.format(name))
    
    return bug_feature_model

In [41]:
def max_margin_objective(encoded_anchor, encoded_positive, encoded_negative):
    
    inputs = np.concatenate([encoded_anchor.input, encoded_positive.input, encoded_negative.input], -1).tolist()
    
    encoded_anchor = encoded_anchor.output
    encoded_positive = encoded_positive.output
    encoded_negative = encoded_negative.output
    
    # Cosine
    positive_d = Lambda(cosine_distance, name='pos_cosine_distance', output_shape=[1])([encoded_anchor, encoded_positive])
    negative_d = Lambda(cosine_distance, name='neg_cosine_distance', output_shape=[1])([encoded_anchor, encoded_negative])

    # Loss function only works with a single output
    output = Lambda(
        lambda vects: stack_tensors(vects),
        name='stack-distances',
        output_shape=(2, 1)
    )([positive_d, negative_d])

    similarity_model = Model(inputs = inputs, 
                           outputs = output, name = 'Similarity_Model')

    optimizer = Nadam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=K.epsilon(), schedule_decay=0.01)
    #optimizer = 'adam'

    # setup the optimization process 
    similarity_model.compile(optimizer=optimizer, loss=margin_loss, metrics=[pos_distance, neg_distance])

    return similarity_model

In [42]:
%%time
import keras

# Inspired on https://'pastebin.com/TaGFdcBA
# TODO: https://stackoverflow.com/questions/49941903/keras-compute-cosine-distance-between-two-flattened-outputs
keras.backend.clear_session()

# Embeddings
cnn_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_D, 
                              trainable=True)
lstm_embedding_layer = embedding_layer(embeddings=baseline.embedding_matrix, 
                              num_words=len(vocab), 
                              embedding_dim=EMBEDDING_DIM, 
                              max_sequence_length=MAX_SEQUENCE_LENGTH_T, 
                              trainable=True)

# Feature models
cnn_feature_model = cnn_model(cnn_embedding_layer, MAX_SEQUENCE_LENGTH_D)
lstm_feature_model = lstm_model(lstm_embedding_layer, MAX_SEQUENCE_LENGTH_T)
mlp_feature_model = mlp_model(number_of_columns_info)

# Similarity model
encoded_anchor = siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'in')
encoded_positive = siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'pos')

encoded_negative = siamese_model(lstm_feature_model, cnn_feature_model, mlp_feature_model, 
                                     number_of_columns_info, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 'neg')

similarity_model = max_margin_objective(encoded_anchor, encoded_positive, encoded_negative)

# cnn_feature_model.summary()
# lstm_feature_model.summary()
similarity_model.summary()


'''
    Configuration
'''
epochs = 1000
best_recall = 0
best_epoch = 0
verbose = 0

'''
    Experiment
'''
for epoch in range(epochs):
    batch_triplet_train, \
        train_input_sample, train_input_pos, train_input_neg, \
            train_sim = baseline.batch_iterator(baseline.train_data, baseline.dup_sets_train, batch_size, 1)
    train_batch = [train_input_sample['title'], train_input_sample['description'], train_input_sample['info'],
                   train_input_pos['title'], train_input_pos['description'], train_input_pos['info'], 
                   train_input_neg['title'], train_input_neg['description'], train_input_neg['info']]
    
    h = similarity_model.train_on_batch(train_batch, train_sim)
    recall = evaluate_validation_test(verbose, encoded_anchor, retrieval.test, issues_by_buckets)
    print("Epoch: {} - Loss: {:.2f}, positive_cosine: {:.2f}, negative_cosine: {:.2f}, recall@25: {:.2f}".format(
        epoch+1, h[0], h[1], h[2], recall))
    if recall > best_recall:
        save_model(similarity_model, SAVE_PATH.replace('@number_of_epochs@', str(epochs)))
        save_model(encoded_anchor, SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)), verbose=1)
        best_recall = recall
        best_epoch = epoch+1
    # Without step decay for each 10 epochs because the Adam optimizer already do this
    # https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1
print('Best_epoch={}, Best_recall={:.2f}'.format(best_epoch, best_recall))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
info_pos (InputLayer)           (None, 1682)         0                                            
__________________________________________________________________________________________________
title_pos 

NameError: name 'get_buckets_from_validation' is not defined

In [43]:
# loss=h.history['loss']
# val_loss=h.history['val_loss']

# plt.plot(loss, label='loss')
# plt.plot(val_loss, label='val_loss')
# plt.title('Model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()

### Using the feature layers

In [44]:
def cosine_normalized(a, b): # Cosine used in the siamese model
    a = K.variable(a)
    b = K.variable(b)
    return K.eval(cosine_distance([a, b]))

#### Loading bugs of test

In [45]:
test = 'eclipse'

In [46]:
from scipy import spatial
if (DOMAIN == test):
    bug_set = baseline.get_bug_set()
    # Eclipse test
    bug_id = [96204, np.random.choice(list(bug_set))] # non-duplicate {15196, 2}
    # bug_id = [96204, 85581] # duplicate {85581, 96204, 106979}
    dup_a, dup_b = bug_id
    bug_a = bug_set[dup_a]
    bug_b = bug_set[dup_b]

    print(dup_a, dup_b)

96204 69255


#### LSTM feature

In [47]:
if (DOMAIN == test):
    print(bug_a['title'])
    print(bug_b['title'])

preferences filter text cut off using default fonts on organization
preferences preferencelook changed


In [48]:
if (DOMAIN == test):
    bug_vector_a_t = lstm_feature_model.predict(np.array([bug_a['title_word']]))[0]
    bug_vector_b_t = lstm_feature_model.predict(np.array([bug_b['title_word']]))[0]
    result = cosine_normalized(bug_vector_a_t, bug_vector_b_t)
    print(result)

0.5033863


In [49]:
if (DOMAIN == test):
    bug_vector_a_t, bug_vector_b_t

#### CNN feature

In [50]:
if (DOMAIN == test):
    print(bug_a['description'])
    print(bug_b['description'])

the standard default font size for organization desktops is points at dpi at this size the message type filter text in the preferences dialog is being cut off
it seems that the motif tk have organization with the height and width of organization if the user choose preference window with windows who needs more place on the screen to show the effects just do the following location open organization choose person close organization product reopen preferences product then new created organization is much taller than thar from step and shows more details it not possible to see the whole product without the sequence open chose close reopen


In [51]:
if (DOMAIN == test):
    bug_vector_a_d = cnn_feature_model.predict(np.array([bug_a['description_word']]))[0]
    bug_vector_b_d = cnn_feature_model.predict(np.array([bug_b['description_word']]))[0]
    result = cosine_normalized(bug_vector_a_d, bug_vector_b_d)
    print(result)

0.98731875


In [52]:
if (DOMAIN == test):
    bug_vector_a_d, bug_vector_b_d

#### MLP feature

In [53]:
if (DOMAIN == test):
    bug_vector_a_i = mlp_feature_model.predict(np.array([retrieval.get_info(bug_a)]))[0]
    bug_vector_b_i = mlp_feature_model.predict(np.array([retrieval.get_info(bug_b)]))[0]
    result = cosine_normalized(bug_vector_a_i, bug_vector_b_i)
    print(result)

0.91721916


In [54]:
if (DOMAIN == test):
    bug_vector_a_i, bug_vector_b_i

#### Merge features

In [55]:
if (DOMAIN == test):
    bug_vector_a = np.concatenate([ bug_vector_a_i, bug_vector_a_t, bug_vector_a_d ], -1)
    bug_vector_b = np.concatenate([ bug_vector_b_i, bug_vector_b_t, bug_vector_b_d ], -1)
    result = cosine_normalized(bug_vector_a, bug_vector_b)
    print(result)

0.8390911


In [56]:
if (DOMAIN == test):
    bug_vector_a, bug_vector_b

### Retrieval evaluation

In [57]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

In [58]:
print("Total of queries:", len(retrieval.test))

Total of queries: 7253


In [59]:
retrieval.test[9]

[245784, [242965, 229882, 281586, 238557]]

#### Getting the model trained

In [60]:
SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))

'baseline_feature_1000epochs_64batch(eclipse)'

In [61]:
model = get_model_vectorizer(path=SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs)))

OSError: Unable to open file (unable to open file: name = 'modelos/model_baseline_feature_1000epochs_64batch(eclipse).h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
model.summary()

#### Vectorizing bugs from train

In [None]:
test_vectorized = vectorizer_test(model, retrieval.test)

#### Vectorizing bugs from test

In [None]:
queries_test_vectorized = vectorize_queries(model, retrieval.test, issues_by_buckets)

In [None]:
annoy = indexing_test(test_vectorized)

In [None]:
X_test, distance_test, indices_test = indexing_query(annoy, queries_test_vectorized)

In [None]:
print("Total test vectorized: {}".format(len(test_vectorized)))
print("Total queries vectorized: {}".format(len(queries_test_vectorized)))

In [None]:
formated_rank = rank_result(test_vectorized, indices_test, distance_test)

In [None]:
rank_queries = formating_rank(X_test)

In [None]:
exported_rank = export_rank(rank_queries, formated_rank)

In [None]:
exported_rank[:20]

In [None]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [None]:
'''
# Eclipse
    With CNN print all embeddings zero and 2 epochs
    {'1 - recall_at_5': 0.13,
     '2 - recall_at_10': 0.18,
     '3 - recall_at_15': 0.22,
     '4 - recall_at_20': 0.24}
     Without relu activation for each feature siamese in 100 epochs
     {'1 - recall_at_5': 0.16,
     '2 - recall_at_10': 0.23,
     '3 - recall_at_15': 0.27,
     '4 - recall_at_20': 0.31}
     Without dense in the last layer with 100 epochs with embed trainable
     {'1 - recall_at_5': 0.16,
     '2 - recall_at_10': 0.22,
     '3 - recall_at_15': 0.26,
     '4 - recall_at_20': 0.3}
      
      {'1 - recall_at_5': 0.16,
         '2 - recall_at_10': 0.22,
         '3 - recall_at_15': 0.26,
         '4 - recall_at_20': 0.29,
         '5 - recall_at_25': 0.29}
    With title (100 padding) and desc (500 padding) and batch refactored
        {'1 - recall_at_5': 0.2,
         '2 - recall_at_10': 0.26,
         '3 - recall_at_15': 0.3,
         '4 - recall_at_20': 0.33,
         '5 - recall_at_25': 0.33}
         
         {'1 - recall_at_5': 0.2,
         '2 - recall_at_10': 0.27,
         '3 - recall_at_15': 0.31,
         '4 - recall_at_20': 0.34,
         '5 - recall_at_25': 0.34}
         With recall in validation step and split 90 train 10 to test
         {'1 - recall_at_5': 0.25,
         '2 - recall_at_10': 0.32,
         '3 - recall_at_15': 0.37,
         '4 - recall_at_20': 0.4,
         '5 - recall_at_25': 0.4}
         With 200 epochs validation_recall@25 = 58, optimizer=Nadam
         {'1 - recall_at_5': 0.26,
         '2 - recall_at_10': 0.34,
         '3 - recall_at_15': 0.39,
         '4 - recall_at_20': 0.42,
         '5 - recall_at_25': 0.42}
         With 100 epochs validation_recall@25 = 52, optimizer=Adam
         {'1 - recall_at_5': 0.23,
         '2 - recall_at_10': 0.3,
         '3 - recall_at_15': 0.34,
         '4 - recall_at_20': 0.37,
         '5 - recall_at_25': 0.37}
        With 1000 epochs validation_recall@25=60, optimizer=Nadam
        {'1 - recall_at_5': 0.24,
         '2 - recall_at_10': 0.32,
         '3 - recall_at_15': 0.37,
         '4 - recall_at_20': 0.41,
         '5 - recall_at_25': 0.41}
         With 1000 epochs validation_recall@25=64, optimizer=Nadam
         {'1 - recall_at_5': 0.28,
         '2 - recall_at_10': 0.36,
         '3 - recall_at_15': 0.41,
         '4 - recall_at_20': 0.45,
         '5 - recall_at_25': 0.45}
         Withou change the distance x when calculate the cosine
         {'1 - recall_at_5': 0.18,
         '2 - recall_at_10': 0.24,
         '3 - recall_at_15': 0.28,
         '4 - recall_at_20': 0.31,
         '5 - recall_at_25': 0.31}
         With concatenation
         {'1 - recall_at_5': 0.23,
         '2 - recall_at_10': 0.31,
         '3 - recall_at_15': 0.36,
         '4 - recall_at_20': 0.4,
         '5 - recall_at_25': 0.43}
             
    # Open Office
    {'1 - recall_at_5': 0.2,
     '2 - recall_at_10': 0.27,
     '3 - recall_at_15': 0.31,
     '4 - recall_at_20': 0.34,
     '5 - recall_at_25': 0.34}
'''
evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report