In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 40
MAX_SEQUENCE_LENGTH_D = 200 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [6]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=295707), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39545), HTML(value='')))




In [7]:
experiment.load_ids()

Reading bug ids


In [8]:
len(baseline.bug_ids)

61700

In [9]:
experiment.load_bugs()

HBox(children=(IntProgress(value=0, max=61700), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=295766), HTML(value='')))




In [11]:
%%time

experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

Reading train data
Reading bug ids
CPU times: user 1min 43s, sys: 45.3 ms, total: 1min 43s
Wall time: 1min 43s


In [12]:
df = pd.read_csv(DATASET)

In [13]:
df[df['bug_id'] == 8]

Unnamed: 0,bug_id,bug_severity,bug_status,component,creation_ts,delta_ts,description,dup_id,priority,product,resolution,short_desc,version
10,8,normal,RESOLVED,Team,2001-10-10 21:34:00 -0400,2011-05-17 11:40:32 -0400,"With the current VCM API, a repository adapter...",[],P3,Platform,WONTFIX,how can we support,2.0


In [14]:
'''
    Good examples
    Bugs = 60, 214092, 260566
''' 

df[df['bug_id'] == 260566][['short_desc', 'description']].values

array([['cant run new "Eclipse Application" run configuration with plugins from workspace',
        'Build ID: I20081211-1908\n\nSteps To Reproduce:\n1. Create plugin projects\n2. Create a new "Eclipse Application" with workspaces plugins\n3. Run it\n\n\nMore information:\nWith cocoa x86-64 version configured to use mac os x jre 1.6 (in Info.plist).\nI have some plugin projects in my workspace.\nHere is the crash report :\nProcess:         eclipse [1741]\nPath:            /eclipses/versions/eclipse-3.5M4/Eclipse.app/Contents/MacOS/eclipse\nIdentifier:      org.eclipse.eclipse\nVersion:         3.4 (3.4)\nCode Type:       X86-64 (Native)\nParent Process:  launchd [139]\n\nDate/Time:       2009-01-09 18:33:43.735 +0100\nOS Version:      Mac OS X 10.5.6 (9G55)\nReport Version:  6\n\nException Type:  EXC_BAD_ACCESS (SIGSEGV)\nException Codes: KERN_INVALID_ADDRESS at 0x0000000000000000\nCrashed Thread:  7\n\nApplication Specific Information:\n\nJava information:\n Exception type: Bus Error 

In [15]:
if 260566 in baseline.bug_set:
    print(baseline.bug_set[260566])

{'priority': '3\n', 'description': 'build id inumbernumbernumbernumbernumbernumbernumbernumber date steps to reproduce number create plugin projects number create a new product with workspaces plugins number run it more information with cocoa xnumbernumber number version configured to use mac os x jre number number in info plist i have some plugin projects in my workspace here is the crash report process eclipse numbernumbernumbernumber path eclipses versions eclipse number number mnumber eclipse app contents macorganization eclipse identifier org eclipse eclipse version number number number number code type xnumbernumber number native parent process launchd numbernumbernumber date time numbernumbernumbernumber numbernumber numbernumber numbernumber numbernumber numbernumber numbernumbernumber numbernumbernumbernumber organization version mac organization x numbernumber number number number gnumbernumber report version number exception type organization organization exception codes cou

In [13]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
if 260566 in baseline.bug_set:
    #text = "Test cant't test23@123 #ffff" 214092, 260566
    text = df[df['bug_id'] == 260566]['description'].values[0]
    def func_name_tokenize(text):
        s = []
        for i, c in enumerate(text):
          if c.isupper() and i > 0 and text[i-1].islower():
            s.append(' ')
          s.append(c)
        return ''.join(s).strip()

    tokens = re.compile(r'[\W_]+', re.UNICODE).split(str(text))
    text = ' '.join([func_name_tokenize(token) for token in tokens])
    text = re.sub(r'\d+((\s\d+)+)?', ' ', text)
    text = [word.lower() for word in nltk.word_tokenize(text)]
    text = ' '.join([word for word in text if len(word) > 1])
    print(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## NER 

Reference: http://minerandodados.com.br/index.php/2018/07/03/named-entity-recognition-como-isso-funciona/

In [17]:
!python -m spacy download en_core_web_lg

[38;5;2m[+] Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [208]:
import re
import nltk

ENTITY_ENUM = {
    '': 'unknown',
    'PERSON': 'person',
    'NORP': 'nationality',
    'FAC': 'facility',
    'ORG': 'organization',
    'GPE': 'country',
    'LOC': 'location',
    'PRODUCT': 'product',
    'EVENT': 'event',
    'WORK_OF_ART': 'artwork',
    'LANGUAGE': 'language',
    'DATE': 'date',
    'TIME': 'time',
    # 'PERCENT': 'percent',
    # 'MONEY': 'money',
    # 'QUANTITY': 'quantity',
    # 'ORDINAL': 'ordinal',
    # 'CARDINAL': 'cardinal',
    'PERCENT': 'number',
    'MONEY': 'number',
    'QUANTITY': 'number',
    'ORDINAL': 'number',
    'CARDINAL': 'number',
    'LAW': 'law'
}

def func_name_tokenize(text):
    s = []
    for i, c in enumerate(text):
      if c.isupper() and i > 0 and text[i-1].islower():
        s.append(' ')
      s.append(c)
    return ''.join(s).strip()

def ner_replace(ner, text):
    corpus = ner(text)
    for row in corpus.ents:
      text = text.replace(row.text, ENTITY_ENUM[row.label_] if row.label_ in ENTITY_ENUM else row.label_)
    return text

def normalize_text(ner, text):
    text = re.sub(r'(bug|Bug) (#|)[0-9]{1,}', 'bug id', str(text)) # bug id
    text = re.sub(r'\w{2,}(.java)', 'java class', str(text)) # .java class files
    text = ner_replace(ner, str(text))
    tokens = re.compile(r'[\W_]+', re.UNICODE).split(text)
    text = ' '.join([func_name_tokenize(token) for token in tokens])
    text = re.sub(r'\d+((\s\d+)+)?', ' ', text)
    text = text[:100000] # limit of spacy lib
#     text = ner_replace(ner, text)
    text = [word.lower() for word in nltk.word_tokenize(text)]
    text = ' '.join([word for word in text]).encode('utf-8')
    return text

In [109]:
ner = spacy.load('en_core_web_lg')

In [70]:
###### import spacy
import spacy
import numpy as np
from spacy import displacy

#idx = 3 # 37
'''
    ID with stack stacktrace
    94184, 331595
'''
idx = np.random.choice(list(baseline.bug_set), 1)[0]
#idx = 281806

print("ID=", idx)

"""
Some issues with bug in the description: 40, 41, 42, 43, 44
"""

sentence = df[df['bug_id'] == idx]['description'].values[0]
#sentence2 = baseline.bug_set[baseline.bug_ids[idx]]['description']

texto = ner(sentence)
texto2 = normalize_text(ner, sentence)

displacy.render(texto, style='ent', jupyter=True)
print("###########################################################################################")
print(texto2)
print("###########################################################################################")
print(baseline.bug_set[idx]['description'])

ID= 281806


###########################################################################################
b'build id inumber numbernumber number steps to reproduce number create an enum inside a class like this class foo enum bar single line a multiline comment b number format shift ctrl f number the number lines of the comment get wrong indentation class foo enum bar single line a multiline comment b more information this happens only with enums that inside something not with top level enums'
###########################################################################################
build id inumber numbernumber number steps to reproduce number create an enum inside a class like this class foo enum bar single line a multiline comment b number format shift ctrl f number the number lines of the comment get wrong indentation class foo enum bar single line a multiline comment b more information this happens only with enums that inside something not with top level enums


## Optimizing NER

https://pypi.org/project/excelcy/

In [106]:
nlp = spacy.blank('en')
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)
#nlp.to_disk('model')

## Some bug ids example

Eclipse

- numerical 
    - 61364
- http link
    - 267713
- stack trace
    - 368109, 35801, 367500, 253193, 281806, 352726, 317126
- steps
    - 327983, 49639, 197215, 187705
- bug id citted
    - 404690, 348357, 26121

In [None]:
# Dates
dates = []
for year in range(2000, 2012):
    for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Oct', 'Nov', 'Dec']:
        for day in range(32):
            dates.append( u'{} {}, {}'.format(day, month, year))

In [190]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

# Solution from https://github.com/explosion/spaCy/issues/3608
class EntityMatcher(object):
    #name = "entity_matcher"

    def __init__(self, name, nlp, terms, label):
        self.name = name
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        for match_id, start, end in matches:
        #    span = Span(doc, start, end, label=match_id)
        #    doc.ents = list(doc.ents) + [span]
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc

nlp = spacy.load('en_core_web_lg')

list_terms = [dates,
              (u'Oracle Corporation', u'Oracle', u'Mozilla', u'Google'),
              (u"the Java", u"Java", u"java", u"the java", u"Javadoc", u'API', 
               u"The Javadoc", u"the Javadoc", u"C++", u'c++', u'C/C++', u'XML', u'xml', u'SQL', u'sql',
               u'HTML5', u'HTTP', u'html', u'http', u'html5' u'html 5', u'HTML 5'), 
              (u"Dennis", u"Bob", u"Kamil", u'Kamil Ignacak'), 
              (u'WAR', u'zip'),
              (u'MacOS', u'MacOS X', u'MacOS x', u'Mac OS X', u'Redhat Linux', u'RedHat Enterprise', u'Linux', 
               u'Eclipse', u'eclipse', u'The Eclipse', u'WindowsXP', u'Windows XP', u'Java Virtual Machine', 
               u'VM', u'BIRT', u'Birt Web project', u'Birt', u'Birt Charting', u'JIRA', u'linux',
               u'CDT', u'JREs', u'JRE', u'jre', u'Windows NT', u'SWT', u'CVS', u'Fedora Core',
              u'Tomcat', u'Axis', u'Red Hat', u'GTK'),
              (u'JDK', u'JDT', u'AJNature', u'JavaBuilder', u'AJBuilder', u'OclInvalid', u'Aerogear', 
               u'JSP', u'JGit', u'SDK', u'JEE', u'EPP', u'JEE EPP', u'Widget'),
              (u'1.', u'1)', u'2.', u'2)', u'3.', u'3)',
               u'4.', u'4)', u'5.', u'5)', u'6.', u'6)', u'7.', u'7)',
                u'8.', u'8)', u'9.', u'9)', u'10.', u'10)'),
              (u'ctrl', u'CTRL', u'F1', u'f1', u'F2', u'f2', u'F3', u'f3',
               u'f4', u'F4', u'f5', u'F5', u'f6', u'F6', u'f7', u'F7', u'f8', u'F8',
               u'f9', u'F9', u'f10', u'F10', u'f11', u'F11', u'f12', u'F12', 
               u'CTRL+F1', u'CTRL+F2', u'CTRL+F3', u'CTRL+F4', u'CTRL+F5', u'CTRL+F6',
              u'CTRL+F7', u'CTRL+F8', u'CTRL+F9', u'CTRL+F10', u'CTRL+F11', u'CTRL+F12',
               u'CTRL+TAB', u'ctrl+tab', u'ESC', u'Esc', u'esc', u'CTRL+1', u'CTRL+2', u'CTRL+3', u'CTRL+4',
              u'CTRL+5', u'CTRL+6', u'CTRL+7', u'CTRL+8', u'CTRL+9', u'CTRL+0', u'ctrl+1', u'ctrl+2',
              u'ctrl+3', u'ctrl+4', u'ctrl+5', u'ctrl+6', u'ctrl+7', u'ctrl+8', u'ctrl+9', u'ctrl+0',
              u'crtl + space', u'CTRL + SPACE', u'CTRL + Space', u'CTRL-C', u'CTRL-V', u'ctrl-c', u'ctrl-v')
             ]
list_labels = ['DATE', 'ORG', "LANGUAGE", "PERSON", "FILE", "PRODUCT", "COMPONENT", "STEP NUMBER", "KEYBOARD"]

for terms, label in zip(list_terms, list_labels):
    entity_matcher = EntityMatcher(label, nlp, terms, label)
    nlp.add_pipe(entity_matcher, after='ner')

In [212]:
import numpy as np
from spacy import displacy

#idx = 3 # 37
'''
    ID with stack stacktrace
    94184, 331595
'''
idx = np.random.choice(list(baseline.bug_set), 1)[0]
#idx = 194802

print("ID=", idx)

"""
Some issues with bug in the description: 40, 41, 42, 43, 44
"""

sentence = df[df['bug_id'] == idx]['description'].values[0]
#sentence2 = baseline.bug_set[baseline.bug_ids[idx]]['description']

texto = nlp(sentence)
texto2 = normalize_text(nlp, sentence)

displacy.render(texto, style='ent', jupyter=True)
print("###########################################################################################")
print(texto2)
print("###########################################################################################")
print(baseline.bug_set[idx]['description'])

ID= 318782


###########################################################################################
b'build identifier number when i bind the text value of a organization swt search date with a search bean when i click on the organization cancel icon the verify listener of the swtvetoable value decorator produce a string index out of bounds exception exception in thread main language lang string index out of bounds exception string index out of range language class lang string substring language class language class lang string substring language class at org eclipse jface internal databinding swt swtvetoable value decorator step numberhandle event language class at org eclipse swt widgets event table send event language class at org eclipse swt widgets widget send event language class at org eclipse swt widgets widget send event language class at org eclipse swt widgets widget send event language class at org eclipse swt widgets organization verifyorganization language class at org eclipse sw