In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 40
MAX_SEQUENCE_LENGTH_D = 200 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
# Domain to use
DOMAIN = 'eclipse'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [5]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [6]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=322339), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39545), HTML(value='')))




In [7]:
experiment.load_ids()

Reading bug ids


In [8]:
len(baseline.bug_ids)

361006

In [9]:
experiment.load_bugs()

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=321536), HTML(value='')))




In [11]:
%%time

experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

Reading train data
Reading bug ids
CPU times: user 5min 15s, sys: 92.6 ms, total: 5min 15s
Wall time: 5min 22s


In [12]:
df = pd.read_csv(DATASET)

In [13]:
'''
    Good examples
    Bugs = 60, 214092, 260566
''' 

df[df['bug_id'] == 260566][['short_desc', 'description']].values

array([['cant run new "Eclipse Application" run configuration with plugins from workspace',
        'Build ID: I20081211-1908\n\nSteps To Reproduce:\n1. Create plugin projects\n2. Create a new "Eclipse Application" with workspaces plugins\n3. Run it\n\n\nMore information:\nWith cocoa x86-64 version configured to use mac os x jre 1.6 (in Info.plist).\nI have some plugin projects in my workspace.\nHere is the crash report :\nProcess:         eclipse [1741]\nPath:            /eclipses/versions/eclipse-3.5M4/Eclipse.app/Contents/MacOS/eclipse\nIdentifier:      org.eclipse.eclipse\nVersion:         3.4 (3.4)\nCode Type:       X86-64 (Native)\nParent Process:  launchd [139]\n\nDate/Time:       2009-01-09 18:33:43.735 +0100\nOS Version:      Mac OS X 10.5.6 (9G55)\nReport Version:  6\n\nException Type:  EXC_BAD_ACCESS (SIGSEGV)\nException Codes: KERN_INVALID_ADDRESS at 0x0000000000000000\nCrashed Thread:  7\n\nApplication Specific Information:\n\nJava information:\n Exception type: Bus Error 

In [14]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## NER 

Reference: http://minerandodados.com.br/index.php/2018/07/03/named-entity-recognition-como-isso-funciona/

In [15]:
!python -m spacy download en_core_web_lg

[38;5;2m[+] Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [16]:
###### import spacy
import spacy
import numpy as np
from spacy import displacy

In [76]:
import re
import nltk
from contractions import contractions_dict

# Example
# https://medium.com/@pemagrg/pre-processing-text-in-python-ad13ea544dae

ENTITY_ENUM = {
    '': 'unknown',
    'PERSON': 'person',
    'NORP': 'nationality',
    'FAC': 'facility',
    'ORG': 'organization',
    'GPE': 'country',
    'LOC': 'location',
    'PRODUCT': 'product',
    'EVENT': 'event',
    'WORK_OF_ART': 'artwork',
    'LANGUAGE': 'language',
    'DATE': 'date',
    'TIME': 'time',
    # 'PERCENT': 'percent',
    # 'MONEY': 'money',
    # 'QUANTITY': 'quantity',
    # 'ORDINAL': 'ordinal',
    # 'CARDINAL': 'cardinal',
    'PERCENT': 'number',
    'MONEY': 'number',
    'QUANTITY': 'number',
    'ORDINAL': 'number',
    'CARDINAL': 'number',
    'LAW': 'law'
}

# Keyboards
keyboards = [u'ctrl', u'CTRL', u'CTRL\+TAB', u'ctrl\+tab', u'ESC', u'Esc', u'esc', u'crtl \+ space', 
             u'CTRL \+ SPACE', u'CTRL + Space', u'CTRL\-C', u'CTRL\-V', u'ctrl\-c', u'ctrl\-v', u'Ctrl-z', u'Ctrl - z',
            u'CTRL-z', u'Ctrl+z', u'ctrl-z', u'ctrl+z', u'CTRL - z', u'Ctrl + z', u'CTRL+z', u'CTRL+Z', u'CTRL + Z',
            u'CTRL- Z']
for i in range(0, 13):
    # Ctrl+number
    keyboards.append(u'CTRL\+{}'.format(i))
    keyboards.append(u'Ctrl\+{}'.format(i))
    keyboards.append(u'ctrl\+{}'.format(i))
    keyboards.append(u'CTRL \+ {}'.format(i))
    keyboards.append(u'Ctrl \+ {}'.format(i))
    keyboards.append(u'ctrl \+ {}'.format(i))
    keyboards.append(u'CTRL\-{}'.format(i))
    keyboards.append(u'Ctrl\-{}'.format(i))
    keyboards.append(u'ctrl\-{}'.format(i))
    # F+number
    keyboards.append(u'F{}'.format(i))
    keyboards.append(u'f{}'.format(i))
    
def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
    re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
    def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contractions_dict.get(match) \
                if contractions_dict.get(match) \
                else contractions_dict.get(match.lower())
            expanded_contraction = expanded_contraction
            return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
def func_name_tokenize(text):
    s = []
    for i, c in enumerate(text):
      if c.isupper() and i > 0 and text[i-1].islower():
        s.append(' ')
      s.append(c)
    return ''.join(s).strip()

def ner_replace(corpus, text, allow_ner):
    ents, start_char, end_char = [], [], []
    
    ents = [ENTITY_ENUM[row.label_] if row.label_ in ENTITY_ENUM else row.label_ for row in corpus.ents]
    starts_char = np.array([row.start_char for row in corpus.ents])
    ends_char = np.array([row.end_char for row in corpus.ents])
    
    for index, ent, start_pos, end_pos in zip(range(len(ents)), ents, starts_char, ends_char):
        if ent.lower() in allow_ner:
            replaced = " {} ".format(ent.lower())
            text = text[:start_pos] + replaced + text[end_pos:]
            diff_replaced = len(replaced) - len(text[start_pos:end_pos])
            if diff_replaced > 0: # push
                starts_char[index+1:] += diff_replaced
                ends_char[index+1:] += diff_replaced
            elif diff_replaced < 0: # pull
                starts_char[index+1:] -= (diff_replaced * -1)
                ends_char[index+1:] -= (diff_replaced * -1)
    return text

def normalize_text(corpus, text, allow_ner=None, ner=None):
    # Bug links
    text = re.sub(r'(https://bugs.eclipse.org/bugs/show_bug\.cgi\?id\=)[0-9]{1,}', 'bug id', str(text)) # extension file
    text = re.sub(r'(bug|Bug) (#|)[0-9]{1,}', 'bug id', text) # bug id
    
    text = re.sub(r'(build|Build|Build Identifier|build identifier)( #|#| | |: |:| :| I| i)[0-9]{1,}', 'build id ', text) # build id
    text = re.sub(r'(npe|NPE)', 'null pointer exception', text) # npe to null pointer exception
    text = re.sub(r'(vm|VM)', 'virtual machine', text) # VM to Virtual Machine
    text = re.sub(r'[0-9]{1,} (min|minutes|minute|m)', 'x time', text) # [0-9] min
    # Extension files
    text = re.sub(r'(WAR|zip|ZIP)', 'extension file', text) # extension file
    text = re.sub(r'.(zip|txt|java|js|html|php|pdf|exe|doc|jar|xml)', ' extension file', text) # extension file
    # Keyboards
    text = re.sub(r'('+('|'.join(keyboards))+')', 'keyboard', text) # key board
    # Contraction
    text=expand_contractions(text,contractions_dict)
    #text = re.sub(r'(doesn\'t)', 'does not', text) # does not 
    #text = re.sub(r'\w{2,}(.java)', 'code class', str(text)) # .java class files

    # NER processing
    text = text[:100000] # limit of spacy lib
    if ner != None:
        corpus = ner(text)
    text = ner_replace(corpus, text, allow_ner)

    tokens = re.compile(r'[\W_]+', re.UNICODE).split(text)
    text = ' '.join([func_name_tokenize(token) for token in tokens])
    #     text = ' '.join(tokens)
    
    text = re.sub(r'\d+((\s\d+)+)?', 'number', text)
    text = [word.lower() for word in nltk.word_tokenize(text)]
    text = ' '.join([word for word in text]).encode('utf-8')
    return text, corpus

In [18]:
ner = spacy.load('en_core_web_lg')

In [19]:
#idx = 3 # 37
'''
    ID with stack stacktrace
    94184, 331595
'''
#idx = np.random.choice(list(baseline.bug_set), 1)[0]
idx = 82448

print("ID=", idx)

"""
Some issues with bug in the description: 40, 41, 42, 43, 44
"""

sentence = df[df['bug_id'] == idx]['description'].values[0]
#sentence2 = baseline.bug_set[baseline.bug_ids[idx]]['description']

texto = ner(sentence)
texto2 = normalize_text(texto, sentence, list(ENTITY_ENUM))

displacy.render(texto, style='ent', jupyter=True)
print("###########################################################################################")
print(texto2)
print("###########################################################################################")
print(baseline.bug_set[idx]['description'])

ID= 82448


###########################################################################################
(b'internal compiler error java lang null pointer exception at org eclipse jdt internal compiler ast annotation resolve type annotation extension file at org eclipse jdt internal compiler ast astnode resolve annotations astnode extension file at org eclipse jdt internal compiler ast abstract methodaration resolve abstract methodaration extension file at org eclipse jdt internal compiler ast typearation resolve typearation extension file at org eclipse jdt internal compiler ast typearation resolve typearation extension file at org eclipse jdt internal compiler ast compilation unitaration resolve compilation unitaration extension file at org eclipse jdt internal compiler compiler process compiler extension file at org eclipse jdt internal compiler compiler compile compiler extension file at org eclipse jdt internal core builder abstract image builder compile abstract image builder extension file a

## Optimizing NER

https://pypi.org/project/excelcy/

## Some bug ids example

Eclipse

- numerical 
    - 61364
- http link
    - 267713
- stack trace
    - 368109, 35801, 367500, 253193, 281806, 352726, 317126, 277559
- steps
    - 327983, 49639, 197215, 187705, 30351, 6747
- bug id
    - 404690, 348357, 26121, 293159
- build id
    - 5516, 301002
- Entidades que dizem a mesma coisa (NPE = Null Pointer Exception)
    - 120679
- Keyboard examples
    - 203041, 5081
- Extension files
    - 380627

In [21]:
# Dates
dates = []
for year in range(2000, 2012):
    for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Oct', 'Nov', 'Dec']:
        for day in range(32):
            dates.append( u'{} {}, {}'.format(day, month, year))

In [37]:
steps = []
for i in range(15):
    steps.append(u'{}. '.format(i))
    steps.append(u'({}) '.format(i))
    steps.append(u'{}) '.format(i))

In [67]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

# Solution from https://github.com/explosion/spaCy/issues/3608
class EntityMatcher(object):
    #name = "entity_matcher"

    def __init__(self, name, nlp, terms, label):
        self.name = name
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        for match_id, start, end in matches:
        #    span = Span(doc, start, end, label=match_id)
        #    doc.ents = list(doc.ents) + [span]
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc

nlp = spacy.load('en_core_web_lg')

'''
Organization
(u'Oracle Corporation', u'Oracle', u'Mozilla', u'Google', u'IBM', u'An IBM')
Product
(u'MacOS', u'MacOS X', u'MacOS x', u'Mac OS X', u'Redhat Linux', u'RedHat Enterprise',
               u'Linux', u'Windows XP', u'Java Virtual Machine', 
               u'VM', u'BIRT', u'Birt Web project', u'Birt', u'Birt Charting', u'JIRA', u'linux',
               u'CDT', u'JREs', u'JRE', u'jre', u'Windows NT', u'SWT', u'CVS', u'Fedora Core',
              u'Tomcat', u'Axis', u'Red Hat', u'GTK'),
Component
(u'JDK', u'JDT', u'AJNature', u'JavaBuilder', u'AJBuilder', u'OclInvalid', u'Aerogear', 
u'JSP', u'JGit', u'SDK', u'JEE', u'EPP', u'JEE EPP', u'Widget'),
'''

list_terms = [dates,
              (u'API', u"The Javadoc", u"the Javadoc", u"C++", u'c++', u'C/C++', u'XML', u'xml',
               u'CSS', u'css', u'SQL', u'sql',
               u'HTML5', u'HTTP', u'html', u'http', u'html5' u'html 5', 'HTML', u'HTML 5'),
              (u'MacOS', u'MacOS X', u'MacOS x', u'Mac OS X', u'Redhat Linux', u'RedHat Enterprise',
               u'Linux', u'Windows XP', u'WindowsXP', u'Windows NT', u'Fedora Core', u'Red Hat'),
              steps
             ]
list_labels = ['DATE', "LANGUAGE", "PRODUCT", "STEP INDEX"]

allow_ner = ['person', 'time', 'number']

allow_ner += [ent.lower() for ent in list_labels]

for terms, label in zip(list_terms, list_labels):
    entity_matcher = EntityMatcher(label, nlp, terms, label)
    nlp.add_pipe(entity_matcher, after='ner')

In [80]:
import numpy as np
from spacy import displacy

#idx = 3 # 37
'''
    ID with stack stacktrace
    94184, 331595
'''
idx = np.random.choice(list(baseline.bug_set), 1)[0]
# idx = 208547 # 327983, 49639, 197215, 187705, 30351

print("ID=", idx)

"""
Some issues with bug in the description: 40, 41, 42, 43, 44
"""

sentence = df[df['bug_id'] == idx]['description'].values[0]
#sentence2 = baseline.bug_set[baseline.bug_ids[idx]]['description']

#texto = nlp(sentence)
texto2, texto2_ner = normalize_text(None, sentence, allow_ner=allow_ner, ner=nlp)
#print(sentence)
print("###########################################################################################")
displacy.render(texto2_ner, style='ent', jupyter=True)
print("###########################################################################################")
print(texto2)
print("###########################################################################################")
print(baseline.bug_set[idx]['description'])

ID= 192280
###########################################################################################


###########################################################################################
b'driver wtp number rcnumber create an axisnumber client scenario with this url http www xmethods net sd number currency exchange service wsdl resulted in the following error exception occurred while code generation for wsdl null extension file lang reflect invocation target exception at sun reflect native method accessor impl invokenumber native method at sun reflect native method accessor impl invoke native method accessor impl extension file number at sun reflect delegating method accessor impl invoke delegating method accessor impl extension file number at extension file lang reflect method invoke method extension file number at org eclipse extension filet ws axisnumber consumption core utils wsdlnumberjava generator get axis service wsdlnumberjava generator extension file number at org eclipse extension filet ws axisnumber consumption core command axisnumberclient codegen command extension 