# Bug triage with Deep Learning - Fasttext

In [1]:
from __future__ import print_function, division

In [2]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [3]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


## Configurações Globais

In [4]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [5]:
# Domain to use
DOMAIN = 'openoffice'
METHOD = 'fasttext'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs({})'.format(METHOD, DOMAIN)

# Test bugs
PATH_TEST = 'data/processed/{}/test.txt'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

## Auxiliary methods

In [6]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

#### Creating buckets and initialize the retrieval method

In [7]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=57667), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14567), HTML(value='')))




#### Loading bug ids in memory

In [8]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


72234

#### Dicionário de títulos e descrições

In [9]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=72234), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 26.5 s, sys: 999 ms, total: 27.5 s
Wall time: 26.9 s


#### Get buckets for each issue

In [10]:
verbose = 0
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=58572), HTML(value='')))




#### Prepare dataset

In [11]:
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

Reading train data
Reading bug ids


#### Creating the queries to test

In [12]:
# Read and create the test queries duplicate
experiment.create_queries()

Reading queries from baseline.


#### Bugs train

In [13]:
bugs_train_ids = []
for bugs in experiment.baseline.train_data:
    bugs_train_ids.append(bugs[0])
    bugs_train_ids.append(bugs[1])

#### Corpus

In [14]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in bugs_train_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in bugs_train_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)

In [15]:
len(lines)

194300

#### Exporting corpus to .txt

In [16]:
EXPORT_PATH = os.path.join(DIR, 'corpus_fasttext_train.txt')
print(EXPORT_PATH)
with open(EXPORT_PATH, 'w') as f:
    for row in lines:
        f.write("{}\n".format(str(row).encode("utf-8")))

data/processed/openoffice/corpus_fasttext_train.txt


#### Train the method

In [17]:
%%time

import os

cpu = os.cpu_count() - 1

max_epochs = 15

# Skipgram model :
model = fasttext.train_unsupervised(EXPORT_PATH, 
                                    model='cbow', 
                                    minCount=1,
                                    neg=5,
                                    wordNgrams=3,
                                    epoch=max_epochs, 
                                    dim=300, 
                                    loss='hs', 
                                    thread=cpu, 
                                    verbose=2)

CPU times: user 1h 10min 59s, sys: 5.32 s, total: 1h 11min 5s
Wall time: 10min 15s


#### Rank@25

In [18]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [19]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, verbose, model, issues_by_buckets, 
                                                            bug_train_ids, method=METHOD)
"recall@25 last epoch:", recall

('recall@25 last epoch:', 0.49)

### Retrieval evaluation

In [20]:
print("Total of queries:", len(retrieval.test))

Total of queries: 2086


In [21]:
exported_rank[:20]

['108544:111059,109674,108379,109366|102518:0.446735143661499,109550:0.3897196054458618,102603:0.38123422861099243,107914:0.37953507900238037,113149:0.3665321469306946,113630:0.36473530530929565,95327:0.3622644543647766,93805:0.3604477047920227,109748:0.3575509190559387,114182:0.35405272245407104,118253:0.3482530117034912,115560:0.3472682237625122,97253:0.3455939292907715,101718:0.34379833936691284,95268:0.34087926149368286,104038:0.3389917016029358,110623:0.33712756633758545,97216:0.33601391315460205,107778:0.3346678614616394,12248:0.3341072201728821,107209:0.3338015675544739,96362:0.3334920406341553,100273:0.33040815591812134,120075:0.33018040657043457,120087:0.3300668001174927,95489:0.32895058393478394,98005:0.3283868432044983,107372:0.32837188243865967,89314:0.32808566093444824',
 '109674:108544,111059,108379,109366|111059:0.37429213523864746,98033:0.3424628973007202,114077:0.3157866597175598,108840:0.3140132427215576,98569:0.29881536960601807,98564:0.2932578921318054,108784:0.2912

#### Exporting the result

In [22]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))

In [23]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [24]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.33,
 '2 - recall_at_10': 0.38,
 '3 - recall_at_15': 0.43,
 '4 - recall_at_20': 0.46,
 '5 - recall_at_25': 0.49}

#### Saving model

In [25]:
model.save_model(os.path.join('modelos', SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(max_epochs)) + '.bin'))