# Bug triage with Deep Learning - Fasttext

In [1]:
from __future__ import print_function, division

In [2]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [3]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


## Configurações Globais

In [4]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [5]:
# Domain to use
DOMAIN = 'openoffice'
METHOD = 'fasttext'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)

# Test bugs
PATH_TEST = 'data/processed/{}/test.txt'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

## Auxiliary methods

In [6]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

#### Creating buckets and initialize the retrieval method

In [7]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=83503), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14567), HTML(value='')))




#### Loading bug ids in memory

In [8]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


98070

#### Dicionário de títulos e descrições

In [9]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 40.6 s, sys: 1.13 s, total: 41.7 s
Wall time: 41.3 s


#### Get buckets for each issue

In [10]:
verbose = 0
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=83503), HTML(value='')))




#### Prepare dataset

In [11]:
experiment.prepare_dataset(issues_by_buckets)

Reading train data
Reading bug ids


#### Creating the queries to test

In [12]:
# Read and create the test queries duplicate
experiment.create_queries()

Reading queries from baseline.


#### Bugs train

In [13]:
bugs_train_ids = []
for bugs in experiment.baseline.train_data:
    bugs_train_ids.append(bugs[0])
    bugs_train_ids.append(bugs[1])

#### Corpus

In [14]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in bugs_train_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in bugs_train_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)

In [15]:
len(lines)

214960

#### Exporting corpus to .txt

In [16]:
EXPORT_PATH = os.path.join(DIR, 'corpus_fasttext_train.txt')
print(EXPORT_PATH)
with open(EXPORT_PATH, 'w') as f:
    for row in lines:
        f.write("{}\n".format(str(row).encode("utf-8")))

data/processed/openoffice/corpus_fasttext_train.txt


#### Train the method

In [17]:
%%time

import os

cpu = os.cpu_count() - 1

max_epochs = 60

# Skipgram model :
model = fasttext.train_unsupervised(EXPORT_PATH, 
                                    model='cbow', 
                                    minCount=1,
                                    neg=12,
                                    wordNgrams=1,
                                    epoch=max_epochs, 
                                    dim=300, 
                                    loss='hs', 
                                    thread=cpu, 
                                    verbose=2)

CPU times: user 4h 52min 30s, sys: 8.78 s, total: 4h 52min 39s
Wall time: 42min 5s


#### Rank@25

In [18]:
recall, exported_rank = experiment.evaluate_validation_test(retrieval, verbose, model, issues_by_buckets, method=METHOD)
"recall@25 last epoch:", recall

('recall@25 last epoch:', 0.23)

### Retrieval evaluation

In [19]:
print("Total of queries:", len(retrieval.test))

Total of queries: 3861


In [20]:
exported_rank[:20]

['16384:15363|4537:0.5025347173213959,48071:0.49102187156677246,2880:0.4774143695831299,6464:0.47304999828338623,5560:0.4571218490600586,8967:0.4526106119155884,32175:0.4516299366950989,92817:0.4481218457221985,10613:0.4479129910469055,107914:0.4441019296646118,25140:0.44124460220336914,87901:0.4400675892829895,101725:0.4375605583190918,53845:0.435147225856781,5149:0.4260609745979309,1034:0.42392146587371826,12710:0.42252296209335327,58672:0.418052077293396,28625:0.4163869023323059,22519:0.4100836515426636,70692:0.4096079468727112,85426:0.4085348844528198,93583:0.40640389919281006,91015:0.40582120418548584,122149:0.4024706482887268',
 '14604:15363|12510:0.5464937090873718,28625:0.48765021562576294,92817:0.46548765897750854,74609:0.4652732014656067,76405:0.45791661739349365,122149:0.4288351535797119,106833:0.4237746000289917,91015:0.42301076650619507,76067:0.42263466119766235,109284:0.41643524169921875,11491:0.41592323780059814,53845:0.4156246781349182,58672:0.4143673777580261,4366:0.41

#### Exporting the result

In [21]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))

In [22]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [25]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.15,
 '2 - recall_at_10': 0.19,
 '3 - recall_at_15': 0.2,
 '4 - recall_at_20': 0.22,
 '5 - recall_at_25': 0.23}

#### Saving model

In [24]:
model.save_model(os.path.join('modelos', SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(max_epochs)) + '.bin'))