# Bug triage with Deep Learning - Fasttext

In [2]:
from __future__ import print_function, division

In [3]:
import fasttext
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [5]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

## Configurações Globais

In [6]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

### Parse bugs preproprecessed

In [7]:
# Domain to use
DOMAIN = 'eclipse'
METHOD = 'fasttext'
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = 'propose_feature@number_of_epochs@epochs_64batch({})'.format(DOMAIN)
SAVE_PATH_FEATURE = 'propose_feature_@number_of_epochs@epochs_64batch({})'.format(DOMAIN)

# Test bugs
PATH_TEST = 'data/processed/{}/test.txt'.format(DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

## Auxiliary methods

In [8]:
baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
evaluation = Evaluation(verbose=0)
experiment = Experiment(baseline, evaluation)

#### Loading bug ids in memory

In [9]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


212512

### Dicionário de títulos e descrições

In [10]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 1min 13s, sys: 2.34 s, total: 1min 15s
Wall time: 1min 14s


### Prepare dataset

In [11]:
experiment.prepare_dataset()

Reading train data


#### Bugs train

In [12]:
bugs_train_ids = []
for bugs in experiment.baseline.train_data:
    bugs_train_ids.append(bugs[0])
    bugs_train_ids.append(bugs[1])

#### Corpus

In [13]:
lines_title = [baseline.bug_set[bug_id]['title'] for bug_id in bugs_train_ids]
lines_desc = [baseline.bug_set[bug_id]['description'] for bug_id in bugs_train_ids]

lines = []
for title, desc in zip(lines_title, lines_desc):
    lines.append(title)
    lines.append(desc)

In [14]:
len(lines)

316292

#### Exporting corpus to .txt

In [15]:
EXPORT_PATH = os.path.join(DIR, 'corpus_fasttext_train.txt')
print(EXPORT_PATH)
with open(EXPORT_PATH, 'w') as f:
    for row in lines:
        f.write("{}\n".format(str(row).encode("utf-8")))

data/processed/eclipse/corpus_fasttext_train.txt


#### Train the method

In [16]:
%%time

import os

cpu = os.cpu_count() - 1

# Skipgram model :
model = fasttext.train_unsupervised(EXPORT_PATH, 
                                    model='cbow', 
                                    minCount=1,
                                    neg=10,
                                    wordNgrams=1,
                                    epoch=15, 
                                    dim=300, 
                                    loss='hs', 
                                    thread=cpu, 
                                    verbose=2)

CPU times: user 4h 16min 15s, sys: 7.63 s, total: 4h 16min 23s
Wall time: 36min 53s


#### Creating buckets and initialize the retrieval method

In [17]:
retrieval = Retrieval()
experiment.retrieval(retrieval, baseline, -1, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))




#### Get buckets for each issue

In [18]:
verbose = 0
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




#### Creating the queries to test

In [19]:
# Read and create the test queries duplicate
experiment.create_queries(PATH_TEST)

Creating the queries...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Rank@25

In [20]:
recall, exported_rank = experiment.evaluate_validation_test(retrieval, verbose, model, issues_by_buckets, method=METHOD)
"recall@25 last epoch:", recall

('recall@25 last epoch:', 0.28)

### Retrieval evaluation

In [21]:
print("Total of queries:", len(retrieval.test))

Total of queries: 7253


In [34]:
exported_rank[:20]

['196609:241619|46704:0.6103690266609192,161628:0.5948980748653412,124182:0.5945041477680206,170385:0.5719431936740875,9235:0.5715988576412201,241619:0.5710369944572449,293551:0.5665361881256104,116688:0.5571001768112183,102526:0.5542369484901428,59371:0.5539950430393219,22487:0.5517208874225616,46315:0.5475929081439972,105738:0.5464617311954498,63185:0.5436904430389404,230056:0.54119473695755,40258:0.5400201678276062,129066:0.5385123789310455,364025:0.5381790399551392,13094:0.5356847643852234,70722:0.5355421006679535,47194:0.532665491104126,4134:0.532654345035553,196383:0.532253086566925,45399:0.5319032669067383,296807:0.5296939015388489',
 '35946:31941|63524:0.6995856463909149,77928:0.6925112307071686,43027:0.6884232759475708,47549:0.6710789799690247,35826:0.6434138715267181,60626:0.5491378605365753,51645:0.5280846655368805,78295:0.5104531645774841,213487:0.3974123001098633,314853:0.37035542726516724,378180:0.35246336460113525,152527:0.3522219657897949,259045:0.3493598699569702,37274

#### Exporting the result

In [29]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))

In [32]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [33]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.22,
 '2 - recall_at_10': 0.24,
 '3 - recall_at_15': 0.26,
 '4 - recall_at_20': 0.27,
 '5 - recall_at_25': 0.28}