In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 20
MAX_SEQUENCE_LENGTH_D = 20 # 80
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

'''
    Configuration
'''
epochs = 1000
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

In [4]:
# Domain to use
DOMAIN = 'firefox'
'''
    propose_centroid_bert_
    propose_bert_triplet_
    propose_bert_
    baseline_dwen_
    baseline_
'''
METHOD = 'baseline_{}'.format(epochs)
EMBED_METHOD='keras'
ONLY_BUCKETS = False
# Dataset paths
DIR = 'data/processed/{}'.format(DOMAIN)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = '{}_feature@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_feature_@number_of_epochs@epochs_64batch({})'.format(METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [5]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [7]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                   token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [8]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

Creating the buckets...


HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=35814), HTML(value='')))




In [9]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


115814

In [10]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=115814), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 9.96 s, sys: 955 ms, total: 10.9 s
Wall time: 10.8 s


In [11]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=89061), HTML(value='')))




In [12]:
%%time

# path_train='train_chronological', path_test='test_chronological'
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

Reading train data
Reading bug ids
CPU times: user 3min 24s, sys: 9.51 ms, total: 3min 24s
Wall time: 3min 24s


In [13]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [14]:
from keras.models import load_model
from keras_bert import get_custom_objects

file_model = os.path.join("modelos", "model_{}.h5".format(SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))))

model = load_model(file_model, custom_objects=get_custom_objects())



In [15]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 94)           0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          28500       info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [16]:
bug_set = retrieval.baseline.get_bug_set()

In [17]:
len(bug_set)

115814

In [18]:
len(retrieval.test)

4719

In [19]:
recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, 
                                                                   bug_train_ids, method=EMBED_METHOD, 
                                                                   only_buckets=ONLY_BUCKETS)

In [20]:
len(exported_rank)

7524

In [21]:
exported_rank[:20]

['688129:684856|680668:0.9758388847112656,699688:0.9381522350013256,681368:0.6630667746067047,712276:0.6580033600330353,334410:0.645418256521225,690254:0.645319014787674,334409:0.6385816633701324,705164:0.6249485313892365,867906:0.6219176054000854,676225:0.6140698194503784,674560:0.6133743822574615,789694:0.6116399765014648,670579:0.6112821698188782,483895:0.6032395958900452,233992:0.6029252409934998,706000:0.5998854041099548,325294:0.596563309431076,173569:0.5927304923534393,777782:0.5897091031074524,669629:0.5874333381652832,673150:0.5816705524921417,673151:0.5816705524921417,735971:0.5813469886779785,464486:0.5796188712120056,685492:0.5761077105998993,690312:0.5759598910808563,318817:0.5755541920661926,399866:0.5744841694831848,574688:0.5740429759025574',
 '557059:358113,446179,653508,829925,320522,429548,782031,326416,347928,888442|519162:0.9562445022165775,488156:0.9550047554075718,497152:0.9546291269361973,540536:0.9540099464356899,500535:0.9536978788673878,536655:0.9499521851539

In [22]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/firefox/exported_rank_baseline_1000.txt'

In [23]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [24]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.23,
 '2 - recall_at_10': 0.28,
 '3 - recall_at_15': 0.31,
 '4 - recall_at_20': 0.34,
 '5 - recall_at_25': 0.36}