In [1]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from annoy import AnnoyIndex
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

Using TensorFlow backend.


In [3]:
MAX_SEQUENCE_LENGTH_T = 20 # 20
MAX_SEQUENCE_LENGTH_D = 20 # 80
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000

'''
    Configuration
'''
epochs = 100
best_loss = 1
best_epoch = 0
verbose = 0
loss = 1

In [4]:
# Domain to use
DOMAIN = 'eclipse'
'''
    propose_centroid_bert_
    propose_bert_triplet_
    propose_bert_
    baseline_dwen_
    baseline_
'''
METHOD = 'baseline_{}'.format(epochs)
PREPROCESSING = 'baseline'
EMBED_METHOD='keras'
ONLY_BUCKETS = False
# Dataset paths
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
# Path embeddings
EMBED_DIR='data/embed'
# Save model
SAVE_PATH = '{}_preprocessing_{}_feature@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)
SAVE_PATH_FEATURE = '{}_preprocessing_{}_feature_@number_of_epochs@epochs_64batch({})'.format(PREPROCESSING, METHOD, DOMAIN)

# Extract CORPUs
EXTRACT_CORPUS = False

In [5]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [7]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                   token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [8]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [9]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


361006

In [10]:
%%time

experiment.load_bugs()
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 28 s, sys: 2.5 s, total: 30.5 s
Wall time: 30.5 s


In [11]:
issues_by_buckets = experiment.get_buckets_for_bugs()

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




In [12]:
%%time

# path_train='train_chronological', path_test='test_chronological'
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')
# Read and create the test queries duplicates
retrieval.create_queries()

CPU times: user 1.5 s, sys: 7.57 ms, total: 1.5 s
Wall time: 1.5 s


In [13]:
bug_train_ids = experiment.get_train_ids(baseline.train_data)

In [14]:
from keras.models import load_model
from keras_bert import get_custom_objects

file_model = os.path.join("modelos", "model_{}.h5".format(SAVE_PATH_FEATURE.replace('@number_of_epochs@', str(epochs))))

model = load_model(file_model, custom_objects=get_custom_objects())



In [15]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 300)          504900      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

In [16]:
bug_set = retrieval.baseline.get_bug_set()

In [17]:
len(bug_set)

361006

In [18]:
len(retrieval.test)

16945

In [19]:
%%time 

recall, exported_rank, debug = experiment.evaluate_validation_test(retrieval, 0, model, issues_by_buckets, 
                                                                   bug_train_ids, method=EMBED_METHOD, 
                                                                   only_buckets=ONLY_BUCKETS)

CPU times: user 1min 16s, sys: 3.89 s, total: 1min 19s
Wall time: 56 s


In [20]:
len(exported_rank)

27336

In [21]:
exported_rank[:20]

['1:183|11858:0.9382210895419121,18947:0.9345714524388313,14024:0.9342795684933662,1557:0.9342563077807426,352470:0.9342006221413612,362534:0.934171512722969,350902:0.933889202773571,356122:0.9338624477386475,358624:0.933822825551033,14095:0.9334587082266808,368893:0.9334156587719917,66335:0.9332822784781456,5316:0.9331230595707893,351436:0.9329810068011284,81228:0.9323837533593178,85:0.9322439879179001,8879:0.9319668188691139,19808:0.9318229109048843,35515:0.9317464977502823,69842:0.9311506003141403,382459:0.930655412375927,8830:0.9304843470454216,24348:0.9303978681564331,11301:0.9301785156130791,16809:0.9297537356615067,14003:0.9292965605854988,61200:0.9292355179786682,6453:0.929199293255806,68162:0.9290460050106049,42031:0.9290097057819366',
 '262155:261466|261485:0.7406244277954102,108976:0.5259395837783813,51706:0.5229259431362152,84666:0.5201786160469055,48033:0.518547385931015,50476:0.5137682855129242,28144:0.5121950805187225,46846:0.5114652514457703,93161:0.5111066102981567,799

In [22]:
EXPORT_RANK_PATH = os.path.join(DIR, 'exported_rank_{}.txt'.format(METHOD))
EXPORT_RANK_PATH

'data/processed/eclipse/baseline/exported_rank_baseline_100.txt'

In [23]:
with open(EXPORT_RANK_PATH, 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [24]:
report = experiment.evaluation.evaluate(EXPORT_RANK_PATH)
report

{'1 - recall_at_5': 0.09,
 '2 - recall_at_10': 0.1,
 '3 - recall_at_15': 0.11,
 '4 - recall_at_20': 0.11,
 '5 - recall_at_25': 0.12}