In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
# %matplotlib inline

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Model

from keras.models import Model
from keras.layers import Input
from keras.models import load_model
import keras.backend as K
from keras_bert import get_custom_objects

import keras.backend as K
from keras_bert import load_vocabulary
from keras.layers import concatenate, Dropout, Add, Flatten
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import _pickle as pickle
from sklearn.metrics import silhouette_score

Using TensorFlow backend.


In [2]:
%env epochs 1000
%env base eclipse
%env method deepQL_no_trainable

env: epochs=1000
env: base=eclipse
env: method=deepQL_no_trainable


In [3]:
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000
epochs = int(os.environ['epochs'])

In [4]:
DOMAIN = os.environ['base']
METHOD = '{}_{}'.format(os.environ['method'], epochs)
TOKEN = 'bert'
PREPROCESSING = 'bert'
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
SAVE_PATH = '{}_classification({})'.format(METHOD, DOMAIN)
PRETRAINED_MODEL = 'modelos/model_{}_preprocessing_{}_feature_{}epochs_64batch({}).h5'.format(PREPROCESSING, METHOD, str(epochs), DOMAIN)
TEST_PATH = os.path.join(DIR, 'test_classification.txt')

In [5]:
print("*********")
print("{} for {} epochs in {}".format(METHOD, epochs, DOMAIN))
print("*********")

*********
deepQL_no_trainable_1000 for 1000 epochs in eclipse
*********


In [6]:
print(PRETRAINED_MODEL)

modelos/model_bert_preprocessing_deepQL_no_trainable_1000_feature_1000epochs_64batch(eclipse).h5


In [7]:
K.clear_session()

"""
    A lista de modelos estar disponível no diretorio /modelos
        name:
            - model_baseline_{X}epoch_{Y}steps_({dataset}).h5
            - model_baseline_{X}epoch_{Y}steps_({dataset}).json
    
        Informações adicionais:
            {X} - total de épocas treinada. Ex: 100, 1000
            {Y} - total de steps validados no treino. Ex: 10, 16
            {dataset} - Nome da base testada. Ex: Eclipse, Netbeans, OpenOffice
"""
similarity_model = load_model(PRETRAINED_MODEL, custom_objects=get_custom_objects())






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.











In [12]:
if('deepQL' in METHOD):
    MAX_SEQUENCE_LENGTH_T = K.int_shape(similarity_model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(similarity_model.input[1])[1]
    MAX_SEQUENCE_LENGTH_I = K.int_shape(similarity_model.input[4])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, MAX_SEQUENCE_LENGTH_I)
elif('DWEN' in METHOD):
    MAX_SEQUENCE_LENGTH_T = K.int_shape(similarity_model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(similarity_model.input[1])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

20 20 1682


In [13]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [14]:
token_dict = load_vocabulary(vocab_path)

In [15]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval  = Retrieval()
experiment = Experiment(baseline, evaluation)

In [16]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [17]:
experiment.load_ids()
print(len(baseline.bug_ids))

Reading bug ids
361006


In [18]:
%%time
experiment.load_bugs(TOKEN)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 35.8 s, sys: 4.91 s, total: 40.7 s
Wall time: 39.6 s


In [19]:
%%time
issues_by_buckets = experiment.get_buckets_for_bugs()
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))


CPU times: user 1.69 s, sys: 31.2 ms, total: 1.72 s
Wall time: 1.7 s


In [27]:
# Read and create the test queries duplicates
retrieval.create_queries()

In [20]:
%%time
bug_train_ids = experiment.get_train_ids(baseline.train_data)
print("Train", len(bug_train_ids))

Train 1546362
CPU times: user 141 ms, sys: 3.6 ms, total: 145 ms
Wall time: 144 ms


In [21]:
bug_test_ids = experiment.get_test_ids(baseline.test_data)
print("Test", len(bug_test_ids))

Test 33990


In [30]:
vectorizer = 'keras'
if('deepQL' in METHOD):
    vectorizer = 'bert'
elif('DWEN' in METHOD):
    vectorizer = 'dwen'
test = retrieval.test
bug_set = baseline.get_bug_set()
verbose = 1
test_vectorized = experiment.vectorizer_test(bug_set, similarity_model, test, issues_by_buckets, 
                                             vectorizer, verbose, only_buckets=False)
print("Test vectorized", len(test_vectorized))

HBox(children=(IntProgress(value=0, max=16995), HTML(value='')))


Test vectorized 30481


In [33]:
embed = [x['vector'] for x in test_vectorized]
cluster_labels = [issues_by_buckets[bug['bug_id']] for bug in test_vectorized]
print("Total cluster labels", len(cluster_labels))

Total cluster labels 30481


### Silhoutte score

In [34]:
%%time

silhouette_avg = silhouette_score(embed, cluster_labels)
print("The average silhouette_score is :", silhouette_avg)

The average silhouette_score is : 0.0062527996
