In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
# %matplotlib inline

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Model

from keras.models import Model
from keras.layers import Input
from keras.models import load_model
import keras.backend as K
from keras_bert import get_custom_objects

import keras.backend as K
from keras_bert import load_vocabulary
from keras.layers import concatenate, Dropout, Add, Flatten
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import _pickle as pickle
from sklearn.metrics import silhouette_score

Using TensorFlow backend.


In [2]:
%env epochs 1000
%env base eclipse
%env method deepQL_topics

env: epochs=1000
env: base=eclipse
env: method=deepQL_topics


In [3]:
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000
epochs = int(os.environ['epochs'])

In [4]:
DOMAIN = os.environ['base']
METHOD = '{}_{}'.format(os.environ['method'], epochs)
TOKEN = 'bert'
PREPROCESSING = 'bert'
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
SAVE_PATH = '{}_classification({})'.format(METHOD, DOMAIN)
PRETRAINED_MODEL = 'modelos/model_{}_preprocessing_{}_feature_{}epochs_64batch({}).h5'.format(PREPROCESSING, METHOD, str(epochs), DOMAIN)
TEST_PATH = os.path.join(DIR, 'test_classification.txt')

In [5]:
print("*********")
print("{} for {} epochs in {}".format(METHOD, epochs, DOMAIN))
print("*********")

*********
deepQL_topics_1000 for 1000 epochs in eclipse
*********


In [6]:
print(PRETRAINED_MODEL)

modelos/model_bert_preprocessing_deepQL_topics_1000_feature_1000epochs_64batch(eclipse).h5


In [7]:
K.clear_session()

"""
    A lista de modelos estar disponível no diretorio /modelos
        name:
            - model_baseline_{X}epoch_{Y}steps_({dataset}).h5
            - model_baseline_{X}epoch_{Y}steps_({dataset}).json
    
        Informações adicionais:
            {X} - total de épocas treinada. Ex: 100, 1000
            {Y} - total de steps validados no treino. Ex: 10, 16
            {dataset} - Nome da base testada. Ex: Eclipse, Netbeans, OpenOffice
"""
similarity_model = load_model(PRETRAINED_MODEL, custom_objects=get_custom_objects())






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.











In [8]:
if('deepQL_topics' in METHOD):
    MAX_SEQUENCE_LENGTH_T = K.int_shape(similarity_model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(similarity_model.input[1])[1]
    MAX_SEQUENCE_LENGTH_I = K.int_shape(similarity_model.input[4])[1]
    MAX_SEQUENCE_LENGTH_TOPICS = K.int_shape(similarity_model.input[5])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, MAX_SEQUENCE_LENGTH_I, MAX_SEQUENCE_LENGTH_TOPICS)
elif('deepQL' in METHOD):
    MAX_SEQUENCE_LENGTH_T = K.int_shape(similarity_model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(similarity_model.input[1])[1]
    MAX_SEQUENCE_LENGTH_I = K.int_shape(similarity_model.input[4])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, MAX_SEQUENCE_LENGTH_I)
elif('DWEN' in METHOD):
    MAX_SEQUENCE_LENGTH_T = K.int_shape(similarity_model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(similarity_model.input[1])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

20 20 1682 30


In [9]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [10]:
token_dict = load_vocabulary(vocab_path)

In [11]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval  = Retrieval()
experiment = Experiment(baseline, evaluation)

In [12]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [13]:
experiment.load_ids()
print(len(baseline.bug_ids))

Reading bug ids
361006


In [14]:
%%time
experiment.load_bugs(TOKEN)

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 1min 12s, sys: 5.58 s, total: 1min 18s
Wall time: 1min 30s


In [15]:
%%time
issues_by_buckets = experiment.get_buckets_for_bugs()
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

HBox(children=(IntProgress(value=0, max=361006), HTML(value='')))


CPU times: user 3.89 s, sys: 10.6 ms, total: 3.9 s
Wall time: 4.33 s


In [16]:
# Read and create the test queries duplicates
retrieval.create_queries()

In [17]:
%%time
bug_train_ids = experiment.get_train_ids(baseline.train_data)
print("Train", len(bug_train_ids))

Train 1546362
CPU times: user 282 ms, sys: 0 ns, total: 282 ms
Wall time: 299 ms


In [18]:
bug_test_ids = experiment.get_test_ids(baseline.test_data)
print("Test", len(bug_test_ids))

Test 33990


In [19]:
%%time

vectorizer = 'keras'
if('deepQL_topics' in METHOD):
    vectorizer = 'bert-topic'
elif('deepQL' in METHOD):
    vectorizer = 'bert'
elif('DWEN' in METHOD):
    vectorizer = 'dwen'
test = retrieval.test
bug_set = baseline.get_bug_set()
verbose = 1
test_vectorized = experiment.vectorizer_test(bug_set, similarity_model, test, issues_by_buckets, 
                                             vectorizer, verbose, only_buckets=False)
print("Test vectorized", len(test_vectorized))

HBox(children=(IntProgress(value=0, max=16995), HTML(value='')))


Test vectorized 30481
CPU times: user 1h 7min 36s, sys: 28.8 s, total: 1h 8min 5s
Wall time: 18min 6s


In [20]:
buckets = retrieval.buckets

### Inserting label to bugs

In [21]:
embed = [x['vector'] for x in test_vectorized]
cluster_labels = [issues_by_buckets[bug['bug_id']] for bug in tqdm(test_vectorized) ]
print("Total cluster labels", len(cluster_labels))

HBox(children=(IntProgress(value=0, max=30481), HTML(value='')))


Total cluster labels 30481


### Silhoutte score

In [None]:
%%time

silhouette_avg = silhouette_score(embed, cluster_labels)
print("The average silhouette_score is :", silhouette_avg)

In [None]:
CLUSTER_PATH = 'cluster_{}.pkl'.format(METHOD)

def save_result():
    result = {
        'silhouette' : silhouette_avg,
        'clusters' : len(set(cluster_labels)),
        'method' : METHOD
    }

    with open(os.path.join(DIR, CLUSTER_PATH), 'wb') as f:
        pickle.dump(result, f)
        
print(CLUSTER_PATH)

In [None]:
save_result()
    
print("All saved.")

In [None]:
with open(os.path.join(DIR, CLUSTER_PATH), 'rb') as f:
    print(pickle.load(f))