## Export classification test

In [1]:
import keras

Using TensorFlow backend.


In [2]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [3]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

In [4]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Model

## Configurações Globais

In [5]:
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000
MAX_SEQUENCE_LENGTH_T = 20
MAX_SEQUENCE_LENGTH_D = 20
epochs = 1000

In [6]:
DOMAIN = 'openoffice'
METHOD = 'deepQL_weights_{}'.format(epochs)
TOKEN = 'bert'
PREPROCESSING = 'bert'
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
SAVE_PATH = '{}_classification({})'.format(METHOD, DOMAIN)
PRETRAINED_MODEL = 'modelos/model_{}_preprocessing_{}_feature_{}epochs_64batch({}).h5'.format(PREPROCESSING, METHOD, str(epochs), DOMAIN)
TEST_PATH = os.path.join(DIR, 'test_classification.txt')

In [7]:
PRETRAINED_MODEL

'modelos/model_bert_preprocessing_deepQL_weights_1000_feature_1000epochs_64batch(openoffice).h5'

In [8]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [9]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [10]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [11]:
baseline.info_dict

{'bug_severity': 6,
 'bug_status': 3,
 'component': 144,
 'priority': 5,
 'product': 41,
 'version': 539}

In [12]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

#### Loading bug ids in memory

In [13]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


98070

#### Loading train bugs

In [14]:
%%time

experiment.load_bugs(TOKEN)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 17.7 s, sys: 5.51 s, total: 23.2 s
Wall time: 43.9 s


In [15]:
%%time
issues_by_buckets = experiment.get_buckets_for_bugs()
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))


CPU times: user 2.02 s, sys: 40 ms, total: 2.06 s
Wall time: 2.14 s


In [16]:
%%time

bug_train_ids = experiment.get_train_ids(baseline.train_data)

CPU times: user 169 ms, sys: 3.83 ms, total: 173 ms
Wall time: 172 ms


In [17]:
print(len(bug_train_ids))

2452118


In [18]:
%%time

baseline.test_data, baseline.dup_sets_test = experiment.read_test_data_classification(DIR, baseline.bug_set, 
                                                                                      bug_train_ids,
                                                                                     path='test_chronological')

Reading test data for classification
CPU times: user 110 ms, sys: 3.89 ms, total: 114 ms
Wall time: 130 ms


In [19]:
bug_test_ids = experiment.get_test_ids(baseline.test_data)
len(bug_test_ids)

81222

In [20]:
baseline.test_data[:10]

[[16384, 16385],
 [16384, 15363],
 [16384, 16387],
 [16384, 16388],
 [16384, 14054],
 [16384, 16391],
 [16384, 16392],
 [16384, 16393],
 [16384, 16394],
 [16384, 16395]]

In [21]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

def batch_classification(train, dup_sets, bug_ids, batch_size, n_neg):
    encoder = LabelEncoder()
    batch_size_normalized = batch_size // 2
    batch_triplet, input_sample, input_pos, input_neg, sim = experiment.batch_iterator_bert(None, 
                                                                                          train, dup_sets, bug_ids, 
                                                                                         batch_size_normalized, n_neg,
                                                                                        issues_by_buckets)
    
    pos = np.full((1, batch_size_normalized), 1)
    neg = np.full((1, batch_size_normalized), 0)
    sim = np.concatenate([pos, neg], -1)[0]
    
    sim = encoder.fit_transform(sim)
    sim = to_categorical(sim)

    title_a = np.concatenate([input_sample['title']['token'], input_sample['title']['token']])
    title_a_seg = np.concatenate([input_sample['title']['segment'], input_sample['title']['segment']])
    title_b = np.concatenate([input_pos['title']['token'], input_neg['title']['token']])
    title_b_seg = np.concatenate([input_pos['title']['segment'], input_neg['title']['segment']])
    desc_a = np.concatenate([input_sample['description']['token'], input_sample['description']['token']])
    desc_a_seg = np.concatenate([input_sample['description']['segment'], input_sample['description']['segment']])
    desc_b = np.concatenate([input_pos['description']['token'], input_neg['description']['token']])
    desc_b_seg = np.concatenate([input_pos['description']['segment'], input_neg['description']['segment']])
    info_a = np.concatenate([input_sample['info'], input_sample['info']])
    info_b = np.concatenate([input_pos['info'], input_neg['info']])
    
    return batch_triplet, title_a, title_a_seg, title_b, title_b_seg, desc_a, desc_a_seg, desc_b, desc_b_seg, info_a, info_b, sim

In [22]:
bug_test_ids[:10]

[16384, 16385, 16384, 15363, 16384, 16387, 16384, 16388, 16384, 14054]

In [23]:
len(bug_test_ids)

81222

In [24]:
batch_triplet, title_a, title_a_seg, title_b, title_b_seg, desc_a, desc_a_seg, desc_b, desc_b_seg, info_a, info_b, sim = batch_classification(baseline.test_data, 
                                                                             baseline.dup_sets_test,
                                                                             bug_train_ids, len(bug_test_ids), 1)

In [25]:
bug_test_classification = []

for anchor, pos, neg in batch_triplet:
    bug_test_classification.append([anchor, pos, 1])
for anchor, pos, neg in batch_triplet:
    bug_test_classification.append([anchor, neg, 0])

len(bug_test_classification)

81222

In [26]:
with open(TEST_PATH, 'w') as f:
    for b1, b2, label in bug_test_classification:
        f.write("{} {} {}\n".format(b1, b2, label))
TEST_PATH

'data/processed/openoffice/bert/test_classification.txt'