In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
# %matplotlib inline

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Model

from keras.models import Model
from keras.layers import Input
from keras.models import load_model
import keras.backend as K
from keras_bert import get_custom_objects

import keras.backend as K
from keras_bert import load_vocabulary
from keras.layers import concatenate, Dropout, Add, Flatten
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import _pickle as pickle

Using TensorFlow backend.


In [2]:
# %env epochs 1000
# %env base openoffice
# %env method deepTL

env: epochs=1000
env: base=openoffice
env: method=deepTL


In [3]:
DOMAIN = os.environ['base']
METHOD = '{}_{}'.format(os.environ['method'], os.environ['epochs'])
TOKEN = 'bert'
PREPROCESSING = 'bert'
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
SAVE_PATH = '{}_classification({})'.format(METHOD, DOMAIN)
PRETRAINED_MODEL = 'modelos/model_{}_preprocessing_{}_feature_{}epochs_64batch({}).h5'.format(PREPROCESSING, METHOD, str(os.environ['epochs']), DOMAIN)
TEST_PATH = os.path.join(DIR, 'test_classification.txt')

In [None]:
print("*********")
print("{} for {} epochs in {}".format(METHOD, os.environ['epochs'], DOMAIN))
print("*********")

In [4]:
print(SAVE_PATH)

'deepTL_1000_classification(openoffice)'

In [5]:
model = load_model(os.path.join('modelos', "model_" + SAVE_PATH + '.h5'), custom_objects=get_custom_objects())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.









Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where






In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_a (InputLayer)             (None, 738)          0                                            
__________________________________________________________________________________________________
title_a (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
title_a_token (InputLayer)      (None, 20)           0                                            
__________________________________________________________________________________________________
desc_a (InputLayer)             (None, 20)           0                                            
__________________________________________________________________________________________________
desc_a_tok

In [7]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [8]:
if 'baseline_dwen' in METHOD:
    MAX_SEQUENCE_LENGTH_T = K.int_shape(model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(model.input[1])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)
elif 'baseline' in METHOD:
    MAX_SEQUENCE_LENGTH_T = K.int_shape(model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(model.input[1])[1]
    MAX_SEQUENCE_LENGTH_I = K.int_shape(model.input[2])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, MAX_SEQUENCE_LENGTH_I)
else:
    MAX_SEQUENCE_LENGTH_T = K.int_shape(model.input[0])[1]
    MAX_SEQUENCE_LENGTH_D = K.int_shape(model.input[1])[1]
    MAX_SEQUENCE_LENGTH_I = K.int_shape(model.input[4])[1]
    MAX_SEQUENCE_LENGTH_TOPICS = K.int_shape(model.input[5])[1]
    print(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, MAX_SEQUENCE_LENGTH_I, MAX_SEQUENCE_LENGTH_TOPICS)

20 20 20 20


In [9]:
token_dict = load_vocabulary(vocab_path)

In [10]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, 
                    token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [11]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [12]:
experiment.load_ids()
print(len(baseline.bug_ids))

Reading bug ids
98070


In [13]:
%%time

experiment.load_bugs(TOKEN)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 10.1 s, sys: 1.18 s, total: 11.3 s
Wall time: 10.8 s


In [14]:
%%time
issues_by_buckets = experiment.get_buckets_for_bugs()
experiment.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))


CPU times: user 1.93 s, sys: 44 ms, total: 1.97 s
Wall time: 1.95 s


In [15]:
%%time

bug_train_ids = experiment.get_train_ids(baseline.train_data)

CPU times: user 155 ms, sys: 0 ns, total: 155 ms
Wall time: 154 ms


In [16]:
print(len(bug_train_ids))

2452118


In [17]:
bug_test_ids = experiment.get_test_ids(baseline.test_data)
print(len(bug_test_ids))

16530


In [18]:
def batch_classification(train, dup_sets, bug_ids, batch_size, n_neg):
    encoder = LabelEncoder()
    batch_size_normalized = batch_size // 2
    batch_triplet, input_sample, input_pos, input_neg, sim = experiment.batch_iterator_bert(None, 
                                                                                          train, dup_sets, bug_ids, 
                                                                                         batch_size_normalized, n_neg,
                                                                                        issues_by_buckets)
    
    pos = np.full((1, batch_size_normalized), 1)
    neg = np.full((1, batch_size_normalized), 0)
    sim = np.concatenate([pos, neg], -1)[0]
    
    sim = encoder.fit_transform(sim)
    sim = to_categorical(sim)

    title_a = np.concatenate([input_sample['title']['token'], input_sample['title']['token']])
    title_a_seg = np.concatenate([input_sample['title']['segment'], input_sample['title']['segment']])
    title_b = np.concatenate([input_pos['title']['token'], input_neg['title']['token']])
    title_b_seg = np.concatenate([input_pos['title']['segment'], input_neg['title']['segment']])
    desc_a = np.concatenate([input_sample['description']['token'], input_sample['description']['token']])
    desc_a_seg = np.concatenate([input_sample['description']['segment'], input_sample['description']['segment']])
    desc_b = np.concatenate([input_pos['description']['token'], input_neg['description']['token']])
    desc_b_seg = np.concatenate([input_pos['description']['segment'], input_neg['description']['segment']])
    info_a = np.concatenate([input_sample['info'], input_sample['info']])
    info_b = np.concatenate([input_pos['info'], input_neg['info']])
    topic_a = np.concatenate([input_sample['topics'], input_sample['topics']])
    topic_b = np.concatenate([input_pos['topics'], input_neg['topics']])
    
    return title_a, title_a_seg, title_b, title_b_seg, desc_a, desc_a_seg, desc_b, desc_b_seg, info_a, info_b, topic_a, topic_b, sim

def batch_classification_siamese(train, dup_sets, bug_ids, batch_size, n_neg):
    while True:
        title_a, title_a_seg, title_b, title_b_seg, desc_a, desc_a_seg, desc_b, desc_b_seg, info_a, info_b, topic_a, topic_b, sim = batch_classification(train, dup_sets, 
                                                                                     bug_ids, batch_size, n_neg)
        
        yield ({ 'title_a' : title_a, 'title_a_token' : title_a_seg,
                'title_b': title_b, 'title_b_token' : title_b_seg,
                    'desc_a' : desc_a, 'desc_a_token' : desc_a_seg,
                    'desc_b' : desc_b, 'desc_b_token' : desc_b_seg, 
                    'info_a' : info_a, 'info_b' : info_b, 
                    'topic_a' : topic_a, 'topic_b' : topic_b }, sim)

In [19]:
print(METHOD)

'deepTL_1000'

In [20]:
TOPIC=False
BERT=True
if 'topics' in METHOD:
    TOPIC=True

In [21]:
%%time

number_of_pairs = len(baseline.test_data)

if TOPIC:
    title_a, title_a_seg, title_b, title_b_seg, desc_a, desc_a_seg, desc_b, desc_b_seg, info_a, info_b, topic_a, topic_b, sim = experiment.batch_classification_test(TEST_PATH, BERT=BERT, TOPIC=TOPIC)
else:
    title_a, title_a_seg, title_b, title_b_seg, desc_a, desc_a_seg, desc_b, desc_b_seg, info_a, info_b, sim = experiment.batch_classification_test(TEST_PATH, BERT=BERT, TOPIC=TOPIC)

y_true = sim

CPU times: user 6.72 s, sys: 171 ms, total: 6.89 s
Wall time: 6.88 s


In [22]:
%%time

if 'topics' in METHOD:
    y_scores = model.predict([title_a_seg, title_a, 
                                  title_b_seg, title_b, 
                                  desc_a_seg, desc_a, 
                                  desc_b_seg, desc_b, 
                                  info_a, info_b,
                                  topic_a, topic_b])
elif 'baseline_dwen' in METHOD:
    y_scores = model.predict([title_a, title_b, 
                                  desc_a, desc_b])
elif 'baseline' in METHOD:
    y_scores = model.predict([title_a, title_b, 
                                  desc_a, desc_b,
                                     info_a, info_b])
else:
    y_scores = model.predict([title_a_seg, title_a, 
                                  title_b_seg, title_b, 
                                  desc_a_seg, desc_a, 
                                  desc_b_seg, desc_b, 
                                  info_a, info_b])

def normalize(row):
    return [1 if(r > 0.5) else 0 for r in row]

y_scores = list(map(normalize, y_scores))
# y_scores
def decode_ohe(row):
    return 0 if(row[0] == 1) else 1
y_scores = [ decode_ohe(r) for r in y_scores ]
y_true = [ decode_ohe(r) for r in y_true ]

KeyboardInterrupt: 

In [53]:
print(len(y_scores), len(y_true))

(81222, 81222)

In [54]:
print(DIR, SAVE_PATH.replace('({})'.format(DOMAIN), ''))

('data/processed/openoffice/bert', 'baseline_dwen_1000_classification')

In [55]:
def save_result(result):
    name = SAVE_PATH.replace('({})'.format(DOMAIN), '')
    with open(os.path.join(DIR,'{}.pkl'.format(name)), 'wb') as f:
        pickle.dump(result, f)
    print("=> result saved!")

In [56]:
acc = [int(x == y) for x, y in zip(y_scores, y_true)]

In [57]:
save_result({ 'y_pred' : y_scores, 'y_true' : y_true, 'acc' : acc })

=> result saved!
