In [3]:
import keras

In [4]:
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

In [5]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers

In [6]:
from methods.retrieval import Retrieval
from methods.baseline import Baseline

In [7]:
MAX_SEQUENCE_LENGTH_T = 20 # 40
MAX_SEQUENCE_LENGTH_D = 200 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

retrieval = Retrieval()

DOMAIN = 'eclipse'
path = 'data/processed/{}'.format(DOMAIN)
path_buckets = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)
path_train = 'data/processed/{}/train.txt'.format(DOMAIN)
path_test = 'data/processed/{}/test.txt'.format(DOMAIN)

MAX_SEQUENCE_LENGTH_T = 20 # Title
MAX_SEQUENCE_LENGTH_D = 200 # Description
MAX_SEQUENCE_LENGTH_I = 1682 # Status, Severity, Version, Component, Module

In [10]:
# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data


HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))




In [8]:
len(retrieval.baseline.bug_ids)

212512

### Generating batchs

In [42]:
import random

# data - path
# batch_size - 128
# n_neg - 1
def batch_iterator(baseline, data, dup_sets, batch_size, n_neg):
    # global train_data
    # global self.dup_sets
    # global self.bug_ids
    # global self.bug_set

    random.shuffle(data)

    batch_input, batch_pos, batch_neg = {'title' : [], 'desc' : [], 'info' : []}, \
                                            {'title' : [], 'desc' : [], 'info' : []}, \
                                                {'title' : [], 'desc' : [], 'info' : []}

    n_train = len(data)

    batch_triplets = []

    for offset in range(batch_size):
        neg_bug = Baseline.get_neg_bug(dup_sets[data[offset][0]], baseline.bug_ids)
        anchor, pos, neg = data[offset][0], data[offset][1], neg_bug
        bug_anchor = baseline.bug_set[anchor]
        bug_pos = baseline.bug_set[pos]
        bug_neg = baseline.bug_set[neg]
        baseline.read_batch_bugs(batch_input, bug_anchor)
        baseline.read_batch_bugs(batch_pos, bug_pos)
        baseline.read_batch_bugs(batch_neg, bug_neg)
        batch_triplets.append([data[offset][0], data[offset][1], neg_bug])

    batch_input['title'] = np.array(batch_input['title'])
    batch_input['desc'] = np.array(batch_input['desc'])
    batch_input['info'] = np.array(batch_input['info'])
    batch_pos['title'] = np.array(batch_pos['title'])
    batch_pos['desc'] = np.array(batch_pos['desc'])
    batch_pos['info'] = np.array(batch_pos['info'])
    batch_neg['title'] = np.array(batch_neg['title'])
    batch_neg['desc'] = np.array(batch_neg['desc'])
    batch_neg['info'] = np.array(batch_neg['info'])

    n_half = batch_size // 2
    if n_half > 0:
        pos = np.full((1, n_half), 1)
        neg = np.full((1, n_half), 0)
        sim = np.concatenate([pos, neg], -1)[0]
    else:
        sim = np.array([np.random.choice([1, 0])])

    input_sample, input_pos, input_neg = {}, {}, {}

    input_sample = { 'title' : batch_input['title'], 'description' : batch_input['desc'], 'info' : batch_input['info'] }
    input_pos = { 'title' : batch_pos['title'], 'description' : batch_pos['desc'], 'info': batch_pos['info'] }
    input_neg = { 'title' : batch_neg['title'], 'description' : batch_neg['desc'], 'info': batch_neg['info'] }

    return batch_triplets, input_sample, input_pos, input_neg, sim #sim

In [43]:
batch_size_test=512
batch_triplets, valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = batch_iterator(
                                                                                          retrieval.baseline,
                                                                                          retrieval.baseline.train_data, 
                                                                                          retrieval.baseline.dup_sets_train, 
                                                                                          batch_size_test, 1)

In [44]:
batch_triplets[:10]

[[96204, 85581, 15196],
 [32475, 32465, 371487],
 [14019, 13487, 17526],
 [179690, 179684, 383873],
 [197526, 122833, 417066],
 [387428, 390154, 111264],
 [361419, 367263, 225269],
 [202500, 197620, 334133],
 [248836, 256052, 75393],
 [226666, 228481, 235827]]

### Siamese model

https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07

https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb

In [15]:
from keras.models import model_from_json

def load_model_disk(DIR, name, dependences):
    m_dir = os.path.join(DIR, 'modelos')
    # load json and create model
    json_file = open(os.path.join(m_dir, "model_{}.json".format(name)), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json, dependences)
    # load weights into new model
    loaded_model.load_weights(os.path.join(m_dir, "model_{}.h5".format(name)), by_name=True)
    print("Loaded model from disk")
    return loaded_model

In [11]:
def get_info(baseline, bug):
    info = np.concatenate((
        baseline.to_one_hot(bug['bug_severity'], baseline.info_dict['bug_severity']),
        baseline.to_one_hot(bug['bug_status'], baseline.info_dict['bug_status']),
        baseline.to_one_hot(bug['component'], baseline.info_dict['component']),
        baseline.to_one_hot(bug['priority'], baseline.info_dict['priority']),
        baseline.to_one_hot(bug['product'], baseline.info_dict['product']),
        baseline.to_one_hot(bug['version'], baseline.info_dict['version']))
    )
    return info

In [30]:
from scipy import spatial
bug_id = [96204, 2]#[269536, 2]
bug_set = retrieval.baseline.get_bug_set()
dup_a, dup_b = bug_id
bug_a = bug_set[dup_a]
bug_b = bug_set[dup_b]

In [16]:
from keras.models import load_model

name = 'checkpoint/checkpoint_baseline_1000epoch_10steps_1024batch({}).hdf5'.format(DOMAIN)

similarity_model = load_model(name, {'l2_normalize' : Baseline.l2_normalize, 
                                     'margin_loss' : Baseline.margin_loss,
                                    'pos_distance' : Baseline.pos_distance,
                                    'neg_distance' : Baseline.neg_distance})

lstm_feature_model = similarity_model.get_layer('FeatureLstmGenerationModel')
cnn_feature_model = similarity_model.get_layer('FeatureCNNGenerationModel')
mlp_feature_model = similarity_model.get_layer('FeatureMlpGenerationModel')

In [31]:
bug_vector_a_t = lstm_feature_model.predict(np.array([bug_a['title_word']]))
bug_vector_b_t = lstm_feature_model.predict(np.array([bug_b['title_word']]))
result = 1 - spatial.distance.cosine(bug_vector_a_t, bug_vector_b_t)
result

0.9941076040267944

In [32]:
bug_vector_a_d = cnn_feature_model.predict(np.array([bug_a['description_word']]))
bug_vector_b_d = cnn_feature_model.predict(np.array([bug_b['description_word']]))
result = 1 - spatial.distance.cosine(bug_vector_a_d, bug_vector_b_d)
result

0.9998556971549988

In [45]:
baseline = retrieval.baseline
bug_vector_a_i = mlp_feature_model.predict(np.array([get_info(baseline, bug_a)]))
bug_vector_b_i = mlp_feature_model.predict(np.array([get_info(baseline, bug_b)]))
result = 1 - spatial.distance.cosine(bug_vector_a_i, bug_vector_b_i)
result

0.18021604418754578

In [34]:
bug_vector_a = np.concatenate([ bug_vector_a_t, bug_vector_a_d, bug_vector_a_i ], -1)
bug_vector_b = np.concatenate([ bug_vector_b_t, bug_vector_b_d, bug_vector_b_i ], -1)
result = 1 - spatial.distance.cosine(bug_vector_a, bug_vector_b)
result

0.9728338122367859

#### Using the siamese model

In [40]:
import keras
from keras.models import Model
from keras.layers import Input
from keras.models import load_model

name = 'modelos/model_baseline_1000epoch_10steps_1024batch({}).h5'.format(DOMAIN)
#similarity_model = load_model('', name, {'l2_normalize' : Baseline.l2_normalize})

similarity_model = load_model(name, {'l2_normalize' : Baseline.l2_normalize, 
                                     'margin_loss' : Baseline.margin_loss,
                                    'pos_distance' : Baseline.pos_distance,
                                    'neg_distance' : Baseline.neg_distance})

bug_title =  similarity_model.get_layer('title_in').input # Input(shape = (MAX_SEQUENCE_LENGTH_T, ), name = 'title')
bug_desc =  similarity_model.get_layer('desc_in').input # Input(shape = (MAX_SEQUENCE_LENGTH_D, ), name = 'desc')
bug_info = similarity_model.get_layer('info_in').input # Input(shape = (MAX_SEQUENCE_LENGTH_I, ), name = 'info') # 

title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')
info_encoder = similarity_model.get_layer('FeatureMlpGenerationModel')

bug_t = title_encoder(bug_title)
bug_d = desc_encoder(bug_desc)
bug_i = info_encoder(bug_info)
# Representation layer
model = similarity_model.get_layer('merge_features_in')
output = model([bug_i, bug_t, bug_d])

model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

In [41]:
bug_vector_a = model.predict([ [bug_a['title_word']], [bug_a['description_word']], [retrieval.get_info(bug_a)] ])[0]
bug_vector_b = model.predict([ [bug_b['title_word']], [bug_b['description_word']], [retrieval.get_info(bug_b)] ])[0]
result = 1 - spatial.distance.cosine(bug_vector_a, bug_vector_b)
result

0.9727153778076172

In [44]:
print(dup_a, retrieval.get_info(bug_a))
print(dup_b, retrieval.get_info(bug_b))
1 - spatial.distance.cosine(bug_a['description_word'], bug_b['description_word'])

96204 [0. 0. 0. ... 0. 0. 0.]
2 [0. 0. 0. ... 0. 0. 0.]


0.36704135776492697

### Validate cosine between the positive and negative

Problem https://stackoverflow.com/questions/40510703/implement-siamese-network-in-keras-issue

In [36]:
batch_size_test=512
valid_input_sample, valid_input_pos, valid_input_neg, valid_sim = retrieval.baseline.batch_iterator(retrieval.baseline.train_data, 
                                                                                          retrieval.baseline.dup_sets_train, 
                                                                                          batch_size_test, 1)

In [37]:
from numpy import dot
from numpy.linalg import norm
from scipy import spatial
import math

# https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists
def cosine(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [42]:
bug_set = retrieval.baseline.get_bug_set()
cos_pos_median = []
cos_neg_median = []

for bug_anchor_title, bug_pos_title, bug_neg_title,\
    bug_anchor_desc, bug_pos_desc, bug_neg_desc,\
    bug_anchor_info, bug_pos_info, bug_neg_info, sim in zip(valid_input_sample['title'], valid_input_pos['title'], valid_input_neg['title'],\
                                             valid_input_sample['description'], valid_input_pos['description'], valid_input_neg['description'],\
                                             valid_input_sample['info'], valid_input_pos['info'], valid_input_neg['info'], 
                                            valid_sim):
    
    bug_vector_anchor = model.predict([ [bug_anchor_title], [bug_anchor_desc], [bug_anchor_info] ])[0]
    bug_vector_pos = model.predict([ [bug_pos_title], [bug_pos_desc], [bug_pos_info] ])[0]
    bug_vector_neg = model.predict([ [bug_neg_title], [bug_neg_desc], [bug_neg_info] ])[0]
    cosine_pos = cosine(bug_vector_anchor, bug_vector_pos)
    cosine_neg = cosine(bug_vector_anchor, bug_vector_neg)
    cos_pos_median.append(cosine_pos)
    cos_neg_median.append(cosine_neg)
    #print("cosine_pos", cosine_pos, "cosine_neg", cosine_neg)

In [43]:
import numpy as np
np.mean(cos_pos_median), np.mean(cos_neg_median)

(0.9696703, 0.9609164)