In [1]:
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from operator import itemgetter
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from methods.baseline import Baseline
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

Using TensorFlow backend.


In [3]:
retrieval = Retrieval()

DOMAIN = 'eclipse'
path = 'data/processed/{}'.format(DOMAIN)
path_buckets = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)
path_train = 'data/processed/{}/train.txt'.format(DOMAIN)
path_test = 'data/processed/{}/test.txt'.format(DOMAIN)

MAX_SEQUENCE_LENGTH_T = 20 # Title
MAX_SEQUENCE_LENGTH_D = 200 # Description
MAX_SEQUENCE_LENGTH_I = 1682 # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

Reading train data
Reading the test...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Reading test data


HBox(children=(IntProgress(value=0, max=212512), HTML(value='')))


Creating the buckets...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




HBox(children=(IntProgress(value=0, max=39523), HTML(value='')))


Creating the queries...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




In [5]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

In [6]:
print("Total of queries:", len(retrieval.test))

Total of queries: 12659


## Search

#### Selecting buckets from train

In [8]:
buckets_duplicates = set([key for key in tqdm(retrieval.buckets) if len(retrieval.buckets[key]) > 1])

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




##### Selecting buckets from test

In [9]:
buckets_train = set()
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    #if issues_by_buckets[bug_id] == bug_id: continue # if the bug is the master
    vectorizer = [bug_id] 
    vectorizer += ground_truth
    for test_bug_id in vectorizer:
        buckets_train.add(issues_by_buckets[test_bug_id])

HBox(children=(IntProgress(value=0, max=12659), HTML(value='')))




#### Buckets with at least 2 duplicates

In [10]:
print("Buckets train:", len(buckets_train))
print("Buckets test:", len(buckets_duplicates) - len(buckets_train))
print("All Buckets:", len(buckets_duplicates))

Buckets train: 7875
Buckets test: 16538
All Buckets: 24413


#### Model to vectorizer all buckets

In [11]:
import keras
from keras.models import Model
from keras.layers import Input
from keras.models import load_model

name = 'modelos/model_baseline_1000epoch_10steps_1024batch({}).h5'.format(DOMAIN)
similarity_model = load_model(name, {'l2_normalize' : Baseline.l2_normalize, 
                                     'margin_loss' : Baseline.margin_loss,
                                    'pos_distance' : Baseline.pos_distance,
                                    'neg_distance' : Baseline.neg_distance,
                                    'stack_tensors': Baseline.stack_tensors})

bug_title =  similarity_model.get_layer('title_in').input # Input(shape = (MAX_SEQUENCE_LENGTH_T, ), name = 'title')
bug_desc =  similarity_model.get_layer('desc_in').input # Input(shape = (MAX_SEQUENCE_LENGTH_D, ), name = 'desc')
bug_info = similarity_model.get_layer('info_in').input # Input(shape = (MAX_SEQUENCE_LENGTH_I, ), name = 'info') # 

title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')
info_encoder = similarity_model.get_layer('FeatureMlpGenerationModel')

bug_t = title_encoder(bug_title)
bug_d = desc_encoder(bug_desc)
bug_i = info_encoder(bug_info)
# Representation layer
model = similarity_model.get_layer('merge_features_in')
output = model([bug_i, bug_t, bug_d])
# Normalization
# model_normalized = similarity_model.get_layer('normalize_encoded_anchor')
# output = model_normalized(output)

model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

In [12]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_in (InputLayer)            (None, 1682)         0                                            
__________________________________________________________________________________________________
title_in (InputLayer)           (None, 20)           0                                            
__________________________________________________________________________________________________
desc_in (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 128)          232768      info_in[0][0]                    
__________________________________________________________________________________________________
FeatureLst

#### Vectorizing all buckets from train

In [13]:
def vectorizer_buckets_train(buckets_train):
    bug_set = retrieval.baseline.get_bug_set()
    buckets_train_vectorized = []
    for bug_id in tqdm(buckets_train): # retrieval.bugs_train
        bug = bug_set[bug_id]
        bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
        buckets_train_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector })
    return buckets_train_vectorized

In [14]:
buckets_train_vectorized = vectorizer_buckets_train(buckets_train)

HBox(children=(IntProgress(value=0, max=7875), HTML(value='')))




#### Vectorizing all test

In [15]:
bug_set = retrieval.baseline.get_bug_set()
queries_test_vectorized = []
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    vectorizer = [bug_id] 
    vectorizer += ground_truth
    for test_bug_id in vectorizer:
        if issues_by_buckets[test_bug_id] == test_bug_id: continue # if the bug is the master
        bug = bug_set[test_bug_id]
        bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
        queries_test_vectorized.append({ 'bug_id' : test_bug_id, 'vector' : bug_vector,
                                        'ground_truth': issues_by_buckets[test_bug_id] })

HBox(children=(IntProgress(value=0, max=12659), HTML(value='')))




### Indexing all vectors

In [16]:
# Indexing all train
def indexing_train(buckets_train_vectorized):
    X = np.array(buckets_train_vectorized)
    annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

    loop = tqdm(total=len(X))
    for index, row in enumerate(X):
        vector = row['vector']
        annoy.add_item(index, vector)
        loop.update(1)
    loop.close()
    annoy.build(10) # 10 trees
    return annoy

In [17]:
annoy = indexing_train(buckets_train_vectorized)

HBox(children=(IntProgress(value=0, max=7875), HTML(value='')))




## Retrieval using classication model

In [18]:
def indexing_test(queries_test_vectorized):
    X_test = queries_test_vectorized
    distance_test, indices_test = [], []
    for index, row in tqdm(enumerate(X_test)):
        vector = row['vector']
        rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
        indices_test.append(rank)
        distance_test.append(1 - np.array(dist)) # normalize the similarity between 0 and 1
    return X_test, distance_test, indices_test

In [19]:
X_test, distance_test, indices_test = indexing_test(queries_test_vectorized)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [20]:
print("Total buckets train vectorized: {}".format(len(buckets_train_vectorized)))
print("Total queries vectorized: {}".format(len(queries_test_vectorized)))

Total buckets train vectorized: 7875
Total queries vectorized: 22740


## Rank result

In [21]:
formated_rank = []
for row_index, row_sim in tqdm(zip(indices_test, distance_test)):
    row_index, row_sim = row_index[:20], row_sim[:20]
    formated_rank.append(",".join(["{}:{}".format(buckets_train_vectorized[index]['bug_id'], sim) 
                                   for index, sim in zip(row_index, row_sim)]))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Queries 

In [22]:
# Generating the rank result
rank_queries = []

for index, row in tqdm(enumerate(X_test)):
    dup_a, ground_truth = row['bug_id'], row['ground_truth']
    rank_queries.append("{}:{}".format(dup_a, ground_truth))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [23]:
exported_rank = []
loop = tqdm(total=len(rank_queries))

for query, rank in zip(rank_queries, formated_rank):
    exported_rank.append("{}|{}".format(query, rank))
    loop.update(1)
loop.close()

HBox(children=(IntProgress(value=0, max=22740), HTML(value='')))




In [24]:
retrieval.buckets[140026]

{131097, 135876, 140026}

In [25]:
exported_rank[:20]

['324658:327681|110979:1.0,192526:1.0,111302:1.0,222921:1.0,213804:1.0,155214:1.0,146427:1.0,238361:1.0,196922:1.0,69892:1.0,124762:1.0,105668:1.0,85005:1.0,57777:1.0,120336:1.0,269269:1.0,77717:1.0,3385:1.0,174762:1.0,16114:1.0',
 '327682:307170|110979:1.0,222921:1.0,213804:1.0,192526:1.0,155214:1.0,111302:1.0,146427:1.0,328982:1.0,85005:1.0,14937:1.0,120336:1.0,42514:1.0,178749:1.0,179031:1.0,43547:1.0,353535:1.0,269269:1.0,23239:1.0,227213:1.0,31529:1.0',
 '345437:360073|146427:1.0,222921:1.0,213397:1.0,213804:1.0,182486:1.0,365936:0.9996946740429848,165056:0.9996165417251177,238361:0.999563439458143,229829:0.9995552628242876,213128:0.9995355346472934,35037:0.9994747602031566,49265:0.9994725179276429,181718:0.9994725179276429,24538:0.9994725179276429,107744:0.9994723624549806,276562:0.9994706338620745,41641:0.9994685830897652,225018:0.9994685830897652,93528:0.9994684139965102,29524:0.9994665404665284',
 '294924:239825|110979:1.0,213804:1.0,192526:1.0,222921:1.0,155214:1.0,146427:1.0

In [26]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [27]:
class Evaluation():
    def __init__(self):
        pass
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, query_master = query.split(":")
        query_master = int(query_master)
        rank_masters = [int(item.split(':')[0]) for pos, item in enumerate(rank.split(",")[:20])]
        corrects = len(set([query_master]) & set(rank_masters[:k]))
        #total = len(retrieval.buckets[issues_by_buckets[query_master]])
        total = 1 if corrects <= 0 else corrects
        return float(corrects), total

    def evaluate(self, path):
        recall_at_5_corrects_sum, recall_at_10_corrects_sum, recall_at_15_corrects_sum, recall_at_20_corrects_sum = 0, 0, 0, 0
        recall_at_5_total_sum, recall_at_10_total_sum, recall_at_15_total_sum, recall_at_20_total_sum = 0, 0, 0, 0
        print("Evaluating...")
        with open(path, 'r') as file_input:
            for row in file_input:
                #if row == '': continue
                recall_at_5_corrects, recall_at_5_total = self.top_k_recall(row, k=5)
                recall_at_10_corrects, recall_at_10_total = self.top_k_recall(row, k=10)
                recall_at_15_corrects, recall_at_15_total = self.top_k_recall(row, k=15)
                recall_at_20_corrects, recall_at_20_total = self.top_k_recall(row, k=20)
                
                recall_at_5_corrects_sum += recall_at_5_corrects
                recall_at_10_corrects_sum += recall_at_10_corrects
                recall_at_15_corrects_sum += recall_at_15_corrects
                recall_at_20_corrects_sum += recall_at_20_corrects
                recall_at_5_total_sum += recall_at_5_total
                recall_at_10_total_sum += recall_at_10_total
                recall_at_15_total_sum += recall_at_15_total
                recall_at_20_total_sum += recall_at_20_total
        
        report = {
            '1 - recall_at_5' : round(recall_at_5_corrects_sum / recall_at_5_total_sum, 2),
            '2 - recall_at_10' : round(recall_at_10_corrects_sum / recall_at_10_total_sum, 2),
            '3 - recall_at_15' : round(recall_at_15_corrects_sum / recall_at_15_total_sum, 2),
            '4 - recall_at_20' : round(recall_at_20_corrects_sum / recall_at_20_total_sum, 2)
        }

        return report

In [28]:
#from methods.evaluation import Evaluation

"""
    Siamese model
    {'1 - recall_at_5': 0.08,
     '2 - recall_at_10': 0.11,
     '3 - recall_at_15': 0.12,
     '4 - recall_at_20': 0.13}
    Classification model
    {'1 - recall_at_5': 0.1,
     '2 - recall_at_10': 0.15,
     '3 - recall_at_15': 0.18,
     '4 - recall_at_20': 0.21}
"""

evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report

Evaluating...


{'1 - recall_at_5': 0.01,
 '2 - recall_at_10': 0.01,
 '3 - recall_at_15': 0.01,
 '4 - recall_at_20': 0.01}

### Visualizing the rank

In [29]:
def get_similar_bugs_rank(index):
    query, rank = exported_rank[index].split('|')
    similar_ids = []
    for row in rank.split(','):
        master_id, bug_id, sim = row.split(':')
        similar_ids.append(bug_id)
    df_query = df[df['bug_id'] == int(query.split(':')[0])]
    df_similar = df[df['bug_id'].isin(similar_ids)]
    return df_query, df_similar

In [30]:
def plot_rank(test_labels, tsne_features):
    obj_categories = ['anchor', 'positive', 'negative']
    groups = [0, 1, 2]
    colors = plt.cm.rainbow(np.linspace(0, 1, 3))
    plt.figure(figsize=(10, 10))

    for c_group, (c_color, c_label) in enumerate(zip(colors, obj_categories)):
        plt.scatter(tsne_features[np.where(test_labels == c_group), 0],
                    tsne_features[np.where(test_labels == c_group), 1],
                    marker='o',
                    color=c_color,
                    linewidth='1',
                    alpha=0.8,
                    label=c_label)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('t-SNE on Testing Samples')
    plt.legend(loc='best')
    #plt.savefig('clothes-dist.png')
    plt.show(block=False)

def display_rank_at_position(index):
    query, rank = exported_rank[index].split('|')
    query_bug_id = int(query.split(':')[0])
    x_test_features = []
    x_test_features.append()
    tsne_features = Baseline.create_features(x_test_features)
    Baseline.plot_2d(valid_sim, tsne_features)

In [31]:
#df_query, df_similar = get_similar_bugs_rank(19)

In [32]:
#df_query

In [33]:
#df_similar