In [1]:
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from operator import itemgetter
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from methods.baseline import Baseline
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

Using TensorFlow backend.


In [3]:
retrieval = Retrieval()

path = 'data/processed/eclipse'
path_buckets = 'data/normalized/eclipse/eclipse.csv'
path_train = 'data/processed/eclipse/train.txt'
path_test = 'data/processed/eclipse/test.txt'

MAX_SEQUENCE_LENGTH_T = 20 # Title
MAX_SEQUENCE_LENGTH_D = 200 # Description
MAX_SEQUENCE_LENGTH_I = 1682 # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

0it [00:00, ?it/s]

Reading train data
Reading the test...


12798it [00:00, 67128.42it/s]
  1%|          | 2019/212512 [00:00<00:10, 20182.80it/s]

Reading test data


100%|██████████| 212512/212512 [00:11<00:00, 19019.06it/s]
  0%|          | 585/321483 [00:00<00:54, 5848.23it/s]

Creating the buckets...


100%|██████████| 321483/321483 [00:16<00:00, 19736.82it/s]
100%|██████████| 39523/39523 [00:01<00:00, 35146.78it/s]
12798it [00:00, 125703.89it/s]

Creating the queries...





In [4]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




In [5]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

In [6]:
print("Total of queries:", len(retrieval.test))

Total of queries: 12798


## Search

#### Filtering the bugs from train

In [7]:
retrieval.train_vectorized = []
bug_set = retrieval.baseline.get_bug_set()
bug_unique = set()
for row in tqdm(retrieval.train):
    dup_a_id, dup_b_id = row
    bug_unique.add(dup_a_id)
    bug_unique.add(dup_b_id)
retrieval.bugs_train = bug_unique

HBox(children=(IntProgress(value=0, max=70287), HTML(value='')))




#### Seleecting buckets from train

In [8]:
buckets_duplicates = set([key for key in tqdm(retrieval.buckets) if len(retrieval.buckets[key]) > 1])
print("Selecting only buckets from train...")
buckets_train = set()
for row in tqdm(retrieval.train):
    dup_a_id, dup_b_id = row
    buckets_train.add(issues_by_buckets[dup_a_id])
    buckets_train.add(issues_by_buckets[dup_b_id])

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))


Selecting only buckets from train...


HBox(children=(IntProgress(value=0, max=70287), HTML(value='')))




#### Buckets with at least 2 duplicates

In [9]:
print("Buckets train:", len(buckets_train))
print("Buckets test:", len(buckets_duplicates) - len(buckets_train))
print("All Buckets:", len(buckets_duplicates))

Buckets train: 20938
Buckets test: 3475
All Buckets: 24413


#### Model to vectorizer all buckets

In [10]:
import keras
from keras.models import Model
from keras.layers import Input

name = 'baseline_classification_100epoch_10steps(eclipse)'
similarity_model = Baseline.load_model('', name, {'l2_normalize' : Baseline.l2_normalize})

bug_title = Input(shape = (MAX_SEQUENCE_LENGTH_T, ), name = 'title') #similarity_model.get_layer('title_in').input
bug_desc = Input(shape = (MAX_SEQUENCE_LENGTH_D, ), name = 'desc') # similarity_model.get_layer('desc_in').input
bug_info = Input(shape = (MAX_SEQUENCE_LENGTH_I, ), name = 'info') # similarity_model.get_layer('info_in').input

title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')
info_encoder = similarity_model.get_layer('FeatureMlpGenerationModel')

bug_t = title_encoder(bug_title)
bug_d = desc_encoder(bug_desc)
bug_i = info_encoder(bug_info)
# Representation layer
model = similarity_model.get_layer('merge_features_in')
output = model([bug_i, bug_t, bug_d])
# Classification layer
for layer in ['dense_2', 'activation_2', 'batch_normalization_2']:
    clf_layer = similarity_model.get_layer(layer)
    output = clf_layer(output)

model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

Loaded model from disk


In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info (InputLayer)               (None, 1682)         0                                            
__________________________________________________________________________________________________
title (InputLayer)              (None, 20)           0                                            
__________________________________________________________________________________________________
desc (InputLayer)               (None, 200)          0                                            
__________________________________________________________________________________________________
FeatureMlpGenerationModel (Mode (None, 256)          4471853     info[0][0]                       
__________________________________________________________________________________________________
FeatureLst

#### Vectorizing all buckets from train

In [12]:
def vectorizer_buckets_train(buckets_train):
    bug_set = retrieval.baseline.get_bug_set()
    buckets_train_vectorized = []
    for bug_id in tqdm(buckets_train): # retrieval.bugs_train
        bug = bug_set[bug_id]
        bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
        buckets_train_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector })
    return buckets_train_vectorized

In [13]:
buckets_train_vectorized = vectorizer_buckets_train(buckets_train)

HBox(children=(IntProgress(value=0, max=20938), HTML(value='')))




#### Vectorizing all test

In [40]:
bug_set = retrieval.baseline.get_bug_set()
queries_test_vectorized = []
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    if issues_by_buckets[bug_id] == bug_id: continue
    bug = bug_set[bug_id]
    bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
    queries_test_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector, 'ground_truth': ground_truth })

HBox(children=(IntProgress(value=0, max=12798), HTML(value='')))




### Indexing all vectors

In [41]:
# Indexing all train
def indexing_train(buckets_train_vectorized):
    X = np.array(buckets_train_vectorized)
    annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

    loop = tqdm(total=len(X))
    for index, row in enumerate(X):
        vector = row['vector']
        annoy.add_item(index, vector)
        loop.update(1)
    loop.close()
    annoy.build(10) # 10 trees
    return annoy

In [42]:
annoy = indexing_train(buckets_train_vectorized)

HBox(children=(IntProgress(value=0, max=24413), HTML(value='')))




## Retrieval using classication model

In [43]:
def indexing_test(queries_test_vectorized):
    X_test = queries_test_vectorized
    distance_test, indices_test = [], []
    for index, row in tqdm(enumerate(X_test)):
        vector = row['vector']
        rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
        indices_test.append(rank)
        distance_test.append(dist)
    return X_test, distance_test, indices_test

In [44]:
X_test, distance_test, indices_test = indexing_test(queries_test_vectorized)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [45]:
"Total buckets train vectorized: {}".format(len(buckets_train_vectorized))

'Total buckets train vectorized: 24413'

#### Removing queries that appear in the train

In [46]:
"Total queries vectorized: {}".format(len(queries_test_vectorized))

'Total queries vectorized: 0'

In [21]:
def removing_queries_appear_train_or_buckets_querie_test(queries_test_vectorized, indices_test):
    queries_test_vectorized_clean = []
    for row, index in zip(queries_test_vectorized, indices_test):
        if row['bug_id'] == buckets_train_vectorized[index[0]]['bug_id']: continue
        queries_test_vectorized_clean.append(row)
    queries_test_vectorized = queries_test_vectorized_clean
    return queries_test_vectorized

In [22]:
queries_test_vectorized = removing_queries_appear_train_or_buckets_querie_test(queries_test_vectorized, indices_test)

In [23]:
"Total queries vectorized: {}".format(len(queries_test_vectorized))

'Total queries vectorized: 11057'

#### Adding the ground_truth in the train

In [24]:
bug_set = retrieval.baseline.get_bug_set()
ground_truth_ids = []
for row in tqdm(queries_test_vectorized):
    ground_truth_ids += row['ground_truth']
for bug_id in tqdm(ground_truth_ids):
    buckets_train.add(issues_by_buckets[bug_id])
buckets_train_vectorized = vectorizer_buckets_train(buckets_train)

HBox(children=(IntProgress(value=0, max=11057), HTML(value='')))




HBox(children=(IntProgress(value=0, max=15414), HTML(value='')))




HBox(children=(IntProgress(value=0, max=24413), HTML(value='')))




#### Rebuilding train and test after modifications

In [25]:
annoy = indexing_train(buckets_train_vectorized)

HBox(children=(IntProgress(value=0, max=24413), HTML(value='')))




In [26]:
X_test, distance_test, indices_test = indexing_test(queries_test_vectorized)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [27]:
print("Total buckets train vectorized: {}".format(len(buckets_train_vectorized)))
print("Total queries vectorized: {}".format(len(queries_test_vectorized)))

Total buckets train vectorized: 24413
Total queries vectorized: 11057


## Rank result

In [28]:
formated_rank = []
for row_index, row_sim in tqdm(zip(indices_test, distance_test)):
    row_index, row_sim = row_index[:20], row_sim[:20]
    formated_rank.append(",".join(["{}:{}".format(buckets_train_vectorized[index]['bug_id'], sim) 
                                   for index, sim in zip(row_index, row_sim)]))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Queries 

In [29]:
# Generating the rank result
rank_queries = []

for index, row in tqdm(enumerate(X_test)):
    dup_a, ground_truth = row['bug_id'], row['ground_truth']
    ground_truth = ",".join(np.array(ground_truth, str))
    rank_queries.append("{}:{}".format(dup_a, issues_by_buckets[dup_a]))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [30]:
exported_rank = []
loop = tqdm(total=len(rank_queries))

for query, rank in zip(rank_queries, formated_rank):
    exported_rank.append("{}|{}".format(query, rank))
    loop.update(1)
loop.close()

HBox(children=(IntProgress(value=0, max=11057), HTML(value='')))




In [31]:
exported_rank[:20]

['98309:128463|139517:0.043819282203912735,147492:0.049943216145038605,41111:0.0519556999206543,135926:0.05506506189703941,88362:0.055887047201395035,82662:0.060155875980854034,331517:0.062186349183321,85971:0.062262751162052155,214041:0.06308712810277939,145060:0.06330885738134384,314489:0.0633108988404274,81997:0.06562544405460358,241440:0.06670304387807846,155442:0.06953224539756775,339214:0.07030508667230606,339607:0.07107843458652496,352463:0.07228773832321167,281709:0.07645438611507416,141220:0.07749004662036896,150097:0.07779340445995331',
 '10:10|10:0.0,4931:0.052450090646743774,4882:0.05384274944663048,7519:0.061163727194070816,10509:0.061429060995578766,11279:0.06479943543672562,7564:0.06737362593412399,11208:0.07220620661973953,12369:0.07266508042812347,71:0.07679548114538193,76326:0.07857028394937515,5568:0.0802750363945961,4904:0.08480124920606613,15406:0.08745533972978592,35609:0.09133847057819366,41376:0.09153779596090317,111250:0.09198561310768127,151861:0.0922753885388

In [32]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [33]:
class Evaluation():
    def __init__(self):
        pass
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, query_master = query.split(":")
        query_master = int(query_master)
        rank_masters = [int(item.split(':')[0]) for pos, item in enumerate(rank.split(",")[:20])]
        corrects = len(set([query_master]) & set(rank_masters[:k]))
        #total = len(retrieval.buckets[issues_by_buckets[query_master]])
        total = 1 if corrects <= 0 else corrects
        return float(corrects), total

    def evaluate(self, path):
        recall_at_5_corrects_sum, recall_at_10_corrects_sum, recall_at_15_corrects_sum, recall_at_20_corrects_sum = 0, 0, 0, 0
        recall_at_5_total_sum, recall_at_10_total_sum, recall_at_15_total_sum, recall_at_20_total_sum = 0, 0, 0, 0
        print("Evaluating...")
        with open(path, 'r') as file_input:
            for row in file_input:
                #if row == '': continue
                recall_at_5_corrects, recall_at_5_total = self.top_k_recall(row, k=5)
                recall_at_10_corrects, recall_at_10_total = self.top_k_recall(row, k=10)
                recall_at_15_corrects, recall_at_15_total = self.top_k_recall(row, k=15)
                recall_at_20_corrects, recall_at_20_total = self.top_k_recall(row, k=20)
                
                recall_at_5_corrects_sum += recall_at_5_corrects
                recall_at_10_corrects_sum += recall_at_10_corrects
                recall_at_15_corrects_sum += recall_at_15_corrects
                recall_at_20_corrects_sum += recall_at_20_corrects
                recall_at_5_total_sum += recall_at_5_total
                recall_at_10_total_sum += recall_at_10_total
                recall_at_15_total_sum += recall_at_15_total
                recall_at_20_total_sum += recall_at_20_total
        
        report = {
            '1 - recall_at_5' : round(recall_at_5_corrects_sum / recall_at_5_total_sum, 2),
            '2 - recall_at_10' : round(recall_at_10_corrects_sum / recall_at_10_total_sum, 2),
            '3 - recall_at_15' : round(recall_at_15_corrects_sum / recall_at_15_total_sum, 2),
            '4 - recall_at_20' : round(recall_at_20_corrects_sum / recall_at_20_total_sum, 2)
        }

        return report

In [34]:
#from methods.evaluation import Evaluation
evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report

Evaluating...


{'1 - recall_at_5': 0.21,
 '2 - recall_at_10': 0.22,
 '3 - recall_at_15': 0.24,
 '4 - recall_at_20': 0.25}

### Visualizing the rank

In [35]:
def get_similar_bugs_rank(index):
    query, rank = exported_rank[index].split('|')
    similar_ids = []
    for row in rank.split(','):
        master_id, bug_id, sim = row.split(':')
        similar_ids.append(bug_id)
    df_query = df[df['bug_id'] == int(query.split(':')[0])]
    df_similar = df[df['bug_id'].isin(similar_ids)]
    return df_query, df_similar

In [36]:
def plot_rank(test_labels, tsne_features):
    obj_categories = ['anchor', 'positive', 'negative']
    groups = [0, 1, 2]
    colors = plt.cm.rainbow(np.linspace(0, 1, 3))
    plt.figure(figsize=(10, 10))

    for c_group, (c_color, c_label) in enumerate(zip(colors, obj_categories)):
        plt.scatter(tsne_features[np.where(test_labels == c_group), 0],
                    tsne_features[np.where(test_labels == c_group), 1],
                    marker='o',
                    color=c_color,
                    linewidth='1',
                    alpha=0.8,
                    label=c_label)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('t-SNE on Testing Samples')
    plt.legend(loc='best')
    #plt.savefig('clothes-dist.png')
    plt.show(block=False)

def display_rank_at_position(index):
    query, rank = exported_rank[index].split('|')
    query_bug_id = int(query.split(':')[0])
    x_test_features = []
    x_test_features.append()
    tsne_features = Baseline.create_features(x_test_features)
    Baseline.plot_2d(valid_sim, tsne_features)

In [37]:
#df_query, df_similar = get_similar_bugs_rank(19)

In [38]:
#df_query

In [39]:
#df_similar