In [1]:
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from operator import itemgetter
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from methods.baseline import Baseline
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

Using TensorFlow backend.


In [3]:
retrieval = Retrieval()

path = 'data/processed/eclipse'
path_buckets = 'data/normalized/eclipse/eclipse.csv'
path_train = 'data/processed/eclipse/train.txt'
path_test = 'data/processed/eclipse/test.txt'

MAX_SEQUENCE_LENGTH_T = 20 # Title
MAX_SEQUENCE_LENGTH_D = 200 # Description
MAX_SEQUENCE_LENGTH_I = 1682 # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

0it [00:00, ?it/s]

Reading train data
Reading the test...


12798it [00:00, 63665.16it/s]
  1%|          | 2114/212512 [00:00<00:09, 21133.27it/s]

Reading test data


100%|██████████| 212512/212512 [00:10<00:00, 20094.02it/s]
  0%|          | 583/321483 [00:00<00:55, 5827.23it/s]

Creating the buckets...


100%|██████████| 321483/321483 [00:16<00:00, 20043.46it/s]
100%|██████████| 39523/39523 [00:01<00:00, 34699.07it/s]
12798it [00:00, 121196.06it/s]

Creating the queries...





In [4]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




In [5]:
# Read the siamese model
name = 'baseline_classification_100epoch_10steps(eclipse)'
retrieval.read_model(name, MAX_SEQUENCE_LENGTH_I, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

retrieval.model.summary()

Loaded model from disk
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_a (InputLayer)             (None, 1682)         0                                            
__________________________________________________________________________________________________
title_a (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
desc_a (InputLayer)             (None, 200)          0                                            
__________________________________________________________________________________________________
info_b (InputLayer)             (None, 1682)         0                                            
______________________________________________________________________________________

In [6]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

In [14]:
print("Total of queries:", len(retrieval.test))

Total of queries: 12798


## Search

#### Vectorizing train pairs 

In [58]:
retrieval.train_vectorized = []
bug_set = retrieval.baseline.get_bug_set()
bug_unique = set()
for row in tqdm(retrieval.train):
    dup_a_id, dup_b_id = row
    bug_unique.add(dup_a_id)
    bug_unique.add(dup_b_id)
retrieval.bugs_train = bug_unique

HBox(children=(IntProgress(value=0, max=70287), HTML(value='')))




In [16]:
buckets_duplicates = [key for key in tqdm(retrieval.buckets) if len(retrieval.buckets[key]) > 1]
print("Selecting only buckets from train...")
buckets_train = set()
for row in tqdm(retrieval.train):
    dup_a_id, dup_b_id = row
    buckets_train.add(issues_by_buckets[dup_a_id])
    buckets_train.add(issues_by_buckets[dup_b_id])

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))


Selecting only buckets from train...


HBox(children=(IntProgress(value=0, max=70287), HTML(value='')))




#### Buckets with at least 2 duplicates

In [17]:
print("Buckets train:", len(buckets_train))
print("Buckets test:", len(buckets_duplicates) - len(buckets_train))
print("All Buckets:", len(buckets_duplicates))

Buckets train: 20938
Buckets test: 3475
All Buckets: 24413


#### Model to vectorizer all buckets

In [96]:
import keras
from keras.models import Model
from keras.layers import Input

name = 'baseline_1000epoch_10steps_1024batch(eclipse)'
similarity_model = Baseline.load_model('', name, {'l2_normalize' : Baseline.l2_normalize})

bug_title = Input(shape = (MAX_SEQUENCE_LENGTH_T, ), name = 'title') #similarity_model.get_layer('title_in').input
bug_desc = Input(shape = (MAX_SEQUENCE_LENGTH_D, ), name = 'desc') # similarity_model.get_layer('desc_in').input
bug_info = Input(shape = (MAX_SEQUENCE_LENGTH_I, ), name = 'info') # similarity_model.get_layer('info_in').input

title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')
info_encoder = similarity_model.get_layer('FeatureMlpGenerationModel')

bug_t = title_encoder(bug_title)
bug_d = desc_encoder(bug_desc)
bug_i = info_encoder(bug_info)

model = similarity_model.get_layer('merge_features_in')
output = model([bug_i, bug_t, bug_d])

model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

Loaded model from disk


#### Vectorizing all buckets from train

In [97]:
bug_set = retrieval.baseline.get_bug_set()
buckets_train_vectorized = []
for bug_id in tqdm(retrieval.bugs_train):
    bug = bug_set[bug_id]
    bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
    buckets_train_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector })

HBox(children=(IntProgress(value=0, max=56578), HTML(value='')))




#### Vectorizing all test

In [98]:
bug_set = retrieval.baseline.get_bug_set()
queries_test_vectorized = []
for row in tqdm(retrieval.test):
    bug_id, ground_truth = row
    bug = bug_set[bug_id]
    bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
    queries_test_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector, 'ground_truth': ground_truth })

HBox(children=(IntProgress(value=0, max=12798), HTML(value='')))




### Indexing all vectors

In [92]:
buckets_train_vectorized[0]

{'bug_id': 1,
 'vector': array([ 5.4095023e+04,  4.4579441e+04,  5.8343961e+04,  3.7577090e+04,
         6.7166492e+04,  6.1266828e+04,  4.2294426e+04,  6.5968047e+04,
         5.5339406e+04,  3.2031213e+04,  5.5323461e+04,  2.9532828e+04,
         5.9494738e+04,  5.6765109e+04,  6.2023844e+04,  6.6623695e+04,
         5.7167195e+04,  6.6088469e+04,  5.1742469e+04,  7.2098039e+04,
         4.9233602e+04,  7.4086609e+04,  6.9188484e+04,  5.2158016e+04,
         7.0691734e+04,  4.2038598e+04,  4.9232613e+04,  6.2573297e+04,
         7.3054094e+04,  4.7811434e+04,  5.0259371e+04,  3.3514844e+04,
         6.4403703e+04,  6.3444719e+04,  5.2334391e+04,  5.6470512e+04,
         6.8522977e+04,  3.9256770e+04,  3.5129508e+04,  6.0702562e+04,
         6.2393559e+04,  2.8148764e+04,  4.0720609e+04,  6.2482688e+04,
         6.4565262e+04,  6.9386641e+04,  5.5471656e+04,  5.4845094e+04,
         6.0922195e+04,  5.1108098e+04,  6.6823500e+04,  3.0119068e+04,
         4.0378051e+04,  6.2787422e+04, 

In [93]:
queries_test_vectorized[0]

{'bug_id': 98309,
 'ground_truth': [128463],
 'vector': array([ 5.40950234e+04,  4.45794414e+04,  5.83439609e+04,  3.75770898e+04,
         6.71664922e+04,  6.12668281e+04,  4.22944258e+04,  6.59680469e+04,
         5.53394062e+04,  3.20312129e+04,  5.53234609e+04,  2.95328281e+04,
         5.94947383e+04,  5.67651094e+04,  6.20238438e+04,  6.66236953e+04,
         5.71671953e+04,  6.60884688e+04,  5.17424688e+04,  7.20980391e+04,
         4.92336016e+04,  7.40866094e+04,  6.91884844e+04,  5.21580156e+04,
         7.06917344e+04,  4.20385977e+04,  4.92326133e+04,  6.25732969e+04,
         7.30540938e+04,  4.78114336e+04,  5.02593711e+04,  3.35148438e+04,
         6.44037031e+04,  6.34447188e+04,  5.23343906e+04,  5.64705117e+04,
         6.85229766e+04,  3.92567695e+04,  3.51295078e+04,  6.07025625e+04,
         6.23935586e+04,  2.81487637e+04,  4.07206094e+04,  6.24826875e+04,
         6.45652617e+04,  6.93866406e+04,  5.54716562e+04,  5.48450938e+04,
         6.09221953e+04,  5.11080

In [99]:
from annoy import AnnoyIndex
import numpy as np
# Indexing all train
X = np.array(buckets_train_vectorized)
annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

In [100]:
loop = tqdm(total=len(X))
for index, row in enumerate(X):
    vector = row['vector']
    annoy.add_item(index, vector)
    loop.update(1)
loop.close()
annoy.build(10) # 10 trees

HBox(children=(IntProgress(value=0, max=56578), HTML(value='')))




True

## Retrieval using classication model

In [101]:
%%time

X_test = queries_test_vectorized
distance_test, indices_test = [], []
for index, row in enumerate(X_test):
    vector = row['vector']
    rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
    indices_test.append(rank)
    distance_test.append(dist)

CPU times: user 4.53 s, sys: 0 ns, total: 4.53 s
Wall time: 4.53 s


In [102]:
"Total bucekets train vectorized: {}".format(len(buckets_train_vectorized))

'Total bucekets train vectorized: 56578'

## Rank result

In [103]:
formated_rank = []
for row_index, row_sim in tqdm(zip(indices_test, distance_test)):
    row_index, row_sim = row_index[:20], row_sim[:20]
    formated_rank.append(",".join(["{}:{}".format(buckets_train_vectorized[index]['bug_id'], sim) for index, sim in zip(row_index, row_sim)]))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




#### Queries 

In [104]:
# Generating the rank result
rank_queries = []
for index, row in tqdm(enumerate(X_test)):
    dup_a, ground_truth = row['bug_id'], row['ground_truth']
    ground_truth = ",".join(np.array(ground_truth, str))
    rank_queries.append("{}:{}".format(dup_a, ground_truth))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [105]:
exported_rank = []
loop = tqdm(total=len(X_test))

for query, rank in zip(rank_queries, formated_rank):
    exported_rank.append("{}|{}".format(query, rank))
    loop.update(1)
loop.close()

HBox(children=(IntProgress(value=0, max=12798), HTML(value='')))




In [106]:
exported_rank[:20]

['98309:128463|2:0.0,262461:0.0,131661:0.0,131852:0.0,132137:0.0,394648:0.0,132699:0.0,1635:0.0,263791:0.0,132760:0.0,263879:0.0,133895:0.0,2857:0.0,265122:0.0,3100:0.0,3106:0.0,134220:0.0,134223:0.0,134271:0.0,396454:0.0',
 '10:121067|2:0.0,262461:0.0,131661:0.0,131852:0.0,132137:0.0,394648:0.0,132699:0.0,1635:0.0,263791:0.0,132760:0.0,263879:0.0,133895:0.0,2857:0.0,265122:0.0,3100:0.0,3106:0.0,134220:0.0,134223:0.0,134271:0.0,396454:0.0',
 '294924:239825|2:0.0,262461:0.0,131661:0.0,131852:0.0,132137:0.0,394648:0.0,132699:0.0,1635:0.0,263791:0.0,132760:0.0,263879:0.0,133895:0.0,2857:0.0,265122:0.0,3100:0.0,3106:0.0,134220:0.0,134223:0.0,134271:0.0,396454:0.0',
 '163858:184976|2:0.0,262461:0.0,131661:0.0,131852:0.0,132137:0.0,394648:0.0,132699:0.0,1635:0.0,263791:0.0,132760:0.0,263879:0.0,133895:0.0,2857:0.0,265122:0.0,3100:0.0,3106:0.0,134220:0.0,134223:0.0,134271:0.0,396454:0.0',
 '25:31234|2:0.0,262461:0.0,131661:0.0,131852:0.0,132137:0.0,394648:0.0,132699:0.0,1635:0.0,263791:0.0,13

In [123]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [124]:
class Evaluation():
    def __init__(self):
        pass
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, query_master = query.split(":")
        query_master = int(query_master)
        hit = 0
        for pos, item in enumerate(rank.split(",")[:20]):
            if item.strip() == '': continue
            master, sim = item.split(':')
            master = int(master)
            if master == query_master and (pos+1) <= k:
                hit=1
                return [hit]
        return [hit]

    def evaluate(self, path):
        recall_at_5, recall_at_10, recall_at_15, recall_at_20 = [], [], [], []
        total = 0
        print("Evaluating...")
        with open(path, 'r') as file_input:
            for row in file_input:
                if row == '': continue
                recall_at_5 += self.top_k_recall(row, k=5)
                recall_at_10 += self.top_k_recall(row, k=10)
                recall_at_15 += self.top_k_recall(row, k=15)
                recall_at_20 += self.top_k_recall(row, k=20)
                total+=1
        
        report = {
            'recall_at_5' : round(sum(recall_at_5) / total, 4),
            'recall_at_10' : round(sum(recall_at_10) / total, 4),
            'recall_at_15' : round(sum(recall_at_15) / total, 4),
            'recall_at_20' : round(sum(recall_at_20) / total, 4)
        }

        return report

In [125]:
#from methods.evaluation import Evaluation
evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report

Evaluating...


{'recall_at_10': 0.0,
 'recall_at_15': 0.0,
 'recall_at_20': 0.0,
 'recall_at_5': 0.0}

### Visualizing the rank

In [None]:
def get_similar_bugs_rank(index):
    query, rank = exported_rank[index].split('|')
    similar_ids = []
    for row in rank.split(','):
        master_id, bug_id, sim = row.split(':')
        similar_ids.append(bug_id)
    df_query = df[df['bug_id'] == int(query.split(':')[0])]
    df_similar = df[df['bug_id'].isin(similar_ids)]
    return df_query, df_similar

In [None]:
def plot_rank(test_labels, tsne_features):
    obj_categories = ['anchor', 'positive', 'negative']
    groups = [0, 1, 2]
    colors = plt.cm.rainbow(np.linspace(0, 1, 3))
    plt.figure(figsize=(10, 10))

    for c_group, (c_color, c_label) in enumerate(zip(colors, obj_categories)):
        plt.scatter(tsne_features[np.where(test_labels == c_group), 0],
                    tsne_features[np.where(test_labels == c_group), 1],
                    marker='o',
                    color=c_color,
                    linewidth='1',
                    alpha=0.8,
                    label=c_label)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('t-SNE on Testing Samples')
    plt.legend(loc='best')
    #plt.savefig('clothes-dist.png')
    plt.show(block=False)

def display_rank_at_position(index):
    query, rank = exported_rank[index].split('|')
    query_bug_id = int(query.split(':')[0])
    x_test_features = []
    x_test_features.append()
    tsne_features = Baseline.create_features(x_test_features)
    Baseline.plot_2d(valid_sim, tsne_features)

In [None]:
retrieval.buckets[128463]

In [None]:
df_query, df_similar = get_similar_bugs_rank(19)

In [None]:
df_query

In [None]:
df_similar