In [2]:
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
from operator import itemgetter
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
from methods.baseline import Baseline
from methods.retrieval import Retrieval
from annoy import AnnoyIndex
import numpy as np

Using TensorFlow backend.


In [4]:
retrieval = Retrieval()

path = 'data/processed/eclipse'
path_buckets = 'data/normalized/eclipse/eclipse.csv'
path_train = 'data/processed/eclipse/train.txt'
path_test = 'data/processed/eclipse/test.txt'

MAX_SEQUENCE_LENGTH_T = 20 # Title
MAX_SEQUENCE_LENGTH_D = 200 # Description
MAX_SEQUENCE_LENGTH_I = 1682 # Status, Severity, Version, Component, Module

# Create the instance from baseline
retrieval.baseline = Baseline(path, path_buckets, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

  0%|          | 0/212512 [00:00<?, ?it/s]

Reading train data


100%|██████████| 212512/212512 [00:56<00:00, 3781.70it/s]
  0%|          | 510/321483 [00:00<01:02, 5099.19it/s]

Creating the buckets...


100%|██████████| 321483/321483 [00:15<00:00, 20220.10it/s]
100%|██████████| 39523/39523 [00:01<00:00, 34866.78it/s]
12798it [00:00, 64864.37it/s]

Creating the queries...





In [5]:
issues_by_buckets = {}
for bucket in tqdm(retrieval.buckets):
    issues_by_buckets[bucket] = bucket
    for issue in np.array(retrieval.buckets[bucket]).tolist():
        issues_by_buckets[issue] = bucket

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))




In [6]:
import keras
# Read the siamese model
name = 'baseline_classification_100epoch_10steps(eclipse)'
retrieval.read_model(name, MAX_SEQUENCE_LENGTH_I, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

retrieval.model.summary()

Loaded model from disk
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
info_a (InputLayer)             (None, 1682)         0                                            
__________________________________________________________________________________________________
title_a (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
desc_a (InputLayer)             (None, 200)          0                                            
__________________________________________________________________________________________________
info_b (InputLayer)             (None, 1682)         0                                            
______________________________________________________________________________________

In [7]:
retrieval.train_vectorized, retrieval.test_result = [], []
# Infer vector to all train
retrieval.read_train(path_train)

## Search

#### Vectorizing train pairs 

In [8]:
retrieval.train_vectorized = []
retrieval.infer_vector_train(retrieval.train)

100%|██████████| 70287/70287 [00:00<00:00, 1396022.47it/s]


In [9]:
buckets_duplicates = [key for key in tqdm(retrieval.buckets) if len(retrieval.buckets[key]) > 1]
print("Selecting only buckets from train...")
buckets_train = set()
for row in tqdm(retrieval.train):
    dup_a_id, dup_b_id = row
    buckets_train.add(issues_by_buckets[dup_a_id])
    buckets_train.add(issues_by_buckets[dup_b_id])

HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))


Selecting only buckets from train...


HBox(children=(IntProgress(value=0, max=70287), HTML(value='')))




#### Buckets with at least 2 duplicates

In [10]:
print("Buckets train:", len(buckets_train))
print("Buckets test:", len(buckets_duplicates) - len(buckets_train))
print("All Buckets:", len(buckets_duplicates))

Buckets train: 20938
Buckets test: 3475
All Buckets: 24413


#### Model to vectorizer all buckets

In [11]:
from keras.models import Model

name = 'baseline_1000epoch_10steps_1024batch(eclipse)'
similarity_model = Baseline.load_model('', name, {'l2_normalize' : Baseline.l2_normalize})

bug_title = similarity_model.get_layer('title_in').input
bug_desc = similarity_model.get_layer('desc_in').input
bug_info = similarity_model.get_layer('info_in').input

title_encoder = similarity_model.get_layer('FeatureLstmGenerationModel')
desc_encoder = similarity_model.get_layer('FeatureCNNGenerationModel')
info_encoder = similarity_model.get_layer('FeatureMlpGenerationModel')

bug_t = title_encoder(bug_title)
bug_d = desc_encoder(bug_desc)
bug_i = info_encoder(bug_info)

model = similarity_model.get_layer('merge_features_in')
output = model([bug_i, bug_t, bug_d])

model = Model(inputs=[bug_title, bug_desc, bug_info], outputs=[output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])

Loaded model from disk


#### Vectorizing all buckets from train

In [12]:
bug_set = retrieval.baseline.get_bug_set()
buckets_train_vectorized = []
for bug_id in tqdm(buckets_train):
    bug = bug_set[bug_id]
    bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
    buckets_train_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector })

HBox(children=(IntProgress(value=0, max=20938), HTML(value='')))




#### Bulding the test

In [13]:
bug_set = retrieval.baseline.get_bug_set()
queries_test = []
print("Selecting buckets duplicates...")
buckets_duplicates = [key for key in tqdm(retrieval.buckets) if len(retrieval.buckets[key]) > 1]
test_no_present_in_trained = []
print("Selecting only bugs did not used in the train...")
for row in tqdm(retrieval.test):
    dup_a_id, dup_b_id = row
    diff = list(set(row) - retrieval.bugs_train)
    test_no_present_in_trained += diff
queries_test = test_no_present_in_trained
print("Removing buckets that did not appear in the train")
queries_test = [bug_id for bug_id in test_no_present_in_trained if issues_by_buckets[bug_id] != bug_id]
print("Adding inside the train buckets that were not used ")
buckets_added_train = [bug_id for bug_id in test_no_present_in_trained if issues_by_buckets[bug_id] == bug_id]
for bug_id in buckets_added_train:
    bug = bug_set[bug_id]
    bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
    buckets_train_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector })

Selecting buckets duplicates...


HBox(children=(IntProgress(value=0, max=321483), HTML(value='')))


Selecting only bugs did not used in the train...


HBox(children=(IntProgress(value=0, max=17572), HTML(value='')))


Removing buckets that did not appear in the train
Adding inside the train buckets that were not used 


In [14]:
print("Total of test:", len(queries_test))

Total of test: 4081


#### Vectorizing all test

In [15]:
bug_set = retrieval.baseline.get_bug_set()
queries_test_vectorized = []
for bug_id in tqdm(queries_test):
    bug = bug_set[bug_id]
    bug_vector = model.predict([ [bug['title_word']], [bug['description_word']], [retrieval.get_info(bug)] ])[0]
    queries_test_vectorized.append({ 'bug_id' : bug_id, 'vector' : bug_vector })

HBox(children=(IntProgress(value=0, max=4081), HTML(value='')))




### Indexing all vectors

In [16]:
from annoy import AnnoyIndex
import numpy as np
# Indexing all train
X = np.array(buckets_train_vectorized)
annoy = AnnoyIndex(X[0]['vector'].shape[0])  # Length of item vector that will be indexed

In [17]:
loop = tqdm(total=len(X))
for index, row in enumerate(X):
    vector = row['vector']
    annoy.add_item(index, vector)
    loop.update(1)
loop.close()
annoy.build(10) # 10 trees

HBox(children=(IntProgress(value=0, max=24722), HTML(value='')))




True

## Retrieval using classication model

In [18]:
%%time

X_test = queries_test_vectorized
distance_test, indices_test = [], []
for index, row in enumerate(X_test):
    vector = row['vector']
    rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
    indices_test.append(rank)
    distance_test.append(dist)

CPU times: user 1.25 s, sys: 1.45 ms, total: 1.26 s
Wall time: 1.25 s


In [19]:
"Total bucekets train vectorized: {}".format(len(buckets_train_vectorized))

'Total bucekets train vectorized: 24722'

#### Using classification model to predict similarity

In [20]:
test = []
loop = tqdm(total=len(indices_test) * len(X_test)) # tqdm(total=len(indices_test))
for rank in indices_test:
    for train_bug_id in X_test:
        test += [(buckets_train_vectorized[index]['bug_id'], train_bug_id['bug_id']) for index in rank]
        loop.update(1)
loop.close()

HBox(children=(IntProgress(value=0, max=16654561), HTML(value='')))




In [21]:
"Total queries test: {}".format(len(test))

'Total queries test: 499636830'

In [25]:
def get_info(retrieval, bug, info_cache):
        if bug['issue_id'] in info_cache:
            return info_cache[bug['issue_id']]
        info = np.concatenate((
            retrieval.baseline.to_one_hot(bug['bug_severity'], retrieval.baseline.info_dict['bug_severity']),
            retrieval.baseline.to_one_hot(bug['bug_status'], retrieval.baseline.info_dict['bug_status']),
            retrieval.baseline.to_one_hot(bug['component'], retrieval.baseline.info_dict['component']),
            retrieval.baseline.to_one_hot(bug['priority'], retrieval.baseline.info_dict['priority']),
            retrieval.baseline.to_one_hot(bug['product'], retrieval.baseline.info_dict['product']),
            retrieval.baseline.to_one_hot(bug['version'], retrieval.baseline.info_dict['version']))
        )
        info_cache[bug['issue_id']] = info
        return info

In [24]:
rank_test_sorted = {}
info_cache = {}

for row in tqdm(test):
    dup_a, dup_b = row
    bug_a = bug_set[dup_a]
    bug_b = bug_set[dup_b]
    sim = retrieval.model.predict([ [bug_a['title_word']], [bug_b['title_word']], 
                                        [bug_a['description_word']], [bug_b['description_word']],
                                        [get_info(retrieval, bug_a, info_cache)], [get_info(retrieval, bug_b, info_cache)] ])[0][1]
    if dup_b not in rank_test_sorted:
        rank_test_sorted[dup_b] = []

    rank_test_sorted[dup_b].append((dup_a, sim))

HBox(children=(IntProgress(value=0, max=499636830), HTML(value='')))




ValueError: Error when checking input: expected info_b to have shape (1682,) but got array with shape (1,)

In [None]:
"Total of rank tested: {}".format(len(rank_test_sorted))

## Rank result

#### Ordering the rank 

In [94]:
for key in tqdm(rank_test_sorted):
    rank = rank_test_sorted[key]
    rank_test_sorted[key] = sorted(rank, key = itemgetter(1), reverse = True)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

In [110]:
formated_rank = []
for row in tqdm(rank_test_sorted):
    rank = rank_test_sorted[row][:20]
    formated_rank.append(",".join(["{}:{}".format(bug, sim) for bug, sim in rank]))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

#### Queries 

In [117]:
# Generating the rank result
rank_queries = []
for index, row in tqdm(enumerate(X_test)):
    dup_a, dup_b = row['bug_id'], issues_by_buckets[row['bug_id']]
    rank_queries.append("{}:{}".format(dup_a, dup_b))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [121]:
exported_rank = []
loop = tqdm(total=len(X_test))

for query, rank in zip(rank_queries, formated_rank):
    search, dup = query.split(":")
    exported_rank.append("{}|{}".format(query, rank))
    loop.update(1)
loop.close()

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

In [122]:
exported_rank[:20]

['98309:128463|132787:0.26083698868751526,131246:0.2434113770723343,197865:0.22080236673355103,295382:0.047042302787303925,101094:0.037583835422992706,262561:0.03204227611422539,393332:0.026789871975779533,232063:0.026379309594631195,131180:0.02292243205010891,393277:0.012278357520699501,199241:0.011731269769370556,232304:0.010547742247581482,394517:0.009860608726739883,67031:0.002433638321235776,34454:0.0023417582269757986,198571:0.0022025585640221834,229377:0.0021864583250135183,101023:0.0013441552873700857,166737:0.0011927509913221002,66688:0.001113341422751546']

In [123]:
with open(os.path.join(path, 'exported_rank.txt'), 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [124]:
class Evaluation():
    def __init__(self):
        pass
    
    """
        Rank recall_rate_@k
        rank = "query:master|master:id:sim,master:id:sim"
    """
    def top_k_recall(self, rank, k):
        query, rank = rank.split('|')
        query_dup_id, query_master = query.split(":")
        query_master = int(query_master)
        hit = 0
        for pos, item in enumerate(rank.split(",")[:20]):
            if item.strip() == '': continue
            master, sim = item.split(':')
            master = int(master)
            if master == query_master and (pos+1) <= k:
                hit=1
                return [hit]
        return [hit]

    def evaluate(self, path):
        recall_at_5, recall_at_10, recall_at_15, recall_at_20 = [], [], [], []
        total = 0
        print("Evaluating...")
        with open(path, 'r') as file_input:
            for row in file_input:
                if row == '': continue
                recall_at_5 += self.top_k_recall(row, k=5)
                recall_at_10 += self.top_k_recall(row, k=10)
                recall_at_15 += self.top_k_recall(row, k=15)
                recall_at_20 += self.top_k_recall(row, k=20)
                total+=1
        
        report = {
            'recall_at_5' : round(sum(recall_at_5) / total, 4),
            'recall_at_10' : round(sum(recall_at_10) / total, 4),
            'recall_at_15' : round(sum(recall_at_15) / total, 4),
            'recall_at_20' : round(sum(recall_at_20) / total, 4)
        }

        return report

In [125]:
#from methods.evaluation import Evaluation
evaluation = Evaluation()
report = evaluation.evaluate(os.path.join(path, 'exported_rank.txt'))
report

Evaluating...


{'recall_at_10': 0.0,
 'recall_at_15': 0.0,
 'recall_at_20': 0.0,
 'recall_at_5': 0.0}

### Visualizing the rank

In [None]:
def get_similar_bugs_rank(index):
    query, rank = exported_rank[index].split('|')
    similar_ids = []
    for row in rank.split(','):
        master_id, bug_id, sim = row.split(':')
        similar_ids.append(bug_id)
    df_query = df[df['bug_id'] == int(query.split(':')[0])]
    df_similar = df[df['bug_id'].isin(similar_ids)]
    return df_query, df_similar

In [None]:
def plot_rank(test_labels, tsne_features):
    obj_categories = ['anchor', 'positive', 'negative']
    groups = [0, 1, 2]
    colors = plt.cm.rainbow(np.linspace(0, 1, 3))
    plt.figure(figsize=(10, 10))

    for c_group, (c_color, c_label) in enumerate(zip(colors, obj_categories)):
        plt.scatter(tsne_features[np.where(test_labels == c_group), 0],
                    tsne_features[np.where(test_labels == c_group), 1],
                    marker='o',
                    color=c_color,
                    linewidth='1',
                    alpha=0.8,
                    label=c_label)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('t-SNE on Testing Samples')
    plt.legend(loc='best')
    #plt.savefig('clothes-dist.png')
    plt.show(block=False)

def display_rank_at_position(index):
    query, rank = exported_rank[index].split('|')
    query_bug_id = int(query.split(':')[0])
    x_test_features = []
    x_test_features.append()
    tsne_features = Baseline.create_features(x_test_features)
    Baseline.plot_2d(valid_sim, tsne_features)

In [None]:
retrieval.buckets[128463]

In [None]:
df_query, df_similar = get_similar_bugs_rank(19)

In [None]:
df_query

In [None]:
df_similar