In [1]:
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [190]:
from methods.baseline import Baseline
from methods.retrieval import Retrieval

In [3]:
retrieval = Retrieval()

path = 'data/processed/eclipse'
path_buckets = 'data/normalized/eclipse/eclipse.csv'
path_train = 'data/processed/eclipse/train.txt'
path_test = 'data/processed/eclipse/test.txt'

MAX_SEQUENCE_LENGTH_T = 100 # Title
MAX_SEQUENCE_LENGTH_D = 100 # Description

# Create the instance from baseline
retrieval.baseline = Baseline(path, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

df = pd.read_csv(path_buckets)

# Load bug ids
retrieval.load_bugs(path, path_train)
# Create the buckets
retrieval.create_bucket(df)
# Read and create the test queries duplicate
retrieval.create_queries(path_test)

Reading train data


100%|█████████████████████████████████| 212512/212512 [17:05<00:00, 207.27it/s]


Creating the buckets...


100%|████████████████████████████████| 321483/321483 [01:50<00:00, 2909.26it/s]
100%|█████████████████████████████████| 39523/39523 [00:03<00:00, 11414.71it/s]


Creating the queries...


12859it [00:00, 13936.90it/s]


In [4]:
import keras
# Read the siamese model
retrieval.read_model(MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D)

Loaded model from disk


In [6]:
retrieval.train_vectorized, retrieval.test_vectorized = [], []
# Infer vector to all train
retrieval.read_train(path_train)
retrieval.infer_vector(retrieval.train, retrieval.train_vectorized)
# Infer vector to all test
retrieval.infer_vector(retrieval.test, retrieval.test_vectorized)

100%|██████████████████████████████████| 70287/70287 [4:00:23<00:00,  4.87it/s]
100%|████████████████████████████████████| 17572/17572 [08:14<00:00, 35.20it/s]


In [7]:
from annoy import AnnoyIndex
import numpy as np
# Indexing all train
X = np.array(retrieval.train_vectorized)
annoy = AnnoyIndex(164)  # Length of item vector that will be indexed

In [8]:
loop = tqdm(total=len(X))
for index, vector in enumerate(X):
    annoy.add_item(index, vector)
    loop.update(1)
loop.close()
annoy.build(10) # 10 trees
annoy.save('eclipse_annoy.ann')

100%|████████████████████████████████| 140574/140574 [00:15<00:00, 9290.60it/s]


True

In [None]:
annoy = AnnoyIndex(164)
annoy.load('eclipse_annoy.ann') # super fast, will just mmap the file

In [9]:
# Creating a hash from the train to index the clusters indices
clusters_by_issue = { 'train' : {} } # { index_cluster : issue_id } 

In [45]:
len(retrieval.train), len(retrieval.train_vectorized)

(70287, 140574)

In [10]:
index = 0
for row in tqdm(retrieval.train):
    dup_a_id, dup_b_id = row
    # if dup_a_id not in bug_set or dup_b_id not in bug_set: continue
    clusters_by_issue['train'][index] = dup_a_id
    clusters_by_issue['train'][index+1] = dup_b_id
    index += 2

100%|████████████████████████████████| 70287/70287 [00:00<00:00, 234120.16it/s]


In [86]:
# Recommend neighborhood instances from test sample
X_test = retrieval.test_vectorized
loop = tqdm(total=len(X_test))
distance_test, indices_test = [], []
for index, vector in enumerate(X_test):
    rank, dist = annoy.get_nns_by_vector(vector, 30, include_distances=True)
    indices_test.append(rank)
    distance_test.append(dist)
    loop.update(1)
loop.close()


  0%|                                                | 0/35144 [00:00<?, ?it/s]
  0%|                                      | 1/35144 [00:00<4:50:44,  2.01it/s]
  0%|                                    | 103/35144 [00:00<3:23:05,  2.88it/s]
  1%|▏                                   | 203/35144 [00:00<2:21:56,  4.10it/s]
  1%|▎                                   | 320/35144 [00:00<1:39:10,  5.85it/s]
  1%|▍                                   | 448/35144 [00:00<1:09:18,  8.34it/s]
  2%|▌                                     | 567/35144 [00:01<48:29, 11.88it/s]
  2%|▋                                     | 688/35144 [00:01<33:58, 16.91it/s]
  2%|▊                                     | 793/35144 [00:01<23:53, 23.97it/s]
  3%|▉                                     | 894/35144 [00:01<16:51, 33.87it/s]
  3%|█                                     | 994/35144 [00:01<11:57, 47.60it/s]
  3%|█▏                                   | 1099/35144 [00:01<08:30, 66.70it/s]
  3%|█▎                                

 71%|█████████████████████████▌          | 24943/35144 [01:23<09:12, 18.47it/s]
 71%|█████████████████████████▋          | 25033/35144 [01:23<06:26, 26.16it/s]
 72%|█████████████████████████▊          | 25139/35144 [01:23<04:30, 36.98it/s]
 72%|█████████████████████████▊          | 25240/35144 [01:23<03:10, 52.01it/s]
 72%|█████████████████████████▉          | 25315/35144 [01:23<02:16, 72.15it/s]
 72%|██████████████████████████          | 25387/35144 [01:23<01:38, 98.83it/s]
 72%|█████████████████████████▎         | 25457/35144 [01:23<01:13, 131.32it/s]
 73%|█████████████████████████▍         | 25527/35144 [01:23<00:55, 173.63it/s]
 73%|█████████████████████████▍         | 25594/35144 [01:23<00:42, 222.12it/s]
 73%|█████████████████████████▌         | 25660/35144 [01:24<00:34, 271.47it/s]
 73%|█████████████████████████▋         | 25732/35144 [01:24<00:28, 330.61it/s]
 73%|█████████████████████████▋         | 25813/35144 [01:24<00:23, 401.94it/s]
 74%|█████████████████████████▊         

In [13]:
issues_by_buckets = {}
for key in tqdm(retrieval.buckets):
    issues_by_buckets[key] = key
    for issue in np.array(retrieval.buckets[key]).tolist():
        issues_by_buckets[issue] = key

100%|██████████████████████████████| 321483/321483 [00:02<00:00, 134247.48it/s]


In [200]:
# Generating the rank result
rank_queries = []
set_queries = {}
for row in tqdm(retrieval.test):
    dup_a, dup_b = row
#     rank_queries.append("{}:{}".format(dup_a, dup_b))
    if dup_a not in set_queries and dup_a != issues_by_buckets[dup_a]:
        rank_queries.append("{}:{}".format(dup_a, issues_by_buckets[dup_a]))
    if dup_b not in set_queries and dup_b != issues_by_buckets[dup_b]:
        rank_queries.append("{}:{}".format(dup_b, issues_by_buckets[dup_b]))
    set_queries[dup_a] = True
    set_queries[dup_b] = True


  0%|                                                | 0/17572 [00:00<?, ?it/s]
 10%|███▎                              | 1688/17572 [00:00<00:00, 16867.31it/s]
 47%|████████████████                  | 8280/17572 [00:00<00:00, 21713.41it/s]
 84%|███████████████████████████▌     | 14691/17572 [00:00<00:00, 27085.06it/s]
100%|█████████████████████████████████| 17572/17572 [00:00<00:00, 47846.18it/s]

In [201]:
exported_rank = []
loop = tqdm(total=len(indices_test))
for query, rank, sim in zip(rank_queries, indices_test, distance_test):
    search, dup = query.split(":")
    rank = ["{}:{}:{}".format(
        issues_by_buckets[clusters_by_issue['train'][item]], clusters_by_issue['train'][item], similarity) 
            for item, similarity in zip(rank, sim) if  
                #issues_by_buckets[clusters_by_issue['train'][item]] != int(dup) 
               clusters_by_issue['train'][item] != int(search) ] # 
    exported_rank.append("{}|{}".format(query, ",".join(rank)))
    loop.update(1)
loop.close()


  0%|                                                | 0/35144 [00:00<?, ?it/s]
  1%|▏                                   | 192/35144 [00:00<00:18, 1918.68it/s]
  1%|▍                                   | 400/35144 [00:00<00:17, 1963.97it/s]
  2%|▌                                   | 588/35144 [00:00<00:17, 1937.57it/s]
  2%|▊                                   | 767/35144 [00:00<00:18, 1890.34it/s]
  3%|▉                                   | 970/35144 [00:00<00:17, 1929.82it/s]
  3%|█▏                                 | 1170/35144 [00:00<00:17, 1949.98it/s]
  4%|█▎                                 | 1343/35144 [00:00<00:18, 1859.67it/s]
  4%|█▌                                 | 1529/35144 [00:00<00:18, 1859.39it/s]
  5%|█▋                                 | 1705/35144 [00:00<00:21, 1564.55it/s]
  5%|█▉                                 | 1883/35144 [00:01<00:20, 1623.19it/s]
  6%|██                                 | 2047/35144 [00:01<00:20, 1623.01it/s]
  6%|██▏                               

In [202]:
exported_rank[:10]

['120648:109290|138528:138528:0.0,37661:46075:0.0,59151:59151:0.0,398509:398509:0.01695529744029045,26404:24492:0.024413608014583588,64319:270231:0.026690147817134857,71987:90797:0.027687272056937218,71987:90797:0.027687272056937218,32204:34201:0.027690207585692406,134608:172458:0.027690207585692406,69374:101778:0.027700912207365036,69374:101778:0.027700912207365036,316509:316509:0.032124560326337814,30186:30186:0.036246635019779205,229919:198067:0.03674674406647682,74949:74949:0.03944720700383186,74949:74949:0.03944720700383186,74949:74949:0.03944720700383186,78295:80640:0.04040580987930298,78295:80640:0.04040580987930298,104203:104203:0.0413176566362381,65385:260093:0.042068127542734146,22536:156359:0.04543006420135498,22536:156359:0.04543006420135498,24793:27263:0.04723434895277023,206169:87479:0.04723434895277023,309049:309049:0.04723434895277023,236524:236524:0.04723434895277023,67327:70636:0.04723434895277023,129034:129034:0.04723434895277023',
 '162234:109290|101224:101443:0.0,2

In [203]:
with open('data/processed/eclipse/exported_rank.txt', 'w') as file_out:
    for row in exported_rank:
        file_out.write(row + "\n")

In [204]:
from methods.evaluation import Evaluation
evaluation = Evaluation()
report = evaluation.evaluate('data/processed/eclipse/exported_rank.txt')
report

{'recall_at_5': 0.0015,
 'recall_at_10': 0.0021,
 'recall_at_15': 0.0028,
 'recall_at_20': 0.0038}