In [14]:
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

import pynndescent
import numpy as np
import sys
import os
from tqdm import tqdm
import sqlite3
sys.path.insert(0, '..')
sys.path.insert(0, '../data_processing')
sys.path.insert(0, '../datasets')
import my_utils
from ComponentAutoExtractor import ComponentAutoExtractor


  from .autonotebook import tqdm as notebook_tqdm


# This experiment assert the how well sbert perform given only the short desc of BR

## Preparation

In [11]:
home_path = "/home/grads/t/tiendat.ng.cs/github_repos/PLM_and_BugReport_datasets"
data_path = os.path.join(home_path, "datasets", "hand-gen-datasets")

# connect to db
database_path = os.path.join(home_path, "dbrd_processed.db")
conn = sqlite3.connect(database_path)
cursor = conn.cursor()

In [12]:
# process db table, create save folder
table = "spark"
save_path = os.path.join(data_path, table)
my_utils.create_folder(save_path)

union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_duplicates = union_find.get_all_children()

# loop through each desc extract components, and save to file
print("Number of bug_ids before filter: ", len(bug_ids))
# remove bug_ids that are of very short desc and those that does not have log
to_remove_ids = []
for bug_id in bug_ids:
    desc = my_utils.get_description(conn, table, bug_id)
    short_desc = my_utils.get_short_desc(conn, table, bug_id)
    auto_extractor = ComponentAutoExtractor(desc)
    if len(desc) < 50 or not auto_extractor.has_log() or bug_id not in bug_ids_w_duplicates:
        to_remove_ids.append(bug_id)

for to_remove_id in to_remove_ids:
    bug_ids.remove(to_remove_id)

print("Number of bug_ids after filter: ", len(bug_ids))

Folder '/home/grads/t/tiendat.ng.cs/github_repos/PLM_and_BugReport_datasets/datasets/hand-gen-datasets/spark' already exists.
Processing spark


100%|██████████| 9579/9579 [00:00<00:00, 83428.14it/s]


Number of bug_ids before filter:  9577
Number of bug_ids after filter:  124


## Load model

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [18]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids.copy()
for bug_id in tqdm(bug_ids):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

100%|██████████| 124/124 [00:00<00:00, 48196.99it/s]


In [22]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    short_desc = my_utils.get_short_desc(conn, table, bug_id)
    vect = model.encode(short_desc,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 175/175 [00:01<00:00, 87.83it/s]


In [23]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=50, metric="cosine")
index.prepare()

In [25]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids]
for bug_id in tqdm(bug_ids):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)

100%|██████████| 124/124 [00:00<00:00, 382985.05it/s]


In [26]:
neighbors = index.query(np.array(Q_vects), 11)

In [30]:
found_in_top_k_wo_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids))):
    q = bug_ids[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_wo_stacktrace)):
                found_in_top_k_wo_stacktrace[f] += 1
            break
        

100%|██████████| 124/124 [00:00<00:00, 33112.22it/s]


In [31]:
found_in_top_k_wo_stacktrace / len(bug_ids)

array([0.        , 0.60483871, 0.67741935, 0.7016129 , 0.72580645,
       0.73387097, 0.75      , 0.76612903, 0.77419355, 0.79032258,
       0.7983871 ])