# Import

In [1]:
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

import sqlite3
import argparse
from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
import os
from numpy.linalg import norm
import sys
from itertools import combinations
import random
import my_utils

  from .autonotebook import tqdm as notebook_tqdm


# Stacktrace

We are suspecting stacktrace is the cause of similarity of BR

# Utils

In [2]:
class UnionFind:
    def __init__(self):
        self.parent = {}  # Dictionary to store parent nodes
        self.ranks = {}    # Dictionary to store rank (or size) of each set
        self.processed = False
        self.project_name = None

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
            self.ranks[x] = 1
            return x

        # Path compression
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)

        if root_x != root_y:
            if self.ranks[root_x] < self.ranks[root_y]:
                self.parent[root_x] = root_y
                self.ranks[root_y] += self.ranks[root_x]
            else:
                self.parent[root_y] = root_x
                self.ranks[root_x] += self.ranks[root_y]
            
    def process_project(self, conn, project_name):
        cursor = conn.cursor()
        self.project_name = project_name
        print("Processing", project_name)
        cursor.execute(f"SELECT * FROM {project_name}")
        for row in cursor.fetchall():
            dup_id = int(row[column_names.index("dup_id")])
            if dup_id == -1: continue
            bug_id = int(row[column_names.index("bug_id")])
            assert(dup_id != bug_id)
            self.union(bug_id, dup_id)
        self.processed = True
            
    def get_roots(self,):
        assert(self.processed)
        return list(set(self.parent.values()))
    
    def get_children(self, parent):
        assert(self.processed)
        parent = self.find(parent)
        children = [key for key, value in self.parent.items() if value == parent]
        return children
    
    def get_all_children(self, ):
        return [key for key, value in self.parent.items()]
    
    def are_dups(this, bug_id1, bug_id2):
        if (bug_id1 not in this.parent.keys() or bug_id2 not in this.parent.keys()):
            return False
        return this.parent[bug_id1] == this.parent[bug_id2]
            

In [3]:
def get_bug_ids(conn, table_name):
    cursor = conn.cursor()
    column_name = "bug_id"

    # Fetch table names using SQL query
    cursor.execute(f"SELECT DISTINCT {column_name} FROM {table_name} ORDER BY {column_name};")
    distinct_values_sorted = cursor.fetchall()

    # Extract table names from the result
    return [value[0] for value in distinct_values_sorted]

In [4]:
def get_column_names(conn, table_name):
    cursor = conn.cursor()

    # Execute a query to get information about the columns in the specified table
    cursor.execute(f"PRAGMA table_info({table_name});")
    columns_info = cursor.fetchall()

    # Extract and return the column names
    column_names = [column[1] for column in columns_info]
    return column_names

In [5]:
def get_code_feature(conn, project_name, bug_id):
    cursor = conn.cursor()

    # Fetch table names using SQL query
    query = f"SELECT * FROM {project_name} WHERE bug_id = {bug_id};"
    # print(query)
    cursor.execute(query)
    result = cursor.fetchall()[0]
    return result[column_names.index("code_feature")]

In [6]:
def get_descriptions(conn, project_name, bug_id):
    cursor = conn.cursor()

    # Fetch table names using SQL query
    query = f"SELECT * FROM {project_name} WHERE bug_id = {bug_id};"
    # print(query)
    cursor.execute(query)
    result = cursor.fetchall()[0]
    desc = result[column_names.index("description")]
    short_desc = result[column_names.index("short_desc")]

    # Extract table names from the result
    return (desc + " \n " + short_desc).replace("\\'", "'")

In [7]:
def vectorize(description, stride_len, chunk_size):
    tokens = tokenizer.tokenize(description)
    # if len og token array is < 32, we do nothing as there is not enough information
    if (len(tokens) < chunk_size // 2): return None

    # remember to add cls and sep token at each chunk
    token_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token]+tokens+[tokenizer.sep_token])

    # divide token ids into batche of chunks
    chunk_list=[]
    for i in range(0, len(token_ids), stride_len):
        chunk = token_ids[i:min(i+chunk_size, len(token_ids))]
        assert(len(chunk) <= chunk_size)
        if len(chunk) < chunk_size:
            # keep going
            continue
            # if (len(chunk) < chunk_size // 2): continue
            # pad_length = chunk_size - len(chunk)
            # chunk += [tokenizer.pad_token_id]*pad_length
        assert(len(chunk) == chunk_size)
        # print(chunk)
        chunk_list.append(chunk)

    if(len(chunk_list) == 0): return None
    chunk_arr = np.array(chunk_list)
    # print("Chunk arr size{}".format(chunk_arr.shape))
    # context_embedding = model(torch.tensor(token_ids[:512])[None, :])[0]
    context_embedding = model(torch.tensor(chunk_arr)[:, :])[0]
    return context_embedding.detach().numpy()

In [8]:
def get_duplicated_pairs(union_find):
    roots = union_find.get_roots()
    pairs = []
    for root in roots:
        group = union_find.get_children(root)
        pairs += list(combinations(group, 2))
    return pairs

In [9]:
def get_non_duplicated_pairs(union_find, conn, size):
    from_dup = union_find.get_all_children()
    #sample in some other single reports
    assert(union_find.processed)
    samples = random.sample(get_bug_ids(conn, union_find.project_name), len(from_dup))
    
    pairs = []
    count = 0
    while (count < size):
        pair = random.sample(samples, 2)
        if pair[0] == pair[1] or union_find.are_dups(pair[0], pair[1]):
            continue
        pairs += [(pair[0], pair[1]),]
        count += 1
    return pairs

In [10]:
def get_mislabels(union_find, bug_ids, anchor_bug_id, threshold):
    assert(threshold >= 0 and threshold <= 1)
    ret = []
    for bug_id in tqdm(bug_ids):
        if not union_find.are_dups(anchor_bug_id, bug_id):
            sim_score = get_similarity_of_pair((anchor_bug_id, bug_id),)
            if sim_score > threshold:
                ret += [bug_id]
    return ret

In [11]:

def get_similarity_of_pair_with_code_feature(conn, project_name, pair):
    sent0 = my_utils.get_code_feature(conn, project_name, pair[0])
    sent1 = my_utils.get_code_feature(conn, project_name, pair[1])
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]


def get_similarity_of_pair_with_desc(conn, project_name, pair):
    sent0 = my_utils.get_descriptions(conn, project_name, pair[0])
    sent1 = my_utils.get_descriptions(conn, project_name, pair[1])
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]

In [12]:
def top_closest_values_indeces(k, vectors, q):
    # Calculate the absolute differences between each element in L and v
    sim_scores = [(my_utils.similarity_score_1d(v, q), i) for i, v in enumerate(vectors)]
    # differences = [(util.pytorch_cos_sim(v, q), i) for i, v in enumerate(vectors)]

    sim_scores.sort(reverse=True)
    # print(sim_scores[:10])

    # Extract the top k closest values and their indices in the original list
    top_k_indices = [index for _, index in sim_scores[:k]]
    return top_k_indices, sim_scores

# Connect to the database

In [16]:
database_path = "./dbrd_processed.db"

conn = sqlite3.connect(database_path)
cursor = conn.cursor()

# Model

In [21]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Eclipse

In [91]:
table = "eclipse"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing eclipse


100%|██████████| 27583/27583 [00:00<00:00, 60547.81it/s]


In [92]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [93]:
len(bug_ids_w_duplicates_and_stacktrace)

351

In [94]:
len(bug_ids_w_duplicates)

1222

In [95]:
len(bug_ids_w_stacktrace)

2883

## with stacktrace

In [96]:


# positive_set = get_duplicated_pairs(union_find)
# negetive_set = get_non_duplicated_pairs(union_find, conn, len(positive_set)*5)


# bug_ids = list(set([bug_id for t in positive_set for bug_id in t] + [bug_id for t in negetive_set for bug_id in t]))

# sample_bug_ids = list(set(bug_ids_w_duplicates).union(set(bug_ids_w_stacktrace)))
sample_bug_ids = bug_ids

vectors = []
for bug_id in tqdm(sample_bug_ids):
    desc = my_utils.get_descriptions(conn, table, bug_id)
    ten = model.encode(desc,convert_to_tensor=True)
    vectors += [ten.detach().numpy().tolist()]

100%|██████████| 27583/27583 [07:34<00:00, 60.64it/s]


In [97]:
accuracies = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(11, np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    accuracy = len(children_in_top_k) / len(children)
    accuracies.append(accuracy)

100%|██████████| 351/351 [03:56<00:00,  1.48it/s]


In [98]:
np.mean(accuracies)

0.4296318116830937

Calculate average rankings of childrens

In [99]:
ranking_averages = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(len(vectors), np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    rankings = [topk.index(child) for child in children]
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    # accuracy = len(children_in_top_k) / len(children)
    ranking_averages.append(np.mean(rankings))

100%|██████████| 351/351 [03:59<00:00,  1.47it/s]


In [100]:
np.mean(ranking_averages)

536.3220765345765

## Without stacktrace

In [101]:


# positive_set = get_duplicated_pairs(union_find)
# negetive_set = get_non_duplicated_pairs(union_find, conn, len(positive_set)*5)


# bug_ids = list(set([bug_id for t in positive_set for bug_id in t] + [bug_id for t in negetive_set for bug_id in t]))

# sample_bug_ids = list(set(bug_ids_w_duplicates).union(set(bug_ids_w_stacktrace)))
sample_bug_ids = bug_ids

vectors = []
for bug_id in tqdm(sample_bug_ids):
    desc = my_utils.get_desc_wo_stacktrace(conn, table, bug_id)
    ten = model.encode(desc,convert_to_tensor=True)
    vectors += [ten.detach().numpy().tolist()]

100%|██████████| 27583/27583 [07:36<00:00, 60.41it/s]


In [102]:
accuracies = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(11, np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    accuracy = len(children_in_top_k) / len(children)
    accuracies.append(accuracy)

100%|██████████| 351/351 [03:54<00:00,  1.50it/s]


In [103]:
np.mean(accuracies)

0.4377031918698585

Calculate average rankings of childrens

In [104]:
ranking_averages = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(len(vectors), np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    rankings = [topk.index(child) for child in children]
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    # accuracy = len(children_in_top_k) / len(children)
    ranking_averages.append(np.mean(rankings))

100%|██████████| 351/351 [03:58<00:00,  1.47it/s]


In [105]:
np.mean(ranking_averages)

505.9727692535385

# Hadoop_old

In [106]:
table = "hadoop_old"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing hadoop_old


100%|██████████| 24083/24083 [00:00<00:00, 69930.33it/s]


In [107]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [108]:
len(bug_ids_w_duplicates_and_stacktrace)

238

In [109]:
len(bug_ids_w_duplicates)

757

In [110]:
len(bug_ids_w_stacktrace)

3771

## with stacktrace

In [111]:


# positive_set = get_duplicated_pairs(union_find)
# negetive_set = get_non_duplicated_pairs(union_find, conn, len(positive_set)*5)


# bug_ids = list(set([bug_id for t in positive_set for bug_id in t] + [bug_id for t in negetive_set for bug_id in t]))

# sample_bug_ids = list(set(bug_ids_w_duplicates).union(set(bug_ids_w_stacktrace)))
sample_bug_ids = bug_ids

vectors = []
for bug_id in tqdm(sample_bug_ids):
    desc = my_utils.get_descriptions(conn, table, bug_id)
    ten = model.encode(desc,convert_to_tensor=True)
    vectors += [ten.detach().numpy().tolist()]

100%|██████████| 24083/24083 [06:35<00:00, 60.89it/s]


In [112]:
accuracies = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(11, np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    accuracy = len(children_in_top_k) / len(children)
    accuracies.append(accuracy)

100%|██████████| 238/238 [02:18<00:00,  1.71it/s]


In [113]:
np.mean(accuracies)

0.5224089635854342

Calculate average rankings of childrens

In [114]:
ranking_averages = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(len(vectors), np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    rankings = [topk.index(child) for child in children]
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    # accuracy = len(children_in_top_k) / len(children)
    ranking_averages.append(np.mean(rankings))

100%|██████████| 238/238 [02:21<00:00,  1.69it/s]


In [115]:
np.mean(ranking_averages)

1056.4771241830065

## Without stacktrace

In [116]:


# positive_set = get_duplicated_pairs(union_find)
# negetive_set = get_non_duplicated_pairs(union_find, conn, len(positive_set)*5)


# bug_ids = list(set([bug_id for t in positive_set for bug_id in t] + [bug_id for t in negetive_set for bug_id in t]))

# sample_bug_ids = list(set(bug_ids_w_duplicates).union(set(bug_ids_w_stacktrace)))
sample_bug_ids = bug_ids

vectors = []
for bug_id in tqdm(sample_bug_ids):
    desc = my_utils.get_desc_wo_stacktrace(conn, table, bug_id)
    ten = model.encode(desc,convert_to_tensor=True)
    vectors += [ten.detach().numpy().tolist()]

100%|██████████| 24083/24083 [06:34<00:00, 61.10it/s]


In [117]:
accuracies = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(11, np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    accuracy = len(children_in_top_k) / len(children)
    accuracies.append(accuracy)

100%|██████████| 238/238 [02:19<00:00,  1.71it/s]


In [118]:
np.mean(accuracies)

0.5469187675070029

Calculate average rankings of childrens

In [119]:
ranking_averages = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(len(vectors), np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    rankings = [topk.index(child) for child in children]
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    # accuracy = len(children_in_top_k) / len(children)
    ranking_averages.append(np.mean(rankings))

100%|██████████| 238/238 [02:20<00:00,  1.69it/s]


In [120]:
np.mean(ranking_averages)

581.1426237161531

# Spark

In [121]:
table = "spark"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing spark


  0%|          | 0/9579 [00:00<?, ?it/s]

100%|██████████| 9579/9579 [00:00<00:00, 80689.50it/s]


In [122]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [123]:
len(bug_ids_w_duplicates_and_stacktrace)

109

In [124]:
len(bug_ids_w_duplicates)

409

In [125]:
len(bug_ids_w_stacktrace)

1659

## with stacktrace

In [126]:


# positive_set = get_duplicated_pairs(union_find)
# negetive_set = get_non_duplicated_pairs(union_find, conn, len(positive_set)*5)


# bug_ids = list(set([bug_id for t in positive_set for bug_id in t] + [bug_id for t in negetive_set for bug_id in t]))

# sample_bug_ids = list(set(bug_ids_w_duplicates).union(set(bug_ids_w_stacktrace)))
sample_bug_ids = bug_ids

vectors = []
for bug_id in tqdm(sample_bug_ids):
    desc = my_utils.get_descriptions(conn, table, bug_id)
    ten = model.encode(desc,convert_to_tensor=True)
    vectors += [ten.detach().numpy().tolist()]

100%|██████████| 9579/9579 [03:01<00:00, 52.69it/s]


In [127]:
accuracies = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(11, np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    accuracy = len(children_in_top_k) / len(children)
    accuracies.append(accuracy)

100%|██████████| 109/109 [00:28<00:00,  3.89it/s]


In [128]:
np.mean(accuracies)

0.555045871559633

Calculate average rankings of childrens

In [129]:
ranking_averages = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(len(vectors), np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    rankings = [topk.index(child) for child in children]
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    # accuracy = len(children_in_top_k) / len(children)
    ranking_averages.append(np.mean(rankings))

  0%|          | 0/109 [00:00<?, ?it/s]

100%|██████████| 109/109 [00:27<00:00,  3.92it/s]


In [130]:
np.mean(ranking_averages)

435.5519877675841

## Without stacktrace

In [131]:


# positive_set = get_duplicated_pairs(union_find)
# negetive_set = get_non_duplicated_pairs(union_find, conn, len(positive_set)*5)


# bug_ids = list(set([bug_id for t in positive_set for bug_id in t] + [bug_id for t in negetive_set for bug_id in t]))

# sample_bug_ids = list(set(bug_ids_w_duplicates).union(set(bug_ids_w_stacktrace)))
sample_bug_ids = bug_ids

vectors = []
for bug_id in tqdm(sample_bug_ids):
    desc = my_utils.get_desc_wo_stacktrace(conn, table, bug_id)
    ten = model.encode(desc,convert_to_tensor=True)
    vectors += [ten.detach().numpy().tolist()]

100%|██████████| 9579/9579 [02:56<00:00, 54.15it/s]


In [132]:
accuracies = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(11, np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    accuracy = len(children_in_top_k) / len(children)
    accuracies.append(accuracy)

100%|██████████| 109/109 [00:27<00:00,  3.96it/s]


In [133]:
np.mean(accuracies)

0.6391437308868502

Calculate average rankings of childrens

In [134]:
ranking_averages = []

for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    children.remove(bug_id)
    for child in children:
        assert(child in sample_bug_ids)
    
    vector = vectors[sample_bug_ids.index(bug_id)]
    children_vectors = [ vectors[sample_bug_ids.index(b)] for b in children]
    
    topk_indeces, sim_scores = top_closest_values_indeces(len(vectors), np.array(vectors), np.array(vector))
    
    
    topk = [sample_bug_ids[i] for i in topk_indeces]
    
    index_to_delete = topk.index(bug_id)
    
    topk.pop(index_to_delete)
    topk_indeces.pop(index_to_delete)
    sim_scores.pop(index_to_delete)
    
    rankings = [topk.index(child) for child in children]
    
    # print("Query: ", bug_id)
    # print("Expect: ", children)
    # print("Topk: ", topk)
    
    children_is_in_top_k = []
    for child in children:
        if child in topk:
            children_is_in_top_k.append(1)
        else:
            children_is_in_top_k.append(0)
            
    children_in_top_k = list(set(children).intersection(set(topk)))
    
    # print(f"{len(children_in_top_k)} out of {len(children)} children are in top k, they are {children_in_top_k}")
    # accuracy = len(children_in_top_k) / len(children)
    ranking_averages.append(np.mean(rankings))

100%|██████████| 109/109 [00:27<00:00,  3.93it/s]


In [135]:
np.mean(ranking_averages)

235.86391437308868

# Do duplicates that both have stacktrace have the same stacktrace?

## Hadoop

In [17]:
table = "hadoop_old"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing hadoop_old


100%|██████████| 24083/24083 [00:00<00:00, 69254.78it/s]


In [137]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [138]:
len(bug_ids_w_duplicates_and_stacktrace)

238

In [142]:
positive_pairs_with_stacktrace = []
seen = []
for bug_id in bug_ids_w_duplicates_and_stacktrace:
    if bug_id in seen: continue
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    pairs = list(combinations(children, 2))
    for bug_id1, bug_id2 in pairs:
        if bug_id1 in bug_ids_w_duplicates_and_stacktrace and bug_id2 in bug_ids_w_duplicates_and_stacktrace:
            positive_pairs_with_stacktrace.append((bug_id1, bug_id2))
    seen += children
    
    

In [143]:
len(positive_pairs_with_stacktrace)

81

In [150]:
for bug_id1, bug_id2 in positive_pairs_with_stacktrace:
    print(bug_id1)
    print(my_utils.get_stacktrace(conn, table, bug_id1))
    print("-"*50)
    print(bug_id2)
    print(my_utils.get_stacktrace(conn, table, bug_id2))
    
    print("*"*50)
    print("*"*50)

12660224
2013-07-27 14:17:23,703 ERROR [main] org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: Exception while registeringorg.apache.hadoop.security.AccessControlException: SIMPLE authentication is not enabled.  Available:[TOKEN]	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:39)	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)	at java.lang.reflect.Constructor.newInstance(Constructor.java:513)	at org.apache.hadoop.yarn.ipc.RPCUtil.instantiateException(RPCUtil.java:53)	at org.apache.hadoop.yarn.ipc.RPCUtil.unwrapAndThrowException(RPCUtil.java:104)	at org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.registerApplicationMaster(ApplicationMasterProtocolPBClientImpl.java:109)	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)	at sun.reflect.NativeMethodAccessorImpl.inv

## Eclipse

In [14]:
table = "eclipse"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing eclipse


100%|██████████| 27583/27583 [00:00<00:00, 59699.81it/s]


In [152]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [153]:
len(bug_ids_w_duplicates_and_stacktrace)

351

In [154]:
positive_pairs_with_stacktrace = []
seen = []
for bug_id in bug_ids_w_duplicates_and_stacktrace:
    if bug_id in seen: continue
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    pairs = list(combinations(children, 2))
    for bug_id1, bug_id2 in pairs:
        if bug_id1 in bug_ids_w_duplicates_and_stacktrace and bug_id2 in bug_ids_w_duplicates_and_stacktrace:
            positive_pairs_with_stacktrace.append((bug_id1, bug_id2))
    seen += children
    
    

In [155]:
len(positive_pairs_with_stacktrace)

421

In [156]:
for bug_id1, bug_id2 in positive_pairs_with_stacktrace:
    print(bug_id1)
    print(my_utils.get_stacktrace(conn, table, bug_id1))
    print("-"*50)
    print(bug_id2)
    print(my_utils.get_stacktrace(conn, table, bug_id2))
    
    print("*"*50)
    print("*"*50)

548865
In our Java 11, JavaEE 8 project that uses InitialContext and NamingException throughout the code base, the ECJ often (but not always) reports a compilation error in classes with these two Java types, but that are actually not a problem.NamingException cannot be resolved to a type	DBUpdateHelper.java	/banaan-data-update/src/main/java/nl/banaan/update/util/helper	line 28	Java ProblemNot all places where InitalContext and NamingException are referenced are affected by this bug, which has been plaguing us since 2018.12.
--------------------------------------------------
550263
The type javax.naming.InvalidNameException is not accessible
**************************************************
**************************************************
548865
In our Java 11, JavaEE 8 project that uses InitialContext and NamingException throughout the code base, the ECJ often (but not always) reports a compilation error in classes with these two Java types, but that are actually not a problem.Namin

In [158]:
print(my_utils.get_descriptions(conn, table, 562610))

Created attachment 282619
class file

Java method body:

new StringBuilder((arg0 ? "a" : "b") + "c");

Stack trace:

java.lang.VerifyError: Inconsistent stackmap frames at branch target 16
Exception Details:
  Location:
    com/example/CompilerCheck.method(Z)V @16: ldc
  Reason:
    Type uninitialized 3 (current frame, stack[1]) is not assignable to uninitialized 0 (stack map, stack[1])
  Current Frame:
    bci: @8
    flags: { }
    locals: { integer }
    stack: { uninitialized 0, uninitialized 3, uninitialized 3, integer }
  Stackmap Frame:
    bci: @16
    flags: { }
    locals: { integer }
    stack: { uninitialized 0, uninitialized 0, uninitialized 0 }
  Bytecode:
    0x0000000: bb00 0fbb 000f 591a 9900 0812 11a7 0005
    0x0000010: 1213 b800 19b7 001c 121e b600 22b6 0026
    0x0000020: b700 1cb1                              
  Stackmap Table:
    full_frame(@16,{Integer},{Uninitialized[#0],Uninitialized[#0],Uninitialized[#0]})
    full_frame(@18,{Integer},{Uninitialized[#0],Unin

In [159]:
print(my_utils.get_descriptions(conn, table, 559094))

The following code fails to run when compiled by Eclipse and run as Java application:

public class WELD_000119 {

    public static void main(String[] args) {
        System.out.println(new WELD_000119().test1());
        System.out.println(new WELD_000119().test2());
    }

    // FIXME: Broken in Eclipse 2019-12...
    private String test1() {
        String label = "label";
        String value = "value";
        return this.findSaveMessage("string", (label != null ? label : value) + ": ") + this.getDetailsToError();
    }

    // NOTE: Works as intended...
    private String test2() {
        String label = "label";
        String value = "value";
        return this.findSaveMessage("string", (label != null ? label : value)) + ": " + this.getDetailsToError();
    }

    private String findSaveMessage(String string, Object... params) {
        return "nothing";
    }

    private String getDetailsToError() {
        return "details";
    }
}

In my real world case it leads to WELD-

In [160]:
print(my_utils.get_descriptions(conn, table, 568724))

Created attachment 284735
Screenshot showing missing markers and broken editor

The markers stop being added when it reaches this a particular Java file.
When I double click on the file to open it, I get the pop up saying "offset x, count -x, length x" and a broken editor.

This stacktrace is in the logs.


!ENTRY org.eclipse.jface 4 2 2020-11-11 15:23:40.750
!MESSAGE Problems occurred when invoking code from plug-in: "org.eclipse.jface".
!STACK 0
java.lang.StringIndexOutOfBoundsException: offset 17308, count -9048, length 24227
	at java.base/java.lang.String.checkBoundsOffCount(String.java:3304)
	at java.base/java.lang.String.rangeCheck(String.java:280)
	at java.base/java.lang.String.<init>(String.java:276)
	at org.eclipse.jdt.internal.compiler.parser.Scanner.disambiguatedToken(Scanner.java:5491)
	at org.eclipse.jdt.internal.compiler.parser.Scanner.getNextToken(Scanner.java:1461)
	at org.eclipse.jdt.internal.compiler.parser.diagnose.LexStream.readTokenFromScanner(LexStream.java:84)
	a

In [19]:
print(my_utils.get_descriptions(conn, table, 12602373))

as we know in hbase 0.94.0 we have a configuration below
  <property>
    <name>hbase.regionserver.wal.enablecompression</name>
         <value>true</value>
  </property>
if we enable it in master cluster and disable it in slave cluster . Then replication will not work. It will throw unwrapRemoteException again and again in master cluster.
2012-08-09 12:49:55,892 WARN org.apache.hadoop.hbase.replication.regionserver.ReplicationSource: Can't replicate because of an error
 on the remote cluster: 
java.io.IOException: IPC server unable to read call parameters: Error in readFields
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:39)
        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:513)
        at org.apache.hadoop.ipc.RemoteExcep

In [20]:
print(my_utils.get_stacktrace(conn, table, 12602373))

if we enable it in master cluster and disable it in slave cluster . Then replication will not work. It will throw unwrapRemoteException again and again in master cluster.java.io.IOException: IPC server unable to read call parameters: Error in readFields        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)        at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:39)        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)        at java.lang.reflect.Constructor.newInstance(Constructor.java:513)        at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:95)        at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:79)        at org.apache.hadoop.hbase.replication.regionserver.ReplicationSource.shipEdits(ReplicationSource.java:635)        at org.apache.hadoop.hbase.replication.regionserver.ReplicationSour

Handpick br with simple stacktrace then compare sim score before and after stracktraces are extracted