Created Dec 4th 2023

# imports

In [1]:
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

import pynndescent

import sqlite3
import argparse
from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
import os
from numpy.linalg import norm
import sys; sys.path.insert(0, '..')
from itertools import combinations, chain
import random
import my_utils
import re

  from .autonotebook import tqdm as notebook_tqdm


# Utils

In [2]:
class UnionFind:
    def __init__(self):
        self.parent = {}  # Dictionary to store parent nodes
        self.ranks = {}    # Dictionary to store rank (or size) of each set
        self.processed = False
        self.project_name = None

    def find(self, x):
        if x not in self.parent:
            self.parent[x] = x
            self.ranks[x] = 1
            return x

        # Path compression
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        root_x = self.find(x)
        root_y = self.find(y)

        if root_x != root_y:
            if self.ranks[root_x] < self.ranks[root_y]:
                self.parent[root_x] = root_y
                self.ranks[root_y] += self.ranks[root_x]
            else:
                self.parent[root_y] = root_x
                self.ranks[root_x] += self.ranks[root_y]
            
    def process_project(self, conn, project_name):
        cursor = conn.cursor()
        self.project_name = project_name
        print("Processing", project_name)
        cursor.execute(f"SELECT * FROM {project_name}")
        for row in cursor.fetchall():
            dup_id = int(row[column_names.index("dup_id")])
            if dup_id == -1: continue
            bug_id = int(row[column_names.index("bug_id")])
            assert(dup_id != bug_id)
            self.union(bug_id, dup_id)
        self.processed = True
            
    def get_roots(self,):
        assert(self.processed)
        return list(set(self.parent.values()))
    
    def get_children(self, parent):
        assert(self.processed)
        parent = self.find(parent)
        children = [key for key, value in self.parent.items() if value == parent]
        return children
    
    def get_all_children(self, ):
        return [key for key, value in self.parent.items()]
    
    def are_dups(this, bug_id1, bug_id2):
        if (bug_id1 not in this.parent.keys() or bug_id2 not in this.parent.keys()):
            return False
        return this.parent[bug_id1] == this.parent[bug_id2]
            

In [3]:
def get_bug_ids(conn, table_name):
    cursor = conn.cursor()
    column_name = "bug_id"

    # Fetch table names using SQL query
    cursor.execute(f"SELECT DISTINCT {column_name} FROM {table_name} ORDER BY {column_name};")
    distinct_values_sorted = cursor.fetchall()

    # Extract table names from the result
    return [value[0] for value in distinct_values_sorted]

In [4]:
def get_column_names(conn, table_name):
    cursor = conn.cursor()

    # Execute a query to get information about the columns in the specified table
    cursor.execute(f"PRAGMA table_info({table_name});")
    columns_info = cursor.fetchall()

    # Extract and return the column names
    column_names = [column[1] for column in columns_info]
    return column_names

In [5]:
def get_code_feature(conn, project_name, bug_id):
    cursor = conn.cursor()

    # Fetch table names using SQL query
    query = f"SELECT * FROM {project_name} WHERE bug_id = {bug_id};"
    # print(query)
    cursor.execute(query)
    result = cursor.fetchall()[0]
    return result[column_names.index("code_feature")]

In [6]:
def get_descriptions(conn, project_name, bug_id):
    cursor = conn.cursor()

    # Fetch table names using SQL query
    query = f"SELECT * FROM {project_name} WHERE bug_id = {bug_id};"
    # print(query)
    cursor.execute(query)
    result = cursor.fetchall()[0]
    desc = result[column_names.index("description")]
    short_desc = result[column_names.index("short_desc")]

    # Extract table names from the result
    return (desc + " \n " + short_desc).replace("\\'", "'")

In [7]:
def vectorize(description, stride_len, chunk_size):
    tokens = tokenizer.tokenize(description)
    # if len og token array is < 32, we do nothing as there is not enough information
    if (len(tokens) < chunk_size // 2): return None

    # remember to add cls and sep token at each chunk
    token_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token]+tokens+[tokenizer.sep_token])

    # divide token ids into batche of chunks
    chunk_list=[]
    for i in range(0, len(token_ids), stride_len):
        chunk = token_ids[i:min(i+chunk_size, len(token_ids))]
        assert(len(chunk) <= chunk_size)
        if len(chunk) < chunk_size:
            # keep going
            continue
            # if (len(chunk) < chunk_size // 2): continue
            # pad_length = chunk_size - len(chunk)
            # chunk += [tokenizer.pad_token_id]*pad_length
        assert(len(chunk) == chunk_size)
        # print(chunk)
        chunk_list.append(chunk)

    if(len(chunk_list) == 0): return None
    chunk_arr = np.array(chunk_list)
    # print("Chunk arr size{}".format(chunk_arr.shape))
    # context_embedding = model(torch.tensor(token_ids[:512])[None, :])[0]
    context_embedding = model(torch.tensor(chunk_arr)[:, :])[0]
    return context_embedding.detach().numpy()

In [8]:
def get_duplicated_pairs(union_find):
    roots = union_find.get_roots()
    pairs = []
    for root in roots:
        group = union_find.get_children(root)
        pairs += list(combinations(group, 2))
    return pairs

In [9]:
def get_non_duplicated_pairs(union_find, conn, size):
    from_dup = union_find.get_all_children()
    #sample in some other single reports
    assert(union_find.processed)
    samples = random.sample(get_bug_ids(conn, union_find.project_name), len(from_dup))
    
    pairs = []
    count = 0
    while (count < size):
        pair = random.sample(samples, 2)
        if pair[0] == pair[1] or union_find.are_dups(pair[0], pair[1]):
            continue
        pairs += [(pair[0], pair[1]),]
        count += 1
    return pairs

In [10]:
def get_mislabels(union_find, bug_ids, anchor_bug_id, threshold):
    assert(threshold >= 0 and threshold <= 1)
    ret = []
    for bug_id in tqdm(bug_ids):
        if not union_find.are_dups(anchor_bug_id, bug_id):
            sim_score = get_similarity_of_pair((anchor_bug_id, bug_id),)
            if sim_score > threshold:
                ret += [bug_id]
    return ret

In [11]:

def get_similarity_of_pair_with_code_feature(conn, project_name, pair):
    sent0 = my_utils.get_code_feature(conn, project_name, pair[0])
    sent1 = my_utils.get_code_feature(conn, project_name, pair[1])
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]


def get_similarity_of_pair_with_desc(conn, project_name, pair):
    sent0 = my_utils.get_descriptions(conn, project_name, pair[0])
    sent1 = my_utils.get_descriptions(conn, project_name, pair[1])
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]

In [12]:
def remove_stacktrace(text):
    # stack_trace_pattern = re.compile(r'\n(Caused\sby:\s)*\w+\.(\w+\.)+\w+.*?\n(?:[ \t]+[at]*.*\n)+', re.DOTALL)
    # stack_trace_pattern = re.compile(r"\n(?:Caused by: )*(?:[a-zA-Z]+\.)+(?:[a-zA-Z]+:*.*)\n(.*\n)*\n(?:[ \t]+[at]*.*\n)*", re.DOTALL)
    
    # stack_trace_pattern = re.compile(r"\n\d*\s*(?:Caused by: )*(?:[a-zA-Z]+\.)+(?:[a-zA-Z]+:*.*)\n(.*\n)*\n*(?:\d*\s*[ \t]+[at]*.*\n)*", re.DOTALL)
    
    caused_by_pattern = re.compile(r"\n\d*\s*(?:Caused by: )*(?:[a-zA-Z]+\.)+[a-zA-Z]+.*", re.DOTALL)
    at_pattern = re.compile(r"\s*\t*at (?:.+\.)+.+.*", re.DOTALL)
    
    result_text = caused_by_pattern.sub('', text)
    result_text = at_pattern.sub('', text)
    return result_text

In [13]:
def is_java_path(word):
    is_long = len(word) >= 15
    has_dots = word.count('.') >= 4
    return is_long or has_dots

def contains_java_path(line):
    for word in line.split(" "):
        if is_java_path(word): return True
    return False

def java_path_is_majority(line):
    total_java_path_length = 0
    for word in line.split(" "):
        if is_java_path(word): total_java_path_length += len(word)
    return total_java_path_length / len(line) > 0.5

def startswith_datetime(line):
    first_word = line.split(" ")[0]
    match1 = re.match(r"\d+-\d+-\d+", first_word) is not None
    match2 = re.match(r"\d+:\d+:\d+.*", first_word) is not None
    return match1 or match2

def get_tag_name(line):
    pattern = re.compile(r'^\s*<([a-zA-Z][^\s>]*)\s*[^>]*>')
    match = pattern.match(line)
    return match.group(1) if match else None

def is_stacktrace_more(line):
    match = re.match(r"... \d+ more", line.strip()) is not None
    return match

def segregate_log_and_stacktrace(text):
    eng = ""
    log_and_stacktrace = ""
    
    for line in text.split("\n"):
        if get_tag_name(line) is not None:
            eng += line + "\n"
            continue
        if len(line) != 0\
            and (line.startswith("at ")\
            or line.startswith("Caused by: ")\
            or java_path_is_majority(line)\
            or startswith_datetime(line)
            or is_stacktrace_more(line)):
            log_and_stacktrace += line + "\n"
        else:
            eng += line + "\n"
    return eng.strip(), log_and_stacktrace.strip()
            

In [14]:
def get_similarity_of_pair_with_desc(conn, project_name, pair):
    sent0 = my_utils.get_descriptions(conn, project_name, pair[0])
    sent1 = my_utils.get_descriptions(conn, project_name, pair[1])
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]

def get_similarity_of_pair_with_desc_no_stacktrace(conn, project_name, pair):
    sent0 = remove_stacktrace(my_utils.get_descriptions(conn, project_name, pair[0]))
    sent1 = remove_stacktrace(my_utils.get_descriptions(conn, project_name, pair[1]))
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]

def load_content_from_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def get_similarity_of_pair_with_desc_no_stacktrace_from_file(folder, pair):
    file_name1 = str(pair[0]) + ".txt"
    file_name2 = str(pair[1]) + ".txt"
    file_path1 = os.path.join(folder, file_name1)
    file_path2 = os.path.join(folder, file_name2)
    
    sent0 = load_content_from_file(file_path1)
    sent1 = load_content_from_file(file_path2)
    sent_embedding0 = model.encode(sent0,convert_to_tensor=True)
    sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
    return util.pytorch_cos_sim(sent_embedding0, sent_embedding1).numpy()[0, 0]

# Connect to the database

In [15]:
database_path = "../dbrd_processed.db"


conn = sqlite3.connect(database_path)
cursor = conn.cursor()

# getting model

In [16]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## Spark

In [30]:
table = "spark"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing spark


100%|██████████| 9579/9579 [00:00<00:00, 82382.40it/s]


In [31]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [32]:
len(bug_ids_w_duplicates_and_stacktrace)

109

In [33]:
len(bug_ids_w_stacktrace)

1659

In [34]:
positive_pairs_with_stacktrace = []
seen = []
for bug_id in bug_ids_w_duplicates_and_stacktrace:
    if bug_id in seen: continue
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    pairs = list(combinations(children, 2))
    for bug_id1, bug_id2 in pairs:
        if bug_id1 in bug_ids_w_duplicates_and_stacktrace and bug_id2 in bug_ids_w_duplicates_and_stacktrace:
            positive_pairs_with_stacktrace.append((bug_id1, bug_id2))
    seen += children
    
    

In [35]:
len(positive_pairs_with_stacktrace)

42

## Hadoop

In [17]:
table = "hadoop_old"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing hadoop_old


100%|██████████| 24083/24083 [00:00<00:00, 72276.88it/s]


In [18]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

In [19]:
len(bug_ids_w_duplicates_and_stacktrace)

238

In [20]:
len(bug_ids_w_stacktrace)

3771

In [21]:
positive_pairs_with_stacktrace = []
seen = []
for bug_id in bug_ids_w_duplicates_and_stacktrace:
    if bug_id in seen: continue
    children = union_find.get_children(bug_id)
    assert(bug_id in children)
    pairs = list(combinations(children, 2))
    for bug_id1, bug_id2 in pairs:
        if bug_id1 in bug_ids_w_duplicates_and_stacktrace and bug_id2 in bug_ids_w_duplicates_and_stacktrace:
            positive_pairs_with_stacktrace.append((bug_id1, bug_id2))
    seen += children
    
    

In [22]:
len(positive_pairs_with_stacktrace)

81

In [23]:
final_bug_ids_w_duplicates_and_stacktrace = list(set(chain.from_iterable(positive_pairs_with_stacktrace)))
final_bug_ids_w_duplicates_and_stacktrace.sort()

In [24]:
partial_bug_ids_w_duplicates_and_stacktrace = list(set(chain.from_iterable(positive_pairs_with_stacktrace[:20])))
partial_bug_ids_w_duplicates_and_stacktrace.sort()

In [25]:
def get_negative_pairs(union_find, bug_ids, sample_size):
    all_pairs = combinations(bug_ids, 2)
    negative_pairs = []
    for pair in all_pairs:
        if (not union_find.are_dups(pair[0], pair[1])):
            negative_pairs.append(pair)
    if (sample_size == -1):
        return negative_pairs
    return random.sample(negative_pairs, sample_size)

In [26]:
partial_positive_pairs_with_stacktrace = positive_pairs_with_stacktrace[:20]

In [27]:
partial_negative_pairs_with_stacktrace = get_negative_pairs(union_find, partial_bug_ids_w_duplicates_and_stacktrace, -1)

In [None]:
len(partial_positive_pairs_with_stacktrace)

20

In [None]:
len(partial_negative_pairs_with_stacktrace)

386

In [None]:
print(my_utils.get_descriptions(conn, table, 12537362))

MAPREDUCE-3617 addresses the default values of yarn.nodemanager.principal and yarn.resourcemanager.principal
I have enabled authorization with simple authentication. NM <=> RM still attempts kerberos authentication. If simple authentication is enabled yarn.nodemanager.principal and yarn.resourcemanager.principal values should be ignored and simple authentication should be used.
core-site.xml snippet
  <property>
    <name>hadoop.security.authentication</name>
    <value>simple</value>
    <description></description>
  </property>
  <property>
    <name>hadoop.security.authorization</name>
    <value>true</value>
    <description></description>
  </property>


yarn-site.xml snippet
<property>
  <description>The Kerberos principal for the resource manager.</description>
  <name>yarn.resourcemanager.principal</name>
  <value>rm/sightbusy-lx@LOCALHOST</value>
</property>
<property>
  <description>The kerberos principal for the node manager.</description>
  <name>yarn.nodemanager.principal<

In [None]:
print(segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, 12537362))[0])

I have enabled authorization with simple authentication. NM <=> RM still attempts kerberos authentication. If simple authentication is enabled yarn.nodemanager.principal and yarn.resourcemanager.principal values should be ignored and simple authentication should be used.
core-site.xml snippet
  <property>
    <name>hadoop.security.authentication</name>
    <value>simple</value>
    <description></description>
  </property>
  <property>
    <name>hadoop.security.authorization</name>
    <value>true</value>
    <description></description>
  </property>


yarn-site.xml snippet
<property>
  <description>The Kerberos principal for the resource manager.</description>
  <name>yarn.resourcemanager.principal</name>
  <value>rm/sightbusy-lx@LOCALHOST</value>
</property>
<property>
  <description>The kerberos principal for the node manager.</description>
  <name>yarn.nodemanager.principal</name>
  <value>nm/sightbusy-lx@LOCALHOST</value>
</property>



 
 Authorization of NM <=> RM with simple au

In [None]:
print(remove_stacktrace(my_utils.get_descriptions(conn, table, 12537362)))

MAPREDUCE-3617 addresses the default values of yarn.nodemanager.principal and yarn.resourcemanager.principal
I have enabled authorization with simple authentication. NM <=> RM still attempts kerberos authentication. If simple authentication is enabled yarn.nodemanager.principal and yarn.resourcemanager.principal values should be ignored and simple authentication should be used.
core-site.xml snippet
  <property>
    <name>hadoop.security.authentication</name>
    <value>simple</value>
    <description></description>
  </property>
  <property>
    <name>hadoop.security.authorization</name>
    <value>true</value>
    <description></description>
  </property>


yarn-site.xml snippet
<property>
  <description>The Kerberos principal for the resource manager.</description>
  <name>yarn.resourcemanager.principal</name>
  <value>rm/sightbusy-lx@LOCALHOST</value>
</property>
<property>
  <description>The kerberos principal for the node manager.</description>
  <name>yarn.nodemanager.principal<

In [None]:
union_find.get_children(12537362)

[12639198, 12537362]

In [None]:
print(my_utils.get_descriptions(conn, table, 12639198))

During BigTop 0.6.0 release test cycle, Roman Shaposhnik came around the following problem:

013-03-26 15:37:03,573 FATAL
org.apache.hadoop.yarn.server.nodemanager.NodeManager: Error starting
NodeManager
org.apache.hadoop.yarn.YarnException: Failed to Start
org.apache.hadoop.yarn.server.nodemanager.NodeManager
        at org.apache.hadoop.yarn.service.CompositeService.start(CompositeService.java:78)
        at org.apache.hadoop.yarn.server.nodemanager.NodeManager.start(NodeManager.java:199)
        at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:322)
        at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:359)
Caused by: org.apache.avro.AvroRuntimeException:
java.lang.reflect.UndeclaredThrowableException
        at org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl.start(NodeStatusUpdaterImpl.java:162)
        at org.apache.hadoop.yarn.service.CompositeService.start(CompositeService.java:68)
 

In [None]:
print(segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, 12639198))[0])

During BigTop 0.6.0 release test cycle, Roman Shaposhnik came around the following problem:

NodeManager
authorized for protocol interface



The most significant part is 
User yarn/ip-10-46-37-244.ec2.internal@BIGTOP (auth:KERBEROS) is not authorized for protocol interface  org.apache.hadoop.yarn.server.api.ResourceTrackerPB indicating that ResourceTrackerPB hasn't been annotated with @KerberosInfo nor @TokenInfo 
 $var shell substitution in properties are not expanded in hadoop-policy.xml


In [None]:
print(remove_stacktrace(my_utils.get_descriptions(conn, table, 12639198)))

During BigTop 0.6.0 release test cycle, Roman Shaposhnik came around the following problem:

013-03-26 15:37:03,573 FATAL
org.apache.hadoop.yarn.server.nodemanager.NodeManager: Error starting
NodeManager
org.apache.hadoop.yarn.YarnException: Failed to Start
org.apache.hadoop.yarn.server.nodemanager.NodeManager


### with stacktrace

In [None]:
sim_scores_pos = np.array([get_similarity_of_pair_with_desc(conn, table, pair) for pair in partial_positive_pairs_with_stacktrace])

In [None]:
sim_scores_neg = np.array([get_similarity_of_pair_with_desc(conn, table, pair) for pair in tqdm(partial_negative_pairs_with_stacktrace)])

100%|██████████| 386/386 [00:26<00:00, 14.40it/s]


In [None]:
print(sim_scores_pos.mean(), sim_scores_pos.std())

0.74316394 0.17393094


In [None]:
print(sim_scores_neg.mean(), sim_scores_neg.std())

0.35438177 0.13968097


### without stacktrace

In [None]:
sim_scores_pos_wo_stacktrace = np.array([get_similarity_of_pair_with_desc_no_stacktrace_from_file("spark_wo_stacktrace", pair) for pair in tqdm(partial_positive_pairs_with_stacktrace)])

  0%|          | 0/20 [00:00<?, ?it/s]

File not found: spark_wo_stacktrace/12660224.txt
File not found: spark_wo_stacktrace/12658852.txt


TypeError: 'NoneType' object is not subscriptable

In [None]:
sim_scores_neg_wo_stacktrace = np.array([get_similarity_of_pair_with_desc_no_stacktrace_from_file("spark_wo_stacktrace", pair) for pair in tqdm(partial_negative_pairs_with_stacktrace)])

100%|██████████| 386/386 [00:14<00:00, 25.84it/s]


In [None]:
print(sim_scores_pos_wo_stacktrace.mean(), sim_scores_pos_wo_stacktrace.std())

0.7414124 0.1926197


In [None]:
print(sim_scores_neg_wo_stacktrace.mean(), sim_scores_neg_wo_stacktrace.std())

0.28094012 0.1169239


In [None]:
len(partial_bug_ids_w_duplicates_and_stacktrace)

29

In [None]:
sim_scores_neg[:20]

array([0.34981698, 0.15760064, 0.41249198, 0.41249198, 0.29618025,
       0.29618025, 0.3864901 , 0.09716301, 0.08086804, 0.36586148,
       0.34407666, 0.62588763, 0.59807384, 0.22106531, 0.46586314,
       0.4237856 , 0.5828459 , 0.4907854 , 0.52060944, 0.37772298],
      dtype=float32)

In [None]:
sim_scores_neg_wo_stacktrace[:20]

array([0.37045205, 0.15760064, 0.37853605, 0.37690347, 0.29618025,
       0.29618025, 0.270958  , 0.04622083, 0.07302547, 0.41119844,
       0.34407666, 0.5406481 , 0.4741854 , 0.22106531, 0.46841183,
       0.34125957, 0.5409507 , 0.45556837, 0.52060944, 0.21469279],
      dtype=float32)

In [None]:
partial_negative_pairs_with_stacktrace[11]

(12537362, 12658852)

In [None]:
union_find.are_dups(12537362, 12658852)

False

In [None]:
union_find.get_children(12658852)

[12660224, 12658852]

In [None]:
get_similarity_of_pair_with_desc_no_stacktrace_from_file("spark_wo_stacktrace", partial_negative_pairs_with_stacktrace[11])

0.5406481

In [None]:
get_similarity_of_pair_with_desc_no_stacktrace_from_file("spark_wo_stacktrace", (12660224, 12658852))

0.6042371

In [None]:
for bug_id in partial_bug_ids_w_duplicates_and_stacktrace:
    bug_id1 = 12658852
    sim_score = get_similarity_of_pair_with_desc_no_stacktrace_from_file("spark_wo_stacktrace", (bug_id, 12658852))
    print(bug_id, bug_id1, sim_score)

12537362 12658852 0.5406481
12540965 12658852 0.25529
12545575 12658852 0.13150539
12602372 12658852 0.36987168
12602373 12658852 0.37431195
12602374 12658852 0.20421995
12602377 12658852 0.20421995
12624404 12658852 0.21704678
12639198 12658852 0.54223627
12644379 12658852 0.115709916
12644449 12658852 0.15631133
12649623 12658852 0.32205003
12654402 12658852 0.31348854
12658852 12658852 1.0
12660224 12658852 0.6042371
12672191 12658852 0.2360063
12679949 12658852 0.34648585
12680074 12658852 0.26270974
12688395 12658852 0.37964186
12689278 12658852 0.38430497
12690959 12658852 0.5425202
12700876 12658852 0.12126547
12703250 12658852 0.29401445
12710926 12658852 0.19538037
12722180 12658852 0.36400878
12723052 12658852 0.2703671
12733025 12658852 0.37316945
12743684 12658852 0.20649233
12745107 12658852 0.20893043


In [None]:
sim_scores_pos

array([0.65443754, 0.78060997, 0.99999994, 0.80272967, 0.80272967,
       0.80272967, 0.80272967, 1.        , 0.74767995, 0.8932073 ,
       0.920344  , 0.22961201, 0.48272163, 0.6399281 , 0.8248001 ,
       0.82780427, 0.7060792 , 0.61524755, 0.59533626, 0.7345528 ],
      dtype=float32)

In [None]:
sim_scores_pos_wo_stacktrace

array([0.6042371 , 0.78060997, 0.9995721 , 0.9205553 , 0.9205553 ,
       0.92103803, 0.92103803, 1.        , 0.6384648 , 0.87005013,
       0.7594098 , 0.22961201, 0.67306983, 0.4193321 , 0.7903035 ,
       0.7834386 , 0.71531075, 0.58781797, 0.5592795 , 0.7345528 ],
      dtype=float32)

In [None]:
partial_positive_pairs_with_stacktrace[10]

(12710926, 12649623)

look at pos pairs: (12710926, 12649623)

In [None]:
partial_positive_pairs_with_stacktrace[12]

(12639198, 12537362)

In [None]:
folder_name = "spark_w_stacktrace"
if not os.path.exists(folder_name):
    # If it doesn't exist, create the folder
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created.")
else:
    print(f"Folder '{folder_name}' already exists.")
    
for bug_id in partial_bug_ids_w_duplicates_and_stacktrace:
    file_name = str(bug_id)+"w_stacktrace.txt"
    sent = my_utils.get_descriptions(conn, table, bug_id)
    file_path = os.path.join(folder_name, file_name)
    
    with open(file_path, 'w') as file:
        # Write the content to the file
        file.write(sent)
    

Folder 'spark_w_stacktrace' created.


# Find duplicates within a stacktrace containing search space

## Hadoop

In [None]:
table = "hadoop_old"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing hadoop_old


100%|██████████| 24083/24083 [00:00<00:00, 67240.18it/s]


In [None]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
# bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if my_utils.has_log_or_stacktrace(my_utils.get_descriptions(conn, table, bug_id))]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

### Without stacktrace

In [None]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids_w_stacktrace.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

100%|██████████| 293/293 [00:00<00:00, 9529.92it/s]


In [None]:
len(search_space)

4635

In [None]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 4635/4635 [01:31<00:00, 50.53it/s]


In [None]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

In [None]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 293/293 [00:00<00:00, 610194.18it/s]


In [None]:
neighbors = index.query(np.array(Q_vects), 11)

In [None]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  12722180
Index of query  3479
Index of neighbors  [3479   81 3742 1256  861  612 3440 1994 1050 2929 1298]
Duplicates  [12733025, 12722180]
Index of duplicates  [3742, 3479]


In [None]:
found_in_top_k_wo_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_wo_stacktrace)):
                found_in_top_k_wo_stacktrace[f] += 1
            break
        

100%|██████████| 293/293 [00:00<00:00, 6545.75it/s]


In [None]:
found_in_top_k_wo_stacktrace / 293

array([0.        , 0.34812287, 0.43686007, 0.48464164, 0.5221843 ,
       0.54266212, 0.55290102, 0.57679181, 0.58020478, 0.58361775,
       0.58361775])

### with stacktrace

In [None]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids_w_stacktrace.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

100%|██████████| 293/293 [00:00<00:00, 9546.80it/s]


In [None]:
len(search_space)

4635

In [None]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = my_utils.get_descriptions(conn, table, bug_id)[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 4635/4635 [00:40<00:00, 115.65it/s]


In [None]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

  self._set_arrayXarray(i, j, x)


In [None]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 293/293 [00:00<00:00, 684260.06it/s]


In [None]:
neighbors = index.query(np.array(Q_vects), 11)

In [None]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  12722180
Index of query  3479
Index of neighbors  [ 991 1150  434  950 1087 1145   66  215  486  581 1170]
Duplicates  [12733025, 12722180]
Index of duplicates  [3742, 3479]


In [None]:
found_in_top_k_w_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_w_stacktrace)):
                found_in_top_k_w_stacktrace[f] += 1
            break
        

100%|██████████| 293/293 [00:00<00:00, 5764.05it/s]


In [None]:
found_in_top_k_w_stacktrace / 293

array([0.        , 0.01706485, 0.02730375, 0.03754266, 0.04778157,
       0.05460751, 0.07849829, 0.07849829, 0.0887372 , 0.09215017,
       0.09215017])

In [None]:
found_in_top_k_w_stacktrace

array([ 0,  5,  8, 11, 14, 16, 23, 23, 26, 27, 27])

## Eclipse

In [None]:
table = "eclipse"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing eclipse


100%|██████████| 27583/27583 [00:00<00:00, 57363.92it/s]


In [None]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
# bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if my_utils.has_log_or_stacktrace(my_utils.get_descriptions(conn, table, bug_id))]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

### Without stacktrace

In [None]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids_w_stacktrace.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

100%|██████████| 471/471 [00:00<00:00, 4635.80it/s]


In [None]:
len(search_space)

7282

In [None]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 7282/7282 [02:25<00:00, 49.91it/s]


In [None]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

In [None]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 471/471 [00:00<00:00, 889471.94it/s]


In [None]:
neighbors = index.query(np.array(Q_vects), 11)

In [None]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  532492
Index of query  652
Index of neighbors  [ 652 6628 2510 6775 4618 6313  331 3063 1737  545 4283]
Duplicates  [531749, 529367, 531870, 532492]
Index of duplicates  [516, 18, 545, 652]


In [None]:
found_in_top_k_wo_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_wo_stacktrace)):
                found_in_top_k_wo_stacktrace[f] += 1
            break
        

100%|██████████| 471/471 [00:00<00:00, 3572.71it/s]


In [None]:
found_in_top_k_wo_stacktrace / 471

array([0.        , 0.37791932, 0.47346072, 0.51167728, 0.54352442,
       0.57324841, 0.59023355, 0.59872611, 0.60509554, 0.61358811,
       0.61358811])

### with stacktrace

In [None]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids_w_stacktrace.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

  0%|          | 0/471 [00:00<?, ?it/s]

100%|██████████| 471/471 [00:00<00:00, 4771.58it/s]


In [None]:
len(search_space)

7282

In [None]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = my_utils.get_descriptions(conn, table, bug_id)[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 7282/7282 [01:05<00:00, 110.55it/s]


In [None]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

  self._set_arrayXarray(i, j, x)


In [None]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 471/471 [00:00<00:00, 790839.55it/s]


In [None]:
neighbors = index.query(np.array(Q_vects), 11)

In [None]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  532492
Index of query  652
Index of neighbors  [377 508 165 360 393 445  95 142 172 177 759]
Duplicates  [531749, 529367, 531870, 532492]
Index of duplicates  [516, 18, 545, 652]


In [None]:
found_in_top_k_w_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_w_stacktrace)):
                found_in_top_k_w_stacktrace[f] += 1
            break
        

100%|██████████| 471/471 [00:00<00:00, 3323.84it/s]


In [None]:
found_in_top_k_w_stacktrace / 471

array([0.        , 0.00849257, 0.00849257, 0.01273885, 0.014862  ,
       0.01698514, 0.01698514, 0.01910828, 0.02123142, 0.02972399,
       0.02972399])

In [None]:
found_in_top_k_w_stacktrace

array([ 0,  4,  4,  6,  7,  8,  8,  9, 10, 14, 14])

# Find duplicates within a dataset

## Hadoop

In [19]:
table = "hadoop_old"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing hadoop_old


100%|██████████| 24083/24083 [00:00<00:00, 62315.00it/s]


In [20]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
# bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if my_utils.has_log_or_stacktrace(my_utils.get_descriptions(conn, table, bug_id))]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

### Without stacktrace

In [19]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

100%|██████████| 293/293 [00:00<00:00, 3223.10it/s]


In [20]:
len(search_space)

24083

In [21]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 24083/24083 [06:59<00:00, 57.38it/s]


In [22]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

  self._set_arrayXarray(i, j, x)


In [23]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 293/293 [00:00<00:00, 350622.27it/s]


In [24]:
neighbors = index.query(np.array(Q_vects), 11)

In [25]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  12722180
Index of query  18632
Index of neighbors  [18632  2439   385 20014 12714  8646  9999  2474 16040 18525  6732]
Duplicates  [12733025, 12722180]
Index of duplicates  [20014, 18632]


In [26]:
found_in_top_k_wo_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_wo_stacktrace)):
                found_in_top_k_wo_stacktrace[f] += 1
            break
        

100%|██████████| 293/293 [00:00<00:00, 2052.79it/s]


In [28]:
found_in_top_k_wo_stacktrace / 293

array([0.        , 0.27645051, 0.35494881, 0.39249147, 0.41979522,
       0.43003413, 0.44368601, 0.45733788, 0.47098976, 0.4778157 ,
       0.4778157 ])

### with stacktrace

In [21]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

  0%|          | 0/293 [00:00<?, ?it/s]

100%|██████████| 293/293 [00:00<00:00, 3369.52it/s]


In [22]:
len(search_space)

24083

In [23]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = my_utils.get_descriptions(conn, table, bug_id)[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 24083/24083 [03:37<00:00, 110.98it/s]


In [24]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

  self._set_arrayXarray(i, j, x)


In [25]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 293/293 [00:00<00:00, 227790.75it/s]


In [26]:
neighbors = index.query(np.array(Q_vects), 11)

In [27]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  12722180
Index of query  18632
Index of neighbors  [7655 5664 5682 7525 4818 5206 2660 5681 6709 3472 7915]
Duplicates  [12733025, 12722180]
Index of duplicates  [20014, 18632]


In [28]:
found_in_top_k_w_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_w_stacktrace)):
                found_in_top_k_w_stacktrace[f] += 1
            break
        

100%|██████████| 293/293 [00:00<00:00, 2079.33it/s]


In [31]:
found_in_top_k_w_stacktrace / 293

array([0.        , 0.        , 0.00341297, 0.00341297, 0.00682594,
       0.01023891, 0.01023891, 0.01023891, 0.01706485, 0.01706485,
       0.01706485])

In [30]:
found_in_top_k_w_stacktrace

array([0, 0, 1, 1, 2, 3, 3, 3, 5, 5, 5])

## Eclipse

In [17]:
table = "eclipse"
union_find = my_utils.UnionFind()
union_find.process_project(conn, table, min_desc_length=10)

Processing eclipse


100%|██████████| 27583/27583 [00:00<00:00, 58947.47it/s]


In [18]:
# Get list of bug_ids that has stacktrace
bug_ids = my_utils.get_bug_ids(conn, table)
# bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if len(my_utils.get_stacktrace(conn, table, bug_id)) != 0]
bug_ids_w_stacktrace = [bug_id for bug_id in bug_ids if my_utils.has_log_or_stacktrace(my_utils.get_descriptions(conn, table, bug_id))]

# bug ids what has duplicates
bug_ids_w_duplicates = union_find.get_all_children()

# intersection, bug_ids that has duplicates and stactrace
bug_ids_w_duplicates_and_stacktrace = list(set(bug_ids_w_duplicates).intersection(set(bug_ids_w_stacktrace)))

### Without stacktrace

In [19]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

  0%|          | 0/471 [00:00<?, ?it/s]

100%|██████████| 471/471 [00:00<00:00, 2021.82it/s]


In [20]:
len(search_space)

27583

In [21]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 27583/27583 [08:05<00:00, 56.81it/s]


In [22]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

  self._set_arrayXarray(i, j, x)


In [23]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 471/471 [00:00<00:00, 471483.81it/s]


In [24]:
neighbors = index.query(np.array(Q_vects), 11)

In [25]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  532492
Index of query  2493
Index of neighbors  [ 2493 25873  9701 26476 17296 24525  1141 11677  6521  2020 15826]
Duplicates  [531749, 529367, 531870, 532492]
Index of duplicates  [1925, 34, 2020, 2493]


In [28]:
found_in_top_k_wo_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_wo_stacktrace)):
                found_in_top_k_wo_stacktrace[f] += 1
            break
        

  0%|          | 0/471 [00:00<?, ?it/s]

100%|██████████| 471/471 [00:00<00:00, 1383.14it/s]


In [29]:
found_in_top_k_wo_stacktrace / 471

array([0.        , 0.32484076, 0.41613588, 0.45859873, 0.48619958,
       0.49469214, 0.507431  , 0.54140127, 0.5477707 , 0.56050955,
       0.56050955])

### with stacktrace

In [19]:
# finalize search space by adding duplicates of bug_ids_w_duplicates_and_stacktrace
search_space = bug_ids.copy()
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    dups = union_find.get_children(bug_id)
    for dup in dups:
        if dup != bug_id and dup not in search_space:
            search_space.append(dup)

  0%|          | 0/471 [00:00<?, ?it/s]

100%|██████████| 471/471 [00:00<00:00, 1990.13it/s]


In [20]:
len(search_space)

27583

In [21]:
# convert to vectors

search_space_vects = {}
for bug_id in tqdm(search_space):
    eng = my_utils.get_descriptions(conn, table, bug_id)[0]
    vect = model.encode(eng,convert_to_tensor=True).numpy()
    search_space_vects[bug_id] = vect

100%|██████████| 27583/27583 [04:17<00:00, 107.33it/s]


In [22]:
index = pynndescent.NNDescent(np.array(list(search_space_vects.values())), n_neighbors=100, metric="cosine")
index.prepare()

  self._set_arrayXarray(i, j, x)


In [23]:

Q_vects = []
Q_indices = [search_space.index(bug_id) for bug_id in bug_ids_w_duplicates_and_stacktrace]
for bug_id in tqdm(bug_ids_w_duplicates_and_stacktrace):
    # eng = segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, bug_id))[0]
    vect = search_space_vects[bug_id]
    Q_vects.append(vect)
    

100%|██████████| 471/471 [00:00<00:00, 523870.91it/s]


In [24]:
neighbors = index.query(np.array(Q_vects), 11)

In [25]:
i = 2
q = bug_ids_w_duplicates_and_stacktrace[i]
print("Q = ", q)
print("Index of query ", Q_indices[i])
print("Index of neighbors ", neighbors[0][i])
print("Duplicates ", union_find.get_children(q))
print("Index of duplicates ", [search_space.index(id) for id in union_find.get_children(q)])


Q =  532492
Index of query  2493
Index of neighbors  [2870 4554 1565 2802 3604 3912  785  800 1706 1920 4595]
Duplicates  [531749, 529367, 531870, 532492]
Index of duplicates  [1925, 34, 2020, 2493]


In [26]:
found_in_top_k_w_stacktrace = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
for i in tqdm(range(len(bug_ids_w_duplicates_and_stacktrace))):
    q = bug_ids_w_duplicates_and_stacktrace[i]
    # print("Q = ", q)
    # print("Index of query ", Q_indices[i])
    # print("Index of neighbors ", neighbors[0][i])
    # print("Duplicates ", union_find.get_children(q))
    index_of_duplicates = [search_space.index(id) for id in union_find.get_children(q)]
    # print("Index of duplicates ", index_of_duplicates)
    for result_k in range(1, len(neighbors[0][i][1:])):
        if neighbors[0][i][result_k] in index_of_duplicates:
            # increment from k to 10
            for f in range(result_k, len(found_in_top_k_w_stacktrace)):
                found_in_top_k_w_stacktrace[f] += 1
            break
        

100%|██████████| 471/471 [00:00<00:00, 1377.72it/s]


In [29]:
found_in_top_k_w_stacktrace / 471

array([0.        , 0.        , 0.        , 0.00212314, 0.00424628,
       0.00424628, 0.00424628, 0.00636943, 0.00636943, 0.00636943,
       0.00636943])

In [28]:
found_in_top_k_w_stacktrace

array([0, 0, 0, 1, 2, 2, 2, 3, 3, 3, 3])

# Test

In [36]:
test_bug_ids = [bug_id for bug_id in bug_ids if my_utils.has_log_or_stacktrace(my_utils.get_descriptions(conn, table, bug_id))]

In [37]:
len(test_bug_ids)

2234

In [21]:
print(my_utils.get_descriptions(conn, table, test_bug_ids[3]))

The following mapred ant tests are failing.  This started on December 22nd.
 [junit] Running org.apache.hadoop.mapred.TestTrackerBlacklistAcrossJobs
[junit] Running org.apache.hadoop.mapred.TestMiniMRDFSSort
[junit] Running org.apache.hadoop.mapred.TestBadRecords
[junit] Running org.apache.hadoop.mapred.TestClusterMRNotification
[junit] Running org.apache.hadoop.mapred.TestDebugScript
[junit] Running org.apache.hadoop.mapred.TestJobCleanup
[junit] Running org.apache.hadoop.mapred.TestJobClient
[junit] Running org.apache.hadoop.mapred.TestJobHistory
[junit] Running org.apache.hadoop.mapred.TestJobInProgressListener
[junit] Running org.apache.hadoop.mapred.TestJobKillAndFail
[junit] Running org.apache.hadoop.mapred.TestJvmReuse
[junit] Running org.apache.hadoop.mapred.TestKillSubProcesses
[junit] Running org.apache.hadoop.mapred.TestNodeRefresh
[junit] Running org.apache.hadoop.mapred.TestSetupAndCleanupFailure
[junit] Running org.apache.hadoop.mapred.TestTaskFail
[junit] Running org.apa

In [34]:
print(my_utils.segregate_log_and_stacktrace(my_utils.get_descriptions(conn, table, test_bug_ids[11]))[0])

Hi folks,
I have problems to run penny in my clusters.
First of all, the hadoop cluster is fine, and the pig program performs well.
Yet, when I'm trying to run the penny tool in the MapReduce Mode with the command:
java -cp pig-0.9.1/contrib/penny/java/penny.jar:pig-0.9.1/pig-0.9.1.jar:$HADOOP_CONF_DIR org.apache.pig.penny.apps.ds.Main test.pig
I get the following errors:
The test.pig script just loads a file and then stores it back:
-----------
data = LOAD 'input/student';
STORE data INTO 'output';
-----------
Is it the problem that I miss some environmental variables for penny? Hope someone can help. 
Thanks 
 Can't run penny in MapReduce mode


In [23]:
my_utils.java_path_is_majority("[junit] Running org.apache.hadoop.mapred.TestJvmReuse")

False

In [24]:
len("org.apache.hadoop.mapred.TestJvmReuse") / len("[junit] Running org.apache.hadoop.mapred.TestJvmReuse")

0.6981132075471698

In [25]:
my_utils.contains_java_path("at org.apache.pig.PigServer.<init>(PigServer.java:244)")

False

In [26]:
my_utils.startswith_datetime("11/09/21 21:59:59 INFO mapreduce.Job:  map 50% reduce 0%")

True

In [38]:


def write_string_to_file(file_path, content):
    try:
        # Open the file in write mode ('w')
        with open(file_path, 'w') as file:
            # Write the content to the file
            file.write(content)
        print(f"String has been written to {file_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")
        
def create_folder(folder_path):
    try:
        # Check if the folder exists
        if not os.path.exists(folder_path):
            # If it doesn't exist, create the folder
            os.makedirs(folder_path)
            print(f"Folder '{folder_path}' created.")
        else:
            print(f"Folder '{folder_path}' already exists.")
    except Exception as e:
        print(f"An error occurred: {e}")

def delete_file(file_path):
    try:
        # Check if the file exists
        if os.path.exists(file_path):
            # If it exists, delete the file
            os.remove(file_path)
            print(f"File '{file_path}' deleted.")
        else:
            print(f"File '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")


create_folder("./"+table)

desc_folder = os.path.join("./"+table, "./desc")
eng_folder = os.path.join("./"+table, "./eng")
logStacktrace_folder = os.path.join("./"+table, "./logStackTrace")

create_folder(desc_folder)
create_folder(eng_folder)
create_folder(logStacktrace_folder)

for bug_id in test_bug_ids:
    filename = str(bug_id)+".txt"
    desc_path = os.path.join(desc_folder, filename)
    eng_path = os.path.join(eng_folder, filename)
    logStacktrace_path = os.path.join(logStacktrace_folder, filename)
    desc = my_utils.get_descriptions(conn, table, bug_id)
    eng, logStacktrace = my_utils.segregate_log_and_stacktrace(desc)
    
    
    write_string_to_file(desc_path, desc)
    write_string_to_file(eng_path, eng)
    write_string_to_file(logStacktrace_path, logStacktrace)
    
    # write_string_to_file(desc, desc_folder)
    # write_string_to_file(eng, eng_folder)
    # write_string_to_file(logStacktrace, logStacktrace_folder)
    # delete_file(desc)
    # delete_file(eng)
    # delete_file(logStacktrace)
    
    
    
    
    

Folder './spark' created.
Folder './spark/./desc' created.
Folder './spark/./eng' created.
Folder './spark/./logStackTrace' created.
String has been written to ./spark/./desc/13127872.txt.
String has been written to ./spark/./eng/13127872.txt.
String has been written to ./spark/./logStackTrace/13127872.txt.
String has been written to ./spark/./desc/13128008.txt.
String has been written to ./spark/./eng/13128008.txt.
String has been written to ./spark/./logStackTrace/13128008.txt.
String has been written to ./spark/./desc/13128077.txt.
String has been written to ./spark/./eng/13128077.txt.
String has been written to ./spark/./logStackTrace/13128077.txt.
String has been written to ./spark/./desc/13128259.txt.
String has been written to ./spark/./eng/13128259.txt.
String has been written to ./spark/./logStackTrace/13128259.txt.
String has been written to ./spark/./desc/13128389.txt.
String has been written to ./spark/./eng/13128389.txt.
String has been written to ./spark/./logStackTrace/1