# Import

In [1]:
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

import sqlite3
import argparse
from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
import os
from numpy.linalg import norm
import sys
from itertools import combinations
import random
import json
import my_utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# exclude stacktrace

def remove_stacktrace(dataset):
    desc_wo_stacktrace = []
    for point in tqdm(dataset):
        desc = point["description"]
        stacktraces = point["stacktrace"]
        for stacktrace in stacktraces:
            desc = desc.replace(stacktrace["exception"], "")
            if stacktrace["message"] is not None: desc = desc.replace(stacktrace["message"], "")
            if stacktrace["frames"] is not None:
                for frame in stacktrace["frames"]:
                    desc = desc.replace(frame["function"], "")
                    desc = desc.replace(frame["file"], "")
                    desc = desc.replace(str(frame["fileline"]), "")
        desc = desc.replace("\tat ", "")
        desc = desc.replace("at\n", "")
        desc = desc.replace("(:)", "")
        desc = desc.split("\n\n\n\n", 1)[0]
        desc = desc.strip()
        desc_wo_stacktrace.append(desc)
    return desc_wo_stacktrace
        

In [28]:
def get_stacktrace_as_string(point):
    stacktraces = point["stacktrace"]
    ret = ""
    for stacktrace in stacktraces:
        ret += stacktrace["exception"] + " "
        if stacktrace["message"] is not None: ret += stacktrace["message"] + " "
        if stacktrace["frames"] is not None:
            for frame in stacktrace["frames"]:
                ret += frame["function"] + " "
                if frame["file"] is not None: ret += frame["file"]
                if frame["fileline"] is not None: ret += ":" + str(frame["fileline"]) + " "
    return ret

In [3]:
def get_duplicated_pairs(union_find):
    roots = union_find.get_roots()
    pairs = []
    for root in tqdm(roots):
        group = union_find.get_children(root)
        pairs += list(combinations(group, 2))
    for pair in tqdm(pairs):
        assert(union_find.are_dups(pair[0], pair[1]))
    return pairs

In [4]:
def get_non_duplicated_pairs(union_find, dataset, idx_to_bug_id, size):
    from_dup = union_find.get_all_children()
    #sample in some other single reports
    assert(union_find.processed)
    samples = random.sample(idx_to_bug_id, len(from_dup))
    
    pairs = []
    count = 0
    while (count < size):
        pair = random.sample(samples, 2)
        if pair[0] == pair[1] or union_find.are_dups(pair[0], pair[1]):
            continue
        pairs += [(pair[0], pair[1]),]
        count += 1
    for pair in tqdm(pairs):
        assert(not union_find.are_dups(pair[0], pair[1]))
    return pairs

# Load data

Data is downloaded from https://zenodo.org/records/5746044#.Yej5HvtyZH6

In [5]:
path_to_datasets = "/home/grads/t/tiendat.ng.cs/github_repos/MLDatasets/EMSE_data"

projects = {"campbell_dataset" : "campbell_stacktraces.json", 
            "eclipse_2018" : "eclipse_stacktraces.json", 
            "gnome_2011" : "gnome_stacktraces.json", 
            "netbeans_2016" : "netbeans_stacktraces.json"}

In [6]:
# loading netbeans 2016 dataset
file_path = os.path.join(path_to_datasets, "netbeans_2016", projects["netbeans_2016"])

try:
    with open(file_path, 'r') as json_file:
        dataset = json.load(json_file)
    # print(dataset)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
except FileNotFoundError:
    print(f"File not found: {file_path}")

In [7]:
len(dataset)

65417

In [8]:
num_br_withduplication = 0
for point in dataset:
    if point["dup_id"] is not None:
        num_br_withduplication += 1
print(num_br_withduplication)

13703


In [9]:
idx_to_bug_id = []
for point in dataset:
    idx_to_bug_id.append(point["bug_id"])

In [10]:
union_find = my_utils.UnionFind()
union_find.process_json_data(dataset, "netbeans_2016")

In [11]:
len(union_find.get_roots())

6840

In [12]:
descs_wo_stacktraces = remove_stacktrace(dataset)

  1%|          | 757/65417 [00:00<00:08, 7560.02it/s]

100%|██████████| 65417/65417 [02:26<00:00, 445.68it/s] 


In [13]:
dataset[1]["description"]

'This is the Windows Eastern European character encoding, and it is being used to load various resources. (The same problem exists in pre-Gandalf versions.) Presumably you set the encoding to this so\n that you can load the _cs.properties files internally without hassle, but it is also the encoder that makes its way into the release builds. This converter is supported by the JDK and should be inclu\nded in any port, but apparently the SGI JDK does not include full I18N support (maybe?) and one person had the following message on startup on Irix (SGI port of JDK 1.1.6):\n\njava.io.UnsupportedEncodingException\n        at sun.io.ByteToCharConverter.getConverter(ByteToCharConverter.java:97)\n\n\n        at java.io.InputStreamReader.<init>(InputStreamReader.java:82)\n        at\ncom.netbeans.developer.util.NetbeansBundle.createResourceBundleFromURL(NetbeansBundle.java:319)\n\n        at\ncom.netbeans.developer.util.NetbeansBundle.findBundle(NetbeansBundle.java:294)\n        at\ncom.netbean

In [14]:
descs_wo_stacktraces[1]

'This is the Windows Eastern European character encoding, and it is being used to load various resources. (The same problem exists in pre-Gandalf versions.) Presumably you set the encoding to this so\n that you can load the _cs.properties files internally without hassle, but it is also the encoder that makes its way into the release builds. This converter is supported by the JDK and should be inclu\nded in any port, but apparently the SGI JDK does not include full I18N support (maybe?) and one person had the following message on startup on Irix (SGI port of JDK 1.1.6):\n\n\n        at \n\n\n        at'

# retry sbert

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## save vector representation of short desc

In [18]:
data_to_save = {}
for point in tqdm(dataset):
    bug_id = point["bug_id"]
    sent = point["short_desc"]
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('netbeans_sbert_short_desc.npz', **data_to_save)

100%|██████████| 65417/65417 [12:17<00:00, 88.72it/s] 


In [20]:
loaded_data = np.load('netbeans_sbert_short_desc.npz')

short_desc_embeddings = []
for point in tqdm(dataset):
    bug_id = str(point["bug_id"])
    short_desc_embeddings.append(loaded_data[bug_id])

100%|██████████| 65417/65417 [01:23<00:00, 779.29it/s]


## Save vector representation of description

In [21]:
data_to_save = {}
for point in tqdm(dataset):
    bug_id = point["bug_id"]
    sent = point["description"]
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('netbeans_sbert_description.npz', **data_to_save)

100%|██████████| 65417/65417 [31:11<00:00, 34.96it/s]   


## Save vector representation of description wo stacktrace

In [22]:
data_to_save = {}
for point, desc_wo_stacktrace in tqdm(zip(dataset, descs_wo_stacktraces)):
    bug_id = point["bug_id"]
    sent = desc_wo_stacktrace
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('netbeans_sbert_description_wo_stacktrace.npz', **data_to_save)

65417it [21:16, 51.23it/s]


## Save vector representation of stacktrace

In [29]:
get_stacktrace_as_string(dataset[0])

'java.lang.ArrayIndexOutOfBoundsException 1 >= 1 java.util.Vector.elementAt Vector.java:328 com.sun.java.swing.JTabbedPane.setIconAt JTabbedPane.java:772 com.netbeans.developer.base.windows.MultiObjectFrame.updateIcons MultiObjectFrame.java:336 com.netbeans.developer.base.windows.MultiObjectFrame.access$8 MultiObjectFrame.java:330 com.netbeans.developer.base.windows.MultiObjectFrame$5.propertyChange MultiObjectFrame.java:250 com.netbeans.developer.util.node.Node.fireOwnPropertyChange Node.java:280 com.netbeans.developer.util.node.Node.fireIconChange Node.java:213 com.netbeans.developerx.loaders.java.JavaNode.resolveIcons JavaNode.java:357 com.netbeans.developerx.loaders.java.JavaNode$3.run JavaNode.java:321 com.netbeans.developer.util.RequestProcessor$1.execute RequestProcessor.java:64 sunw.hotjava.misc.RequestProcessor.run RequestProcessor.java:130 java.lang.Thread.run Thread.java:474 '

In [30]:
data_to_save = {}
for point in tqdm(dataset):
    bug_id = point["bug_id"]
    sent = get_stacktrace_as_string(point)
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('netbeans_sbert_stacktrace.npz', **data_to_save)

100%|██████████| 65417/65417 [44:23<00:00, 24.56it/s]  


# Save eclipse

In [32]:
# loading netbeans 2016 dataset
file_path = os.path.join(path_to_datasets, "eclipse_2018", projects["eclipse_2018"])

try:
    with open(file_path, 'r') as json_file:
        dataset = json.load(json_file)
    # print(dataset)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
except FileNotFoundError:
    print(f"File not found: {file_path}")

In [33]:
len(dataset)

55968

In [34]:
num_br_withduplication = 0
for point in dataset:
    if point["dup_id"] is not None:
        num_br_withduplication += 1
print(num_br_withduplication)

8332


In [35]:
idx_to_bug_id = []
for point in dataset:
    idx_to_bug_id.append(point["bug_id"])

In [36]:
union_find = my_utils.UnionFind()
union_find.process_json_data(dataset, "eclipse_2018")

In [37]:
len(union_find.get_roots())

4297

In [38]:
descs_wo_stacktraces = remove_stacktrace(dataset)

  0%|          | 0/55968 [00:00<?, ?it/s]

100%|██████████| 55968/55968 [00:25<00:00, 2219.44it/s]


In [39]:
dataset[1]["description"]



In [40]:
descs_wo_stacktraces[1]



## save vector representation of short desc

In [41]:
data_to_save = {}
for point in tqdm(dataset):
    bug_id = point["bug_id"]
    sent = point["short_desc"]
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('eclipse_sbert_short_desc.npz', **data_to_save)

100%|██████████| 55968/55968 [09:13<00:00, 101.21it/s]


## Save vector representation of description

In [42]:
data_to_save = {}
for point in tqdm(dataset):
    bug_id = point["bug_id"]
    sent = point["description"]
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('eclipse_sbert_description.npz', **data_to_save)

100%|██████████| 55968/55968 [29:25<00:00, 31.70it/s] 


## Save vector representation of description wo stacktrace

In [43]:
data_to_save = {}
for point, desc_wo_stacktrace in tqdm(zip(dataset, descs_wo_stacktraces)):
    bug_id = point["bug_id"]
    sent = desc_wo_stacktrace
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('eclipse_sbert_description_wo_stacktrace.npz', **data_to_save)

55968it [17:30, 53.29it/s]


## Save vector representation of stacktrace

In [None]:
get_stacktrace_as_string(dataset[0])

'java.lang.ArrayIndexOutOfBoundsException 1 >= 1 java.util.Vector.elementAt Vector.java:328 com.sun.java.swing.JTabbedPane.setIconAt JTabbedPane.java:772 com.netbeans.developer.base.windows.MultiObjectFrame.updateIcons MultiObjectFrame.java:336 com.netbeans.developer.base.windows.MultiObjectFrame.access$8 MultiObjectFrame.java:330 com.netbeans.developer.base.windows.MultiObjectFrame$5.propertyChange MultiObjectFrame.java:250 com.netbeans.developer.util.node.Node.fireOwnPropertyChange Node.java:280 com.netbeans.developer.util.node.Node.fireIconChange Node.java:213 com.netbeans.developerx.loaders.java.JavaNode.resolveIcons JavaNode.java:357 com.netbeans.developerx.loaders.java.JavaNode$3.run JavaNode.java:321 com.netbeans.developer.util.RequestProcessor$1.execute RequestProcessor.java:64 sunw.hotjava.misc.RequestProcessor.run RequestProcessor.java:130 java.lang.Thread.run Thread.java:474 '

In [44]:
data_to_save = {}
for point in tqdm(dataset):
    bug_id = point["bug_id"]
    sent = get_stacktrace_as_string(point)
    sent_embedding = model.encode(sent,convert_to_tensor=True).numpy()
    data_to_save[str(bug_id)] = sent_embedding

np.savez('eclipse_sbert_stacktrace.npz', **data_to_save)

100%|██████████| 55968/55968 [27:45<00:00, 33.61it/s]  
