# Data Analysis for Bug Localization 

In [22]:
from src.utils.jsonl_utils import get_jsonl_data, get_repos

In [23]:
from omegaconf import OmegaConf

config = OmegaConf.load('/home/tigina/bug-localization/configs/data/server.yaml')

In [24]:
def count_jsonl_data(jsonls_path: str, repo_owner: str, repo_name: str) -> int:
    jsonl_data = get_jsonl_data(jsonls_path, repo['owner'], repo['name'])
    if jsonl_data is None:
        return 0
    return len(jsonl_data)

In [25]:
issues_count = 0
issue_comments_count = 0
prs_count = 0
prs_comments_count = 0

for repo in get_repos(config.repos_list_path):
    issues_count += count_jsonl_data(config.issues_path, repo['owner'], repo['name'])
    issue_comments_count += count_jsonl_data(config.issues_comments_path, repo['owner'], repo['name'])
    prs_count += count_jsonl_data(config.pulls_path, repo['owner'], repo['name'])
    prs_comments_count += count_jsonl_data(config.pull_requests_comments_path, repo['owner'], repo['name'])

Path /mnt/data/shared-data/lca/pulls_comments_updated/jformdesigner__flatlaf.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comments_updated/cms-sw__cmssw.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comments_updated/yelp__paasta.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comments_updated/zephyrproject-rtos__zephyr.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comments_updated/shipshapecode__tether.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comments_updated/lightninglabs__loop.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comments_updated/odyseeteam__odysee-api.jsonl does not exists
Path /mnt/data/shared-data/lca/issues_prs_updated_dedup/draios__agent-libs.jsonl does not exists
Path /mnt/data/shared-data/lca/comments_updated_dedup/draios__agent-libs.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_updated_dedup/draios__agent-libs.jsonl does not exists
Path /mnt/data/shared-data/lca/pulls_comme

In [26]:
print(f"""
    Issues count: {issues_count}
    Issues comments count: {issue_comments_count}
    Pulls count: {prs_count}
    Pulls comments count: {prs_comments_count}
""")


    Issues count: 15580465
    Issues comments count: 34437308
    Pulls count: 7027484
    Pulls comments count: 17251762



In [27]:
print(f"""
    Repos count: {len(get_repos(config.repos_list_path))}
""")


    Repos count: 7755



In [37]:
from collections import defaultdict

parsed_links_count = 0
links_with_status_count = 0
links_by_status_count = defaultdict(int)

for repo in get_repos(config.repos_list_path):
    issue_links = get_jsonl_data(config.issues_links_filtered_path, repo['owner'], repo['name'])
    if issue_links is None:
        continue
    for issue_link in issue_links:
        links_by_status_count[issue_link['status']] += 1
    parsed_links_count += count_jsonl_data(config.issues_links_path, repo['owner'], repo['name'])
    links_with_status_count += len(issue_links)

Path /mnt/data/shared-data/lca/issues_links_filtered_updated/draios__agent-libs.jsonl does not exists
Path /mnt/data/shared-data/lca/issues_links_filtered_updated/mintlayer__mintlayer-core.jsonl does not exists


In [38]:
print(f"""
    Issues links count: {parsed_links_count}
    Issues links with status count: {links_with_status_count}
""")


    Issues links count: 25544771
    Issues links with status count: 25544771



In [39]:
for status, status_count in links_by_status_count.items():
    print(f"{status}: {status_count}")

not_enough_info: 21193707
issue_not_a_bug: 3472057
ok: 22457
pr_to_multi_issues: 7946
issue_to_multi_prs: 2046
diff_has_new_files: 30574
diff_can_not_extract: 475418
diff_can_not_extract_changed_files: 6809
issue_not_english: 35883
issue_has_media: 145225
diff_non_code_files: 136365
issue_empty: 16265
diff_non_utf8: 19


In [49]:
filters_list = ['not_enough_info', 
               'issue_not_a_bug', 'issue_empty', 'issue_has_media', 'issue_not_english',
               'diff_can_not_extract', 'diff_has_new_files', 'diff_non_code_files', 'diff_non_utf8', 
               'diff_can_not_extract_changed_files', 
               'pr_to_multi_issues', 'issue_to_multi_prs'
              ]
initial_count = 4351064
cur_count = parsed_links_count
print(cur_count)
for f in filters_list:
    print(f, links_by_status_count[f], '{:.2f}% {:.2f}%'.format(links_by_status_count[f] / cur_count * 100, links_by_status_count[f] / initial_count * 100))
    cur_count -= links_by_status_count[f]
    print(cur_count)
    

25544771
not_enough_info 21193707 82.97% 487.09%
4351064
issue_not_a_bug 3472057 79.80% 79.80%
879007
issue_empty 16265 1.85% 0.37%
862742
issue_has_media 145225 16.83% 3.34%
717517
issue_not_english 35883 5.00% 0.82%
681634
diff_can_not_extract 475418 69.75% 10.93%
206216
diff_has_new_files 30574 14.83% 0.70%
175642
diff_non_code_files 136365 77.64% 3.13%
39277
diff_non_utf8 19 0.05% 0.00%
39258
diff_can_not_extract_changed_files 6809 17.34% 0.16%
32449
pr_to_multi_issues 7946 24.49% 0.18%
24503
issue_to_multi_prs 2046 8.35% 0.05%
22457
