## Data features


In [7]:
from collections import defaultdict
import json
import os
import sys
from types import SimpleNamespace
import numpy as np
import typing
from matplotlib import pyplot as plt

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

import downloader.typing.bugswarm as typ_bugswarm
from preprocess.cmd.export_json import Task, load_artifact

from consts import (
    ARTIFACTS_JSON_PATH,
    LOG_FILTERED_DIR,
    EXPORT_DIR,
    PASSED_LOG_NAME,
    FAILED_LOG_NAME,
    DIFF_NO_CTX_NAME,
)

def load_artifact(artifacts_json_path: str):
    artifacts = []  # type: list[typ_bugswarm.Entry]
    with open(artifacts_json_path) as f:
        artifacts = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
    tasks = defaultdict(Task) # type: dict[str, Task]
    for artifact in artifacts:
        i = tasks[artifact.image_tag]
        i.id = artifact.image_tag
        i.passed_log_id = str(artifact.passed_job.job_id)
        i.failed_log_id = str(artifact.failed_job.job_id)
    return tasks, artifacts

art_path = os.path.join(os.getcwd(), "../..", ARTIFACTS_JSON_PATH)
tasks, art = load_artifact(art_path)
print(f"Loaded {len(tasks)} tasks")
hunk_path = os.path.abspath(os.path.join(os.getcwd(), "../..", EXPORT_DIR, "data.json"))
hunk_json = json.loads(open(hunk_path).read())
print(f"Loaded {len(hunk_json)} log hunk groups")
print(f"Preview: {hunk_json[0]}")

Loaded 4478 tasks
Loaded 7821 log hunk groups


In [13]:
# length
# length-after filter
# length-delta
# per type/all
# hunk avg length
# hunk nums
import consts


class Item(typing.TypedDict):
    len_passed_by_line: int
    len_failed_by_line: int
    len_filtered_passed_by_line: int
    len_filtered_failed_by_line: int
    len_passed_by_char: int
    len_failed_by_char: int
    len_filtered_passed_by_char: int
    len_filtered_failed_by_char: int
    lang: str
    num_hunk: int
    len_hunks_avg: float
    len_context_avg: float


res = {}  # type: dict[str, Item]

has_log = {i["id"] for i in hunk_json}
print(f"Has log: {len(has_log)}")
data_folder = os.path.abspath(os.path.join(os.getcwd(), "../.."))

for idx, id in enumerate(has_log):
    if idx % 100 == 0:
        print(f"Processing {idx}/{len(has_log)}")
    art_item = [i for i in art if i.image_tag == id][0]
    passed_log = open(
        os.path.join(data_folder, consts.LOG_DIR, f"{tasks[id].passed_log_id}.log")
    ).read()
    failed_log = open(
        os.path.join(data_folder, consts.LOG_DIR, f"{tasks[id].failed_log_id}.log")
    ).read()
    filterd_passed_log = open(
        os.path.join(data_folder, consts.DIFF_HUNK_FILTERED_DIR, id, PASSED_LOG_NAME)
    ).read()
    filtered_failed_log = open(
        os.path.join(data_folder, consts.DIFF_HUNK_FILTERED_DIR, id, FAILED_LOG_NAME)
    ).read()
    hunk_items = [i for i in hunk_json if i["id"] == id]
    res[id] = Item(
        len_passed_by_line=passed_log.count("\n") + 1,
        len_failed_by_line=failed_log.count("\n") + 1,
        len_filtered_passed_by_line=filterd_passed_log.count("\n") + 1,
        len_filtered_failed_by_line=filtered_failed_log.count("\n") + 1,
        len_passed_by_char=len(passed_log),
        len_failed_by_char=len(failed_log),
        len_filtered_passed_by_char=len(filterd_passed_log),
        len_filtered_failed_by_char=len(filtered_failed_log),
        lang=art_item.lang,
        num_hunk=len(hunk_items),
        len_hunks_avg=float(np.mean([len(i["hunk"]) for i in hunk_items])),
        len_context_avg=float(np.mean([len(i["context"]) for i in hunk_items])),
    )
print(f"Processed {len(res)} items")
print(f"Preview: {res[list(res.keys())[0]]}")

Has log: 4455
Processing 0/4455


Processing 100/4455
Processing 200/4455
Processing 300/4455
Processing 400/4455
Processing 500/4455
Processing 600/4455
Processing 700/4455
Processing 800/4455
Processing 900/4455
Processing 1000/4455
Processing 1100/4455
Processing 1200/4455
Processing 1300/4455
Processing 1400/4455
Processing 1500/4455
Processing 1600/4455
Processing 1700/4455
Processing 1800/4455
Processing 1900/4455
Processing 2000/4455
Processing 2100/4455
Processing 2200/4455
Processing 2300/4455
Processing 2400/4455
Processing 2500/4455
Processing 2600/4455
Processing 2700/4455
Processing 2800/4455
Processing 2900/4455
Processing 3000/4455
Processing 3100/4455
Processing 3200/4455
Processing 3300/4455
Processing 3400/4455
Processing 3500/4455
Processing 3600/4455
Processing 3700/4455
Processing 3800/4455
Processing 3900/4455
Processing 4000/4455
Processing 4100/4455
Processing 4200/4455
Processing 4300/4455
Processing 4400/4455
Processed 4455 items
Preview: {'len_passed_by_line': 7579, 'len_failed_by_line': 3191

In [18]:
print(f"""
Summary:
- Line count passed: {np.mean([i["len_passed_by_line"] for i in res.values()])}
- Line count failed: {np.mean([i["len_failed_by_line"] for i in res.values()])}
- Line count filtered passed: {np.mean([i["len_filtered_passed_by_line"] for i in res.values()])}
- Line count filtered failed: {np.mean([i["len_filtered_failed_by_line"] for i in res.values()])}
- Char count passed: {np.mean([i["len_passed_by_char"] for i in res.values()])}
- Char count failed: {np.mean([i["len_failed_by_char"] for i in res.values()])}
- Char count filtered passed: {np.mean([i["len_filtered_passed_by_char"] for i in res.values()])}
- Char count filtered failed: {np.mean([i["len_filtered_failed_by_char"] for i in res.values()])}
- Hunk count: {np.mean([i["num_hunk"] for i in res.values()])}
- Hunk avg length: {np.mean([i["len_hunks_avg"] for i in res.values()])}
- Context avg length: {np.mean([i["len_context_avg"] for i in res.values()])}
""")



Summary:
- Line count passed: 5655.947025813693
- Line count failed: 4084.4466891133557
- Line count filtered passed: 3031.9959595959594
- Line count filtered failed: 2197.5957351290685
- Char count passed: 492095.23501683504
- Char count failed: 359556.30774410773
- Char count filtered passed: 274139.2307519641
- Char count filtered failed: 191430.64915824917
- Hunk count: 1.7555555555555555
- Hunk avg length: 337.9268237934905
- Context avg length: 451.624354657688



In [19]:
# for python
python = [i for i in res.values() if i["lang"] == "Python"]
print(f"""
Python:
- Line count passed: {np.mean([i["len_passed_by_line"] for i in python])}
- Line count failed: {np.mean([i["len_failed_by_line"] for i in python])}
- Line count filtered passed: {np.mean([i["len_filtered_passed_by_line"] for i in python])}
- Line count filtered failed: {np.mean([i["len_filtered_failed_by_line"] for i in python])}
- Char count passed: {np.mean([i["len_passed_by_char"] for i in python])}
- Char count failed: {np.mean([i["len_failed_by_char"] for i in python])}
- Char count filtered passed: {np.mean([i["len_filtered_passed_by_char"] for i in python])}
- Char count filtered failed: {np.mean([i["len_filtered_failed_by_char"] for i in python])}
- Hunk count: {np.mean([i["num_hunk"] for i in python])}
- Hunk avg length: {np.mean([i["len_hunks_avg"] for i in python])}
- Context avg length: {np.mean([i["len_context_avg"] for i in python])}
""")


Python:
- Line count passed: 2613.2181818181816
- Line count failed: 2802.677777777778
- Line count filtered passed: 893.0454545454545
- Line count filtered failed: 1106.6338383838383
- Char count passed: 202300.68383838385
- Char count failed: 215747.87727272726
- Char count filtered passed: 65735.13939393939
- Char count filtered failed: 78639.1005050505
- Hunk count: 1.7515151515151515
- Hunk avg length: 180.09873737373738
- Context avg length: 245.74015151515152



In [22]:
# for java
java = [i for i in res.values() if i["lang"] == "Java"]
print(f"""
Java:
- Line count passed: {np.mean([i["len_passed_by_line"] for i in java])}
- Line count failed: {np.mean([i["len_failed_by_line"] for i in java])}
- Line count filtered passed: {np.mean([i["len_filtered_passed_by_line"] for i in java])}
- Line count filtered failed: {np.mean([i["len_filtered_failed_by_line"] for i in java])}
- Char count passed: {np.mean([i["len_passed_by_char"] for i in java])}
- Char count failed: {np.mean([i["len_failed_by_char"] for i in java])}
- Char count filtered passed: {np.mean([i["len_filtered_passed_by_char"] for i in java])}
- Char count filtered failed: {np.mean([i["len_filtered_failed_by_char"] for i in java])}
- Hunk count: {np.mean([i["num_hunk"] for i in java])}
- Hunk avg length: {np.mean([i["len_hunks_avg"] for i in java])}
- Context avg length: {np.mean([i["len_context_avg"] for i in java])}
""")


Java:
- Line count passed: 8257.88316008316
- Line count failed: 5195.625363825364
- Line count filtered passed: 4835.913097713098
- Line count filtered failed: 3117.0553014553016
- Char count passed: 740882.3600831601
- Char count failed: 484424.6074844075
- Char count filtered passed: 450832.3405405405
- Char count filtered failed: 287112.9663201663
- Hunk count: 1.7563409563409564
- Hunk avg length: 475.0609147609148
- Context avg length: 628.8386694386694

