In [1]:
%matplotlib inline
import os
import pickle
import json
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import dotenv
dotenv.load_dotenv()

True

In [2]:
plt.style.use('../style/style-formal.mplstyle')

In [3]:
RESEARCH_DATA = os.environ.get("RESEARCH_DATA")

EL = "attempt_1"
PIDS = ["Lang", "Mockito", "Math", "Time", "Chart"]

SBFL_FORMULA = [
    "tarantula", "ochiai", "dstar",
    "naish1", "naish2", "gp13"
]

TRANSITION_TYPES = {
    "type1": "result_transition",
    "type2": "exception_type_transition",
    "type3": "exception_msg_transition",
    "type4": "stacktrace_transition",
    "type5": "all_types_transition"
}

curr_dir = os.getcwd()
root_dir = os.path.dirname(curr_dir)
dot_exp_config_file = os.path.join(root_dir, ".experiment_config-rq3")
EXP_CONFIG = json.load(open(dot_exp_config_file, "r"))

LINE_CNT = EXP_CONFIG["target_lines"][-1]
MUT_CNT = EXP_CONFIG["mutation_cnt"][-1]
TCS_REDUCTION = EXP_CONFIG["tcs_reduction"]
TCS_EXP_LIST = ["All"]

if len(EXP_CONFIG["target_lines"]) > 1:
    EXPERIMENT_TYPE = "lineCnt"
elif len(EXP_CONFIG["mutation_cnt"]) > 1:
    EXPERIMENT_TYPE = "mutCnt"
else:
    EXPERIMENT_TYPE = "tcsReduction"
    TCS_EXP_LIST.append("Reduced")

TOP_N = [1, 3, 5, 10]

# Create output directories for each PID and a combined results directory
PID_OUT_DIRS = {}
for PID in PIDS:
    # PID_OUT_DIR = os.path.join(RESEARCH_DATA, EL, f"{PID}-v1", "experiment_information_results")
    PID_OUT_DIR = os.path.join(RESEARCH_DATA, EL, f"{PID}", "experiment_information_results")
    if not os.path.exists(PID_OUT_DIR):
        os.makedirs(PID_OUT_DIR, exist_ok=True)
    PID_OUT_DIRS[PID] = PID_OUT_DIR

# Create a combined results directory
COMBINED_OUT_DIR = os.path.join(RESEARCH_DATA, EL, "combined_experiment_results")
if not os.path.exists(COMBINED_OUT_DIR):
    os.makedirs(COMBINED_OUT_DIR, exist_ok=True)

print(f"Processing {len(PIDS)} projects: {PIDS}")
print(f"Individual project results will be saved to respective directories")
print(f"Combined results will be saved to: {COMBINED_OUT_DIR}")

Processing 5 projects: ['Lang', 'Mockito', 'Math', 'Time', 'Chart']
Individual project results will be saved to respective directories
Combined results will be saved to: /ssd_home/yangheechan/d4j_research_data/attempt_1/combined_experiment_results


In [4]:
# set the directory to upper one level so that I can import lib.database
os.chdir(os.path.dirname(curr_dir))
from lib.database import CRUD

In [5]:
def measure_sbfl_execution_time():
    # Initiate CRUD
    db = CRUD(
        host=os.environ.get("DB_HOST"),
        port=os.environ.get("DB_PORT"),
        user=os.environ.get("DB_USER"),
        password=os.environ.get("DB_PASSWORD"),
        database=os.environ.get("DB_NAME"),
        slack_channel=os.environ.get("SLACK_CHANNEL"),
        slack_token=os.environ.get("SLACK_TOKEN"),
    )

    sbfl_time = {}
    for pid in PIDS:
        sbfl_time[pid] = {}
        fault_info_res = db.read(
            "d4j_fault_info",
            columns="fault_idx, bug_id",
            conditions={
                "project": pid,
                "experiment_label": EL,
            }
        )

        for fault_idx, bug_id in fault_info_res:
            sbfl_time[pid][bug_id] = {}
            tc_info_res = db.read(
                "d4j_tc_info",
                columns="tc_idx, result, execution_time_ms",
                conditions={
                    "fault_idx": fault_idx,
                }
            )

            for tc_idx, result, execution_time_ms in tc_info_res:
                sbfl_time[pid][bug_id][tc_idx] = execution_time_ms
    
    # with open("exec_ms", "w") as f:
    #     json.dump(sbfl_time, f, indent=4)

    sbfl_total_time_ms = 0
    for pid, bug_data in sbfl_time.items():
        for bug_id, tc_data in bug_data.items():
            for tc_idx, exec_time in tc_data.items():
                sbfl_total_time_ms += exec_time
    
    db.__del__()
    return sbfl_total_time_ms


In [11]:
def record_time_per_method(pid, bid, rid, lineIdx2lineData, mbfl_time):
    if EXPERIMENT_TYPE == "lineCnt":
        exp_list = EXP_CONFIG["target_lines"]
    elif EXPERIMENT_TYPE == "mutCnt":
        exp_list = EXP_CONFIG["mutation_cnt"]
    elif EXPERIMENT_TYPE == "tcsReduction":
        exp_list = TCS_EXP_LIST

    for exp_cnt in exp_list:
        if EXPERIMENT_TYPE == "lineCnt":
            time_key = f"lineCnt{exp_cnt}_mutCnt{MUT_CNT}_tcs{TCS_REDUCTION}_result_transition_total_execution_time_ms"
        elif EXPERIMENT_TYPE == "mutCnt":
            time_key = f"lineCnt{LINE_CNT}_mutCnt{exp_cnt}_tcs{TCS_REDUCTION}_result_transition_total_execution_time_ms"
        elif EXPERIMENT_TYPE == "tcsReduction":
            time_key = f"lineCnt{LINE_CNT}_mutCnt{MUT_CNT}_tcs{exp_cnt}_result_transition_total_execution_time_ms"

        # Get first_data in lineIdx2lineData
        first_data = next(iter(lineIdx2lineData.values()))
        total_execution_time_ms = first_data[time_key]
        # # WARNING WILL BE REMOVED AFTER CONSTRUCTING THE DATA AGAIN
        # first_data = {}
        # total_execution_time_ms = -1
        # for key, data in lineIdx2lineData.items():
        #     if time_key in data.keys():
        #         total_execution_time_ms = data[time_key]

        # if total_execution_time_ms == -1:
        #     time_key = time_key.replace("type1", "result_transition")
        #     # print(f"Warning: No execution time found for {pid}, {bid}, {rid}, {time_key}. Setting to -1.")

        # first_data = {}
        # total_execution_time_ms = -1
        # for key, data in lineIdx2lineData.items():
        #     if time_key in data.keys():
        #         total_execution_time_ms = data[time_key]
        
        # if total_execution_time_ms == -1:
        #     # print(f"Warning: No execution time found for {pid}, {bid}, {rid}, {time_key}. Setting to -1.")
        #     raise ValueError(f"No execution time found for {pid}, {bid}, {rid}, {time_key}. Please check the data.")


        if time_key not in mbfl_time:
            mbfl_time[time_key] = {}
        
        if pid not in mbfl_time[time_key]:
            mbfl_time[time_key][pid] = {}
        
        if bid not in mbfl_time[time_key][pid]:
            mbfl_time[time_key][pid][bid] = {}
        
        if rid not in mbfl_time[time_key][pid][bid]:
            mbfl_time[time_key][pid][bid][rid] = {}

        # total_execution_time_ms = first_data[time_key]
        mbfl_time[time_key][pid][bid][rid] = total_execution_time_ms

def measure_mbfl_execution_time():
    mbfl_time = {}

    for pid in PIDS:
        for rid in range(1, EXP_CONFIG["num_repeats"]+1):
            RID_DIR_NAME = f"repeat_{rid}"
            # RID_DIR = os.path.join(RESEARCH_DATA, EL, f"{pid}-v1", "experiment_raw_results", RID_DIR_NAME)
            RID_DIR = os.path.join(RESEARCH_DATA, EL, f"{pid}", "experiment_raw_results", RID_DIR_NAME)

            for bid_res_file in os.listdir(RID_DIR):
                pck_file = os.path.join(RID_DIR, bid_res_file)
                with open(pck_file, "rb") as f:
                    bid = int(bid_res_file.split("_")[0])

                    lineIdx2lineData = pickle.load(f)
                    record_time_per_method(pid, bid, RID_DIR_NAME, lineIdx2lineData, mbfl_time)
    
    # Measure the average for each repeat across each bugs
    # and measure the total execution time ms
    # for each method
    method2time = {}
    for method, pid_data in mbfl_time.items():
        method2time[method] = []

        pid_total = 0
        for pid, bid_data in pid_data.items():
            bid_total = 0
            for bid, rid_data in bid_data.items():
                rid_time_list = []
                for rid, exec_time in rid_data.items():
                    rid_time_list.append(exec_time)
                bid_total += sum(rid_time_list) / len(rid_time_list)
            pid_total += bid_total
        method2time[method].append(pid_total)
    
    for method, times in method2time.items():
        total_time = sum(times)
        print(f"Method: {method}, Total Execution Time: {total_time} ms")
        
            

    with open("mbfl_exec_ms.json", "w") as f:
        json.dump(mbfl_time, f, indent=4)

    return method2time


In [12]:
def measure_dlfl_execution_time():
    if EXPERIMENT_TYPE == "lineCnt":
        exp_list = EXP_CONFIG["target_lines"]
    elif EXPERIMENT_TYPE == "mutCnt":
        exp_list = EXP_CONFIG["mutation_cnt"]
    elif EXPERIMENT_TYPE == "tcsReduction":
        exp_list = TCS_EXP_LIST
    
    dlfl_time = {}
    for exp_cnt in exp_list:
        if EXPERIMENT_TYPE == "lineCnt":
            method_key = f"lineCnt{exp_cnt}_mutCnt{MUT_CNT}_tcs{TCS_REDUCTION}"
        elif EXPERIMENT_TYPE == "mutCnt":
            method_key = f"lineCnt{LINE_CNT}_mutCnt{exp_cnt}_tcs{TCS_REDUCTION}"
        elif EXPERIMENT_TYPE == "tcsReduction":
            method_key = f"lineCnt{LINE_CNT}_mutCnt{MUT_CNT}_tcs{exp_cnt}"

        if method_key not in dlfl_time:
            dlfl_time[method_key] = {}

        for rid in range(1, EXP_CONFIG["num_repeats"]+1):
            RID_DIR_NAME = f"repeat_{rid}"
            # final_results_json = os.path.join(RESEARCH_DATA, EL, "dlfl_out-v1/experiment_raw_results", RID_DIR_NAME, "methods", method_key, "final_results.json")
            final_results_json = os.path.join(RESEARCH_DATA, EL, "dlfl_out/experiment_raw_results", RID_DIR_NAME, "methods", method_key, "final_results.json")

            if RID_DIR_NAME not in dlfl_time[method_key]:
                dlfl_time[method_key][RID_DIR_NAME] = {}
            
            results_data = json.load(open(final_results_json, "r"))
            time_secs = results_data["total"]["train_time_seconds"]
            dlfl_time[method_key][RID_DIR_NAME] = time_secs
    
    # Measure the average for each repeat across each bugs
    # and measure the total execution time ms
    method2time = {}
    for method, rid_data in dlfl_time.items():
        method2time[method] = []

        rid_total = 0
        for rid, exec_time in rid_data.items():
            rid_total += exec_time
        method2time[method].append(rid_total)

    for method, times in method2time.items():
        total_time = sum(times)
        print(f"Method: {method}, Total Execution Time: {total_time} seconds")

    return method2time


In [13]:
def make_csv_time(sbfl_total_time_ms, mbfl_time, dlfl_time):
    methods2time = {}
    for method, time in dlfl_time.items():
        methods2time[method] = {
            "sbfl": sbfl_total_time_ms/1000,
            "mbfl": 0,
            "dlfl": sum(time)
        }

        for key, mbfl_data in mbfl_time.items():
            if method in key and sum(mbfl_data) != 0:
                methods2time[method]["mbfl"] = sum(mbfl_data)/1000

        methods2time[method]["total_seconds"] = methods2time[method]["sbfl"] + methods2time[method]["mbfl"] + methods2time[method]["dlfl"]
        methods2time[method]["total_minutes"] = methods2time[method]["total_seconds"] / 60
        methods2time[method]["total_hours"] = methods2time[method]["total_minutes"] / 60

    df = pd.DataFrame.from_dict(methods2time, orient='index')
    df = df.reset_index().rename(columns={"index": "method"})
    df = df[["method", "sbfl", "mbfl", "dlfl", "total_seconds", "total_minutes", "total_hours"]]
    df.to_csv(os.path.join(COMBINED_OUT_DIR, f"execution_time_summary-{EXPERIMENT_TYPE}.csv"), index=False)

In [14]:
# 1. Measure SBFL execution time
sbfl_total_time_ms = measure_sbfl_execution_time()
print(f"Total SBFL execution time: {sbfl_total_time_ms} ms")

Total SBFL execution time: 548777.340402993 ms


In [15]:
# 2. Measure MBFL execution time (average across repeats)
method2mbfltime = measure_mbfl_execution_time()

Method: lineCnt70_mutCnt7_tcsAll_result_transition_total_execution_time_ms, Total Execution Time: 1050136324.1384995 ms
Method: lineCnt70_mutCnt7_tcsReduced_result_transition_total_execution_time_ms, Total Execution Time: 404624208.2690169 ms


In [16]:
# 3. Measure DLFL execution time (average across repeats)
method2dlfltime = measure_dlfl_execution_time()

Method: lineCnt70_mutCnt7_tcsAll, Total Execution Time: 1573.4424171447754 seconds
Method: lineCnt70_mutCnt7_tcsReduced, Total Execution Time: 1511.7837388515472 seconds


In [17]:
make_csv_time(sbfl_total_time_ms, method2mbfltime, method2dlfltime)