In [1]:
from datasets import load_from_disk

## 加载验证数据集

In [19]:
def load_validation_dataset(dataset_name):
    dataset=load_from_disk(f'../defects4j_validation/dataset_validated/{dataset_name}')
    print(f' \n==========\n{dataset_name}\n==========\n',dataset)
    return dataset

In [None]:
codellama_vanilla=load_validation_dataset('codellama_vanilla')
codellama_classinfo=load_validation_dataset('codellama_classinfo')
codellama_classinfo_lora=load_validation_dataset('codellama_classinfo_lora')
codellama_no_classinfo_lora=load_validation_dataset('codellama_no_classinfo_lora')
repairllama=load_validation_dataset('repairllama')
repairllama_classinfo=load_validation_dataset('repairllama_classinfo')
repairllama_classinfo_lora=load_validation_dataset('repairllama_classinfo_lora')

In [21]:
dataset_dict={
    'codellama_vanilla':codellama_vanilla,
    'codellama_classinfo':codellama_classinfo,
    'codellama_classinfo_lora':codellama_classinfo_lora,
    'codellama_no_classinfo_lora':codellama_no_classinfo_lora,
    'repairllama':repairllama,
    'repairllama_classinfo':repairllama_classinfo,
    'repairllama_classinfo_lora':repairllama_classinfo_lora,
}

## 统计correctness

In [22]:
def determine_correctness(correctness_list):
    # 按优先级确定correctness
    if 'plausible' in correctness_list:
        return 'plausible'
    elif 'wrong' in correctness_list:
        return 'wrong'
    elif 'uncompilable' in correctness_list:
        return 'uncompilable'
    else:
        return 'timeout'  # 如果列表中没有已知的correctness值

def statistics_by_correctness(dataset):
    # 初始化一个字典来临时存储每个bug_id的所有correctness值
    temp_result = {}
    # 初始化最终结果字典
    final_result = {}

    # 收集每个bug_id的所有correctness值
    for row in dataset:
        bug_id = row['bug_id']
        test_res_list = row['test_res']  # 这是一个列表，不是单个字典
        if not test_res_list:
            continue
        if bug_id not in temp_result:
            temp_result[bug_id] = []
        # 遍历test_res_list中的每个字典
        for test_res in test_res_list:
            correctness = test_res['correctness']
            temp_result[bug_id].append(correctness)

    # 确定每个bug_id的最终correctness并组织最终结果
    for bug_id, correctness_list in temp_result.items():
        final_correctness = determine_correctness(correctness_list)
        if final_correctness not in final_result:
            final_result[final_correctness] = []
        final_result[final_correctness].append(bug_id)

    return final_result



In [30]:
res_dict={}
for name in dataset_dict:
    print(f'========={name}=========')
    dataset= dataset_dict[name]
    res=statistics_by_correctness(dataset)
    lst=['plausible','wrong','uncompilable']
    s=0
    for i in lst:
        print(f'{i}:', len(res[i]))
        s+=len(res[i])
    print('timeout or error', 479-s)
    res_dict[name]=res

plausible: 116
wrong: 238
uncompilable: 119
timeout or error 6
plausible: 87
wrong: 274
uncompilable: 113
timeout or error 5
plausible: 87
wrong: 284
uncompilable: 102
timeout or error 6
plausible: 86
wrong: 269
uncompilable: 119
timeout or error 5
plausible: 114
wrong: 254
uncompilable: 106
timeout or error 5
plausible: 114
wrong: 255
uncompilable: 104
timeout or error 6
plausible: 97
wrong: 261
uncompilable: 116
timeout or error 5


In [32]:
def count_correctness(dataset):
    # 初始化一个字典来计数不同correctness值的出现次数
    correctness_count = {}

    # 遍历数据集中的每一行
    for row in dataset:
        test_res_list = row['test_res']  # 获取当前bug_id的test_res列表
        if not test_res_list:
            continue
        # 遍历test_res_list中的每个字典
        for test_res in test_res_list:
            correctness = test_res['correctness']
            # 更新correctness值的出现次数
            if correctness in correctness_count:
                correctness_count[correctness] += 1
            else:
                correctness_count[correctness] = 1

    return correctness_count


# 调用函数并打印结果
correctness_count = count_correctness(codellama_vanilla)
print(correctness_count)


{'uncompilable': 2311, 'plausible': 380, 'wrong': 1852, 'timeout': 184}


In [35]:
for name in dataset_dict:
    print(f'========={name}=========')
    dataset= dataset_dict[name]
    res=count_correctness(dataset)
    lst=['plausible','wrong','uncompilable']
    for i in lst:
        print(f'{i}:', res[i])

plausible: 380
wrong: 1852
uncompilable: 2311
plausible: 246
wrong: 1748
uncompilable: 2624
plausible: 199
wrong: 1845
uncompilable: 2324
plausible: 210
wrong: 1802
uncompilable: 2449
plausible: 342
wrong: 1910
uncompilable: 2246
plausible: 276
wrong: 1689
uncompilable: 2483
plausible: 199
wrong: 1585
uncompilable: 2636
