In [1]:
%matplotlib inline
from datasets import *
from pyvenn import *
from tqdm import tqdm

# LOAD DATASET

In [2]:
def load_validation_dataset(dataset_name):
    dataset=load_from_disk(f'../defects4j_validation/dataset_validated/{dataset_name}')
    print(f' \n==========\n{dataset_name}\n==========\n',dataset)
    return dataset

In [3]:
codellama_vanilla=load_validation_dataset('codellama_vanilla')
codellama_classinfo=load_validation_dataset('codellama_classinfo')
codellama_classinfo_lora=load_validation_dataset('codellama_classinfo_lora')
codellama_no_classinfo_lora=load_validation_dataset('codellama_no_classinfo_lora')
repairllama=load_validation_dataset('repairllama')
repairllama_classinfo=load_validation_dataset('repairllama_classinfo')
repairllama_classinfo_lora=load_validation_dataset('repairllama_classinfo_lora')
repairllama_paper=load_validation_dataset('repairllama_paper')

 
codellama_vanilla
 Dataset({
    features: ['methodInformation', 'involvedTypesInformation', 'filePath', 'classInformation', 'buggyInfo', 'projectName', 'bug_id', 'start_line', 'end_line', 'path', 'fix_code', 'pre_context', 'post_context', 'buggy_code', 'input', 'gen', 'test_res'],
    num_rows: 479
})
 
codellama_classinfo
 Dataset({
    features: ['methodInformation', 'involvedTypesInformation', 'filePath', 'classInformation', 'buggyInfo', 'projectName', 'bug_id', 'start_line', 'end_line', 'path', 'fix_code', 'pre_context', 'post_context', 'buggy_code', 'input', 'gen', 'test_res'],
    num_rows: 479
})
 
codellama_classinfo_lora
 Dataset({
    features: ['methodInformation', 'involvedTypesInformation', 'filePath', 'classInformation', 'buggyInfo', 'projectName', 'bug_id', 'start_line', 'end_line', 'path', 'fix_code', 'pre_context', 'post_context', 'buggy_code', 'input', 'gen', 'test_res'],
    num_rows: 479
})
 
codellama_no_classinfo_lora
 Dataset({
    features: ['methodInformatio

In [4]:
dataset_dict={
    'codellama_vanilla':codellama_vanilla,
    'codellama_classinfo':codellama_classinfo,
    'codellama_classinfo_lora':codellama_classinfo_lora,
    'codellama_no_classinfo_lora':codellama_no_classinfo_lora,
    'repairllama':repairllama,
    'repairllama_classinfo':repairllama_classinfo,
    'repairllama_classinfo_lora':repairllama_classinfo_lora,
    'repairllama_paper':repairllama_paper,
}

# STATICS ANALYSIS

## correctness

In [5]:
def determine_correctness(correctness_list):
    # 按优先级确定correctness
    if 'plausible' in correctness_list:
        return 'plausible'
    elif 'wrong' in correctness_list:
        return 'wrong'
    elif 'uncompilable' in correctness_list:
        return 'uncompilable'
    else:
        return 'timeout'  # 如果列表中没有已知的correctness值

def statistics_by_correctness(dataset):
    # 初始化一个字典来临时存储每个bug_id的所有correctness值
    temp_result = {}
    # 初始化最终结果字典
    final_result = {}

    # 收集每个bug_id的所有correctness值
    for row in dataset:
        bug_id = row['bug_id']
        test_res_list = row['test_res']  # 这是一个列表，不是单个字典
        if not test_res_list:
            continue
        if bug_id not in temp_result:
            temp_result[bug_id] = []
        # 遍历test_res_list中的每个字典
        for test_res in test_res_list:
            correctness = test_res['correctness']
            temp_result[bug_id].append(correctness)

    # 确定每个bug_id的最终correctness并组织最终结果
    for bug_id, correctness_list in temp_result.items():
        final_correctness = determine_correctness(correctness_list)
        if final_correctness not in final_result:
            final_result[final_correctness] = []
        final_result[final_correctness].append(bug_id)

    return final_result



In [6]:
res_dict={}
for name in dataset_dict:
    print(f'========={name}=========')
    dataset= dataset_dict[name]
    res=statistics_by_correctness(dataset)
    lst=['plausible','wrong','uncompilable']
    s=0
    for i in lst:
        print(f'{i}:', len(res[i]))
        s+=len(res[i])
    print('timeout or error', 479-s)
    res_dict[name]=res

plausible: 116
wrong: 238
uncompilable: 119
timeout or error 6
plausible: 87
wrong: 274
uncompilable: 113
timeout or error 5
plausible: 87
wrong: 284
uncompilable: 102
timeout or error 6
plausible: 86
wrong: 269
uncompilable: 119
timeout or error 5
plausible: 114
wrong: 254
uncompilable: 106
timeout or error 5
plausible: 114
wrong: 255
uncompilable: 104
timeout or error 6
plausible: 97
wrong: 261
uncompilable: 116
timeout or error 5
plausible: 150
wrong: 254
uncompilable: 70
timeout or error 5


## length

In [7]:
def average_patch_length(dataset):
    # 初始化累计变量和计数器
    total_length = 0
    count = 0

    # 遍历数据集
    for row in dataset:
        # 获取每个bug_id对应的test_res列表
        test_res_list = row['test_res']
        if not test_res_list:
            continue
        # 遍历test_res列表中的每个字典
        for test_res in test_res_list:
            # 获取patch并累计其长度
            patch = test_res['patch']
            total_length += len(patch)
            count += 1

    # 计算平均长度，避免除以零的错误
    average_length = total_length / count if count > 0 else 0
    return average_length


# 调用函数并打印结果
for name in dataset_dict:
    print(f'========={name}=========')
    dataset= dataset_dict[name]
    avg_length=average_patch_length(dataset)
    print('average patch length', avg_length)

average patch length 174.16966363444044
average patch length 199.437156157427
average patch length 54.74960594460707
average patch length 81.9032400264492
average patch length 147.90901231899718
average patch length 169.2356182499449
average patch length 49.2953125
average patch length 156.5341963322546


## similarity

In [8]:
def levenshtein_distance(s1, s2):
    s1, s2=s1.strip(), s2.strip()
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def similarity_score(distance, max_length):
    if max_length == 0:
        return 1.0  # 假设两个空字符串是完全相似的
    return 1 - (distance / max_length)

def average_patch_similarity(dataset):
    similarity_results = {}
    all_scores = []

    for row in tqdm(dataset):
        bug_id = row['bug_id']
        test_res_list = row['test_res']
        
        if not test_res_list:
            continue
            
        scores = []

        for i in range(len(test_res_list)):
            for j in range(i+1, len(test_res_list)):
                patch1 = test_res_list[i]['patch']
                patch2 = test_res_list[j]['patch']
                max_length = max(len(patch1), len(patch2))
                distance = levenshtein_distance(patch1, patch2)
                score = similarity_score(distance, max_length)
                scores.append(score)

        # 计算并存储平均相似度分数
        if scores:
            average_score = sum(scores) / len(scores)
            similarity_results[bug_id] = average_score
            all_scores.extend(scores)  # 将所有得分添加到总列表中，以计算整个数据集的平均相似度
        else:
            similarity_results[bug_id] = 1.0  # 如果只有一个patch，假设相似度为100%

    # 计算整个数据集的平均相似度
    dataset_average_similarity = sum(all_scores) / len(all_scores) if all_scores else 1.0
    return similarity_results, dataset_average_similarity

In [None]:
# 调用函数并打印结果
for name in dataset_dict:
    print(f'========={name}=========')
    dataset= dataset_dict[name]
    sim_res, avg_sim=average_patch_similarity(dataset)
    print(f"Dataset Average Similarity Score = {avg_sim:.2%}")

In [None]:
def max_patch_list_similarity(listA, listB):
    max_similarity = 0
    for patchA in listA:
        for patchB in listB:
            distance = levenshtein_distance(patchA, patchB)
            max_length = max(len(patchA), len(patchB))
            similarity = similarity_score(distance, max_length)
            max_similarity = max(max_similarity, similarity)
    return max_similarity

def compare_datasets(dataset_dict, base_dataset_name='codellama_vanilla'):
    results = {}
    base_dataset = dataset_dict[base_dataset_name]

    for dataset_name, dataset in dataset_dict.items():
        if dataset_name == base_dataset_name:
            continue  # Skip comparing the dataset with itself
        print(f'{dataset_name} start!')
        
        results[dataset_name] = {}
        for sample in base_dataset:
            bug_id = sample['bug_id']
            test_res = sample['test_res']
            if not test_res:
                continue
                
            sampleB = dataset.filter(lambda x:x['bug_id']==bug_id)[0]
            
            listA = [test_res['patch'] for test_res in test_res]
            test_resB=sampleB['test_res']
            if not test_resB:
                continue
            listB=[]

            for res in test_resB:
                if res['correctness']=='plausible':
                    listB.append(res['patch'])
                    
            if not listB:
                continue
                
            similarity = max_patch_list_similarity(listA, listB)
            results[dataset_name][bug_id] = similarity

    return results


# 调用函数并打印结果
dataset_sim = compare_datasets(dataset_dict)

## same patch analysis

In [31]:
sim_bug_ids=[]
for dataset_name, similarities in dataset_sim.items():
    print(f"Comparing with {dataset_name}:")
    sim_thershold=1
    sim_num=0
    for bug_id, similarity_score in similarities.items():
        if similarity_score >= sim_thershold:
            sim_num+=1
            sim_bug_ids.append(bug_id)
    print(f"total_num:{len( similarities)}, sim_num: {sim_num}")

Comparing with codellama_classinfo:
total_num:87, sim_num: 53
Comparing with codellama_classinfo_lora:
total_num:87, sim_num: 40
Comparing with codellama_no_classinfo_lora:
total_num:86, sim_num: 50
Comparing with repairllama:
total_num:114, sim_num: 71
Comparing with repairllama_classinfo:
total_num:114, sim_num: 63
Comparing with repairllama_classinfo_lora:
total_num:97, sim_num: 42
Comparing with repairllama_paper:
total_num:150, sim_num: 82


In [None]:
for sample in repairllama_paper:
    bug_id=sample['bug_id']
    if bug_id in sim_bug_ids:
        print(f"\n====={bug_id}=====\n")
        print(sample['input'])
        print('fix: ', sample['fix_code'])
        for i in sample['test_res']:
            if i['correctness']=='plausible':
                print('------------------')
                print(i['patch'])

# PATCH analysis

## codellama vanilla pluasible patches

In [40]:
for sample in codellama_vanilla:
    if not sample['test_res']:
        continue
    is_plausible=len([i for i in sample['test_res'] if i['correctness']=='plausible']) > 0
    if is_plausible:
        bug_id=sample['bug_id']
        print(f"\n====={bug_id}=====\n")
        print(sample['input'])
        print('fix: ', sample['fix_code'])

        for i in sample['test_res']:
            if i['correctness']=='plausible':
                print('------------------')
                print(i['patch'])


=====Math-30=====

<PRE>     private double calculateAsymptoticPValue(final double Umin,
                                             final int n1,
                                             final int n2)
        throws ConvergenceException, MaxCountExceededException {
 <SUF>        final double EU = n1n2prod / 2.0;
        final double VarU = n1n2prod * (n1 + n2 + 1) / 12.0;
        final double z = (Umin - EU) / FastMath.sqrt(VarU);
        final NormalDistribution standardNormal = new NormalDistribution(0, 1);
        return 2 * standardNormal.cumulativeProbability(z);
    }
 <MID>
fix:  final double n1n2prod = n1 * n2;
------------------
       final double n1n2prod = n1 * n2;

------------------
       final double n1n2prod = (double) n1 * n2;

------------------
       final double n1n2prod = n1 * n2;
        if (n1n2prod == 0) {
            return 1;
        }

------------------
       final double n1n2prod = ((double) n1) * ((double) n2);

------------------
       final do

## unique bug_id

In [36]:
def find_unique_plausible_ids(res_dict):
    # 初始化一个字典来存储最终结果
    unique_plausible_ids = {dataset: [] for dataset in res_dict}

    # 遍历每个数据集，找到每个数据集中的plausible bug_id
    for dataset, correctness_dict in res_dict.items():
        plausible_ids = correctness_dict.get('plausible', [])
        
        # 对于每个plausible的bug_id，检查它是否在其他数据集中出现过
        for bug_id in plausible_ids:
            is_unique = True  # 假设当前bug_id是唯一的，直到证明它在其他数据集中出现过
            
            # 检查其他数据集
            for other_dataset, other_correctness_dict in res_dict.items():
                if dataset == other_dataset:
                    continue  # 跳过当前正在检查的数据集
                # 如果bug_id在其他数据集的任何correctness下出现，则不是唯一的
                if any(bug_id in ids for ids in other_correctness_dict['plausible']):
                    is_unique = False
                    break  # 一旦找到重复项，就停止检查当前bug_id
            
            # 如果当前bug_id是唯一的，则添加到结果字典中
            if is_unique:
                unique_plausible_ids[dataset].append(bug_id)

    return unique_plausible_ids


# 调用函数并打印结果
unique_plausible_ids = find_unique_plausible_ids(res_dict)
print(unique_plausible_ids)


{'codellama_vanilla': ['Csv-14', 'Csv-15', 'Compress-41', 'Chart-10', 'Closure-78', 'Mockito-22', 'Gson-18', 'Compress-26', 'Closure-58', 'Lang-37', 'Chart-24', 'Compress-32'], 'codellama_classinfo': ['JacksonXml-4', 'Lang-40'], 'codellama_classinfo_lora': ['Closure-109', 'Lang-38'], 'codellama_no_classinfo_lora': ['Closure-124', 'Jsoup-49'], 'repairllama': ['Compress-24', 'Closure-65', 'Time-18', 'Cli-37', 'Compress-44'], 'repairllama_classinfo': ['JacksonDatabind-24', 'Lang-16', 'JacksonDatabind-39', 'Lang-55'], 'repairllama_classinfo_lora': ['Math-105', 'Codec-2', 'Csv-5']}


In [None]:
print('unique bug_id results')
for name in unique_plausible_ids:
    id_lst=unique_plausible_ids[name]
    dataset=dataset_dict[name]
    
    print(f'========={name}=========')
    for ids in id_lst:
        sample=dataset.filter(lambda x:x['bug_id']==ids)[0]
        print(f'====={ids}=====')
        print(sample['input'])
        print('fix: ', sample['fix_code'])
        for i in sample['test_res']:
            del i['test_message']
            if i['correctness']=='plausible':
                print('-----------------')
                print(i['patch'])
    

# repairllama check

In [54]:
repairllama_result=load_dataset('json', data_files='/Users/17988/PycharmProjects/repairllama/results/defects4j/repairllama/lora/RepairLLaMA_defects4j_f2f_bugs_results_ir4_or2.jsonl', split='all')

Found cached dataset json (C:/Users/17988/.cache/huggingface/datasets/json/default-bfbb90bc52752341/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [60]:
plausible_id_list=[]
for sample in repairllama_result:
    bug_id, res=sample['bug_id'], sample['test_results']
    if 'Line match' in res or 'Plausible' in res or 'AST match' in res:
        plausible_id_list.append(bug_id) 

In [61]:
len(plausible_id_list)

196

In [None]:
for sample in repairllama_paper:
    bug_id, res=sample['bug_id'], sample['test_res']
    if bug_id in plausible_id_list:
        different=True
        for patch in res:
            if patch['correctness']=='plausible':
                different=False
                break
        if different:
            print(sample['input'])
            print('fix: ', sample['fix_code'])
            for i in res:
                del i['test_message']
                print(i)