## list defects4j projects used by repairllama

In [38]:
import os

def list_subfolders(directory):
    """列出指定目录下所有子文件夹的名称"""
    subfolders = [f.name for f in os.scandir(directory) if f.is_dir()]
    return subfolders

# 使用示例
directory_path = '/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/repairllama/fft/patches'
# directory_path = '/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/codellama/patches'
subfolders = list_subfolders(directory_path)
print(len(subfolders))

479


## find the patch file

In [39]:
def find_path_file(project_id):
    project, number=project_id.split('-')[0], project_id.split('-')[1]
    base_path='/Users/alex.wu/defects4j/framework/projects'
    file_path=f'{base_path}/{project}/patches/{number}.src.patch'
    return file_path

def find_project_root_file(project_id):
    project, number=project_id.split('-')[0].lower(), project_id.split('-')[1]
    base_path='/Users/alex.wu/defects4j_projects_buggy'
    file_path=f'{base_path}/{project.lower()}_{number}'
    return file_path

## get file startline

In [40]:
import re

def extract_line_number_and_code_path_from_diff_file(file_path):
    """
    从存储在文件中的diff内容提取更改开始的行号和代码地址。
    
    参数:
    - file_path: 字符串，指向包含diff的文本文件的路径。
    
    返回:
    - tuple: (行号, 代码地址)。如果没有找到，返回(None, None)。
    """
    try:
        with open(file_path, 'r') as file:
            diff_content = file.read()
            # 使用正则表达式匹配文件路径
            path_match = re.search(r'diff --git a/(.*?) b/', diff_content)
            code_path = path_match.group(1) if path_match else None
            
            # 使用正则表达式匹配@@ -x,y +x,y @@格式以提取行号
            line_match = re.search(r'@@ -\d+,\d+ \+(\d+),\d+ @@', diff_content)
            line_number = int(line_match.group(1)) + 3 if line_match else None
            
            return line_number, code_path
    except FileNotFoundError:
        print(f"文件'{file_path}'未找到。")
        return None, None
    except Exception as e:
        print(f"读取文件时发生错误：{e}")
        return None, None

# 注意：这里的find_path_file函数或变量似乎未定义。请确保替换file_path为正确的diff文件路径。
file_path = find_path_file('Chart-1')

# 使用函数并打印结果
line_number, code_path = extract_line_number_and_code_path_from_diff_file(file_path)
if line_number is not None and code_path is not None:
    print(f"更改开始的行号是：{line_number}")
    print(f"代码地址是：{code_path}")
else:
    print("未能从diff中提取出行号和代码地址。")



更改开始的行号是：1797
代码地址是：source/org/jfree/chart/renderer/category/AbstractCategoryItemRenderer.java


In [41]:
import os

def find_file_by_tail(root_dir, tail_path):
    """
    在指定的根目录下查找路径尾部匹配给定尾部路径的文件地址。

    参数:
    - root_dir: 字符串，要搜索的根目录路径。
    - tail_path: 字符串，需要匹配的文件地址路径尾部。

    返回:
    - 完整的文件地址，如果找到匹配的文件。否则返回None。
    """
    # 确保尾部路径以斜杠开头
    if not tail_path.startswith('/'):
        tail_path = '/' + tail_path

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            # 构造当前文件的完整路径
            file_path = os.path.join(dirpath, filename)
            # 检查文件路径是否以指定的尾部匹配
            if file_path.endswith(tail_path):
                return file_path  # 返回匹配的完整文件地址

    # 如果遍历了整个根目录但没有找到匹配，返回None
    return None

# 示例用法
root_dir = '/Users/alex.wu/defects4j_projects_buggy/chart_1'  # 要搜索的根目录
tail_path = 'source/org/jfree/chart/renderer/category/AbstractCategoryItemRenderer.java'  # 需要匹配的文件地址路径尾部
matched_file = find_file_by_tail(root_dir, tail_path)

if matched_file:
    print(f"找到匹配的文件地址：{matched_file}")
else:
    print("没有找到匹配的文件。")


找到匹配的文件地址：/Users/alex.wu/defects4j_projects_buggy/chart_1/source/org/jfree/chart/renderer/category/AbstractCategoryItemRenderer.java


## find repairllama prompt information

In [42]:
def find_repairllama_path_file(project_id):
    base_path='/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/repairllama/fft/patches'
    prompt_file_path=f'{base_path}/{project_id}/prompt.txt'
    target_file_path=f'{base_path}/{project_id}/target.diff'
    return prompt_file_path, target_file_path

In [43]:
find_repairllama_path_file("Chart-1")

('/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/repairllama/fft/patches/Chart-1/prompt.txt',
 '/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/repairllama/fft/patches/Chart-1/target.diff')

In [44]:
def find_and_extract_code_blocks(file_path, start_marker="// buggy code", end_marker="<FILL_ME>"):
    """
    在文件中查找特定注释代码块的开始行数和结束行数，并提取相关代码段。

    参数:
    - file_path: 字符串，指向文本文件的路径。
    - start_marker: 字符串，标记注释代码块开始的注释内容。
    - end_marker: 字符串，标记注释代码块结束的内容。

    返回:
    - tuple: 包含开始行数、结束行数、开始标记前的代码字符串、结束标记后的代码字符串、被注释的代码字符串。
             如果未找到，相应位置返回None。
    """
    start_line = None
    end_line = None
    pre_code = ""  # 开始标记前的代码
    post_code = ""  # 结束标记后的代码
    commented_code = ""  # 被注释的代码
    in_commented_block = False
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i, line in enumerate(lines):
            if start_marker in line:
                start_line = i + 1  # 文件行数从1开始
                in_commented_block = True
                continue
            if end_marker in line:
                end_line = i + 1  # 结束行的处理稍有不同
                in_commented_block = False
                # 继续累积post_code，直到文件结束
                continue
            if in_commented_block:
                commented_code += line
            elif end_line is not None:
                post_code += line
            else:
                pre_code += line

    return start_line, end_line, pre_code, post_code, commented_code.replace('// ', '').strip()

# 将这里的文件路径替换为你的实际文件路径
file_path = find_repairllama_path_file("Chart-5")[0]

# 使用函数并打印结果
start_line, end_line, pre_code, post_code, commented_code = find_and_extract_code_blocks(file_path)
if start_line is not None and end_line is not None:
    print(f"被注释的代码块开始于第{start_line}行，结束于第{end_line}行。")
    print("开始标记前的代码:\n", pre_code)
    print("结束标记后的代码:\n", post_code)
    print("被注释的代码:\n", commented_code)
else:
    print("未找到指定的被注释代码块。")



被注释的代码块开始于第5行，结束于第9行。
开始标记前的代码:
     public XYDataItem addOrUpdate(Number x, Number y) {
        if (x == null) {
            throw new IllegalArgumentException("Null 'x' argument.");
        }

结束标记后的代码:
             XYDataItem existing = (XYDataItem) this.data.get(index);
            try {
                overwritten = (XYDataItem) existing.clone();
            }
            catch (CloneNotSupportedException e) {
                throw new SeriesException("Couldn't clone XYDataItem!");
            }
            existing.setY(y);
        }
        else {
            if (this.autoSort) {
                this.data.add(-index - 1, new XYDataItem(x, y));
            }
            else {
                this.data.add(new XYDataItem(x, y));
            }
            if (getItemCount() > this.maximumItemCount) {
                this.data.remove(0);
            }
        }
        fireSeriesChanged();
        return overwritten;
    }

被注释的代码:
 XYDataItem overwritten = null;
       int index =

In [None]:
def process_diff_file(file_path):
    """
    从diff文件中提取修改后的内容，去除不需要的diff标记和删除的内容。

    参数:
    - file_path: 字符串，指向diff文件的路径。

    返回:
    - processed_content: 字符串，处理后的文件内容。
    """
    processed_lines = []  # 存储处理后的行

    try:
        with open(file_path, 'r') as file:
            for line in file:
                # 跳过不需要的行
                if line.startswith('--- ') or line.startswith('+++ ') or line.startswith('@@ '):
                    continue
                elif line.startswith('-'):
                    continue
                # 移除以+开头的行的+符号
                elif line.startswith('+'):
                    processed_lines.append(' '+line[1:])
                else:
                    processed_lines.append(line)
    except FileNotFoundError:
        print(f"文件'{file_path}'未找到。")
        return ""
    except Exception as e:
        print(f"读取文件时发生错误：{e}")
        return ""

    # 合并处理后的行为一个字符串并返回
    processed_content = ''.join(processed_lines)
    return processed_content

# 使用示例
file_path = find_repairllama_path_file('Chart-4')[1]  # 将这里的路径替换为你的实际文件路径

# 调用函数并打印结果
processed_content = process_diff_file(file_path)
print(processed_content)


## 构建info jsonl

In [46]:
import re

def remove_java_comments_correctly(code):
    """
    正确地去除Java代码中的单行和多行注释，不移除注释前的换行符。

    参数:
    - code: 字符串，包含Java代码。

    返回:
    - 去除注释的代码字符串，保留其他内容不变。
    """
    # 移除多行注释，这里假设多行注释不跨越多个非空白行
    code_no_multiline_comments = re.sub(r'/\*[\s\S]*?\*/', '', code)
    # 移除单行注释及其后的换行符，如果存在
    cleaned_code = re.sub(r'(?m)^ *//.*\n?', '', code_no_multiline_comments)

    return cleaned_code.strip()

# 示例Java代码字符串
java_code = """
public class HelloWorld {
    public static void main(String[] args) {
        // 这是单行注释
        System.out.println("Hello, world!");
    }
}
"""

# 使用函数并打印结果
cleaned_code = remove_java_comments_correctly(java_code)
print(cleaned_code)


public class HelloWorld {
    public static void main(String[] args) {
        System.out.println("Hello, world!");
    }
}


In [47]:
def remove_spaces_newlines_and_get_indices(java_code):
    cleaned_code = ""
    indices = []
    for index, char in enumerate(java_code):
        if char not in [' ', '\n', '\r', '\t']:
            cleaned_code += char
            indices.append(index)
    return indices, cleaned_code

def find_substring_indices(main_string, substring):
    start_index = main_string.find(substring)
    
    # 如果找不到子字符串，则返回-1
    if start_index == -1:
        return -1, -1

    end_index = start_index + len(substring) - 1
    return start_index, end_index

def replace_sub(java_code, java_patch):
    rm_code_ind_lst, rm_java_code=remove_spaces_newlines_and_get_indices(java_code)
    _, rm_java_patch=remove_spaces_newlines_and_get_indices(java_patch)
    start_ind, end_ind=find_substring_indices(rm_java_code, rm_java_patch)
    code_start_ind, code_end_ind=rm_code_ind_lst[start_ind], rm_code_ind_lst[end_ind]
    patch=java_code[:code_start_ind]+java_code[code_end_ind+1:]
    return patch

In [48]:
def build_info_jsonl_line(project_id):
    json={}
    project, number=project_id.split('-')[0].lower(), project_id.split('-')[1]

    patch_file_path = find_path_file(project_id)
    root_dir = find_project_root_file(project_id)
    
    file_start_line, tail_file_path = extract_line_number_and_code_path_from_diff_file(patch_file_path)
    real_file_path = find_file_by_tail(root_dir, tail_file_path)
    json['startLine'] = file_start_line
    json['filePath']= real_file_path
    json['mutationStrategy']='RELATIONAL_EXPRESSION_MUTATION'
    json['projectName']=f'{project.lower()}_{number}'
    
    prompt_file_path, target_file_path = find_repairllama_path_file(project_id)
    patch_start_line, patch_end_line, before_code, after_code, buggy_code=find_and_extract_code_blocks(prompt_file_path)
    json['endLine'] = file_start_line + patch_end_line - patch_start_line - 1
    json['methodPreContext'] = before_code
    json['methodPostContext'] = after_code
    json['buggyCode'] = buggy_code
    
    target=process_diff_file(target_file_path)
    target=remove_java_comments_correctly(target)
    target=replace_sub(target, before_code)
    target=replace_sub(target, after_code)
    json['originalCode'] = target.strip()
    return json

build_info_jsonl_line('Chart-4')

{'startLine': 4493,
 'filePath': '/Users/alex.wu/defects4j_projects_buggy/chart_4/source/org/jfree/chart/plot/XYPlot.java',
 'mutationStrategy': 'RELATIONAL_EXPRESSION_MUTATION',
 'projectName': 'chart_4',
 'endLine': 4501,
 'methodPreContext': '    public Range getDataRange(ValueAxis axis) {\n        Range result = null;\n        List mappedDatasets = new ArrayList();\n        List includedAnnotations = new ArrayList();\n        boolean isDomainAxis = true;\n        int domainIndex = getDomainAxisIndex(axis);\n        if (domainIndex >= 0) {\n            isDomainAxis = true;\n            mappedDatasets.addAll(getDatasetsMappedToDomainAxis(\n                    new Integer(domainIndex)));\n            if (domainIndex == 0) {\n                Iterator iterator = this.annotations.iterator();\n                while (iterator.hasNext()) {\n                    XYAnnotation annotation = (XYAnnotation) iterator.next();\n                    if (annotation instanceof XYAnnotationBoundsInfo) {\n

## 保存jsonl文件

In [49]:
import json
with open('defects4j.jsonl', 'w') as file:
    for i in subfolders:
        try:
            jsonline = build_info_jsonl_line(i)
            json_string = json.dumps(jsonline)  
            file.write(json_string + '\n')
        except Exception as e:
            print(i, e)

Lang-29 list index out of range
Chart-7 'NoneType' object has no attribute 'startswith'
Math-8 list index out of range
Collections-27 list index out of range
Collections-26 list index out of range
Chart-23 list index out of range


## 保存context dataset

In [50]:
from datasets import load_dataset

# 步骤1: 加载JSONL文件作为Dataset
# 假设你的JSONL文件路径为"path/to/your/file.jsonl"
dataset = load_dataset('json', data_files='defects4j_with_context.jsonl')

# 步骤2: 保存Dataset到磁盘
# 指定保存路径
output_path = 'data/defects4j_context'
dataset.save_to_disk(output_path)

print(f'Dataset saved to {output_path}')


Downloading and preparing dataset json/default to /Users/alex.wu/.cache/huggingface/datasets/json/default-2f08f9006d6b2262/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/alex.wu/.cache/huggingface/datasets/json/default-2f08f9006d6b2262/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/448 [00:00<?, ? examples/s]



Dataset saved to data/defects4j_context


## validation

In [2]:
from datasets import load_from_disk

dataset=load_from_disk('/Users/alex.wu/PycharmProjects/apr_datasets_processing/defects4j_validation/dataset_validated/defects4j_context_gen_validation')

In [5]:
project_ids=[i.capitalize().replace('Jacksondatabind', 'JacksonDatabind').replace('Jacksoncore', 'JacksonCore').replace(
            'Jacksonxml', 'JacksonXml').replace('Jxpath', 'JxPath').replace('_', '-') for i in dataset['projectName']]

In [None]:
project_ids

### repairllama result

In [10]:
file_path='/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/repairllama/lora/RepairLLaMA_defects4j_f2f_bugs_results_ir4_or2.jsonl'

In [9]:
import json

json_lst=[]
# 使用with语句打开文件，确保文件会被正确关闭
with open(json_path, 'r') as file:
    for line in file:
        # 解析每一行的JSON内容
        json_obj = json.loads(line)
        
        # 处理json_obj
        json_lst.append(json_obj)

In [15]:
from datasets import load_dataset

repairllama_dataset=load_dataset('json', data_files=file_path)['train']

Found cached dataset json (/Users/alex.wu/.cache/huggingface/datasets/json/default-086715474a996e88/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
repairllama_dataset=repairllama_dataset.filter(lambda x:x['bug_id'] in project_ids)

Filter:   0%|          | 0/480 [00:00<?, ? examples/s]

In [26]:
repairllama_dataset

Dataset({
    features: ['bug_id', 'buggy_code', 'patches', 'test_results'],
    num_rows: 223
})

In [24]:
pluasible_bug_id=repairllama_dataset.filter(lambda x: 'Plausible' in x['test_results'])['bug_id']

Filter:   0%|          | 0/223 [00:00<?, ? examples/s]

In [None]:
for i in repairllama_dataset:
    if i['bug_id'] in pluasible_bug_id:
        print(f"------------------{i['bug_id']}------------------")
        print(i['buggy_code'])
        indexes = [index for index, value in enumerate(i['test_results']) if value == 'Plausible']
        print('plausible patches')
        for j in indexes:
            print('-------')            
            print(i['patches'][j])

### our result

In [71]:
dataset=load_from_disk('/Users/alex.wu/PycharmProjects/apr_datasets_processing/defects4j_validation/dataset_to_be_validated/defects4j_context_gen_10')

In [75]:

def name_map(x):
    x['bug_id']=x['projectName'].capitalize().replace('Jacksondatabind', 'JacksonDatabind').replace('Jacksoncore', 'JacksonCore').replace(
            'Jacksonxml', 'JacksonXml').replace('Jxpath', 'JxPath').replace('_', '-')
    x['gen']=[i.replace(x['input'], '').replace('<s>','').replace('</s>','').replace('<unk>','') for i in x['gen']]
    return x

dataset=dataset.map(name_map)

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [76]:
dataset[0]['gen']

[' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();',
 ' StringBuffer buff = new StringBuffer();']

In [45]:
def name_map(x):
    x['bug_id']=x['projectName'].capitalize().replace('Jacksondatabind', 'JacksonDatabind').replace('Jacksoncore', 'JacksonCore').replace(
            'Jacksonxml', 'JacksonXml').replace('Jxpath', 'JxPath').replace('_', '-')
    return x

In [46]:
dataset=dataset.map(name_map)

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [None]:
dataset['test_res']

In [49]:
our_pluasible_bug_id=dataset.filter(lambda x: 'plausible' in [i['correctness'] for i in x['test_res']])['bug_id']

Filter:   0%|          | 0/223 [00:00<?, ? examples/s]

In [55]:
unique_id=list(set(our_pluasible_bug_id).difference(set(pluasible_bug_id)))

In [59]:
unique_dataset=dataset.filter(lambda x:x['bug_id'] in unique_id)

Filter:   0%|          | 0/223 [00:00<?, ? examples/s]

In [61]:
unique_dataset

Dataset({
    features: ['methodInformation', 'involvedTypesInformation', 'filePath', 'classInformation', 'buggyInfo', 'projectName', 'input', 'output', 'gen', 'test_res', 'bug_id'],
    num_rows: 11
})

In [None]:
for i in unique_dataset:
    print(f"\n\n")
    print(f"------------------{i['bug_id']}------------------")
    print('buggy_code:', i['buggyInfo']['buggyCode'])
    print('fix_code:', i['buggyInfo']['originalCode'])
    print('--our plausible patches:--')
    for j in i['test_res']:
        if j['correctness']=='plausible':
            print('-------')            
            print(j['patch'])
    print('--repairllama patches:--')
    print(repairllama_dataset.filter(lambda x:x['bug_id']==i['bug_id'])[0]['patches'][0])