In [1]:
from datasets import *

## base dataset

In [2]:
file_path='defects4j_with_context.jsonl'

In [3]:
base_dataset =load_dataset('json', data_files=file_path, split='all')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
def process_base_dataset(sample):
    buggy_info = sample['buggyInfo']
    proj, bug_id, start_line, end_line, path = buggy_info['projectName'].split('_')[0], \
        buggy_info['projectName'].split('_')[1], buggy_info['startLine'], buggy_info['endLine'], buggy_info[
        'filePath'].replace(f"/Users/alex.wu/defects4j_projects_buggy/{buggy_info['projectName']}", '')
    end_line = int(end_line) - 1
    
    proj= proj.capitalize().replace('Jacksondatabind', 'JacksonDatabind').replace('Jacksoncore', 'JacksonCore').replace(
            'Jacksonxml', 'JacksonXml').replace('Jxpath', 'JxPath')
    
    sample['bug_id']=proj+'-'+bug_id
    sample['start_line']=start_line
    sample['end_line']=end_line
    sample['path']=path
    sample['fix_code']=sample['buggyInfo']['originalCode']
    sample['pre_context']=sample['buggyInfo']['methodPreContext']
    sample['post_context']=sample['buggyInfo']['methodPostContext']
    sample['buggy_code']=sample['buggyInfo']['buggyCode']
    return sample

In [5]:
base_dataset=base_dataset.map(process_base_dataset)

Map:   0%|          | 0/455 [00:00<?, ? examples/s]

In [6]:
base_dataset

Dataset({
    features: ['methodInformation', 'involvedTypesInformation', 'filePath', 'classInformation', 'buggyInfo', 'projectName', 'bug_id', 'start_line', 'end_line', 'path', 'fix_code', 'pre_context', 'post_context', 'buggy_code'],
    num_rows: 455
})

In [8]:
full_dataset_without_context=load_dataset('json', data_files='defects4j.jsonl', split='all')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

#### complete base dataset

In [9]:
base_bug_ids=base_dataset['bug_id']

In [10]:
sample_list=[]

for sample in full_dataset_without_context:
    proj, ids = sample['projectName'].split('_')[0], \
        sample['projectName'].split('_')[1]
    proj= proj.capitalize().replace('Jacksondatabind', 'JacksonDatabind').replace('Jacksoncore', 'JacksonCore').replace(
            'Jacksonxml', 'JacksonXml').replace('Jxpath', 'JxPath')
    bug_id=proj+'-'+ids
    
    if bug_id not in base_bug_ids:
        add_sample={
            "buggyInfo": None,
            "bug_id":bug_id,
            "start_line":sample['startLine'],
            "end_line":int(sample['endLine']) - 1,
            "path":sample['filePath'].replace(f"/Users/alex.wu/defects4j_projects_buggy/{sample['projectName']}", ''),
            "fix_code":sample['originalCode'],
            "methodInformation":None,
            "involvedTypesInformation":None,
            "filePath":sample['filePath'],
            "classInformation":None,
            "projectName":sample['projectName'],
            'pre_context':sample['methodPreContext'],
            'post_context':sample['methodPostContext'],
            'buggy_code':sample['buggyCode'],
        }
        sample_list.append(add_sample)
        
base_dataset = concatenate_datasets([base_dataset, Dataset.from_list(sample_list)])

## vanilla dataset prepration

In [84]:
def get_vanilla_input(sample):
    # format as "<PRE> {pre} <SUF>{suf} <MID>"
    sample['input']='<PRE> '+sample['pre_context']+' <SUF>'+sample['post_context']+' <MID>'
    return sample

vanilla_dataset=base_dataset.map(get_vanilla_input)
vanilla_dataset.save_to_disk('datasets/defects4j_vanilla')

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]

In [14]:
## vanilla dataset with buggy lines prepration
def get_vanilla_buggy_input(sample):
    # format as "<PRE> {pre} <SUF>{suf} <MID>"
    sample['input']='<PRE> '+sample['pre_context'].strip()+'\n// buggy lines\n'+'\n'.join(['// '+ i for i in sample['buggy_code'].split('\n')])+' <SUF>'+sample['post_context'].strip()+' <MID>'
    return sample

buggy_dataset=base_dataset.map(get_vanilla_buggy_input)
buggy_dataset.save_to_disk('datasets/defects4j_vanilla_with_buggy')

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]

In [15]:
print(buggy_dataset['input'][0])

<PRE> public char[] expandCurrentSegment()
    {
        final char[] curr = _currentSegment;
        final int len = curr.length;
// buggy lines
// int newLen = (len == MAX_SEGMENT_LEN) ? (MAX_SEGMENT_LEN+1) : Math.min(MAX_SEGMENT_LEN, len + (len >> 1)); <SUF>return (_currentSegment = Arrays.copyOf(curr, newLen));
    } <MID>


## repairllama dataset prepration

In [85]:
import os
from datasets import *

In [86]:
directory='/Users/alex.wu/PycharmProjects/repairllama/results/defects4j/repairllama/fft/patches'

In [87]:
sample_list=[]
for entry in os.listdir(directory):
    sample={}
    path = os.path.join(directory, entry)
    if os.path.isdir(path):
        with open(path+'/prompt.txt', 'r') as file:
            sample['input']=file.read()
        sample['bug_id']=entry
        sample_list.append(sample)

In [89]:
def get_repair_input(sample):
    for sample1 in sample_list:
        if sample1['bug_id']==sample['bug_id']:
            sample['input']=sample1['input']
    return sample

In [90]:
repairllama_dataset = base_dataset.map(get_repair_input)
repairllama_dataset.save_to_disk('datasets/defects4j_repairllama')

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]

## classinfo dataset prepration

In [10]:
def remove_comments(java_code):
    in_multiline_comment = False
    result = []

    for line in java_code.splitlines():
        # Check for the start of a multiline comment
        if "/*" in line:
            in_multiline_comment = True

        # Check for the end of a multiline comment
        if "*/" in line:
            in_multiline_comment = False
            continue  # We skip the rest of the processing for this line

        # If we are in a multiline comment, skip the line
        if in_multiline_comment:
            continue

        # Check for single line comment
        if "//" in line:
            line = line.split("//")[0]  # Remove the comment part

        # Add the line if it's not empty after removing comments
        if line.strip():
            result.append(line)

    return "\n".join(result)


In [95]:
def build_class_info(class_info):
    if not class_info:
        return ''
    class_head, class_var, class_method=class_info['classHead'],class_info['memberVariables'],class_info['memberMethods']
    class_info_prompt=class_head+"{\n"+'\n'.join(['    '+i for i in class_var])+"\n"+'\n'.join(['    '+i.replace('  ',' ') for i in class_method])+'\n}'
    return class_info_prompt

In [99]:
def build_buggy_info(sample):
    class_info=build_class_info(sample['classInformation'])
    buggy, pre_context, post_context=sample['buggy_code'], sample['pre_context'], sample['post_context']
    buggy, pre_context, post_context=remove_comments(buggy), remove_comments(pre_context), remove_comments(post_context)
    sample['input']= '<PRE> // class information of method\n'+class_info+'\n\n// method to be repaired\n'+pre_context+'\n// buggy lines\n'+'\n'.join(['// '+ i for i in buggy.split('\n')])+" <SUF>"+post_context+' <MID>'
    return sample

In [102]:
classinfo_dataset=base_dataset.map(build_buggy_info)

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

In [104]:
classinfo_dataset.save_to_disk('datasets/defects4j_classinfo')

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]

In [3]:
dataset=load_from_disk('/Users/alex.wu/PycharmProjects/apr_datasets_processing/defects4j_validation/dataset_validated/defects4j_vanilla_gen_validation')

In [None]:
dataset['test_res']

## repairllama result dataset

In [29]:
repairllama_result=load_dataset('json', data_files='/Users/17988/PycharmProjects/repairllama/results/defects4j/repairllama/lora/RepairLLaMA_defects4j_f2f_bugs_results_ir4_or2.jsonl', split='all')

Found cached dataset json (C:/Users/17988/.cache/huggingface/datasets/json/default-bfbb90bc52752341/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [30]:
repairllama_dataset=load_from_disk('datasets/defects4j_repairllama')

In [31]:
repairllama_dataset

Dataset({
    features: ['methodInformation', 'involvedTypesInformation', 'filePath', 'classInformation', 'buggyInfo', 'projectName', 'bug_id', 'start_line', 'end_line', 'path', 'fix_code', 'pre_context', 'post_context', 'buggy_code', 'input'],
    num_rows: 479
})

In [None]:
def get_repairllama_output(sample):
    bug_id, pre, after=sample['bug_id'], sample['pre_context'], sample['post_context']
    
    result_sample=repairllama_result.filter(lambda x:x['bug_id']==bug_id)[0]
    patches=result_sample['patches']
    output=[]
    for patch in patches:
        patch=remove_comments(patch)
        patch=patch.replace(pre, '').replace(after.strip(), '')
        output.append(patch.strip())
    sample['gen']=output
    return sample

repairllama_res_dataset=repairllama_dataset.map(get_repairllama_output)

In [39]:
repairllama_res_dataset['gen']

[['int newLen = (len == MAX_SEGMENT_LEN) ? MAX_SEGMENT_LEN : Math.min(MAX_SEGMENT_LEN, len + (len >> 1));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? (MAX_SEGMENT_LEN+1) : Math.min(MAX_SEGMENT_LEN, len + (len >> 2));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? (len+1) : Math.min(MAX_SEGMENT_LEN, len + (len >> 1));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? MAX_SEGMENT_LEN+1 : Math.min(MAX_SEGMENT_LEN, len + (len >> 1));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? (MAX_SEGMENT_LEN+1) : Math.min(MAX_SEGMENT_LEN, len << 1);',
  'int newLen = (len == MAX_SEGMENT_LEN) ? (len + 1) : Math.min(MAX_SEGMENT_LEN, len + (len >> 1));',
  'int newLen = Math.min(MAX_SEGMENT_LEN, len + (len >> 1));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? (MAX_SEGMENT_LEN+1) : Math.min(MAX_SEGMENT_LEN, len + (len >>> 1));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? MAX_SEGMENT_LEN + 1 : Math.min(MAX_SEGMENT_LEN, len + (len >> 1));',
  'int newLen = (len == MAX_SEGMENT_LEN) ? (MAX_SEGMENT_LEN+1) : Math.min(MA

In [36]:
repairllama_res_dataset.save_to_disk('datasets/defects4j_repairllama_res')

Saving the dataset (0/1 shards):   0%|          | 0/479 [00:00<?, ? examples/s]