In [131]:
import os
from tqdm import tqdm

os.environ['TRANSFORMERS_CACHE'] = '/datasets/Large_Language_Models'

In [104]:
from datasets import load_dataset

dataset = load_dataset('h4iku/coconut_java2006_preprocessed', split='train')

In [105]:
dataset

Dataset({
    features: ['rem', 'add', 'context'],
    num_rows: 1125599
})

In [17]:
df=dataset.to_pandas()

In [None]:
for index, row in df.iterrows():
    if index<100:
        print("Index:", index)  # 注意这里是大写的 'Index'
        print("Row Data:", row['rem'])

In [133]:
from transformers import AutoTokenizer



In [134]:
MODEL_NAME = 'codefuse-ai/CodeFuse-CodeLlama-34B'
TOKEN = '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'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=TOKEN)

In [135]:
def get_token_len(input):
    input_tokens = tokenizer.encode(input, return_tensors="pt")
    return len(input_tokens[0])

## java code format

In [126]:
import subprocess
import tempfile

def format_java_code(java_code):
    # 创建一个临时文件来保存Java代码
    with tempfile.NamedTemporaryFile(mode='w+', suffix='.java', delete=False) as temp_file:
        temp_file_name = temp_file.name
        # print(java_code)
        temp_file.write('public class A {  '+java_code+'}')

    # 使用java命令行工具格式化代码
    try:
        subprocess.run(['java', '-jar', '/home/dwu25/google-java-format-1.19.1-all-deps.jar', '--replace', temp_file_name], check=True)
        
        # 读取格式化后的代码
        with open(temp_file_name, 'r') as file:
            formatted_code = file.read()
        return formatted_code[19:-3]
    except subprocess.CalledProcessError as e:
        # print(java_code)
        return None
    finally:
        # 删除临时文件
        if temp_file_name:
            os.remove(temp_file_name)

# 示例Java代码
# java_code = """
# public class A {  void updateCurrentVisiblePath()  {    if (treeModel == null)      return;    Object next = treeModel.getRoot();    Rectangle bounds = getCellBounds(0, 0, next);           if ((bounds.width == 0 && bounds.height == 0) || (!isRootVisible()         && tree.isExpanded(new TreePath(next))))      next = getNextNode(next);    TreePath current = null;    while (next != null)      {        if (current == null)          current = new TreePath(next);        else             current = current.pathByAddingChild(next);        do          {            TreePath path = new TreePath(getPathToRoot(next, 0));            if (tree.isVisible(path) && tree.isExpanded(path))          next = getNextNode(next);            else next = getNextSibling(next);          }        while (next != null && !tree.isVisible(new TreePath(getPathToRoot(next, 0))));      }    currentVisiblePath = current;    tree.setVisibleRowCount(getRowCount(tree));    if (tree.getSelectionModel() != null && tree.getSelectionCount() == 0 &&        currentVisiblePath != null)      tree.addSelectionRow(0);  }}"""

# formatted_code = format_java_code(java_code)
# if formatted_code:
#     print("Formatted Java Code:\n")
#     print(formatted_code)
# formatted_code[19:-3]

## coconut preprocess

In [122]:
import re

## 去掉注释
def remove_comments(text):
    # 正则表达式匹配以 // 开头，后面跟任意字符（除了换行符），并以两个或更多空格结尾的子字符串
    pattern = r'//.*?(  +|\t)'
    # 使用空字符串替换匹配的文本
    return re.sub(pattern, '', text)

In [123]:
remove_comments('protected void ensureRowsAreVisible(int beginRow, int endRow)	{    // FIXME: not implemented	}')

'protected void ensureRowsAreVisible(int beginRow, int endRow)\t{    }'

In [136]:
def format_context(sample):
    context=sample['context']
    context=remove_comments(context)
    
    if sample['rem'].replace('{', '').replace('}', '')=='':
        sample['formatted_context']=None
        return sample
    
    if len(context)>1000:
        sample['formatted_context']=None
        return sample
    
    sample['formatted_context']=format_java_code(context)
    return sample

In [None]:
format_dataset=dataset.map(format_context, num_proc=12)
format_dataset

In [12]:
import re

def replace_multiple_spaces_with_single(s):
    return re.sub(r'\s+', ' ', s)

def preprocess(sample):
    rem, add, context = replace_multiple_spaces_with_single(sample['rem']).strip(), replace_multiple_spaces_with_single(sample['add']).strip(), replace_multiple_spaces_with_single(sample['context']).strip()
    infill =context.replace(rem, '<INFILL>')
    return rem.strip()=='' or infill.count('<INFILL>')!=1
    
dataset.filter(preprocess)

Filter:   0%|          | 0/1125599 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['rem', 'add', 'context'],
        num_rows: 219320
    })
})

In [58]:
fixme=dataset['train'].filter(lambda x:'//' in x['context'])

In [88]:
fixme[100]

{'rem': '}',
 'add': '',
 'context': '  public void setSelectedIndex(int index)  {  \tif(index < -1 || index >= dataModel.getSize()) {  \t\t// Fails because index is out of bounds.   \t\tthrow new IllegalArgumentException("illegal index: " + index);  \t} else {  \t\t/* Selects the item at the given index or clears the selection if the  \t\t * index value is -1.  \t\t */\t\tsetSelectedItem((index == -1) ? null : dataModel.getElementAt(index));  \t}  }'}

In [29]:
for i in fixme[0:1000]:
    print(i)

rem
add
context


In [None]:
## 输入是patch，格式化的code
def replace_patch(patch, code):
    code_list=code.split('\n')

## coconut comparison finetune dataset for MFTcoder

In [19]:
import random

def process(ind, sample): 
    correct_patch_choice=''
    
    random_number = random.randint(0, 1)
    if random_number==0:
        A_patch, B_patch, correct_patch_choice=sample['add'], sample['rem'],'A'
    else:
        A_patch, B_patch, correct_patch_choice=sample['rem'], sample['add'],'B'
        
    
    system_content="You are an intelligent programming assistant for JAVA."
    human_content="Choose a correct patch from the following two patches to infill the Java code.\n\nJava code:\n"+sample['context'].replace(sample['rem'], '<INFILL>')+ "\n\nPatches:\nA. "+A_patch.strip()+"\nB. "+B_patch.strip()+'<|role_start|>bot<|role_end|>'
    bot_content="The correct patch is "+correct_patch_choice
    
    jsonl_sample={"id":ind,
                  "data_name":"comparison_finetune",
                  "chat_rounds":[
                      {
                          "role": "system",
                          "content": system_content,
                          "chat_round_id": 0
                      },
                      {
                          "role": "human",
                          "content": human_content,
                          "chat_round_id": 1
                      },
                      {
                          "role": "bot",
                          "content": bot_content,
                          "chat_round_id": 2
                      }
                  ]
                 }
    if ind<10:
        print(human_content)
    return ind, jsonl_sample, correct_patch_choice

In [21]:
from tqdm import tqdm
import json

sample_num=0

with open('data/coconut_comparison.jsonl', 'w') as file:
    with open('data/coconut_comparison_label.txt', 'w') as label_file:
        
        for ind, sample in tqdm(df.iterrows()):
            ind, jsonl_sample, correct_patch_choice = process(ind, sample)
            
            json_string = json.dumps(jsonl_sample)
            if get_token_len(json_string)<=500:
                file.write(json_string)
                file.write('\n')  

                label_file.write(f"{ind} {correct_patch_choice}\n")
                
                sample_num+=1
print(sample_num)
    

TypeError: 'generator' object is not subscriptable