In [1]:
# 数据 load

import glob
import shutil

valid_dir = "output/hf-eval-data-v4-valid/"

def load_valid_data(dir_path):
    data = []
    file_pattern = "*.py"

    # 使用 glob.glob 获取匹配的文件列表
    python_files = glob.glob(dir_path + file_pattern)

    for file_path in python_files:
        try:
            with open(file_path, 'r') as file:
                content = file.read()
                data.append({
                    "path": file_path,
                    "content": content,
                })
                print(f"内容来自文件: {file_path}\n")
        except FileNotFoundError:
            print(f"文件 {file_path} 未找到。")
        except Exception as e:
            print(f"发生错误：{e}")
            
    return data

valid_data = load_valid_data(valid_dir)
valid_data[0], len(valid_data)

内容来自文件: output/hf-eval-data-v4-valid/f00055_translate_french_to_english.py

内容来自文件: output/hf-eval-data-v4-valid/f00064_generate_story_start.py

内容来自文件: output/hf-eval-data-v4-valid/f00065_predict_missing_text.py

内容来自文件: output/hf-eval-data-v4-valid/f00066_fill_in_the_blank_chinese_sentence.py

内容来自文件: output/hf-eval-data-v4-valid/f00070_calculate_sentence_similarity.py

内容来自文件: output/hf-eval-data-v4-valid/f00085_predict_vehicle_co2_emissions.py

内容来自文件: output/hf-eval-data-v4-valid/f00111_detect_objects_in_image.py

内容来自文件: output/hf-eval-data-v4-valid/f00126_find_most_relevant_passage.py

内容来自文件: output/hf-eval-data-v4-valid/f00156_transcribe_audio.py

内容来自文件: output/hf-eval-data-v4-valid/f00169_predict_prosthetic_leg_actions.py

内容来自文件: output/hf-eval-data-v4-valid/f00184_estimate_depth_from_footage.py

内容来自文件: output/hf-eval-data-v4-valid/f00189_classify_image.py

内容来自文件: output/hf-eval-data-v4-valid/f00232_convert_chinese_text_to_speech.py

内容来自文件: output/hf-eval-data-v4-valid/f

({'path': 'output/hf-eval-data-v4-valid/f00055_translate_french_to_english.py',
  'content': '# requirements_file --------------------\n\nimport subprocess\n\nrequirements = ["transformers"]\n\nfor package in requirements:\n    subprocess.run([\'pip\', \'install\', \'-U\', package])\n\n# function_import --------------------\n\nfrom transformers import pipeline\n\n# function_code --------------------\n\ndef translate_french_to_english(text):\n    """\n    Translates a given text from French to English using Hugging Face\'s Transformers library.\n\n    Args:\n        text (str): The text in French to be translated.\n\n    Returns:\n        str: The translated text in English.\n\n    Raises:\n        ValueError: If the input text is not a string or if it\'s empty.\n    """\n    if not isinstance(text, str) or not text:\n        raise ValueError(\'Input text must be a non-empty string.\')\n    \n    translation_pipeline = pipeline(\'translation_fr_to_en\', model=\'Helsinki-NLP/opus-mt-fr-e

In [5]:
# 拆出 instruct / output / testing
import re

def extract_between_strings(input_string, start_string, end_string):
    after = input_string.split(start_string)[1]
    if end_string != "":
        between = after.split(end_string)[0]
        return start_string + between
    
    return start_string + after

def extract_section(data):
    content = data['content']
    
    s = "# requirements_file --------------------"
    e = "# function_import --------------------"
    function_req = extract_between_strings(content, s, e)
    
    s = "# function_import --------------------"
    e = "# function_code --------------------"
    function_import = extract_between_strings(content, s, e)
    
    s = "# function_code --------------------"
    e = "# test_function_code --------------------"
    function_code = extract_between_strings(content, s, e)
    
    s = "# test_function_code --------------------"
    e = "# call_test_function_line --------------------"
    test_function_code = extract_between_strings(content, s, e)
    
    s = "# call_test_function_line --------------------"
    e = ""
    call_test_function_line = extract_between_strings(content, s, e)
    
    data['function_req'] = function_req
    data['function_import'] = function_import
    data['function_code'] = function_code
    data['test_function_code'] = test_function_code
    data['call_test_function_line'] = call_test_function_line
    
    return data

result = extract_section(valid_data[0])
result

{'path': 'output/hf-eval-data-v4-valid/f00055_translate_french_to_english.py',
 'content': '# requirements_file --------------------\n\nimport subprocess\n\nrequirements = ["transformers"]\n\nfor package in requirements:\n    subprocess.run([\'pip\', \'install\', \'-U\', package])\n\n# function_import --------------------\n\nfrom transformers import pipeline\n\n# function_code --------------------\n\ndef translate_french_to_english(text):\n    """\n    Translates a given text from French to English using Hugging Face\'s Transformers library.\n\n    Args:\n        text (str): The text in French to be translated.\n\n    Returns:\n        str: The translated text in English.\n\n    Raises:\n        ValueError: If the input text is not a string or if it\'s empty.\n    """\n    if not isinstance(text, str) or not text:\n        raise ValueError(\'Input text must be a non-empty string.\')\n    \n    translation_pipeline = pipeline(\'translation_fr_to_en\', model=\'Helsinki-NLP/opus-mt-fr-en\

In [10]:
# 拆出注释、正文
def extract_all_section(data):
    result = extract_section(data)
        
    s = "# function_code --------------------"
    e = "'''"
    if e not in result['function_code']:
        e = '"""'

    def extract_instruct(input_string, start_string, end_string):
        after = input_string.split(start_string)[1]
        between = after.rsplit(end_string, 1)[0]
        return start_string + between + end_string

    def extract_answer(input_string, start_string):
        after = input_string.split(start_string)[-1]
        return after

    instruct = extract_instruct(result['function_code'], s, e)
    answer = extract_answer(result['function_code'], e)
    
    result['instruct'] = result['function_req'] + result['function_import'] + instruct
    result['answer'] = answer
    
    return result


import traceback

section_valid_data = []
for d in valid_data:
    try:
        result = extract_all_section(d)
        section_valid_data.append(result)
    except Exception as e:
        # Handle the exception and print the traceback
        print("An exception occurred:", e)
        traceback.print_exc()
    
len(section_valid_data), section_valid_data[0]

(49,
 {'path': 'output/hf-eval-data-v4-valid/f00055_translate_french_to_english.py',
  'content': '# requirements_file --------------------\n\nimport subprocess\n\nrequirements = ["transformers"]\n\nfor package in requirements:\n    subprocess.run([\'pip\', \'install\', \'-U\', package])\n\n# function_import --------------------\n\nfrom transformers import pipeline\n\n# function_code --------------------\n\ndef translate_french_to_english(text):\n    """\n    Translates a given text from French to English using Hugging Face\'s Transformers library.\n\n    Args:\n        text (str): The text in French to be translated.\n\n    Returns:\n        str: The translated text in English.\n\n    Raises:\n        ValueError: If the input text is not a string or if it\'s empty.\n    """\n    if not isinstance(text, str) or not text:\n        raise ValueError(\'Input text must be a non-empty string.\')\n    \n    translation_pipeline = pipeline(\'translation_fr_to_en\', model=\'Helsinki-NLP/opus-mt

In [11]:
# 输出到文件
import json

with open(f"output/hf-eval-data-v4-valid.jsonl", 'w') as f:
    for d in section_valid_data:
        f.write(json.dumps(d) + "\n")

In [12]:
import pprint
pprint.pp(section_valid_data[0])

{'path': 'output/hf-eval-data-v4-valid/f00055_translate_french_to_english.py',
 'content': '# requirements_file --------------------\n'
            '\n'
            'import subprocess\n'
            '\n'
            'requirements = ["transformers"]\n'
            '\n'
            'for package in requirements:\n'
            "    subprocess.run(['pip', 'install', '-U', package])\n"
            '\n'
            '# function_import --------------------\n'
            '\n'
            'from transformers import pipeline\n'
            '\n'
            '# function_code --------------------\n'
            '\n'
            'def translate_french_to_english(text):\n'
            '    """\n'
            '    Translates a given text from French to English using Hugging '
            "Face's Transformers library.\n"
            '\n'
            '    Args:\n'
            '        text (str): The text in French to be translated.\n'
            '\n'
            '    Returns:\n'
            '        str

In [67]:
# ! cp output/hf-eval-v3-240.json /root/autodl-tmp/LLaMA-Factory/data/

In [69]:
# import json

# d = {}

# with open("/root/autodl-tmp/LLaMA-Factory/data/dataset_info.json") as f:
#     d = json.loads(f.read())
#     d['hf_eval_v3_240'] = {
#         "file_name": "hf-eval-v3-240.json",
#         "columns": {
#           "prompt": "instruction",
#           "query": "input",
#           "response": "output",
#           "history": "history"
#         }
#     }
    
# with open("/root/autodl-tmp/LLaMA-Factory/data/dataset_info.json", 'w') as f:
#     f.write(json.dumps(d))

In [1]:
! export REPLICATE_API_TOKEN=r8_1YU2oz2exxBYZYIo9sZDngjFwBNvNQD0IiXjD

In [7]:
import replicate
import os

os.environ['REPLICATE_API_TOKEN'] = "r8_1YU2oz2exxBYZYIo9sZDngjFwBNvNQD0IiXjD"

def get_prediction(prompt, model="vixuowis/codellama-7b-python"):
    deployment = replicate.deployments.get(model)
    prediction = deployment.predictions.create(
      input={"prompt": prompt}
    )
    prediction.wait()
    return "".join(prediction.output)

In [10]:
import json
import traceback
import os

target_dir = "output/hf-eval-data-v4-valid-result"
model_name = "codellama-34b-python"

try:
    os.mkdir(target_dir)
except:
    pass

try:
    os.mkdir(target_dir + "/" + model_name)
except:
    pass


with open("output/hf-eval-data-v4-valid.jsonl") as f:
    for idx, l in enumerate(f):
        d = json.loads(l)
        idx_str = str(idx + 1).zfill(5)
        with open(f"{target_dir}/{model_name}/result-{idx_str}.json", 'w') as fw:
            try:
                print(idx_str, end="...")
                d['prediction'] = get_prediction(d['instruct'], f"vixuowis/{model_name}")
                fw.write(json.dumps(d))
            except Exception as e:
                # Handle the exception and print the traceback
                print("An exception occurred:", e)
                traceback.print_exc()

00001...00002...00003...00004...00005...00006...00007...00008...00009...00010...00011...00012...00013...00014...00015...00016...00017...00018...00019...00020...00021...00022...00023...00024...00025...00026...00027...00028...00029...00030...00031...00032...00033...00034...00035...00036...00037...00038...00039...00040...00041...00042...00043...