- 准备数据集，处理 gorilla 的 instruction + code example
    - Instruction 任务说明
    - Function，接受端到端任务
    - Test function
    - Test dataset

In [2]:
import json
import pprint

def load_jsonl_data(path):
    data = []
    with open(path) as f:
        for l in f:
            d = json.loads(l)
            data.append(d)
            
    return data

hf_api_data = load_jsonl_data("gorilla/data/api/huggingface_api.jsonl")
len(hf_api_data), hf_api_data[0]

(936,
 {'domain': 'Natural Language Processing Feature Extraction',
  'framework': 'Hugging Face Transformers',
  'functionality': 'Feature Extraction',
  'api_name': 'YituTech/conv-bert-base',
  'api_call': "AutoModel.from_pretrained('YituTech/conv-bert-base')",
  'api_arguments': 'N/A',
  'python_environment_requirements': 'transformers',
  'example_code': 'N/A',
  'performance': {'dataset': 'N/A', 'accuracy': 'N/A'},
  'description': 'A pre-trained ConvBERT model for feature extraction provided by YituTech, based on the Hugging Face Transformers library.'})

In [3]:
from collections import Counter

domain_counter_dict = Counter()

for d in hf_api_data:
    domain_counter_dict[d['domain']] += 1
        
pprint.pp(domain_counter_dict.most_common())

[('Natural Language Processing Text2Text Generation', 41),
 ('Natural Language Processing Text Generation', 39),
 ('Natural Language Processing Sentence Similarity', 33),
 ('Computer Vision Image Classification', 33),
 ('Natural Language Processing Token Classification', 33),
 ('Natural Language Processing Zero-Shot Classification', 33),
 ('Natural Language Processing Text Classification', 32),
 ('Audio Automatic Speech Recognition', 31),
 ('Natural Language Processing Table Question Answering', 31),
 ('Computer Vision Video Classification', 30),
 ('Multimodal Text-to-Image', 30),
 ('Multimodal Image-to-Text', 30),
 ('Computer Vision Object Detection', 30),
 ('Computer Vision Image Segmentation', 30),
 ('Natural Language Processing Fill-Mask', 30),
 ('Natural Language Processing Question Answering', 29),
 ('Multimodal Document Question Answer', 29),
 ('Computer Vision Depth Estimation', 29),
 ('Computer Vision Unconditional Image Generation', 29),
 ('Audio Text-to-Speech', 29),
 ('Audi

In [4]:
hf_train_data = load_jsonl_data("gorilla/data/apibench/huggingface_train.json")
len(hf_train_data), hf_train_data[0]

(8191,
 {'code': "###Instruction: Write an API implementation that takes customer reviews as input and extracts features to analyze customer sentiment.\n###Output: <<<domain>>>: Natural Language Processing Feature Extraction\n<<<api_call>>>: AutoModel.from_pretrained('YituTech/conv-bert-base')\n<<<api_provider>>>: Hugging Face Transformers\n<<<explanation>>>: 1. We import the necessary classes from the transformers package. This includes AutoTokenizer and AutoModel for tokenizing and processing customer review text.\n2. We use the from_pretrained method of the AutoModel class to load the pre-trained model 'YituTech/conv-bert-base'. This model is based on ConvBERT and is suitable for feature extraction in text data.\n3. We load the customer review text, tokenize it, and use the model to extract features from the review. These features can then be used to analyze customer sentiment.\n<<<code>>>: from transformers import AutoTokenizer, AutoModel\ntokenizer = AutoTokenizer.from_pretrained(

In [5]:
hf_eval_data = load_jsonl_data("gorilla/data/apibench/huggingface_eval.json")
len(hf_eval_data), hf_eval_data[1]

(911,
 {'code': '###Instruction: The user is interested in a tool to find relationships between medical terms.\n###Output: <<<domain>>>: Multimodal Feature Extraction\n<<<api_call>>>: AutoModel.from_pretrained(\'GanjinZero/UMLSBert_ENG\')\n<<<api_provider>>>: Hugging Face Transformers\n<<<explanation>>>: 1. We import the necessary classes from the transformers package provided by Hugging Face.\n2. We then call the "AutoModel.from_pretrained" method with the argument \'GanjinZero/UMLSBert_ENG\' to load this pretrained model.\n3. This model, which is particularly suitable for finding relationships between medical terms, can be used to convert medical terms into embeddings (dense vectors).\n4. These embeddings can then be compared to find similarities and relationships between various medical terms.\n<<<code>>>: from transformers import AutoTokenizer, AutoModel\ntokenizer = AutoTokenizer.from_pretrained(\'GanjinZero/UMLSBert_ENG\')\nmodel = AutoModel.from_pretrained(\'GanjinZero/UMLSBert_

# 1. Instruction

In [5]:
# instruction: apibench - {lib}_train.json - code - instruction

import re

def get_code_parts_from_apibench_data(data):
    text = data['code']
    instruction, _ = text.split("\n###Output")
    
    # Extracting domain, api_call, api_provider, and code using regular expressions
    domain_pattern = r'<<<domain>>>: (.+?)\n'
    api_call_pattern = r'<<<api_call>>>: (.+?)\n'
    api_provider_pattern = r'<<<api_provider>>>: (.+?)\n'
    code_pattern = r'<<<code>>>: (.+)'

    domain = re.search(domain_pattern, text).group(1)
    api_call = re.search(api_call_pattern, text).group(1)
    api_provider = re.search(api_provider_pattern, text).group(1)
    code = re.search(code_pattern, text, re.DOTALL).group(1).strip()

    return {
        'instruction': instruction, 
        'domain': domain, 
        'api_call': api_call, 
        'api_provider': api_provider, 
        'code': code
    }

d = hf_eval_data[0]
code_parts = get_code_parts_from_apibench_data(d)
code_parts, code_parts['instruction']

({'instruction': '###Instruction: Design a feature for a social media website to recommend articles to users based on how similar the articles are to their previously liked articles.',
  'domain': 'Natural Language Processing Sentence Similarity',
  'api_call': "AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base')",
  'api_provider': 'Hugging Face Transformers',
  'code': "from transformers import AutoTokenizer, AutoModel\ntokenizer = AutoTokenizer.from_pretrained('princeton-nlp/unsup-simcse-roberta-base')\nmodel = AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base')"},
 '###Instruction: Design a feature for a social media website to recommend articles to users based on how similar the articles are to their previously liked articles.')

In [6]:
for d in hf_eval_data:
    code_parts = get_code_parts_from_apibench_data(d)
    print(code_parts['instruction'])
    pprint.pp(d)
    break

###Instruction: Design a feature for a social media website to recommend articles to users based on how similar the articles are to their previously liked articles.
{'code': '###Instruction: Design a feature for a social media website to '
         'recommend articles to users based on how similar the articles are to '
         'their previously liked articles.\n'
         '###Output: <<<domain>>>: Natural Language Processing Sentence '
         'Similarity\n'
         '<<<api_call>>>: '
         "AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base')\n"
         '<<<api_provider>>>: Hugging Face Transformers\n'
         '<<<explanation>>>:1. We first import the necessary classes and '
         'modules from the transformers package. This includes AutoTokenizer '
         'and AutoModel for loading the pre-trained models from Hugging Face.\n'
         '2. We use the AutoModel.from_pretrained() method to load the '
         "'princeton-nlp/unsup-simcse-roberta-base' model,

# 2. Function / Test Function
- code part -> gpt -> function
- dataset 问题，先通过 prompt 解决一部分，需要对应到 huggingface dataset 名称才能对应
- prompt:
    generate following code based on above infomation:
    1. function with：
    - detailed comments
    - function description
    2. test function with：
    - test dataset
    - using assert in test function
    - do not compare number strictly
    - if dataset is provided in performance - dataset, load the dataset, then select several sample from the dataset, otherwise, using online source, do not leave blank


In [7]:
! pip install langchain -U

Looking in indexes: http://mirrors.aliyun.com/pypi/simple


In [13]:
from typing import Optional

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
    create_openai_fn_runnable,
    create_structured_output_runnable,
)
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate

def get_chat_model():
    BASE_URL = "https://autoagents-ca-east.openai.azure.com/"
    API_KEY = "2864ce19a46540b2a0943df607ca6225"
    model = AzureChatOpenAI(
        temperature=0.0,
        openai_api_base=BASE_URL,
        openai_api_version="2023-08-01-preview",
        deployment_name="gpt-4-32k",
        openai_api_key=API_KEY,
        openai_api_type="azure",
    )
    return model

# def get_chat_model():
#     BASE_URL = "https://autoagents-global.openai.azure.com"
#     API_KEY = "6c1c61bd992146a1bbcde4a80fef51ba"
#     model = AzureChatOpenAI(
#         temperature=0.0,
#         openai_api_base=BASE_URL,
#         openai_api_version="2023-08-01-preview",
#         deployment_name="gpt-35-turbo-16k",
#         openai_api_key=API_KEY,
#         openai_api_type="azure",
#     )
#     return model


######################################################################
from langchain.pydantic_v1 import BaseModel, Field

class CodeResp(BaseModel):
    """
    generate function_code and test_function_code based on input.
    
    function comments should follow Google Python Style Guide, includes args, returns, and raises
    """
    function_name: str = Field(..., description="function name")
    function_import: str = Field(..., description="import nessary lib before function code")
    function_code: str = Field(..., description="standalone function with:\n- detailed comments\n- function description")
    test_function_code: str = Field(..., description="standalone test function")
    call_test_function_code: str = Field(..., description="test function call in the end of test function file")
    function_import_fixed: str = Field(..., description="import nessary lib before function code, fixed by checking the function, e.g. 'import torch' or 'import numpy as np' if needed")
    requirements_file: str =  Field(..., description="dependency packages needed to install, usage: pip install -r requirements.txt, do not specific the version")
    
######################################################################
from langchain.chains.openai_functions import (
    convert_to_openai_function,
    get_openai_output_parser,
)
    
def get_function_from_data(data, err):
    code_desc = """
    generate function_code and test_function_code based on input.
    
    function comment can be descripted as following
    
    Certain aspects of a function should be documented in special sections, listed below. Each section begins with a heading line, which ends with a colon. All sections other than the heading should maintain a hanging indent of two or four spaces (be consistent within a file). These sections can be omitted in cases where the function’s name and signature are informative enough that it can be aptly described using a one-line docstring.

    Args:
        List each parameter by name. A description should follow the name, and be separated by a colon followed by either a space or newline. If the description is too long to fit on a single 80-character line, use a hanging indent of 2 or 4 spaces more than the parameter name (be consistent with the rest of the docstrings in the file). The description should include required type(s) if the code does not contain a corresponding type annotation. If a function accepts *foo (variable length argument lists) and/or **bar (arbitrary keyword arguments), they should be listed as *foo and **bar.
    Returns: (or Yields: for generators)
        Describe the semantics of the return value, including any type information that the type annotation does not provide. If the function only returns None, this section is not required. It may also be omitted if the docstring starts with Returns or Yields (e.g. \"\"\"Returns row from Bigtable as a tuple of strings.\"\"\") and the opening sentence is sufficient to describe the return value. Do not imitate older ‘NumPy style’ (example), which frequently documented a tuple return value as if it were multiple return values with individual names (never mentioning the tuple). Instead, describe such a return value as: “Returns: A tuple (mat_a, mat_b), where mat_a is …, and …”. The auxiliary names in the docstring need not necessarily correspond to any internal names used in the function body (as those are not part of the API).
    Raises:
        List all exceptions that are relevant to the interface followed by a description. Use a similar exception name + colon + space or newline and hanging indent style as described in Args:. You should not document exceptions that get raised if the API specified in the docstring is violated (because this would paradoxically make behavior under violation of the API part of the API).
    """
    
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "You are a world class algorithm for recording entities."),
            (
                "human",
                """
basic api info:
{input}

consider potential err:
{err}

function comments should follow:
{code_desc}

generate following code based on above information:
1. function with:
    - detailed comments
    - function description
    - not using " in file
2. test function with:
    - 3-5 test cases
        - for multi-media task: try to load official dataset, or using online resource url
        - e.g. https://placekitten.com/200/300
    - using assert in test function
    - do not compare number strictly
    - return 'All Tests Passed' in final if every assertion passed
    - do not use fake file url in test case
    - not using " in file
3. requirements_file with no specific package version
""",
            ),
            ("human", "Tip: Make sure to answer in the correct format"),
        ]
    )
    
    llm = get_chat_model()
    runnable = create_openai_fn_runnable([CodeResp], llm, prompt)
    
    resp = runnable.invoke({"input": str(data), "err": str(err), "code_desc": code_desc})
    return resp

import glob

def get_v2_err(base_dir, idx):
    formatted_number = str(idx).zfill(5)
    matching_files = glob.glob(f"{base_dir}/f{formatted_number}_*.err")
    for file_path in matching_files:
        lines = []
        with open(file_path) as f:
            for idx, l in enumerate(f):
                if 'Downloading' in l:
                    continue
                lines.append(l)
                
            return "".join(lines[-50:])
    
    return ""

In [16]:
idx = 6
print(hf_eval_data[idx])

err = get_v2_err("output/hf-eval-data-v2", idx + 1)
print(err)

resp = get_function_from_data(hf_eval_data[idx], err)
resp

{'code': "###Instruction: Create a program to generate a description for an image provided as input.\n###Output: <<<domain>>>: Multimodal Image-to-Text\n<<<api_call>>>: pipeline('text-generation', model='microsoft/git-large-r-textcaps')\n<<<api_provider>>>: Hugging Face Transformers\n<<<explanation>>>:1. Import the pipeline function from the transformers library provided by Hugging Face.\n2. Use the pipeline function to create a text generation model.\n3. Specify the model 'microsoft/git-large-r-textcaps' to be loaded. This model has been fine-tuned on the TextCaps dataset and is capable of generating image descriptions based on the content of the image.\n4. The created model can be used to generate a description for a given input image by simply passing the image into the pipeline's generate method.\n<<<code>>>: from transformers import pipeline\ndescription_generator = pipeline('text-generation', model='microsoft/git-large-r-textcaps')\nimage_description = description_generator(image

CodeResp(function_name='generate_image_description', function_import='from transformers import pipeline\nfrom PIL import Image\nimport requests\nfrom io import BytesIO', function_code='def generate_image_description(image_url: str) -> str:\n    """\n    Generate a description for an image provided as input.\n\n    Args:\n        image_url (str): The URL of the image to be described.\n\n    Returns:\n        str: The generated description of the image.\n\n    Raises:\n        Exception: If the image cannot be loaded from the provided URL.\n    """\n    try:\n        response = requests.get(image_url)\n        image = Image.open(BytesIO(response.content))\n        description_generator = pipeline(\'text-generation\', model=\'microsoft/git-large-r-textcaps\')\n        image_description = description_generator(image)\n        return image_description\n    except Exception as e:\n        raise Exception(\'Failed to load image from URL: \' + str(e))', test_function_code='def test_generate_im

In [17]:
print(hf_eval_data[idx])
print("----------------------")
print(resp.requirements_file)
print("----------------------")
print(resp.function_import)
print("----------------------")
print(resp.function_import_fixed)
print("----------------------")
print(resp.function_code)
print("----------------------")
print(resp.test_function_code)
print("----------------------")
print(resp.call_test_function_code)

{'code': "###Instruction: Create a program to generate a description for an image provided as input.\n###Output: <<<domain>>>: Multimodal Image-to-Text\n<<<api_call>>>: pipeline('text-generation', model='microsoft/git-large-r-textcaps')\n<<<api_provider>>>: Hugging Face Transformers\n<<<explanation>>>:1. Import the pipeline function from the transformers library provided by Hugging Face.\n2. Use the pipeline function to create a text generation model.\n3. Specify the model 'microsoft/git-large-r-textcaps' to be loaded. This model has been fine-tuned on the TextCaps dataset and is capable of generating image descriptions based on the content of the image.\n4. The created model can be used to generate a description for a given input image by simply passing the image into the pipeline's generate method.\n<<<code>>>: from transformers import pipeline\ndescription_generator = pipeline('text-generation', model='microsoft/git-large-r-textcaps')\nimage_description = description_generator(image

In [18]:
import traceback

err_dir = "output/hf-eval-data-v2"
output_dir = "output/hf-eval-data-v3"

for idx, d in enumerate(hf_eval_data):
    print(idx + 1, end="...")
    formatted_number = str(idx + 1).zfill(5)
    
    # 跳过已生成的
    matching_files = glob.glob(f"{output_dir}/f{formatted_number}_*")
    if matching_files:
        print("skip", end="...")
        continue
    
    max_retries = 3
    retry_count = 0

    while retry_count < max_retries:
        try:
            err = get_v2_err(err_dir, idx + 1)
            resp = get_function_from_data(d, err)

            # 写入 prompt
            with open(f"{output_dir}/f{formatted_number}_{resp.function_name}.prompt", 'w') as f:
                f.write(str(d) + "\n\n")
                f.write(str(err))

            # 写入 requirements
            with open(f"{output_dir}/f{formatted_number}_{resp.function_name}.txt", 'w') as f:
                f.write(resp.requirements_file)

            # 写入 python
            with open(f"{output_dir}/f{formatted_number}_{resp.function_name}.py", 'w') as f:
                f.write("# function_import --------------------\n\n")
                f.write(resp.function_import_fixed)

                f.write("\n\n# function_code --------------------\n\n")
                f.write(resp.function_code)

                f.write("\n\n# test_function_code --------------------\n\n")
                f.write(resp.test_function_code)

                f.write("\n\n# call_test_function_code --------------------\n\n")
                f.write(resp.call_test_function_code)

            break  # Break out of the loop if the operation is successful
        except Exception:
            retry_count += 1
            if retry_count < max_retries:
                print(f"Retrying... (Attempt {retry_count}/{max_retries})", end="...")
            else:
                print("Max retries reached. Exiting.", end="...")
                traceback.print_exc()


1...skip...2...skip...3...skip...4...skip...5...skip...6...skip...7...skip...8...9...skip...10...skip...11...skip...12...13...14...skip...15...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....16...skip...17...skip...18...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 222 (char 1388)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

19...skip...20...21...22...skip...23...skip...24...25...26...skip...27...skip...28...29...skip...30...31...skip...32...skip...33...34...35...skip...36...37...skip...38...39...skip...40...41...skip...42...skip...43...44...45...46...47...48...49...50...51...52...53...54...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....55...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 296 (char 909)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

56...57...58...59...60...61...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....62...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 763 (char 1804)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

63...64...65...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....66...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 168 (char 741)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

67...68...69...70...71...72...73...74...75...76...77...78...79...80...81...82...83...84...85...86...87...88...89...90...91...92...93...94...95...96...97...98...99...100...101...102...103...104...105...106...107...108...109...110...111...112...113...114...115...116...117...118...119...120...121...122...123...124...125...126...127...128...129...130...Retrying... (Attempt 1/3)...131...132...133...134...135...136...137...138...139...140...141...142...143...144...145...146...147...148...149...150...151...152...153...154...155...156...157...158...159...160...161...162...163...164...165...166...167...168...169...170...171...172...173...174...175...176...177...178...179...Retrying... (Attempt 1/3)...180...181...182...183...184...185...186...187...188...189...190...191...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...192...193...194...195...196...197...198...199...200...201...202...203...204...205...206...207...208...209...210...211...212...213...214...215...Retrying... (Attempt 1/3)..

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 365 (char 1319)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

288...289...290...291...292...293...294...295...296...297...298...299...300...301...302...303...304...305...306...307...308...309...310...311...312...313...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...314...315...316...317...318...319...320...321...322...323...324...325...326...327...Retrying... (Attempt 1/3)...328...329...330...331...332...333...334...335...336...337...338...339...340...341...342...343...344...345...346...347...348...349...350...351...352...Retrying... (Attempt 1/3)...353...354...355...356...357...358...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....359...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 207 (char 916)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

360...361...362...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....363...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 398 (char 1103)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

364...365...366...367...368...369...370...371...372...373...374...375...376...377...378...379...380...381...382...383...384...385...386...387...388...389...390...391...392...393...394...395...396...397...398...399...400...401...402...403...404...405...406...407...408...409...410...411...412...413...414...415...416...417...418...419...420...421...422...423...424...Retrying... (Attempt 1/3)...425...426...427...428...429...430...431...432...433...434...435...436...437...438...439...440...441...442...443...444...445...446...447...448...449...450...451...452...453...454...455...456...457...458...459...460...461...462...463...464...465...466...467...468...469...470...471...472...473...474...475...476...477...478...479...480...481...482...483...484...485...486...487...488...489...490...491...492...493...494...495...496...497...498...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...499...500...501...502...503...504...505...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retr

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 147 (char 1213)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

507...508...509...510...511...512...513...514...Retrying... (Attempt 1/3)...515...516...517...518...519...520...521...522...523...524...525...526...527...528...529...530...531...532...533...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...534...535...536...537...538...539...540...541...542...543...544...545...546...547...548...549...550...551...552...553...554...555...556...557...558...559...560...561...562...563...564...565...566...567...568...569...570...571...572...573...574...575...576...577...578...579...580...581...582...583...584...585...586...587...588...589...590...591...592...593...594...595...596...597...598...599...600...601...602...603...604...605...606...607...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....608...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 202 (char 821)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

609...610...611...612...613...614...615...616...617...618...619...620...621...622...623...624...625...626...627...628...Retrying... (Attempt 1/3)...629...630...631...632...633...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....634...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 209 (char 887)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

635...Retrying... (Attempt 1/3)...636...637...638...639...640...641...642...643...644...645...646...647...648...649...650...651...652...653...Retrying... (Attempt 1/3)...654...655...656...657...658...659...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...660...661...662...663...664...665...666...667...668...669...670...671...672...673...674...675...676...677...678...679...680...681...682...683...684...685...686...687...688...689...690...691...692...693...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...694...695...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....696...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 171 (char 800)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

697...698...699...700...701...702...703...704...705...706...707...708...709...710...711...712...713...714...715...716...717...718...719...720...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....721...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 402 (char 1251)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

722...723...724...725...726...727...728...729...730...731...732...733...734...735...736...737...738...739...740...741...742...743...744...745...746...747...748...749...750...751...752...753...754...755...756...757...758...759...760...761...762...763...764...765...766...767...768...769...770...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....771...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 202 (char 1081)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

772...773...774...775...776...777...778...779...780...781...782...783...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....784...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 276 (char 1104)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

785...786...787...788...789...790...791...792...793...794...795...796...797...798...799...800...801...802...803...804...805...806...807...808...809...810...811...812...813...814...815...816...817...818...819...820...821...822...823...824...825...826...827...828...829...830...831...832...833...834...835...836...837...838...839...840...841...842...843...844...845...846...847...848...849...850...851...852...853...854...855...856...857...858...859...860...861...862...863...864...865...866...867...868...869...870...871...872...873...874...875...876...877...878...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...879...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....880...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 468 (char 1371)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

881...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....882...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 5 column 290 (char 1071)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117,

883...884...885...Retrying... (Attempt 1/3)...886...887...888...889...890...891...892...893...894...895...896...Retrying... (Attempt 1/3)...Retrying... (Attempt 2/3)...Max retries reached. Exiting....897...

Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/main.py", line 539, in parse_raw
    obj = load_str_bytes(
  File "/root/miniconda3/lib/python3.8/site-packages/pydantic/v1/parse.py", line 37, in load_str_bytes
    return json_loads(b)
  File "/root/miniconda3/lib/python3.8/json/__init__.py", line 357, in loads
    return _default_decoder.decode(s)
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/root/miniconda3/lib/python3.8/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Invalid \escape: line 4 column 699 (char 946)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/tmp/ipykernel_5093/1340527589.py", line 22, in <cell line: 6>
    resp = get_function_from_data(d, err)
  File "/tmp/ipykernel_5093/1013548028.py", line 117, 

898...899...900...901...902...903...904...Retrying... (Attempt 1/3)...905...906...907...908...909...910...911...

# 3. Dataset
- search from {lib}

# 4. Analyze

In [28]:
import glob
import shutil

output_dir = "output/hf-eval-data-v3"
valid_dir = "output/hf-eval-data-v3-valid"

matched = 0
valid = 0

for idx, d in enumerate(hf_eval_data):
    print(idx + 1, end="...")
    formatted_number = str(idx + 1).zfill(5)

    # check err
    matching_files_err = glob.glob(f"{output_dir}/f{formatted_number}_*.err")
    if not matching_files_err:
        continue
    
    matched += 1
        
    with open(matching_files_err[0]) as f:
        content = f.read()
        if 'Error' in content:
            continue

    # check out
    matching_files_out = glob.glob(f"{output_dir}/f{formatted_number}_*.out")
    if not matching_files_out:
        continue
        
    with open(matching_files_out[0]) as f:
        content = f.read()
        if 'failed' in content:
            continue
            
    # copy files
    matching_files = glob.glob(f"{output_dir}/f{formatted_number}_*")
    for source_file in matching_files:
        dest_file = valid_dir + source_file.split(f"{output_dir}")[-1]
        shutil.copy(source_file, dest_file)
        
    valid += 1
    
valid, matched, valid/matched

1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...40...41...42...43...44...45...46...47...48...49...50...51...52...53...54...55...56...57...58...59...60...61...62...63...64...65...66...67...68...69...70...71...72...73...74...75...76...77...78...79...80...81...82...83...84...85...86...87...88...89...90...91...92...93...94...95...96...97...98...99...100...101...102...103...104...105...106...107...108...109...110...111...112...113...114...115...116...117...118...119...120...121...122...123...124...125...126...127...128...129...130...131...132...133...134...135...136...137...138...139...140...141...142...143...144...145...146...147...148...149...150...151...152...153...154...155...156...157...158...159...160...161...162...163...164...165...166...167...168...169...170...171...172...173...174...175...176...177...178...179...180...181...182...183...184...185.

(237, 894, 0.2651006711409396)

In [29]:
894*valid/matched

237.0

# 5. Evaluation

- 处理文件为 prompt
- 跑 codellama
- evaluate 结果

In [19]:
prompt = \
"""
# function_import --------------------

from transformers import AutoTokenizer, AutoModel
import torch

# function_code --------------------

def extract_medical_term_relationships(medical_term):
    \"\"\"
    This function uses the pretrained model 'GanjinZero/UMLSBert_ENG' from Hugging Face Transformers to find relationships between medical terms.
    It converts the medical terms into embeddings (dense vectors) which can be compared to find similarities and relationships.

    Args:
        medical_term (str): The medical term to be converted into an embedding.

    Returns:
        torch.Tensor: The embedding of the input medical term.
    \"\"\"
"""

In [9]:
! source activate py38

In [11]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

ImportError: cannot import name 'get_full_repo_name' from 'huggingface_hub' (/root/miniconda3/lib/python3.8/site-packages/huggingface_hub/__init__.py)

In [10]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="codellama/CodeLlama-7b-Python-hf")

ImportError: cannot import name 'get_full_repo_name' from 'huggingface_hub' (/root/miniconda3/lib/python3.8/site-packages/huggingface_hub/__init__.py)