# ToolCall Experiment Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json, re, os, sys
from tqdm import tqdm
from loguru import logger
import random
from typing import List, Dict, Any, Tuple, Union
from copy import deepcopy
import numpy as np
from textwrap import dedent

In [3]:
current_dir = os.path.dirname(os.path.abspath(os.getcwd()))
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

In [4]:
from utils import build_instruction, build_response_from_functioncalls, toolcall_to_functions, format_chat

### APIs

In [6]:
round_a_apis_file = "./round_a_apis.json"
round_b_apis_file = "./round_b_apis.json"
round_a_apis = json.load(open(round_a_apis_file))
round_b_apis = json.load(open(round_b_apis_file))
toolcall_apis = {}
toolcall_apis.update(round_a_apis)
toolcall_apis.update(round_b_apis)
len(toolcall_apis), len(round_a_apis), len(round_b_apis)

(5991, 887, 5188)

In [6]:
toolcall_apis_file = "toolcall_apis.json"
with open(toolcall_apis_file, "w", encoding="utf-8") as f:
    json.dump(toolcall_apis, f, ensure_ascii=False, indent=2)
print(f"Saved {len(toolcall_apis)} toolcall apis to {toolcall_apis_file}")

Saved 5991 toolcall apis to toolcall_apis.json


## Synthetic Dataset

In [5]:
synthetic_data_single_api_file = "./synthetic_data/synthetic_data_native-single_api-v6_0314.jsonl"
synthetic_data_multi_api_file = "./synthetic_data/synthetic_data_native-multi_apis-0316.jsonl"
synthetic_data_dev_file = "./synthetic_data/synthetic_data_native-multi_apis-0318.jsonl"
synthetic_data_multi_api_0315_file = "./synthetic_data/synthetic_data_native-multi_apis-0315.jsonl"

synthetic_data_single_api = [json.loads(line) for line in open(synthetic_data_single_api_file)]
synthetic_data_multi_apis = [json.loads(line) for line in open(synthetic_data_multi_api_file)]
synthetic_data_multi_apis_0315 = [json.loads(line) for line in open(synthetic_data_multi_api_0315_file)]
synthetic_data_dev = [json.loads(line) for line in open(synthetic_data_dev_file)]

In [13]:

num_single_api = 0
num_2_apis = 0
num_3_apis = 0
num_multi_apis = 0
num_dup_functionalls = 0
num_empty_apis = 0

# output_single_file = "./synthetic_data/synthetic_data-single_api-0314.jsonl"
# output_multi_file = "./synthetic_data/synthetic_data-multi_apis-0314.jsonl"
# output_empty_file = "./synthetic_data/synthetic_data-empty_apis-0314.jsonl"
# synthetic_data = synthetic_data_single_api
# empty_threshold = 0.0

output_single_file = "./synthetic_data/synthetic_data-single_api-0315.jsonl"
output_multi_file = "./synthetic_data/synthetic_data-multi_apis-0315.jsonl"
output_empty_file = "./synthetic_data/synthetic_data-empty_apis-0315.jsonl"
synthetic_data = synthetic_data_multi_apis_0315
empty_threshold = 0.9

# output_single_file = "./synthetic_data/synthetic_data-single_api-0316.jsonl"
# output_multi_file = "./synthetic_data/synthetic_data-multi_apis-0316.jsonl"
# output_empty_file = "./synthetic_data/synthetic_data-empty_apis-0316.jsonl"
# empty_threshold = 0.9
# synthetic_data = synthetic_data_multi_apis


# output_single_file = "./synthetic_data/synthetic_data-single_api-0318.jsonl"
# output_multi_file = "./synthetic_data/synthetic_data-multi_apis-0318.jsonl"
# output_empty_file = "./synthetic_data/synthetic_data-empty_apis-0318.jsonl"
# empty_threshold = 0.9
# synthetic_data = synthetic_data_dev


single_apis = []
multi_apis = []
empty_apis = []
for data in tqdm(synthetic_data):
    # print(data)
    if "api_name" in data:
        _ = data.pop("api_name")
    dialogue = data["dialogue"]
    # print(dialogue)

    if not isinstance(dialogue, list):
        continue
    bad = False
    for turn in dialogue:
        if not isinstance(turn, dict):
            bad = True
            break
        if "from" not in turn:
            bad = True
            break
        if "value" not in turn:
            bad = True
            break
    if bad:
        continue

    # 找到最后一个assistant的回复内容，可能是functioncall，也可能是其他内容
    has_functioncall = False
    has_empty = False
    for i, turn in enumerate(reversed(dialogue)):
        if turn["from"] == "ASSISTANT":
            if "value" not in turn:
                break
            content = turn["value"]
            # print(content)
            if "<functioncall>" in content:
                if len(re.findall(r"<functioncall>", content)) > 1 or len(re.findall(r"</functioncall>", content)) > 1:
                    logger.warning(f"Must only 1 <functioncall> pair in {content=}")
                    break
                # if has_functioncall:
                #     num_dup_functionalls += 1
                #     continue
                functioncall = content.split("<functioncall>")[1].split("</functioncall>")[0]
                try:
                    functions = json.loads(functioncall)
                except:
                    logger.warning(f"{functioncall=}")
                    break

                if not isinstance(functions, list):
                    functions = [functions]


                if not all([isinstance(function, dict) for function in functions]):
                    break


                if len(functions) == 0:
                    print(functions)
                    pass
                if len(functions) == 1:
                    if not isinstance(functions[0], dict):
                        break
                    has_functioncall = True
                    num_single_api += 1
                    turn["value"] = ("<functioncall>" + json.dumps(functions, ensure_ascii=False) + "</functioncall>")
                    new_data = deepcopy(data)
                    new_data["dialogue"] = dialogue[:len(dialogue) - i]
                    single_apis.append(new_data)
                elif len(functions) >= 2:
                    has_functioncall = True
                    num_multi_apis += 1
                    turn["value"] = ("<functioncall>" + json.dumps(functions, ensure_ascii=False) + "</functioncall>")
                    new_data = deepcopy(data)
                    new_data["dialogue"] = dialogue[:len(dialogue) - i]
                    multi_apis.append(new_data)
                    if len(functions) == 2:
                        num_2_apis += 1
                    elif len(functions) == 3:
                        num_3_apis += 1
                else:
                    raise ValueError(f"{functions=}")
                break
            else:
                if not has_empty:
                    if random.random() > empty_threshold:
                        empty_apis.append(data)
                    num_empty_apis += 1
                    has_empty = True
                # logger.debug(f"{data=}")

# Only keep USER and ASSISTANT turns
for api_data in [single_apis, multi_apis, empty_apis]:
    for data in api_data:
        dialogue = data["dialogue"]
        new_dialogue = []
        if isinstance(dialogue[0], list):
            dialogue = dialogue[0]
        # print(dialogue)
        for turn in dialogue:
            if turn["from"] in ["USER", "ASSISTANT"]:
                new_dialogue.append(turn)
        data["dialogue"] = new_dialogue
                
print(num_dup_functionalls)
print(num_single_api, num_2_apis, num_3_apis, num_multi_apis, num_empty_apis, len(synthetic_data) - num_single_api - num_2_apis - num_3_apis - num_multi_apis - num_empty_apis)

with open(output_single_file, "w") as f:
    for data in single_apis:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(single_apis)} single-apis to {output_single_file}")
with open(output_multi_file, "w") as f:
    for data in multi_apis:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(multi_apis)} multi-apis to {output_multi_file}")
with open(output_empty_file, "w") as f:
    for data in empty_apis:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(empty_apis)} empty-apis to {output_empty_file}")

100%|██████████| 13287/13287 [00:00<00:00, 56520.95it/s]


0
2834 10329 67 10414 13221 -23578
Saved 2834 single-apis to ./synthetic_data/synthetic_data-single_api-0315.jsonl
Saved 10414 multi-apis to ./synthetic_data/synthetic_data-multi_apis-0315.jsonl
Saved 1295 empty-apis to ./synthetic_data/synthetic_data-empty_apis-0315.jsonl


In [14]:

synthetic_single_files = ["./synthetic_data/synthetic_data-single_api-0314.jsonl",
                          "./synthetic_data/synthetic_data-single_api-0315.jsonl",  
                          "./synthetic_data/synthetic_data-single_api-0316.jsonl",  
                          "./synthetic_data/synthetic_data-single_api-0318.jsonl"]
synthetic_multi_files = ["./synthetic_data/synthetic_data-multi_apis-0314.jsonl",
                         "./synthetic_data/synthetic_data-multi_apis-0315.jsonl",
                         "./synthetic_data/synthetic_data-multi_apis-0316.jsonl",
                         "./synthetic_data/synthetic_data-multi_apis-0318.jsonl"]
synthethic_empty_files = ["./synthetic_data/synthetic_data-empty_apis-0314.jsonl",
                          "./synthetic_data/synthetic_data-empty_apis-0315.jsonl",
                          "./synthetic_data/synthetic_data-empty_apis-0316.jsonl",
                          "./synthetic_data/synthetic_data-empty_apis-0318.jsonl"]
os.makedirs("./synthetic_data/0324", exist_ok=True)
output_synthetic_single_api_file = "./synthetic_data/0324/synthetic_data-single_api-0324.jsonl"
output_sythetic_multi_apis_file = "./synthetic_data/0324/synthetic_data-multi_apis-0324.jsonl"
output_sythetic_empty_apis_file = "./synthetic_data/0324/synthetic_data-empty_apis-0324.jsonl"


synthetic_single_api_data = [json.loads(line) for file in synthetic_single_files for line in open(file) ]
synthetic_multi_apis_data = [json.loads(line) for file in synthetic_multi_files for line in open(file) ]
synthetic_empty_apis_data = [json.loads(line) for file in synthethic_empty_files for line in open(file) ]
# print(f"{len(synthetic_empty_apis_data)}")

synthetic_single_api_data = random.sample(synthetic_single_api_data, len(synthetic_single_api_data))
synthetic_multi_apis_data = random.sample(synthetic_multi_apis_data, len(synthetic_multi_apis_data))
synthetic_empty_apis_data = random.sample(synthetic_empty_apis_data, len(synthetic_empty_apis_data))
# print(f"{len(synthetic_empty_apis_data)}")

with open(output_synthetic_single_api_file, "w", encoding="utf-8") as f:
    for data in synthetic_single_api_data:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(synthetic_single_api_data)} single-apis to {output_synthetic_single_api_file}")
with open(output_sythetic_multi_apis_file, "w", encoding="utf-8") as f:
    for data in synthetic_multi_apis_data:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(synthetic_multi_apis_data)} multi-apis to {output_sythetic_multi_apis_file}")
with open(output_sythetic_empty_apis_file, "w", encoding="utf-8") as f:
    for data in synthetic_empty_apis_data:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(synthetic_empty_apis_data)} empty-apis to {output_sythetic_empty_apis_file}")

Saved 25451 single-apis to ./synthetic_data/0324/synthetic_data-single_api-0324.jsonl
Saved 14181 multi-apis to ./synthetic_data/0324/synthetic_data-multi_apis-0324.jsonl
Saved 4397 empty-apis to ./synthetic_data/0324/synthetic_data-empty_apis-0324.jsonl


### toolcall-single_api-0324.jsonl & toocall-multi_apis-0324.jsonl

In [33]:

synthetic_single_api_file = "./synthetic_data/0324/synthetic_data-single_api-0324.jsonl"
sythetic_multi_apis_file = "./synthetic_data/0324/synthetic_data-multi_apis-0324.jsonl"
sythetic_empty_apis_file = "./synthetic_data/0324/synthetic_data-empty_apis-0324.jsonl"

synthetic_single_api_data = [json.loads(line) for line in open(synthetic_single_api_file)]
synthetic_multi_apis_data = [json.loads(line) for line in open(sythetic_multi_apis_file)]
synthetic_empty_apis_data = [json.loads(line) for line in open(sythetic_empty_apis_file)]

toolcall_apis = json.load(open(toolcall_apis_file))
toolcall_apis_keys = list(toolcall_apis.keys())

def get_functions_from_synthetic_data(data) -> List:
    selected_functions = []
    dialogue = data["dialogue"]
    bad = False
    for turn in reversed(dialogue):
        if turn["from"] == "ASSISTANT":
            content = turn["value"]
            if "<functioncall>" in content:
                functioncall = content.split("<functioncall>")[1].split("</functioncall>")[0]
                try:
                    functions = json.loads(functioncall)
                except:
                    logger.error(f"{functioncall=}")
                    bad = True
                    break

                if not isinstance(functions, list):
                    functions = [functions]

                if not all([isinstance(function, dict) for function in functions]):
                    bad = True
                    break

                for function in functions:
                    selected_functions.append(function)
                break

    last_response = selected_functions
    if not bad:
        last_turn = dialogue[-1]
        if last_turn["from"] != "ASSISTANT":
            bad = True
        else:
            content = last_turn["value"]
            if not "<functioncall>" in content:
                last_response = content


    return selected_functions, last_response, bad

def get_user_messages_from_synthetic_data(data) -> List:
    user_messages = []
    dialogue = data["dialogue"]
    for turn in dialogue:
        if turn["from"] == "USER":
            content = turn["value"]
            user_messages.append(content)
    return user_messages

def get_llm_responses_from_synthetic_data(data) -> List:
    llm_responses = []
    dialogue = data["dialogue"]
    for turn in dialogue:
        if turn["from"] == "ASSISTANT":
            content = turn["value"]
            llm_responses.append(content)
    return llm_responses

toolcall_single_api_data = []
toolcall_multi_apis_data = []
toolcall_empty_apis_data = []

output_single_api_file = "./toolcall-single_api-0324.jsonl"
output_multi_apis_file = "./toolcall-multi_apis-0324.jsonl"
output_empty_apis_file = "./toolcall-empty_apis-0324.jsonl"

# 25451
# synthetic_datas = synthetic_single_api_data
# toolcall_datas = toolcall_single_api_data 
# output_file = output_single_api_file
# prefix = "s"

# 14181
# synthetic_datas = synthetic_multi_apis_data
# toolcall_datas = toolcall_multi_apis_data 
# output_file = output_multi_apis_file
# prefix = "m"

# 3785
synthetic_datas = synthetic_empty_apis_data
toolcall_datas = toolcall_empty_apis_data
output_file = output_empty_apis_file
prefix = "e"


for i, s_data in enumerate(tqdm(synthetic_datas, desc="Processing synthetic data")):
    selected_functions, functions, bad = get_functions_from_synthetic_data(s_data)
    if bad:
        continue

    assert isinstance(selected_functions, list), f"{selected_functions=}"
    if len(selected_functions) == 0:
        continue
        

    user_messages = get_user_messages_from_synthetic_data(s_data)
    llm_responses = get_llm_responses_from_synthetic_data(s_data)

    id = f"{prefix}-{len(toolcall_datas)}"
    if not isinstance(functions, list):
        data = {
            "id": id,
            "user_messages": user_messages,
            "functions": [[] * len(user_messages)],
            "llm_responses": llm_responses,
            "apis": selected_apis,
        }
        toolcall_datas.append(data)

    else:
        num_functions = len(functions)
        if num_functions == 0:
            continue


        function_names = [function["name"] for function in selected_functions]
        # Random select 4 - 6 apis that are different from the function_names
        num_toolcall_apis = random.randint(4, 6)
        toolcall_api_names = random.sample(toolcall_apis_keys, num_toolcall_apis + len(function_names))
        toolcall_api_names = [name for name in toolcall_api_names if name not in function_names][:num_toolcall_apis]

        selected_apis = [toolcall_apis[name] for name in toolcall_api_names + function_names]
        selected_apis = random.sample(selected_apis, len(selected_apis))

        data = {
            "id": id,
            "user_messages": user_messages,
            "functions": [[] * (len(user_messages) -1)] + [[ {"type": "function", "function": function} for function in functions]],
            "llm_responses": llm_responses,
            "apis": selected_apis,
        }

        toolcall_datas.append(data)

toolcall_datas = random.sample(toolcall_datas, len(toolcall_datas))
with open(output_file, "w", encoding="utf-8") as f:
    for data in toolcall_datas:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(toolcall_datas)} to {output_file}")  




        
    

Processing synthetic data:   0%|          | 0/4397 [00:00<?, ?it/s][32m2025-03-24 15:08:05.596[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mget_functions_from_synthetic_data[0m:[36m24[0m - [31m[1mfunctioncall=' {"name": "calculate_employee_engagement_score", "arguments": {"employee_id": "E789", "survey_results": "{\\"satisfaction\\": 6, \\"motivation\\": 5}", "peer_feedback": \\"很努力\\", "manager_feedback": \\"表现良好\\", "job_satisfaction_level": 7}}'[0m
Processing synthetic data: 100%|██████████| 4397/4397 [00:00<00:00, 172912.75it/s]

Saved 3785 to ./toolcall-empty_apis-0324.jsonl





## Module Datasets

#### Load Full Dataset

In [12]:

ratios = [6, 3, 1]
# ratios = [5, 4, 1]
tag = "_".join([f"{num}" for num in ratios])
total_train_data = 3000
total_test_data = 1000
data_dir = "./" + tag
full_train_file = os.path.join(data_dir, f"toolcall-train-{total_train_data}-{tag}.jsonl")
full_test_file = os.path.join(data_dir, f"toolcall-test-{total_test_data}-{tag}.jsonl")    

full_train_data = [json.loads(line) for line in open(full_train_file).readlines()]
full_test_data = [json.loads(line) for line in open(full_test_file).readlines()]


### Save Full Dataset

In [None]:

single_api_file = "./toolcall-single_api-0324.jsonl"
multi_apis_file = "./toolcall-multi_apis-0324.jsonl"
empty_apis_file = "./toolcall-empty_apis-0324.jsonl"
single_api_data = [json.loads(line) for line in open(single_api_file).readlines()]
multi_apis_data = [json.loads(line) for line in open(multi_apis_file).readlines()]
empty_apis_data = [json.loads(line) for line in open(empty_apis_file).readlines()]
print(f"{len(single_api_data)}, {len(multi_apis_data)}, {len(empty_apis_data)}")

weights = list(np.array(ratios) * 0.1)
train_data = single_api_data[:int(total_train_data * weights[0])] + multi_apis_data[:int(total_train_data * weights[1])] + empty_apis_data[:int(total_train_data * weights[2])]
test_data = single_api_data[int(-total_test_data * weights[0]):] + multi_apis_data[int(-total_test_data * weights[1]):] + empty_apis_data[int(-total_test_data * weights[2]):]
full_train_data = random.sample(train_data, len(train_data))
full_test_data = random.sample(test_data, len(test_data))

In [None]:

os.makedirs(data_dir, exist_ok=True)
with open(full_train_file, "w", encoding="utf-8") as f:
    for data in full_train_data:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(train_data)} to {full_train_file}")
with open(full_test_file, "w", encoding="utf-8") as f:
    for data in full_test_data:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
print(f"Saved {len(test_data)} to {full_test_file}")


### Intent Dataset

In [13]:

# Adjust these numbers to change the number of training and testing data
num_train_data = 1000
num_test_data = 1000

train_data = full_train_data[:num_train_data]
test_data = full_test_data[:num_test_data]
train_data_file = os.path.join(data_dir, f"toolcall-instructions-intent-train-{num_train_data}-{total_train_data}-{tag}.jsonl")
test_data_file = os.path.join(data_dir, f"toolcall-instructions-intent-test-{num_test_data}-{total_test_data}-{tag}.jsonl")

In [14]:

from modules.intent.intent_prompt import system_content as intent_system_content

def build_instruction_response_from_data(data):
    instruction = ""
    response = ""
    user_messages = data['user_messages']
    apis = data['apis']
    toolcalls = data['functions']
    output_format = "json"
    system_content = intent_system_content
    instruction = build_instruction(user_messages=user_messages, tools=apis, output_format=output_format, system_content=system_content, include_toolcall_example=False)
    # response = build_response_from_functioncalls(functions, output_format=output_format)
    # logger.info(f"{functions=}")

    toolcall = toolcalls[-1]
    function_names = [ function["function"]["name"] for function in toolcall]
    response = "<function-names>[" + ", ".join(function_names) + "]</function-names>"

    # functions = [ function["function"] for function in toolcall]
    # function_names = [ function["name"] for function in functions]
    # response = "<function-names>[" + ", ".join(function_names) + "]</function-names>"

    return instruction, response

#### toolcall-instructions-intent-train-3000-6_3_1.jsonl

In [None]:


with open(train_data_file, "w", encoding="utf-8") as f:
    for data in tqdm(train_data, desc="Building training data"):
        instruction, response = build_instruction_response_from_data(data)
        f.write(json.dumps({
                "id": data["id"],
                "instruction": instruction,
                "response": response,
                # "user_messages": data['user_messages'],
                # "llm_responses": data["llm_responses"], 
                # "functions": data['functions'],
                # "apis": data['apis'],
            }, 
            ensure_ascii=False) + "\n")

print(f"Saved {len(train_data)} to {train_data_file}")

#### toolcall-instructions-intent-test-1000-6_3_1.jsonl

In [15]:

with open(test_data_file, "w", encoding="utf-8") as f:
    for data in tqdm(test_data, desc="Building test data"):
        instruction, response = build_instruction_response_from_data(data)
        f.write(json.dumps({
                "id": data["id"],
                "instruction": instruction,
                "response": response,
                "user_messages": data['user_messages'],
                "llm_responses": data["llm_responses"], 
                "functions": data['functions'],
                "apis": data['apis'],
            }, 
            ensure_ascii=False) + "\n")
print(f"Saved {len(test_data)} to {test_data_file}")

Building test data: 100%|██████████| 1000/1000 [00:00<00:00, 11488.19it/s]

Saved 1000 to ./6_3_1/toolcall-instructions-intent-test-1000-1000-6_3_1.jsonl





### NER Dataset

In [8]:
# Adjust these numbers to change the number of training and testing data
num_train_data = 3000
num_test_data = 1000

train_data = full_train_data[:num_train_data]
test_data = full_test_data[:num_test_data]
train_data_file = os.path.join(
    data_dir, f"toolcall-instructions-ner-train-{num_train_data}-{total_train_data}-{tag}.jsonl"
)
test_data_file = os.path.join(data_dir, f"toolcall-instructions-ner-test-{num_test_data}-{total_test_data}-{tag}.jsonl")

In [9]:

from modules.ner.ner_prompt import system_content as ner_system_content

def build_instruction_response_from_data(data):
    instruction = ""
    response = ""
    user_messages = data['user_messages']
    apis = data['apis']
    apis_dict = {
        api["name"]: api
        for api in apis
    }
    toolcalls = data['functions']
    output_format = "json"
    system_content = ner_system_content

    instruction = build_instruction(
        user_messages=user_messages,
        tools=apis,
        output_format=output_format,
        system_content=system_content,
        include_toolcall_example=False
    )
    # response = build_response_from_functioncalls(functions, output_format=output_format)
    # logger.info(f"{functions=}")

    toolcall = toolcalls[-1]
    functions = [function["function"] for function in toolcall]

    response = ""
    for function in functions:
        func_name = function["name"]
        func_arguments = function["arguments"]
        api = apis_dict[func_name]
        api_description = api['description']
        api_parameters = api['parameters']['properties']

        func_arguments_list = []
        for arg_name, arg_value in func_arguments.items():
            if len(str(arg_value)) == 0:
                continue
            api_param = api_parameters.get(arg_name)
            if api_param is None:
                continue
            arg_type = api_param['type']
            arg_description = api_param['description']

            data = f"| {arg_name} | {str(arg_value)} | {arg_type} | {arg_description} |"
            func_arguments_list.append(data)
        func_arguments_str = "\n".join(func_arguments_list)
        response += f"<function-name>| {func_name} | {api_description} |</function-name>\n<function-arguments>\n{func_arguments_str}\n</function-arguments>"

    return instruction, response


#### toolcall-instructions-ner-train-3000-6_3_1.jsonl

In [None]:
with open(train_data_file, "w", encoding="utf-8") as f:
    for data in tqdm(train_data, desc="Building training data"):
        instruction, response = build_instruction_response_from_data(data)
        f.write(
            json.dumps(
                {
                    "id": data["id"],
                    "instruction": instruction,
                    "response": response,
                    # "user_messages": data['user_messages'],
                    # "llm_responses": data["llm_responses"],
                    # "functions": data['functions'],
                    # "apis": data['apis'],
                },
                ensure_ascii=False
            ) + "\n"
        )

print(f"Saved {len(train_data)} to {train_data_file}")

#### toolcall-instructions-ner-test-1000-6_3_1.jsonl

In [10]:

with open(test_data_file, "w", encoding="utf-8") as f:
    for data in tqdm(test_data, desc="Building test data"):
        instruction, response = build_instruction_response_from_data(data)
        f.write(
            json.dumps(
                {
                    "id": data["id"],
                    "instruction": instruction,
                    "response": response,
                    "user_messages": data['user_messages'],
                    "llm_responses": data["llm_responses"],
                    "functions": data['functions'],
                    "apis": data['apis'],
                },
                ensure_ascii=False
            ) + "\n"
        )
print(f"Saved {len(test_data)} to {test_data_file}")

Building test data: 100%|██████████| 1000/1000 [00:00<00:00, 10558.43it/s]

Saved 1000 to ./6_3_1/toolcall-instructions-ner-test-1000-1000-6_3_1.jsonl





## Evaluation

### Intent Model

In [None]:
test_file = ""
results_file = ""
test_datas = [json.loads(line) for line in open(test_file).readlines()]
results_data = [json.loads(line) for line in open(results_file).readlines()]
assert len(test_datas) == len(results_data)

from collections import Counter
def get_function_names_from_data(data):
    function_names = data.split("<function-names>")[1].split("</function-names>")[0].strip()
    function_names = function_names.split(",")
    function_names = [name.strip() for name in function_names]
    function_names = sorted(function_names)

    return function_names


all_scores = []
for result_data, test_data  in tqdm(zip(results_data, test_datas), desc="Evaluating results"):
    test_response = test_data['response']
    test_function_names = get_function_names_from_data(test_response)

    result_response = result_data['response']
    result_function_names = get_function_names_from_data(result_response)

    if test_function_names == result_function_names:
        score = 1
        all_scores.append(score)
        continue

    mc_test_function_names = Counter(test_function_names).most_common()
    test_names_count = {name: count for name, count in mc_test_function_names}

    mc_result_function_names = Counter(result_function_names).most_common()
    result_name_count = { name: count for name, count in mc_result_function_names}

    common_names = set(test_function_names) & set(result_function_names)
    TP = [ min(test_names_count.get(name, 0), result_name_count.get(name, 0)) for name in common_names]
    FP = [ min(result_name_count.get(name, 0) - test_names_count.get(name, 0), 0) for name in set(test_function_names)]
    # FN = [ min(result_name_count.get(name, 0) - test_names_count.get(name, 0), 0) for name in set(result_function_names) - set(test_function_names)]
    # TP, FP, FN = sum(TP), sum(FP), sum(FN)
    # # score = TP / (TP + FP + FN)
    # acc = TP / (TP + FN)
    # callback = TP / (TP + FP)
    # F1 = 2 * acc * callback / (acc + callback)
    # score = F1 
    acc = TP / (TP + FP)
    score = acc 

    all_scores.append(score)

score = sum(all_scores) / len(all_scores)
print(f"{score=}")




### NER Model

In [None]:

test_file = ""
results_file = ""
test_datas = [json.loads(line) for line in open(test_file).readlines()]
results_data = [json.loads(line) for line in open(results_file).readlines()]
assert len(test_datas) == len(results_data)

from collections import Counter
def get_function_arguments_from_data(data):
    function_names = data.split("<function-names>")[1].split("</function-names>")[0].strip()
    function_names = function_names.split(",")
    function_names = [name.strip() for name in function_names]
    function_names = sorted(function_names)

    return function_names


all_scores = []
for result_data, test_data  in tqdm(zip(results_data, test_datas), desc="Evaluating results"):
    test_response = test_data['response']
    test_function_names = get_function_names_from_data(test_response)

    result_response = result_data['response']
    result_function_arguments = get_function_arguments_from_data(result_response)

    if test_function_names == result_function_names:
        score = 1
    else:
        mc_test_function_names = Counter(test_function_names).most_common()
        test_names_count = {name: count for name, count in mc_test_function_names}

        mc_result_function_names = Counter(result_function_names).most_common()
        result_name_count = { name: count for name, count in mc_result_function_names}

        common_names = set(test_function_names) & set(result_function_names)
        TP = [ min(test_names_count.get(name, 0), result_name_count.get(name, 0)) for name in common_names]
        FP = [ min(test_names_count.get(name, 0) - result_name_count.get(name, 0), 0) for name in set(test_function_names) - set(result_function_names)]
        # FN = [ min(result_name_count.get(name, 0) - test_names_count.get(name, 0), 0) for name in set(result_function_names) - set(test_function_names)]
        # TP, FP, FN = sum(TP), sum(FP), sum(FN)
        # # score = TP / (TP + FP + FN)
        # acc = TP / (TP + FN)
        # callback = TP / (TP + FP)
        # F1 = 2 * acc * callback / (acc + callback)
        # score = F1 
        acc = TP / (TP + FP)
        score = acc 
        

    all_scores.append(score)

score = sum(all_scores) / len(all_scores)
print(f"{score=}")


