## Convert ToolRet dataset into MTEB format

In [1]:
import os
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets

tasks = ['reversechain', 'gorilla-huggingface', 'tool-be-honest', 'toolbench', 'rotbench', 'gpt4tools', 'apibank', 'gta', 'metatool', 'toolemu', 'taskbench-multimedia', 'craft-math-algebra', 'toollens', 'autotools-weather', 'gorilla-tensor', 'ultratool', 'autotools-music', 't-eval-step', 'craft-vqa', 'mnms', 't-eval-dialog', 'restgpt-spotify', 'toolbench-sam', 'toolace', 'toolink', 'craft-tabmwp', 'gorilla-pytorch', 'tooleyes', 'autotools-food', 'appbench', 'toolalpaca', 'apigen', 'taskbench-daily', 'restgpt-tmdb', 'taskbench-huggingface']

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
queries_path = "mangopy/ToolRet-Queries"
tools_path = "mangopy/ToolRet-Tools"

total_queries = None
for task in tqdm(tasks):
	queries = load_dataset(queries_path, task, split="queries")
	if total_queries is None:
		total_queries = queries
	else:
		total_queries = concatenate_datasets([total_queries, queries])
total_queries


Generating queries split: 100%|██████████| 500/500 [00:00<00:00, 38684.99 examples/s]
Generating queries split: 100%|██████████| 350/350 [00:00<00:00, 43430.86 examples/s]
Generating queries split: 100%|██████████| 550/550 [00:00<00:00, 63833.18 examples/s]
Generating queries split: 100%|██████████| 32/32 [00:00<00:00, 5346.68 examples/s]
Generating queries split: 100%|██████████| 14/14 [00:00<00:00, 2338.24 examples/s]
Generating queries split: 100%|██████████| 200/200 [00:00<00:00, 31831.70 examples/s]
Generating queries split: 100%|██████████| 38/38 [00:00<00:00, 3088.59 examples/s]
Generating queries split: 100%|██████████| 40/40 [00:00<00:00, 6293.50 examples/s]
Generating queries split: 100%|██████████| 280/280 [00:00<00:00, 31530.19 examples/s]
Generating queries split: 100%|██████████| 314/314 [00:00<00:00, 27292.75 examples/s]
Generating queries split: 100%|██████████| 11/11 [00:00<00:00, 1071.79 examples/s]
Generating queries split: 100%|██████████| 55/55 [00:00<00:00, 8402.9

Dataset({
    features: ['id', 'query', 'instruction', 'labels', 'category'],
    num_rows: 7961
})

In [18]:
total_queries[0]

{'id': 'reversechain_query_0',
 'query': 'What is the weather forecast for the location of the Burning Man Festival on the day it starts?',
 'instruction': "Given a `weather forecast` task, retrieve tools that can determine the weather conditions for an event by first identifying the event's date and location, followed by processing these details to provide the forecast for that specific date and location.",
 'labels': '[{"id": "reversechain_tool_409", "doc": {"name": "FindEventCity", "input_params": {"event_name": {"description": "the name of the event", "type": "String"}}, "output_params": {"city_name": {"description": "the name of the city where the event is located", "type": "String"}}, "format": "FindEventCity(event_name) -> city_name", "description": "This API is to find the city location of an event."}, "relevance": 1}, {"id": "reversechain_tool_410", "doc": {"name": "FindEventDate", "input_params": {"event_name": {"description": "the name of the event", "type": "String"}}, "out

In [34]:
total_queries = total_queries.rename_column("id", "_id")
total_queries = total_queries.rename_column("query", "text")
total_queries = total_queries.remove_columns(["instruction", "labels", "category"])
total_queries


Dataset({
    features: ['_id', 'text'],
    num_rows: 7961
})

In [2]:
web_tools = load_dataset("mangopy/ToolRet-Tools", "web", split="tools")
code_tools = load_dataset("mangopy/ToolRet-Tools", "code", split="tools")
customized_tools = load_dataset("mangopy/ToolRet-Tools", "customized", split="tools")

Downloading readme: 100%|██████████| 1.80k/1.80k [00:00<00:00, 7.65kB/s]


In [3]:
from datasets import concatenate_datasets
tools_corpus = concatenate_datasets([web_tools, code_tools, customized_tools])

In [4]:
tools_corpus

Dataset({
    features: ['id', 'documentation'],
    num_rows: 44453
})

In [5]:
tools_corpus = tools_corpus.rename_column("id", "_id")
tools_corpus = tools_corpus.rename_column("documentation", "text")
tools_corpus = tools_corpus.add_column("title", [""] * len(tools_corpus))
tools_corpus

Dataset({
    features: ['_id', 'text', 'title'],
    num_rows: 44453
})

In [32]:
tools_corpus[0]

{'_id': 'ultraTool_tool_0',
 'text': '{"doc_arguments": {"type": "object", "properties": {"file_path": {"type": "string", "description": "The file path for which to count the number of characters"}}}, "name": "file_character_count", "description": "Count the number of characters in a file and return"}',
 'title': ''}

In [26]:
import json
from datasets import Dataset

def convert_to_mteb_format(queries):
    mteb_rows = []
    
    for query in tqdm(queries):
        query_id = query["id"]
        labels = json.loads(query["labels"])
        
        for label in labels:
            tool_id = label["id"]
            relevance = label["relevance"]
            
            mteb_rows.append({
                "query-id": query_id,
                "corpus-id": tool_id,
                "score": relevance
            })
    
    return Dataset.from_list(mteb_rows)

In [27]:
mteb_dataset = convert_to_mteb_format(total_queries)
mteb_dataset

100%|██████████| 7961/7961 [00:00<00:00, 14070.27it/s]


Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 14106
})

In [29]:
mteb_dataset[1]

{'query-id': 'reversechain_query_0',
 'corpus-id': 'reversechain_tool_410',
 'score': 1}

In [38]:
repo_id = "yjoonjang/toolret"

mteb_dataset.push_to_hub(
    repo_id=repo_id,
    config_name="default",
    split="dev",
    data_dir = "default"
)

total_queries.push_to_hub(
    repo_id=repo_id,
    config_name="queries",
	split="queries",
	data_dir="queries"
)

tools_corpus.push_to_hub(
    repo_id=repo_id,
	config_name="corpus",
	split="corpus",
	data_dir="corpus"
)

Creating parquet from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 1182.54ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]
Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 473.93ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]
Creating parquet from Arrow format: 100%|██████████| 45/45 [00:00<00:00, 424.86ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.35s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/yjoonjang/toolret/commit/50e91b691d2a347735bc2c1b4dd235f4047d1984', commit_message='Upload dataset', commit_description='', oid='50e91b691d2a347735bc2c1b4dd235f4047d1984', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yjoonjang/toolret', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yjoonjang/toolret'), pr_revision=None, pr_num=None)