## Imports

In [1]:
import re
from typing import Dict

from datasets import load_dataset, Dataset, DatasetDict, load_from_disk, concatenate_datasets
from huggingface_hub import notebook_login

## Global variables

In [2]:
SAVE_GATH_BAIZE = True
SAVE_GLAIVE_FC_V2 = True
MERGE_BOTH = True
FINAL = True

## Dataset Names

In [3]:
dataset_name, dataset_name2 = "gathnex/Gath_baize", "glaiveai/glaive-function-calling-v2"

## Gath_baize

In [4]:
# Loading a Gath_baize dataset
dataset = load_dataset(dataset_name, split="train")

In [5]:
print(dataset)
dataset["chat_sample"][1]

Dataset({
    features: ['chat_sample', 'dataset_origin'],
    num_rows: 210311
})


'The conversation between Human and AI assisatance named Gathnex [INST] I would like to know how to find the path with the maximum minimal weight in a graph.\n[/INST] That\'s a great question. To find the path with the maximum minimal weight in a graph, you can use the algorithm called "Floyd–Warshall algorithm". Have you heard about it before?\n[INST] Yes, I have. Can you explain how it works?\n[/INST] Sure. The Floyd–Warshall algorithm is a dynamic programming algorithm that finds the shortest paths in a weighted graph with positive or negative edge weights. It does this by repeatedly improving an estimate of the shortest path between two points, until the estimate is optimal.\n[INST] How does it guarantee to find the maximum minimal weight path?\n[/INST] The algorithm works by computing a matrix of all the possible pairwise shortest paths. From that matrix, we can pick the path with the largest minimum weight. That path will be the maximum minimal weight path.\n[INST] That sounds in

### Function to split System and chat

In [6]:
def convert_row(row: str) -> Dict[str, str]:
    regex = r"(?P<system>^.+?)(?=\[INST\])|\[INST\](?P<user>.+?)\[\/INST\]|(?<=\[\/INST\])(?P<assistant>.+?)(?=\[INST\]|$)"
    # Compile the regex for better performance
    compiled_regex = re.compile(regex, re.DOTALL)
    # Search for matches
    matches = compiled_regex.finditer(row)

    system_prompt = []
    conversation = []
    # Process matches
    for match in matches:
        if match.group("system"):
            system = match.group("system").strip()
            system_prompt += [{"role": "system", "content": "The conversation between Human and AI assistant"}]
        if match.group("user"):
            user = match.group("user").strip()
            conversation += [{"role": "user", "content": user}]
        if match.group("assistant"):
            assistant = match.group("assistant").strip()
            conversation += [{"role": "assistant", "content": assistant}]
    
    return {'system': system_prompt, 'chat': conversation}

In [7]:
convert_row(dataset["chat_sample"][1])

{'system': [{'role': 'system',
   'content': 'The conversation between Human and AI assistant'}],
 'chat': [{'role': 'user',
   'content': 'I would like to know how to find the path with the maximum minimal weight in a graph.'},
  {'role': 'assistant',
   'content': 'That\'s a great question. To find the path with the maximum minimal weight in a graph, you can use the algorithm called "Floyd–Warshall algorithm". Have you heard about it before?'},
  {'role': 'user', 'content': 'Yes, I have. Can you explain how it works?'},
  {'role': 'assistant',
   'content': 'Sure. The Floyd–Warshall algorithm is a dynamic programming algorithm that finds the shortest paths in a weighted graph with positive or negative edge weights. It does this by repeatedly improving an estimate of the shortest path between two points, until the estimate is optimal.'},
  {'role': 'user',
   'content': 'How does it guarantee to find the maximum minimal weight path?'},
  {'role': 'assistant',
   'content': 'The algori

### Splitting System and Chat

In [8]:
if SAVE_GATH_BAIZE:
    # Use the .map() method to apply convert_row to each row
    # Set batched=True if your convert_row function can handle batch processing for efficiency
    updated_dataset = dataset.map(lambda examples: convert_row(examples['chat_sample']), remove_columns=['chat_sample'])
    
    print(updated_dataset)

Dataset({
    features: ['dataset_origin', 'system', 'chat'],
    num_rows: 210311
})


In [9]:
if SAVE_GATH_BAIZE:
    print(updated_dataset[0])

{'dataset_origin': 'alpaca', 'system': [{'content': 'The conversation between Human and AI assistant', 'role': 'system'}], 'chat': [{'content': 'Generate a headline given a content block.\nThe Sony Playstation 5 is the latest version of the console. It has improved graphics and faster processing power.', 'role': 'user'}, {'content': 'Experience Amazing Graphics and Speed with the New Sony Playstation 5', 'role': 'assistant'}]}


### Saving new dataset

In [10]:
# Specify the path where you want to save the dataset
save_path = './dataset/hibo-gath_baize'

In [11]:
if SAVE_GATH_BAIZE:
    # Save the dataset to disk
    updated_dataset.save_to_disk(save_path)
    
    print(f"Dataset saved to {save_path}")

Saving the dataset (0/1 shards):   0%|          | 0/210311 [00:00<?, ? examples/s]

Dataset saved to ./dataset/hibo-gath_baize


### Reloading new dataset

In [12]:
# Load the dataset from disk
reloaded_dataset = load_from_disk(save_path)

print(reloaded_dataset)
reloaded_dataset[0]

Dataset({
    features: ['dataset_origin', 'system', 'chat'],
    num_rows: 210311
})


{'dataset_origin': 'alpaca',
 'system': [{'content': 'The conversation between Human and AI assistant',
   'role': 'system'}],
 'chat': [{'content': 'Generate a headline given a content block.\nThe Sony Playstation 5 is the latest version of the console. It has improved graphics and faster processing power.',
   'role': 'user'},
  {'content': 'Experience Amazing Graphics and Speed with the New Sony Playstation 5',
   'role': 'assistant'}]}

## glaive-function-calling-v2

In [13]:
# Loading a Gath_baize_function_calling dataset
dataset2 = load_dataset(dataset_name2, split="train")

In [14]:
print(dataset2)
dataset2['chat'][112911]

Dataset({
    features: ['chat', 'system'],
    num_rows: 112960
})


'USER: Hi, I would like to create a new user profile.\n\n\nASSISTANT: Sure, I can help with that. Could you please provide me with your name, age, and email address? <|endoftext|>\n\n\nUSER: My name is John Doe, I am 30 years old and my email is johndoe@example.com.\n\n\nASSISTANT: <functioncall> {"name": "create_user_profile", "arguments": \'{"name": "John Doe", "age": 30, "email": "johndoe@example.com"}\'} <|endoftext|>\n\n\nFUNCTION RESPONSE: {"status": "success", "message": "User profile created successfully", "user_id": "12345"}\n\n\nASSISTANT: Your user profile has been created successfully. Your user ID is 12345. Please keep it safe as you will need it for future reference. <|endoftext|>\n\n\n'

### Function to split System and chat

In [15]:
def convert_row_d2(chat: str, system: str) -> Dict[str, str]:
    regex = r"(?=USER:\s)|USER:\s(?P<user>.+?)(?=ASSISTANT:\s|FUNCTION RESPONSE:\s|$)|ASSISTANT:\s(?P<assistant>.+?)(?=USER:\s|FUNCTION RESPONSE:\s|ASSISTANT:\s|$)|FUNCTION RESPONSE:\s(?P<function_response>.+?)(?=USER:\s|ASSISTANT:\s|FUNCTION RESPONSE:\s|$)"
    # Compile the regex for better performance
    compiled_regex = re.compile(regex, re.DOTALL)
    # Search for matches
    matches = compiled_regex.finditer(chat)

    system_conv = [{"role": "system", "content": system.replace("SYSTEM: ", "").strip()}]
    conversation = []
    # Process matches
    for match in matches:
        if match.group("user"):
            user = match.group("user").replace("<|endoftext|>", "").strip()
            conversation += [{"role": "user", "content": user}]
        if match.group("assistant"):
            assistant = match.group("assistant").replace("<|endoftext|>", "").strip()
            conversation += [{"role": "assistant", "content": assistant}]
        if match.group("function_response"):
            function_response = match.group("function_response").strip()
            conversation += [{"role": "function_response", "content": function_response}]
    return {'system': system_conv, 'chat': conversation, 'dataset_origin': 'glaiveai/glaive-function-calling-v2'}


In [16]:
text = """USER: Can you book a flight for me from New York to London?\n\n\nASSISTANT: I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask! <|endoftext|>\n\n\n"""
system = """SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_exchange_rate",\n    "description": "Get the exchange rate between two currencies",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "base_currency": {\n                "type": "string",\n                "description": "The currency to convert from"\n            },\n            "target_currency": {\n                "type": "string",\n                "description": "The currency to convert to"\n            }\n        },\n        "required": [\n            "base_currency",\n            "target_currency"\n        ]\n    }\n}\n"""
convert_row_d2(text, system)

{'system': [{'role': 'system',
   'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_exchange_rate",\n    "description": "Get the exchange rate between two currencies",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "base_currency": {\n                "type": "string",\n                "description": "The currency to convert from"\n            },\n            "target_currency": {\n                "type": "string",\n                "description": "The currency to convert to"\n            }\n        },\n        "required": [\n            "base_currency",\n            "target_currency"\n        ]\n    }\n}'}],
 'chat': [{'role': 'user',
   'content': 'Can you book a flight for me from New York to London?'},
  {'role': 'assistant',
   'content': "I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currenci

In [17]:
if SAVE_GLAIVE_FC_V2:
    # Adding 'dataset_origin' column directly within the .map() call
    updated_dataset = dataset2.map(lambda examples: convert_row_d2(examples['chat'], examples['system']), batched=False)
    print(updated_dataset)
    print(updated_dataset[0])

Map:   0%|          | 0/112960 [00:00<?, ? examples/s]

Dataset({
    features: ['chat', 'system', 'dataset_origin'],
    num_rows: 112960
})
{'chat': [{'content': 'Can you book a flight for me from New York to London?', 'role': 'user'}, {'content': "I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!", 'role': 'assistant'}], 'system': [{'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_exchange_rate",\n    "description": "Get the exchange rate between two currencies",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "base_currency": {\n                "type": "string",\n                "description": "The currency to convert from"\n            },\n            "target_currency": {\n                "type": "string",\n                "description": "The currency to convert to"\n            }\n        },

### Saving new dataset

In [18]:
# Specify the path where you want to save the dataset
save_path = './dataset/hibo-glaive-function-calling'

In [19]:
if SAVE_GLAIVE_FC_V2:
    # Save the dataset to disk
    updated_dataset.save_to_disk(save_path)
    
    print(f"Dataset saved to {save_path}")

Saving the dataset (0/1 shards):   0%|          | 0/112960 [00:00<?, ? examples/s]

Dataset saved to ./dataset/hibo-glaive-function-calling


### Reloading new dataset

In [20]:
# Load the dataset from disk
reloaded_dataset2 = load_from_disk(save_path)

print(reloaded_dataset2)
reloaded_dataset2[0]

Dataset({
    features: ['chat', 'system', 'dataset_origin'],
    num_rows: 112960
})


{'chat': [{'content': 'Can you book a flight for me from New York to London?',
   'role': 'user'},
  {'content': "I'm sorry, but I don't have the capability to book flights. My current function allows me to get the exchange rate between two currencies. If you need help with that, feel free to ask!",
   'role': 'assistant'}],
 'system': [{'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_exchange_rate",\n    "description": "Get the exchange rate between two currencies",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "base_currency": {\n                "type": "string",\n                "description": "The currency to convert from"\n            },\n            "target_currency": {\n                "type": "string",\n                "description": "The currency to convert to"\n            }\n        },\n        "required": [\n            "base_currency",\n            "target_cu

## hibo-function-calling-v1

### Merge both

In [21]:
print(reloaded_dataset['system'][0])

[{'content': 'The conversation between Human and AI assistant', 'role': 'system'}]


In [22]:
print(reloaded_dataset2['system'][1])

[{'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "get_news_headlines",\n    "description": "Get the latest news headlines",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "country": {\n                "type": "string",\n                "description": "The country for which to fetch news"\n            }\n        },\n        "required": [\n            "country"\n        ]\n    }\n}', 'role': 'system'}]


In [23]:
if MERGE_BOTH:
    # Assuming updated_dataset and updated_dataset_with_origin are your two datasets ready to be merged
    merged_dataset = concatenate_datasets([reloaded_dataset, reloaded_dataset2])
    
    print(merged_dataset)

Dataset({
    features: ['dataset_origin', 'system', 'chat'],
    num_rows: 323271
})


### Saving merged dataset

In [24]:
# Specify the path where you want to save the dataset
save_path = './dataset/hibo-function-calling-merge'

In [25]:
if MERGE_BOTH:
    # Save the dataset to disk
    merged_dataset.save_to_disk(save_path)
    
    print(f"Dataset saved to {save_path}")

Saving the dataset (0/1 shards):   0%|          | 0/323271 [00:00<?, ? examples/s]

Dataset saved to ./dataset/hibo-function-calling-merge


### Reloading new dataset

In [26]:
# Load the dataset from disk
reloaded_dataset3 = load_from_disk(save_path)
print(reloaded_dataset3)

Dataset({
    features: ['dataset_origin', 'system', 'chat'],
    num_rows: 323271
})


In [27]:
reloaded_dataset3[0]

{'dataset_origin': 'alpaca',
 'system': [{'content': 'The conversation between Human and AI assistant',
   'role': 'system'}],
 'chat': [{'content': 'Generate a headline given a content block.\nThe Sony Playstation 5 is the latest version of the console. It has improved graphics and faster processing power.',
   'role': 'user'},
  {'content': 'Experience Amazing Graphics and Speed with the New Sony Playstation 5',
   'role': 'assistant'}]}

In [28]:
reloaded_dataset3[323222]

{'dataset_origin': 'glaiveai/glaive-function-calling-v2',
 'system': [{'content': 'You are a helpful assistant with access to the following functions. Use them if required -\n{\n    "name": "create_user_profile",\n    "description": "Create a user profile",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "name": {\n                "type": "string",\n                "description": "The user\'s name"\n            },\n            "age": {\n                "type": "integer",\n                "description": "The user\'s age"\n            },\n            "email": {\n                "type": "string",\n                "description": "The user\'s email address"\n            }\n        },\n        "required": [\n            "name",\n            "age",\n            "email"\n        ]\n    }\n}',
   'role': 'system'}],
 'chat': [{'content': 'Hi, I would like to create a new user profile.',
   'role': 'user'},
  {'content': 'Sure, I can help with that. Could yo

## Split train

In [29]:
dataset_dict = DatasetDict({
    'train': reloaded_dataset3
})

# Shuffle the 'train' dataset
dataset_dict = dataset_dict['train'].shuffle(seed=42)

In [30]:
dataset_dict

Dataset({
    features: ['dataset_origin', 'system', 'chat'],
    num_rows: 323271
})

In [31]:
# Specify the path where you want to save the dataset
save_path = './dataset/hibo-function-calling-v1'

In [32]:
if FINAL:
    # Save the dataset to disk
    dataset_dict.save_to_disk(save_path)
    
    print(f"Dataset saved to {save_path}")

Saving the dataset (0/1 shards):   0%|          | 0/323271 [00:00<?, ? examples/s]

Dataset saved to ./dataset/hibo-function-calling-v1


In [33]:
if FINAL:
    dataset_dict.push_to_hub("thibaud-perrin/hibo-function-calling-v1")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/324 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
