### Generates batches for Multiturn chat creations

* Cell 0: import necesary libraries
* Cell 1: Create system prompts for user and chatbot (called situations) and store them for later use -> do not redo that!
* Cell 2: Create file that stores the multiturn conversation and add the initial prompt to it -> do not redo that
* Cell 3: Helper function to build prompt from chat history
* Cell 4: Creates batches of prompts

In [1]:
import json
from tqdm import tqdm
from multiturn_modul import MultiturnStyle, get_multiturn_style, get_multiturn_style_additional, find_profession_by_specialty
import pandas as pd
import random

In [2]:
# Create situations
# Creates randomly system prompts for the user and the chatbot

# Path for input data and to store prompts
initial_prompts_name = "task_x_specialties_x_demographic_x_answerstyle"
initial_prompts_path = "../results/parsed_prompts_" + initial_prompts_name + ".json"
output_path = "../results/situations_" + initial_prompts_name + ".jsonl"

# store initial prompts in a list
with open(initial_prompts_path, 'r') as file:
    initial_prompts_list = json.load(file)  # Load JSON data from file

# Load countries necessary to give the medical setting
countries = pd.read_csv("../resources/countries_by_income_category.csv")

print("Creating list of situations and initial prompts")

with open(output_path, 'w') as file: ### only remove if the situations are new and the situations file is never used anywhere
    for initial_prompt in tqdm(initial_prompts_list):
        sampled_country = random.choice(countries.iloc[:, 0].tolist())
        id = initial_prompt["id"]
        if initial_prompts_name == "specialty_x_domain":
            profession, specialty, domain = initial_prompt["context"].lower().split(",")
            multiturn_style = get_multiturn_style(id=id, sampled_country=sampled_country, profession=profession, specialty=specialty, domain=domain)
            print("error")
        elif initial_prompts_name == "task_x_specialties_x_demographic_x_answerstyle_2":
            context = initial_prompt["context"]
            multiturn_style = get_multiturn_style_additional(id=id, context=context)
        else:
            multiturn_style = get_multiturn_style(id=id, sampled_country=sampled_country)
            print("error")

        system_prompt_chatbot = multiturn_style.system_prompt_chatbot()
        system_prompt_user = multiturn_style.system_prompt_user()
        nbr_of_turns = multiturn_style.number_of_turns
        line = {
            "id": id,
            "nbr_of_turns": nbr_of_turns,
            "initial_prompt": initial_prompt["prompt"],
            "system_prompt_chatbot": system_prompt_chatbot,
            "system_prompt_user": system_prompt_user,
            "multiturn_style_parameters": multiturn_style.get_all_system_prompts_atributes()
        }
        file.write(json.dumps(line) + '\n')

print(f"Up to {id} prompts with situations saved to {output_path}")

Creating list of situations and initial prompts


100%|██████████| 20395/20395 [00:06<00:00, 3189.30it/s]

Up to D-2586-6B-Burkina Faso prompts with situations saved to ../results/situations_task_x_specialties_x_demographic_x_answerstyle_2.jsonl





In [3]:
# Create initial multiturn_situation .jsonl
# Function to get the initial prompt

situations_name = "task_x_specialties_x_demographic_x_answerstyle_2"

situations_path = "../results/situations_" + situations_name + ".jsonl"
output_path = "../results/multiturn_" + situations_name + ".jsonl"

multiturn_conversations = []
with ###open(situations_path, 'r') as file: ### do only exectue if it is the first time
    for entry in file:
        data = json.loads(entry)
        line = {
            "id": data["id"],
            "conversation": [
                {"role": "user", "value": data["initial_prompt"]},
            ]
        }
        multiturn_conversations.append(line)
# Save to JSON file
with open(output_path, 'w') as jsonl_file:
    for conv in tqdm(multiturn_conversations):
        jsonl_file.write(json.dumps(conv) + '\n')


print(f"The file to store the multiturn conversations for {situations_name} has been created")
print(f"The file contains the initial prompt.")

100%|██████████| 20395/20395 [00:00<00:00, 95590.76it/s]

The file to store the multiturn conversations for task_x_specialties_x_demographic_x_answerstyle_2 has been created
The file contains the initial prompt.





In [4]:
def build_prompt(conv_list):
    prompt = []
    for entry in conv_list:
        prompt.append(f"({entry["role"]}) {entry["value"]} \n")
    return "".join(prompt)

In [5]:
# Create batch of prompts

# Specify which model should be used to answer prompts
gpt_model = "gpt-4o"

# Path for input data and to store prompts
situations_name = "task_x_specialties_x_demographic_x_answerstyle_2" # Change this
situations_path = "../results/situations_" + situations_name + ".jsonl"
multiturn_conv_path = "../results/multiturn_" + situations_name + ".jsonl"
turn = 1 # can be 1, 2 or 3 # Change this
system_is = "chatbot" # can be user or chatbot # Change this
output_path = "../results/batched_prompts_" + situations_name + "_turn_" + str(turn) + system_is[0] + ".jsonl"


# store prompts in a list
situations = []
with open(situations_path, 'r') as file:
    for i, line in enumerate(file): # i could be used in test mode with if i >= 100; break
        situations.append(json.loads(line))

# store multiturn convs in a dict
multiturn_conv = {}
with open(multiturn_conv_path, 'r') as file:
    for i, line in enumerate(file): # can use i if we want to test -> if i >= 100; break
        line_as_json = json.loads(line)
        multiturn_conv[line_as_json["id"]] = line_as_json["conversation"]


print(f"Creating a batch of prompts that can be processed by {gpt_model} in batch mode")

with open(output_path, 'w') as file:
    nbr_of_prompts = 0
    for situation in tqdm(situations):
        if situation["nbr_of_turns"] >= turn:
            line = {
                "custom_id": situation["id"],
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": gpt_model,
                    "messages": [
                        {"role": "system", "content": situation["system_prompt_" + system_is]},
                        {"role": "user", "content": build_prompt(multiturn_conv[situation["id"]])}
                    ],
                    "max_tokens": 1000
                }
            }
            file.write(json.dumps(line) + '\n')
            nbr_of_prompts += 1

print(f"batch of {nbr_of_prompts} prompts saved to {output_path}")
print(f"IMPORTANT: batches were created for turn {turn} and {system_is}")

Creating a batch of prompts that can be processed by gpt-4o in batch mode


100%|██████████| 20395/20395 [00:00<00:00, 28415.74it/s]

batch of 20395 prompts saved to ../results/batched_prompts_task_x_specialties_x_demographic_x_answerstyle_2_turn_1c.jsonl
IMPORTANT: batches were created for turn 1 and chatbot



