### Create batch of prompts
#### This prompts are then sent to and processed by gpt-4o to generate the initial prompt a healthcare worker asks meditron in a multiturn interaction

* Cell 1: load libraries
* Cell 2: Inputs per prompt to generate is a health care worker profession and its specialty as well as a medical domain about which the question is (e.g Physician sepcialized in Allergy and Immunology asks a question about Human Anatomy and Physiology)
* Cell 3: Inputs per per prompt to generate are a specific topic and a task a physician wants to solve using Meditron (e.g. a question about Cell Biology when using Meditron to assist with Diagnosing Symptoms)
* Cell 4: Inputs per prompt to generate are a medical health care work and their specialization, a task they want to solve using meditron and an additional instruction they give on how the format/style of the answer should be (e.g. Give a beginner-friendly answer, for a health are worker specialized in Allergy and Immunology when they are trying to Diagnosing Symptoms"). Note: the style instruction (e.g. Give a beginner-friendly answer) is added manually to the prompt generated by gpt-4o

In [4]:
import json
from tqdm import tqdm
import pandas as pd
import random

#### Version 1 -> profession sepcialties x domains

In [None]:
# Create batch of prompts

# Specify which model should be used to answer prompts
gpt_model = "gpt-4o"

# Path for input data and to store prompts
path_medical_settings = "../results/medical_settings.json"
path_medical_subtopics = "../results/medical_subtopics.json"
path_medical_professions = "../results/medical_professions.json"

output_path = "../results/batched_prompts.jsonl"

print("Creating a batch of prompts")
print(f"that can be processed by {gpt_model} in batch mode")

# Load data
with open(path_medical_settings, "r") as f:
    medical_settings = json.load(f)
with open(path_medical_subtopics, "r") as f:
    medical_subtopics = json.load(f)
with open(path_medical_professions, "r") as f:
    medical_professions = json.load(f)

with open(output_path, 'w') as file:
    id = 0
    for prof_entry in tqdm(medical_professions):
        profession = prof_entry["profession"]
        specialties = prof_entry["specialties"]
        for top_entry in medical_subtopics:
            domain = top_entry["domain"]
            subtopics = top_entry["subtopic"]
            for specialty in specialties:
                # for subtopic in subtopics: -> TODO: removed because gives too much prompts
                    content = "You are an assistant responsible for creating prompts that healthcare workers would ask a medical AI chatbot."
                    prompt = f'''Generate a prompt that a {profession} specializing in {specialty} might ask an AI chatbot about the {domain} regarding a practical issue they are facing with a patient.
                    Only include the prompt from the {profession} specializing in {specialty} about the {domain} in your response, with no additional text.'''
                    # TODO: Maybe add some examples
                    # TODO: Different style of prompts (here all are: practical issue they face with a patient)
                    line = {
                        "custom_id": str(id),
                        "method": "POST",
                        "url": "/v1/chat/completions",
                        "body": {
                            "model": gpt_model,
                            "messages": [
                                {"role": "system", "content": content},
                                {"role": "user", "content": f"{prompt}"}
                            ],
                            "max_tokens": 150 ### TODO: can be changed... (based on some examples that where in average 105 tokens long)
                        }
                    }
                    file.write(json.dumps(line) + '\n')
                    id += 1

print(f"batch of {id} prompts saved to {output_path}")

#### Version 2 -> medical_ai_tasks x subtopics

In [2]:
# Create batch of prompts

# Specify which model should be used to answer prompts
gpt_model = "gpt-4o"

# Path for input data and to store prompts
medical_ai_tasks_file = "../results/medical_ai_tasks_part2.json"
subtopics_file = "../results/medical_subtopics.json"
output_path = "../results/batched_prompts.jsonl"



# store medical AI tasks in a list
with open(medical_ai_tasks_file, 'r') as file:
    medical_ai_tasks = json.load(file)  # Load JSON data from file

# store subtopics in a list
with open(subtopics_file, 'r') as file:
    subtopics_data = json.load(file)  # Load JSON data from file

# Extract subtopics
subtopics = []
for item in subtopics_data:
    subtopics.extend(item['subtopic'])  # Add each subtopic list to the main list

# prompt and context
content = "You are an assistant responsible for creating prompts that healthcare workers would ask a medical AI chatbot."
def get_prompt(task, description, additional_instruction, subtopic):
    prompt = f'''Generate a prompt that a physician might ask an AI chatbot when tasked with {task} in the context of {subtopic} in the medical field.
    {task} is described as: {description}
    To create a realistic prompt, follow these additional instructions: {additional_instruction}
    Only include the generated prompt, adding extra details only if specified. Focus solely on a realistic prompt a physician might ask a medical AI chatbot.'''
    return prompt


print("Creating a batch of prompts")
print(f"that can be processed by {gpt_model} in batch mode")

with open(output_path, 'w') as file:
    task_id = -1
    for medical_ai_task in tqdm(medical_ai_tasks):
        subtopic_id = 0
        task_id += 1
        for subtopic in subtopics:
            content = content
            prompt = get_prompt(medical_ai_task["task"], medical_ai_task["description"], medical_ai_task["additional_instruction"], subtopic)
            line = {
                "custom_id": str(task_id) + "-" + str(subtopic_id),
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": gpt_model,
                    "messages": [
                        {"role": "system", "content": content},
                        {"role": "user", "content": f"{prompt}"}
                    ],
                    "max_tokens": medical_ai_task["max_token"]
                }
            }
            file.write(json.dumps(line) + '\n')
            subtopic_id += 1

print(f"batch of {id} prompts saved to {output_path}")

Creating a batch of prompts
that can be processed by gpt-4o in batch mode


100%|██████████| 2/2 [00:00<00:00, 56.00it/s]

batch of <built-in function id> prompts saved to ../results/batched_prompts.jsonl





### Part 3 -> medical_ai_tasks x specialties x demographics

In [None]:
import json
from tqdm import tqdm
import pandas as pd
import random
import helper
# Create batch of prompts

# Specify which model should be used to answer prompts
gpt_model = "gpt-4o"

# Path for input data and to store prompts
medical_ai_tasks_file = "../resources/medical_ai_tasks.json"
medical_professions_file = "../resources/patient_age_medical_profession.json"
medical_topics_file = "../resources/medical_topics.json"
age_ranges_file = "../resources/age_categories.json"
sex_and_gender_file = "../resources/sex_and_gender.json"
countries_file = "../resources/countries_by_income_category.csv"
output_path = "../results/batched_prompts_task_x_specialties_x_demographic_2.jsonl"

# store medical AI tasks in a list
with open(medical_ai_tasks_file, 'r') as file:
    medical_ai_tasks = json.load(file)  # Load JSON data from file

# store specialties in a list
with open(medical_professions_file, 'r') as file:
    professions_data = json.load(file)  # Load JSON data from file  

# store specialties in a list
with open(medical_topics_file, 'r') as file:
    topics_data = json.load(file)  # Load JSON data from file  

# store age ranges in a list
with open(age_ranges_file, 'r') as file:
    age_categories = json.load(file)  # Load JSON data from file

# store sex and gender in a list
with open(sex_and_gender_file, 'r') as file:
    sex_and_gender = json.load(file)  # Load JSON data from file

countries_df = pd.read_csv(countries_file)
print("Creating a batch of prompts")
print(f"that can be processed by {gpt_model} in batch mode")

content = "You are an assistant responsible for creating prompts that healthcare workers would ask a medical AI chatbot."
nbr_of_questions_per_category = 4
id = 0
with open(output_path, 'w') as file:
    for medical_ai_task in tqdm(medical_ai_tasks):
        task = medical_ai_task["task"]
        description = medical_ai_task["description"]
        add_instruction = medical_ai_task["additional_instruction"]
        for profession in professions_data:
            main_profession = profession["profession"]
            for specialty in profession["specialties"]:
                demographics = helper.get_demographics(countries_df=countries_df, age_categories=age_categories,
                                                       sex_and_gender=sex_and_gender, nbr_samples=nbr_of_questions_per_category,
                                                       possible_ages=specialty[1], specialty=specialty[0])
                prompt = helper.get_prompt(nbr_of_questions=nbr_of_questions_per_category, task=task, description=description,
                                           additional_instruction=add_instruction, demographics=demographics, profession=main_profession,
                                           specialty=specialty[0])
                countries_used = ",".join([dict["country"] for dict in demographics])
                prompt_id = str(id) + "-" + countries_used
                line = helper.fill_line(prompt_id=prompt_id, gpt_model=gpt_model, content=content,
                                        prompt=prompt, max_tokens=nbr_of_questions_per_category*500)
                file.write(json.dumps(line) + '\n')
                id += 1
        for profession in professions_data:
            demographics = helper.get_demographics(countries_df=countries_df, age_categories=age_categories,
                                                    sex_and_gender=sex_and_gender, nbr_samples=nbr_of_questions_per_category)
            prompt = helper.get_prompt(nbr_of_questions=nbr_of_questions_per_category, task=task, description=description,
                                       additional_instruction=add_instruction, demographics=demographics, profession=profession["profession"])
            countries_used = ",".join([dict["country"] for dict in demographics])
            prompt_id = str(id) + "-" + countries_used
            line = helper.fill_line(prompt_id=prompt_id, gpt_model=gpt_model, content=content,
                                    prompt=prompt, max_tokens=nbr_of_questions_per_category*500)
            file.write(json.dumps(line) + '\n')
            id += 1
        for topic in topics_data:
            demographics = helper.get_demographics(countries_df=countries_df, age_categories=age_categories,
                                                    sex_and_gender=sex_and_gender, nbr_samples=nbr_of_questions_per_category,
                                                    medical_topic=topic["topic"])
            prompt = helper.get_prompt(nbr_of_questions=nbr_of_questions_per_category, task=task, description=description,
                                       additional_instruction=add_instruction, demographics=demographics, topic=topic["topic"])
            countries_used = ",".join([dict["country"] for dict in demographics])
            prompt_id = str(id) + "-" + countries_used
            line = helper.fill_line(prompt_id=prompt_id, gpt_model=gpt_model, content=content,
                                    prompt=prompt, max_tokens=nbr_of_questions_per_category*500)
            file.write(json.dumps(line) + '\n')
            id += 1

print(f"batch of {id} prompts saved to {output_path}")