### Part 3 -> medical_ai_tasks x specialties x demographics

In [1]:
import json
from tqdm import tqdm
from multiturn_modul import MultiturnStyle, get_multiturn_style
import pandas as pd
import random

In [None]:
# Create batch of prompts

# Specify which model should be used to answer prompts
gpt_model = "gpt-4o"

# Path for input data and to store prompts
medical_ai_tasks_file = "../resources/medical_ai_tasks.json"
medical_professions_file = "../resources/patient_age_medical_profession.json"
medical_topics_file = "../resources/medical_topics.json"
age_ranges_file = "../resources/age_categories.json"
sex_and_gender_file = "../resources/sex_and_gender.json"
countries_file = "../resources/countries_by_income_category.csv"
output_path = "../results/batched_prompts_task_x_specialties_x_demographic.jsonl"

# store medical AI tasks in a list
with open(medical_ai_tasks_file, 'r') as file:
    medical_ai_tasks = json.load(file)  # Load JSON data from file

# store specialties in a list
with open(medical_professions_file, 'r') as file:
    professions_data = json.load(file)  # Load JSON data from file  

# store specialties in a list
with open(medical_topics_file, 'r') as file:
    topics_data = json.load(file)  # Load JSON data from file  

# store age ranges in a list
with open(age_ranges_file, 'r') as file:
    age_ranges = json.load(file)  # Load JSON data from file

# store sex and gender in a list
with open(sex_and_gender_file, 'r') as file:
    sex_and_gender = json.load(file)  # Load JSON data from file

countries = pd.read_csv(countries_file)

# prompt and context
content = "You are an assistant responsible for creating prompts that healthcare workers would ask a medical AI chatbot."
def get_prompt(task, description, additional_instruction, demographics, profession="physician", specialty="no", topic="no"):
    prompt_parts = []
    prompt_parts.append(f"Generate a prompt that a {profession} ")
    if specialty != "no":
        prompt_parts.append(f"specializing in {specialty} and ")
    prompt_parts.append(f"working in {demographics["country"]} ")
    prompt_parts.append(f"might ask an AI chatbot when tasked with {task} ")
    if topic != "no":
        prompt_parts.append(f"in the context of the medical field \"{topic}\".\n")
    else:
        prompt_parts.appemd(f".\n")
    prompt_parts.append(f"{task} is described as: {description}\n")
    prompt_parts.append(f"To create a realistic prompt, follow these additional instructions: {additional_instruction}\n")
    prompt_parts.append(f"The question you generate has to be about a patient with the following demographic attributes:")
    prompt_parts.append(f"age: {demographics["age"]}\n")
    prompt_parts.append(f"sex: {demographics["sex"]}\n")
    prompt_parts.append(f"gender: {demographics["gender"]}\n")
    prompt_parts.append("Only include the generated prompt, adding extra details only if specified. Focus solely on a realistic prompt a physician might ask a medical AI chatbot.")
    prompt = "".join(prompt_parts)
    return prompt

def fill_line(prompt_id, gpt_model, content, prompt, max_tokens):
    line = {
        "custom_id": prompt_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": gpt_model,
            "messages": [
                {"role": "system", "content": content},
                {"role": "user", "content": f"{prompt}"}
            ],
            "max_tokens": max_tokens
        }
    }
    return line

def get_countries(countries_df, nbr_samples):
    # Step 1: Group by 'Income Category'
    groups = countries_df.groupby("Income Category")

    # Step 2: Ensure at least two groups are available
    if len(groups) < nbr_samples:
        raise ValueError("Not enough unique income categories to sample from.")

    # Step 3: Randomly sample one country from two different categories
    sampled_countries = []
    for category, group in groups:
        sampled_countries.append(group.sample(n=1))
    sampled_countries_df = pd.concat(sampled_countries).reset_index(drop=True).sample(nbr_samples)
    return sampled_countries_df["Country"].to_list()

def get_ages(age_categories, possible_ages, nbr_sample):
    all_ages = [dict["category"] for dict in age_categories]
    if possible_ages[0] == "All age categories":
        possible_ages = all_ages
    ages = random.choices(possible_ages, k=nbr_sample)

def adjust_ages(possible_ages):
    adjusted_possible_ages = possible_ages
        # special cases
    if "Pediatrics" "->infants, toddlers, children, and adolescents."
    if "Geriatrics" "->older adults, elderly, and centenarians."
    if "Neonatology" "-> infants (newborns)."
    if "Pain Management" "->adults, middle-aged, older adults, and elderly due to chronic pain conditions."
    if "Physical Medicine and Rehabilitation" "->adults, middle-aged, and older populations"
    if "Occupational Health" "->young adults to middle-aged adults"
    return adjusted_possible_ages

def get_sex_gender():
        if "Urology" ==
    if "Obstetrics and Gynecology" ==




def get_demographics(countries_df, age_categories, possible_ages, nbr_samples, domain="no", specialty="no"):
    adjusted_possible_ages = adjust_ages(possible_ages, domain)
    countries = get_countries(countries_df, nbr_samples)
    ages = get_ages(age_categories, adjusted_possible_ages, nbr_samples)
    sex, gender = get_sex_gender(sex_and_gender, specialty)

    list_of_patients = []
    for i in nbr_samples:
        dict = {}
        dict["country"] = countries[i]
        dict["age"] = ages[i]
        dict["sex"] = sex[i]
        dict["gender"] = gender[i]

    return list_of_patients


In [None]:
print("Creating a batch of prompts")
print(f"that can be processed by {gpt_model} in batch mode")

id = 0
with open(output_path, 'w') as file:
    for medical_ai_task in tqdm(medical_ai_tasks):
        task = medical_ai_task["task"]
        description = medical_ai_task["description"]
        add_instruction = medical_ai_task["additional_instruction"]
        for profession in professions_data:
            main_profession = profession["profession"]
            for specialty in profession["specialties"]:
                demographics = get_demographics()
                for dem in demographics:
                    prompt = get_prompt(task, description, add_instruction, subtopic)
                    prompt_id = str(id)
                    line = fill_line()
                    file.write(json.dumps(line) + '\n')
                    id += 1
        for profession in professions_data:
            prompt = get_prompt()
            prompt_id = str(id)
            line = fill_line()
            file.write(json.dumps(line) + '\n')
            id += 1
        for topic in topics_data:
            prompt = get_prompt()
            prompt_id = str(id)
            line = fill_line()
            file.write(json.dumps(line) + '\n')
            id += 1

print(f"batch of {id} prompts saved to {output_path}")

{'profession': 'Nurse', 'specialties': [['Critical Care Nursing', ['All age categories']], ['Emergency Nursing', ['All age categories']], ['Geriatric Nursing', ['Older Adults', 'Elderly', 'Centenarians']], ['Medical-Surgical Nursing', ['All age categories']], ['Neonatal Nursing', ['Infants']], ['Oncology Nursing', ['All age categories']], ['Pediatric Nursing', ['Infants', 'Toddlers', 'Preschoolers', 'School-age Children', 'Adolescents']], ['Psychiatric Nursing', ['All age categories']], ['Public Health Nursing', ['All age categories']]]}
Nurse
[['Critical Care Nursing', ['All age categories']], ['Emergency Nursing', ['All age categories']], ['Geriatric Nursing', ['Older Adults', 'Elderly', 'Centenarians']], ['Medical-Surgical Nursing', ['All age categories']], ['Neonatal Nursing', ['Infants']], ['Oncology Nursing', ['All age categories']], ['Pediatric Nursing', ['Infants', 'Toddlers', 'Preschoolers', 'School-age Children', 'Adolescents']], ['Psychiatric Nursing', ['All age categories']