### Part 3 -> medical_ai_tasks x specialties x demographics

In [1]:
import json
from tqdm import tqdm
from multiturn_modul import MultiturnStyle, get_multiturn_style
import pandas as pd
import random
import helper

In [2]:
# Create batch of prompts

# Specify which model should be used to answer prompts
gpt_model = "gpt-4o"

# Path for input data and to store prompts
medical_ai_tasks_file = "../resources/medical_ai_tasks.json"
medical_professions_file = "../resources/patient_age_medical_profession.json"
medical_topics_file = "../resources/medical_topics.json"
age_ranges_file = "../resources/age_categories.json"
sex_and_gender_file = "../resources/sex_and_gender.json"
countries_file = "../resources/countries_by_income_category.csv"
output_path = "../results/batched_prompts_task_x_specialties_x_demographic.jsonl"

# store medical AI tasks in a list
with open(medical_ai_tasks_file, 'r') as file:
    medical_ai_tasks = json.load(file)  # Load JSON data from file

# store specialties in a list
with open(medical_professions_file, 'r') as file:
    professions_data = json.load(file)  # Load JSON data from file  

# store specialties in a list
with open(medical_topics_file, 'r') as file:
    topics_data = json.load(file)  # Load JSON data from file  

# store age ranges in a list
with open(age_ranges_file, 'r') as file:
    age_categories = json.load(file)  # Load JSON data from file

# store sex and gender in a list
with open(sex_and_gender_file, 'r') as file:
    sex_and_gender = json.load(file)  # Load JSON data from file

countries_df = pd.read_csv(countries_file)

In [4]:
print("Creating a batch of prompts")
print(f"that can be processed by {gpt_model} in batch mode")

content = "You are an assistant responsible for creating prompts that healthcare workers would ask a medical AI chatbot."
nbr_of_questions_per_category = 1
id = 0
with open(output_path, 'w') as file:
    for medical_ai_task in tqdm(medical_ai_tasks):
        task = medical_ai_task["task"]
        description = medical_ai_task["description"]
        add_instruction = medical_ai_task["additional_instruction"]
        for profession in professions_data:
            main_profession = profession["profession"]
            for specialty in profession["specialties"]:
                demographics = helper.get_demographics(countries_df=countries_df, age_categories=age_categories,
                                                       sex_and_gender=sex_and_gender, nbr_samples=nbr_of_questions_per_category,
                                                       possible_ages=specialty[1], specialty=specialty[0])
                prompt = helper.get_prompt(nbr_of_questions=nbr_of_questions_per_category, task=task, description=description,
                                           additional_instruction=add_instruction, demographics=demographics, profession=main_profession,
                                           specialty=specialty[0])
                countries_used = ",".join([dict["country"] for dict in demographics])
                prompt_id = str(id) + "-" + countries_used
                line = helper.fill_line(prompt_id=prompt_id, gpt_model=gpt_model, content=content,
                                        prompt=prompt, max_tokens=nbr_of_questions_per_category*500)
                file.write(json.dumps(line) + '\n')
                id += 1
        for profession in professions_data:
            demographics = helper.get_demographics(countries_df=countries_df, age_categories=age_categories,
                                                    sex_and_gender=sex_and_gender, nbr_samples=nbr_of_questions_per_category)
            prompt = helper.get_prompt(nbr_of_questions=nbr_of_questions_per_category, task=task, description=description,
                                       additional_instruction=add_instruction, demographics=demographics, profession=profession)
            countries_used = ",".join([dict["country"] for dict in demographics])
            prompt_id = str(id) + "-" + countries_used
            line = helper.fill_line(prompt_id=prompt_id, gpt_model=gpt_model, content=content,
                                    prompt=prompt, max_tokens=nbr_of_questions_per_category*500)
            file.write(json.dumps(line) + '\n')
            id += 1
        for topic in topics_data:
            demographics = helper.get_demographics(countries_df=countries_df, age_categories=age_categories,
                                                    sex_and_gender=sex_and_gender, nbr_samples=nbr_of_questions_per_category,
                                                    medical_topic=topic)
            prompt = helper.get_prompt(nbr_of_questions=nbr_of_questions_per_category, task=task, description=description,
                                       additional_instruction=add_instruction, demographics=demographics, topic=topic)
            countries_used = ",".join([dict["country"] for dict in demographics])
            prompt_id = str(id) + "-" + countries_used
            line = helper.fill_line(prompt_id=prompt_id, gpt_model=gpt_model, content=content,
                                    prompt=prompt, max_tokens=nbr_of_questions_per_category*500)
            file.write(json.dumps(line) + '\n')
            id += 1

print(f"batch of {id} prompts saved to {output_path}")

Creating a batch of prompts
that can be processed by gpt-4o in batch mode


100%|██████████| 13/13 [00:03<00:00,  3.91it/s]

batch of 2587 prompts saved to ../results/batched_prompts_task_x_specialties_x_demographic.jsonl



