### Part 1 -> specialties x domains

In [None]:
# Get context of generated prompt based on id
import json
from tqdm import tqdm

# Path for input data and to store prompts
path_medical_settings = "../results/medical_settings.json"
path_medical_subtopics = "../results/medical_subtopics.json"
path_medical_professions = "../results/medical_professions.json"

# Load data
with open(path_medical_settings, "r") as f:
    medical_settings = json.load(f)
with open(path_medical_subtopics, "r") as f:
    medical_subtopics = json.load(f)
with open(path_medical_professions, "r") as f:
    medical_professions = json.load(f)

contexts = {}
id = 0
for prof_entry in tqdm(medical_professions):
    profession = prof_entry["profession"]
    specialties = prof_entry["specialties"]
    for top_entry in medical_subtopics:
        domain = top_entry["domain"]
        subtopics = top_entry["subtopic"]
        for specialty in specialties:
            # for subtopic in subtopics: -> TODO: removed because gives too much prompts
            contexts[id] = profession + ", " + specialty + ", " + domain
            id += 1

print(f"Contexts of size {id} of generated prompts saved in contexts dictionary")

100%|██████████| 21/21 [00:00<00:00, 5132.89it/s]

Context of size 6321 of generated prompts saved in context dictionary





In [14]:
# Function to parse the string and create a JSON object
def parse_gpt_response(string, id, contexts):
    """
    Helper function fur parse_results()
    Parses the gpt responses from string to dict
    """
# Create a dictionary to hold the data in the desired format
# Only successfull if all that is expected is present

    try:
        # Attempt to load the JSON string
        # TODO: verify that gpt_response is like that

        # Decode any UTF-8 character codes in the input string
        decoded_string = (string.encode().decode('unicode_escape')).encode('latin1').decode('utf-8')

        data_dict = {}
        data_dict["id"] = int(id)
        data_dict["prompt"] = decoded_string
        data_dict["context"] = contexts[int(id)]
        return data_dict

    except json.JSONDecodeError as e:
        return None
    

In [15]:
import json
from tqdm import tqdm
import re

path_to_results = "../results/gpt_results.jsonl"
output_path = "../results/parsed_health_care_worker_prompts.json"


# TODO: do both in the same for loop, no need to store contents
# Extract content from json
print("Parsing GPT responses...")
medical_prompts = []
questions_failed_to_parse = []
with open(path_to_results, 'r') as file:
    for line in tqdm(file):
        try:
            data = json.loads(line)  # Parse each line as JSON
            response_content = data.get("response", {}).get("body", {}).get("choices", [])[0].get("message", {}).get("content", None)
            id = data.get("custom_id") # format "1"
            if response_content and id:
                parsed_response = parse_gpt_response(response_content, id, contexts)
                if bool(parsed_response):
                    medical_prompts.append(parsed_response)
                else:
                    questions_failed_to_parse.append(id)
        except json.JSONDecodeError as e:
            questions_failed_to_parse.append(id)
print("Parsing completed")

# Save to JSON file
with open(output_path, 'w') as json_file:
    json.dump(medical_prompts, json_file, indent=4)

# Output the parsed data (for verification)
print(f"Medical prompts have been saved to {output_path}")
print("See an example below:")
print(json.dumps(parsed_response, indent=4))
print(f"Failed to parse {len(questions_failed_to_parse)} questions:")
print(questions_failed_to_parse)

Parsing GPT responses...


6321it [00:00, 45962.76it/s]

Parsing completed





Medical prompts have been saved to ../results/parsed_health_care_worker_prompts.json
See an example below:
{
    "id": 6320,
    "prompt": "\"What preventive strategies can be recommended for a patient who is at risk of depression due to prolonged physical rehabilitation, to improve their mental resilience and overall mental health?\"",
    "context": "Clinical Psychologist, Rehabilitation Psychology, Preventive Medicine"
}
Failed to parse 0 questions:
[]


### Part 2 - ai_tasks x subtopics

In [1]:
# Load files necessary for context and store them in lists
import json
from tqdm import tqdm

# Path for input data and to store prompts
medical_ai_tasks_file = "../results/medical_ai_tasks.json"
subtopics_file = "../results/medical_subtopics.json"

# store medical AI tasks in a list
with open(medical_ai_tasks_file, 'r') as file:
    medical_ai_tasks = json.load(file)  # Load JSON data from file

# store subtopics in a list
with open(subtopics_file, 'r') as file:
    subtopics_data = json.load(file)  # Load JSON data from file

# Extract subtopics
subtopics = []
for item in subtopics_data:
    subtopics.extend(item['subtopic'])  # Add each subtopic list to the main list

In [2]:
# Function to parse the string and create a JSON object
def parse_gpt_response_2(string, task_id, subtopic_id, medical_ai_tasks, subtopics):
    """
    Helper function fur parse_results()
    Parses the gpt responses from string to dict
    """
# Create a dictionary to hold the data in the desired format
# Only successfull if all that is expected is present

    try:
        # Attempt to load the JSON string
        # TODO: verify that gpt_response is like that

        # Decode any UTF-8 character codes in the input string
        decoded_string = (string.encode().decode('unicode_escape')).encode('latin1').decode('utf-8')

        data_dict = {}
        data_dict["id"] = (task_id, subtopic_id)
        data_dict["prompt"] = decoded_string
        data_dict["context"] = medical_ai_tasks[task_id]["task"] + ", " + subtopics[subtopic_id]
        return data_dict

    except json.JSONDecodeError as e:
        return None

In [None]:
import json
from tqdm import tqdm

path_to_results = "../results/gpt_results.jsonl"
output_path = "../results/parsed_prompts_tasks_x_subtopics.json"


# TODO: do both in the same for loop, no need to store contents
# Extract content from json
print("Parsing GPT responses...")
medical_prompts = []
questions_failed_to_parse = []
with open(path_to_results, 'r') as file:
    for line in tqdm(file):
        try:
            data = json.loads(line)  # Parse each line as JSON
            response_content = data.get("response", {}).get("body", {}).get("choices", [])[0].get("message", {}).get("content", None)
            id = data.get("custom_id") # format "0-0" "task_id - subtopic_id"
            if response_content and id:
                parsed_response = parse_gpt_response_2(response_content, int(id.split("-")[0]), int(id.split("-")[1]), medical_ai_tasks, subtopics)
                if bool(parsed_response):
                    medical_prompts.append(parsed_response)
                else:
                    questions_failed_to_parse.append(id)
        except json.JSONDecodeError as e:
            questions_failed_to_parse.append(id)
print("Parsing completed")

# Save to JSON file
with open(output_path, 'w') as json_file:
    json.dump(medical_prompts, json_file, indent=4)

# Output the parsed data (for verification)
print(f"Medical prompts have been saved to {output_path}")
print("See an example below:")
print(json.dumps(parsed_response, indent=4))
print(f"Failed to parse {len(questions_failed_to_parse)} questions:")
print(questions_failed_to_parse)