In [2]:
import json
import random
from tqdm import tqdm

### Part 3 -> task x specialty x demographics

In [2]:
import json
import random

# Load the .jsonl file
def load_jsonl(file_path):
    """
    Loads a .jsonl file and returns its content as a list of dictionaries.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return [json.loads(line) for line in file]

# Randomly sample an entry and extract custom_id and content
def sample_entry(jsonl_data):
    """
    Randomly samples an entry from the jsonl data and extracts the custom_id and assistant content.
    """
    sampled_entry = random.choice(jsonl_data)
    
    # Extract custom_id
    custom_id = sampled_entry.get("custom_id", "N/A")

    # Extract assistant content
    assistant_content = None
    body = sampled_entry.get("response", {}).get("body", {})
    choices = body.get("choices", [])
    if choices:
        assistant_content = choices[0].get("message", {}).get("content", "N/A")

    return custom_id, assistant_content

In [9]:
# Load data
data = load_jsonl("../results/gpt_results.jsonl")

# Sample an entry
custom_id, assistant_content = sample_entry(data)

# Display the results
print(f"Custom ID: {custom_id}")
print(f"Assistant Content: {assistant_content}")


Custom ID: 1028-Uganda,Cyprus,Paraguay,Angola
Assistant Content: How can I explain to a 4-year-old boy in Uganda what leukemia is and what to expect when undergoing treatment, in simple and reassuring terms, to help him understand his diagnosis and the importance of treatment compliance?

###

What are some effective ways to teach an 8-year-old boy in Cyprus and his family about the diagnosis of Hodgkin's lymphoma, including what the disease is, how it can be treated, and what side effects to expect from chemotherapy, using language that is easy for the child to comprehend?

###

How do I educate an elderly transgender male patient, aged 82, in Paraguay, on the specifics of their breast cancer diagnosis and the treatment options, such as surgery and hormone therapy, ensuring the information is respectful, sensitive to gender identity, and easy to understand?

###

What are the key points I should cover when educating a 22-year-old man in Angola about his recent diagnosis of testicular 

In [32]:
# Function to parse the string and create a JSON object
def split_questions(input_text):
    """
    Splits the input text into separate questions using '###' as the delimiter.

    Args:
        input_text (str): A string containing multiple questions separated by '###'.

    Returns:
        list: A list of individual questions as strings.
    """
    # Split the text by '###' and strip extra whitespace from each part
    questions = [question.strip() for question in input_text.split('###') if question.strip()]
    return questions

import re

def format_prompts(prompt):
    """
    Formats the prompts in the given JSON data.

    Args:
        json_data (list): A list of dictionaries containing "id" and "prompt" fields.

    Returns:
        list: A list of formatted dictionaries with updated prompts.
    """
    # Remove numbering at the start (e.g., "1.", "2.", etc.)
    prompt = re.sub(r"^\d+\.\s*", "", prompt)  # Matches "1. " or "2. " at the beginning

    # Remove content between ** and **, including the **
    while "**" in prompt:
        start = prompt.find("**")
        end = prompt.find("**", start + 2)
        if end == -1:
            break
        prompt = prompt[:start] + prompt[end + 2:]
    return prompt.strip()

def extract_country(input_string):
    # Split the string into the letter, number, and countries part
    try:
        parts = input_string.split('-', 2)
        letter = parts[0]
        id = int(parts[1])
        countries_part = parts[2]

        # Split countries by commas while accounting for special cases
        countries = [country.strip() for country in countries_part.split(',')]

        # Extract the country based on the number
        if letter in "ABCD":
            number = ord(letter) - ord('A')
        else:
            raise ValueError("Input must be one of 'A', 'B', 'C', or 'D'.")
        
        selected_country = countries[number]

        # Reassemble the output
        output_string = f"{letter}-{id}-{selected_country}"
        return output_string
    except (IndexError, ValueError):
        raise ValueError("Input string is not in the correct format or index is out of range.")



def parse_gpt_response(content, id):
    """
    Helper function fur parse_results()
    Parses the gpt responses from string to dict
    """
# Create a dictionary to hold the data in the desired format
# Only successfull if all that is expected is present

    try:
        # Attempt to load the JSON string
        # TODO: verify that gpt_response is like that

        # Decode any UTF-8 character codes in the input string
        decoded_content = (content.encode().decode('unicode_escape')).encode('latin1').decode('utf-8')
        split_content = split_questions(decoded_content)
        if len(split_content) != 4:
            raise ValueError(f"lenght of split content for {id} is not as expected: {len(split_content)}")

        return_list = []
        for c, l in zip(split_content, ["A", "B", "C", "D"]):
            full_id = l + "-" + id
            parsed_id = extract_country(full_id)
            prompt = format_prompts(c)
            data_dict = {}
            data_dict["id"] = parsed_id
            data_dict["prompt"] = prompt
            return_list.append(data_dict)
        
        return return_list

    except json.JSONDecodeError as e:
        return None

In [None]:
import json
from tqdm import tqdm

path_to_results = "../results/gpt_results.jsonl"
output_path = "../results/parsed_prompts_task_x_specialties_x_demographic_2.json"


# TODO: do both in the same for loop, no need to store contents
# Extract content from json
print("Parsing GPT responses...")
medical_prompts = []
questions_failed_to_parse = []
with open(path_to_results, 'r') as file:
    for line in tqdm(file):
        try:
            data = json.loads(line)  # Parse each line as JSON
            response_content = data.get("response", {}).get("body", {}).get("choices", [])[0].get("message", {}).get("content", None)
            id = data.get("custom_id") # format "0-0" "task_id - subtopic_id"
            if response_content and id:
                parsed_responses = parse_gpt_response(response_content, id)
                for response in parsed_responses:
                    if bool(response):
                        medical_prompts.append(response)
                    else:
                        questions_failed_to_parse.append(id)
        except json.JSONDecodeError as e:
            questions_failed_to_parse.append(id)
print("Parsing completed")

# Save to JSON file
with open(output_path, 'w') as json_file:
    json.dump(medical_prompts, json_file, indent=4)

# Output the parsed data (for verification)
print(f"Medical prompts have been saved to {output_path}")
print("See an example below:")
print(json.dumps(response, indent=4))
print(f"Failed to parse {len(questions_failed_to_parse)} questions:")
print(questions_failed_to_parse)

Parsing GPT responses...


2586it [00:00, 13917.60it/s]


Parsing completed
Medical prompts have been saved to ../results/parsed_prompts_task_x_specialties_x_demographic_3.json
See an example below:
{
    "id": "D-2586-Burkina Faso",
    "prompt": "A 10-year-old female student in Burkina Faso requires a school health assessment. To ensure compliance with Burkina Faso's preventive health regulations, what are the specific legal requirements regarding vaccinations and health screenings that are mandated for school-age children to both protect individual health and prevent disease outbreaks in educational settings?"
}
Failed to parse 0 questions:
[]


In [3]:
import json
import random

# Add answer styles to the prompts
def load_json_file(file_path):
    """Load and return data from a JSON file."""
    with open(file_path, 'r') as file:
        return json.load(file)
    
# Sample two titles without replacement, considering weights
def weighted_sample_no_replacement(titles, weights, k):
    chosen_titles = []
    available_titles = titles.copy()
    available_weights = weights.copy()
    
    for _ in range(k):
        selected = random.choices(available_titles, weights=available_weights, k=1)[0]
        chosen_titles.append(selected)
        
        # Remove selected title and its weight to ensure no replacement
        idx = available_titles.index(selected)
        available_titles.pop(idx)
        available_weights.pop(idx)
    
    return chosen_titles


def append_random_options_twice(entries, options_data, emergency_option):
    """Append two different options to each prompt based on given weights and emergency rules."""
    updated_entries = []
    nbr_of_emergencies = 0
    for entry in entries:
        treat_as_emergency = random.choice([True, False])
        if 'emergency' in entry['prompt'].lower() and treat_as_emergency:
            # If "emergency" is in the prompt, assign the hardcoded option
            entry_copy = entry.copy()
            entry_copy['prompt'] += f"\n{emergency_option}"
            parts = entry_copy['id'].split('-', 2)  # Split into three parts: letter, number, and the rest (country)
            letter = parts[0]
            number = parts[1]
            country = parts[2]

            # Update the ID with the new identifier
            entry_copy['id'] = f"{letter}-{number}-EM-{country}"
            updated_entries.append(entry_copy)
            nbr_of_emergencies += 1

        else:
            # Weighted random choice for two different titles without replacement
            titles = [item['title'] for item in options_data]

            # Set weights for titles (must correspond to the order in the options JSON file)
            if 'translate' in entry['prompt'].lower():
                weights = [2, 2, 2, 0, 1, 1]  # do not want the output given in a specified language, since it is already specified in the prompt
            else:
                weights = [2, 2, 2, 2, 1, 1]

            chosen_titles = weighted_sample_no_replacement(titles=titles, weights=weights, k=2)

            for chosen_title in chosen_titles:
                # Get options for the chosen title
                chosen_options = next(item['options'] for item in options_data if item['title'] == chosen_title)

                # Uniform random choice among options
                chosen_option = random.choice(chosen_options)

                # Create a copy of the entry and append the chosen option to the prompt
                entry_copy = entry.copy()
                entry_copy['prompt'] += f"\n{chosen_option}"

                # Determine the identifier to append
                title_index = options_data.index(next(item for item in options_data if item['title'] == chosen_title)) + 1
                option_index = chr(chosen_options.index(chosen_option) + ord('A'))
                identifier = f"{title_index}{option_index}"

                # Split the ID into its parts while preserving the country name
                parts = entry['id'].split('-', 2)  # Split into three parts: letter, number, and the rest (country)
                letter = parts[0]
                number = parts[1]
                country = parts[2]

                # Update the ID with the new identifier
                entry_copy['id'] = f"{letter}-{number}-{identifier}-{country}"
                updated_entries.append(entry_copy)

    print(f"nbr of emergencies: {nbr_of_emergencies}")
    return updated_entries

def save_to_json_file(data, file_path):
    """Save data to a JSON file."""
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

# Paths to the JSON files
entries_file = "../results/parsed_prompts_task_x_specialties_x_demographic_2.json"  # Replace with your file path
options_file = "../resources/answer_styles.json"  # Replace with your file path

# Load data
entries = load_json_file(entries_file)
options_data = load_json_file(options_file)

# Set weights for titles (must correspond to the order in the options JSON file)
weights = [2, 2, 2, 2, 1, 1]  # Adjust weights as desired

# Hardcoded option for prompts containing "emergency"
emergency_option = "Highlight the urgent steps to take." # this option was deleted from the answer_style.json because it only makes sense in the context of an emergency

# Append random options to the prompts
updated_entries = append_random_options_twice(entries, options_data, emergency_option)

# Output the updated entries
for i, entry in enumerate(updated_entries):
    if i >= 10:
        break
    print(f"ID: {entry['id']}")
    print(f"Prompt: {entry['prompt']}\n")

# After processing the entries:
output_file = "../results/parsed_prompts_task_x_specialties_x_demographic_x_answerstyle_2.json"  # Replace with your desired output file path
save_to_json_file(updated_entries, output_file)
print(f"Updated entries saved to {output_file}")

nbr of emergencies: 293
ID: A-0-2E-Cameroon
Prompt: A 54-year-old female patient in Cameroon presents with persistent nasal congestion, itching of the eyes, and sneezing that have been ongoing for several weeks. Her medical history reveals a previous diagnosis of seasonal allergic rhinitis, but her symptoms seem to persist beyond the usual allergy season. She reports that the symptoms are particularly aggravated at nighttime when indoors. The family has recently acquired a new pet dog. Considering her medical history and current symptoms, what differential diagnoses should be considered, and what specific tests or management approaches are recommended in this setting?
Format the answer as a script or dialogue.

ID: A-0-3C-Cameroon
Prompt: A 54-year-old female patient in Cameroon presents with persistent nasal congestion, itching of the eyes, and sneezing that have been ongoing for several weeks. Her medical history reveals a previous diagnosis of seasonal allergic rhinitis, but her sym

In [5]:
import json

def get_ai_task(number, file_path):
    """
    Retrieves the task name and the number for a given input number between 1 and 13.

    Parameters:
    - file_path (str): Path to the .json file.
    - number (int): The input number (1-13).

    Returns:
    - tuple: A tuple containing the task name and the number.

    Raises:
    - ValueError: If the number is not between 1 and 13.
    """
    if not (1 <= number <= 13):
        raise ValueError("The number must be between 1 and 13.")

    # Read the .json file
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Ensure the number corresponds to an entry index
    if number > len(data):
        raise ValueError("The number exceeds the available entries in the file.")

    # Retrieve the task name
    task_name = data[number - 1].get("task", "Task name not found")

    return task_name, number

def get_specialty_by_number(file_path, number):
    """
    Retrieves the specialty and the input number for a given number between 1 and 129.

    Parameters:
    - file_path (str): Path to the .json file.
    - number (int): The input number (1-129).

    Returns:
    - tuple: A tuple containing the specialty and the number.

    Raises:
    - ValueError: If the number is not between 1 and 129 or exceeds the total specialties.
    """
    if not (1 <= number <= 129):
        raise ValueError("The number must be between 1 and 129.")

    # Read the .json file
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Flatten the specialties list
    specialties = []
    for entry in data:
        specialties.extend(entry.get("specialties", []))

    # Check if the number exceeds the available specialties
    if number > len(specialties):
        raise ValueError("The number exceeds the available specialties in the file.")

    # Retrieve the specialty
    specialty = specialties[number - 1]

    return specialty, number


def get_profession_by_number(file_path, number):
    """
    Retrieves the profession and the input number for a given number between 130 and 150.

    Parameters:
    - file_path (str): Path to the .json file.
    - number (int): The input number (130-150).

    Returns:
    - tuple: A tuple containing the profession and the number.

    Raises:
    - ValueError: If the number is not between 130 and 150.
    """
    if not (130 <= number <= 150):
        raise ValueError("The number must be between 130 and 150.")

    # Read the .json file
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Calculate the index corresponding to the number
    index = number - 130

    # Ensure the index is within bounds
    if index >= len(data):
        raise ValueError("The number exceeds the available professions in the file.")

    # Retrieve the profession
    profession = data[index].get("profession", "Profession not found")

    return profession, number

import json

def get_topic_by_number(file_path, number):
    """
    Retrieves the topic and the input number for a given number between 151 and 199.

    Parameters:
    - file_path (str): Path to the .json file.
    - number (int): The input number (151-199).

    Returns:
    - tuple: A tuple containing the topic and the number.

    Raises:
    - ValueError: If the number is not between 151 and 199.
    """
    if not (151 <= number <= 199):
        raise ValueError("The number must be between 151 and 199.")

    # Read the .json file
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Calculate the index corresponding to the number
    index = number - 151

    # Ensure the index is within bounds
    if index >= len(data):
        raise ValueError("The number exceeds the available topics in the file.")

    # Retrieve the topic
    topic = data[index].get("topic", "Topic not found")

    return topic, number


def get_context(number):
    if number >= 1 and number <= 129:
        return get_specialty_by_number("../resources/medical_professions.json", number)
    elif number >= 130 and number <= 150:
        return get_profession_by_number("../resources/medical_professions.json", number)
    elif number >= 151 and number <=199:
        return get_topic_by_number("../resources/medical_topics.json", number)
    else:
        raise ValueError(f"{number} is not valid")


In [11]:
import json
import math

# Function to read, process, and update the .json file
def process_json(file_path):
    """
    Reads a .json file, processes the 'id' field for each entry, adds a 'context' field, and writes it back.
    """
    # Read the .json file
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Process each entry
    for i, entry in enumerate(data):
        entry_id = entry.get("id")
        parts = entry_id.split('-', 3)  # Split into three parts: letter, number, and the rest (country)
        letter = parts[0]
        number = int(parts[1])
        answer_style = parts[2]
        country = parts[3]
        print(number)

        ai_task, ai_task_nbr = get_ai_task((math.floor(number/199)+1), "../resources/medical_ai_tasks.json")
        print(ai_task)
        print(ai_task_nbr)
        context, context_nbr = get_context(number - ((ai_task_nbr - 1)*199) +1)

        # Add a new 'context' field (content to be defined manually)
        entry["context"] = [ai_task_nbr, ai_task, context_nbr, context]

    # Write the updated data back to the same .json file
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# Example usage
file_path = "../results/parsed_prompts_task_x_specialties_x_demographic_x_answerstyle_2.json"  # Replace with your actual file path
process_json(file_path)


0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
0
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
1
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
2
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
3
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
4
Diagnosing Symptoms
1
5
Diagnosing Symptoms
1
5
Diagnosing Sym