In [1]:
import json
import random
from tqdm import tqdm

### Part 3 -> task x specialty x demographics

In [2]:
import json
import random

# Load the .jsonl file
def load_jsonl(file_path):
    """
    Loads a .jsonl file and returns its content as a list of dictionaries.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return [json.loads(line) for line in file]

# Randomly sample an entry and extract custom_id and content
def sample_entry(jsonl_data):
    """
    Randomly samples an entry from the jsonl data and extracts the custom_id and assistant content.
    """
    sampled_entry = random.choice(jsonl_data)
    
    # Extract custom_id
    custom_id = sampled_entry.get("custom_id", "N/A")

    # Extract assistant content
    assistant_content = None
    body = sampled_entry.get("response", {}).get("body", {})
    choices = body.get("choices", [])
    if choices:
        assistant_content = choices[0].get("message", {}).get("content", "N/A")

    return custom_id, assistant_content

In [9]:
# Load data
data = load_jsonl("../results/gpt_results.jsonl")

# Sample an entry
custom_id, assistant_content = sample_entry(data)

# Display the results
print(f"Custom ID: {custom_id}")
print(f"Assistant Content: {assistant_content}")


Custom ID: 1028-Uganda,Cyprus,Paraguay,Angola
Assistant Content: How can I explain to a 4-year-old boy in Uganda what leukemia is and what to expect when undergoing treatment, in simple and reassuring terms, to help him understand his diagnosis and the importance of treatment compliance?

###

What are some effective ways to teach an 8-year-old boy in Cyprus and his family about the diagnosis of Hodgkin's lymphoma, including what the disease is, how it can be treated, and what side effects to expect from chemotherapy, using language that is easy for the child to comprehend?

###

How do I educate an elderly transgender male patient, aged 82, in Paraguay, on the specifics of their breast cancer diagnosis and the treatment options, such as surgery and hormone therapy, ensuring the information is respectful, sensitive to gender identity, and easy to understand?

###

What are the key points I should cover when educating a 22-year-old man in Angola about his recent diagnosis of testicular 

In [32]:
# Function to parse the string and create a JSON object
def split_questions(input_text):
    """
    Splits the input text into separate questions using '###' as the delimiter.

    Args:
        input_text (str): A string containing multiple questions separated by '###'.

    Returns:
        list: A list of individual questions as strings.
    """
    # Split the text by '###' and strip extra whitespace from each part
    questions = [question.strip() for question in input_text.split('###') if question.strip()]
    return questions

import re

def format_prompts(prompt):
    """
    Formats the prompts in the given JSON data.

    Args:
        json_data (list): A list of dictionaries containing "id" and "prompt" fields.

    Returns:
        list: A list of formatted dictionaries with updated prompts.
    """
    # Remove numbering at the start (e.g., "1.", "2.", etc.)
    prompt = re.sub(r"^\d+\.\s*", "", prompt)  # Matches "1. " or "2. " at the beginning

    # Remove content between ** and **, including the **
    while "**" in prompt:
        start = prompt.find("**")
        end = prompt.find("**", start + 2)
        if end == -1:
            break
        prompt = prompt[:start] + prompt[end + 2:]
    return prompt.strip()

def extract_country(input_string):
    # Split the string into the letter, number, and countries part
    try:
        parts = input_string.split('-', 2)
        letter = parts[0]
        id = int(parts[1])
        countries_part = parts[2]

        # Split countries by commas while accounting for special cases
        countries = [country.strip() for country in countries_part.split(',')]

        # Extract the country based on the number
        if letter in "ABCD":
            number = ord(letter) - ord('A')
        else:
            raise ValueError("Input must be one of 'A', 'B', 'C', or 'D'.")
        
        selected_country = countries[number]

        # Reassemble the output
        output_string = f"{letter}-{id}-{selected_country}"
        return output_string
    except (IndexError, ValueError):
        raise ValueError("Input string is not in the correct format or index is out of range.")



def parse_gpt_response(content, id):
    """
    Helper function fur parse_results()
    Parses the gpt responses from string to dict
    """
# Create a dictionary to hold the data in the desired format
# Only successfull if all that is expected is present

    try:
        # Attempt to load the JSON string
        # TODO: verify that gpt_response is like that

        # Decode any UTF-8 character codes in the input string
        decoded_content = (content.encode().decode('unicode_escape')).encode('latin1').decode('utf-8')
        split_content = split_questions(decoded_content)
        if len(split_content) != 4:
            raise ValueError(f"lenght of split content for {id} is not as expected: {len(split_content)}")

        return_list = []
        for c, l in zip(split_content, ["A", "B", "C", "D"]):
            full_id = l + "-" + id
            parsed_id = extract_country(full_id)
            prompt = format_prompts(c)
            data_dict = {}
            data_dict["id"] = parsed_id
            data_dict["prompt"] = prompt
            return_list.append(data_dict)
        
        return return_list

    except json.JSONDecodeError as e:
        return None

In [None]:
import json
from tqdm import tqdm

path_to_results = "../results/gpt_results.jsonl"
output_path = "../results/parsed_prompts_task_x_specialties_x_demographic_2.json"


# TODO: do both in the same for loop, no need to store contents
# Extract content from json
print("Parsing GPT responses...")
medical_prompts = []
questions_failed_to_parse = []
with open(path_to_results, 'r') as file:
    for line in tqdm(file):
        try:
            data = json.loads(line)  # Parse each line as JSON
            response_content = data.get("response", {}).get("body", {}).get("choices", [])[0].get("message", {}).get("content", None)
            id = data.get("custom_id") # format "0-0" "task_id - subtopic_id"
            if response_content and id:
                parsed_responses = parse_gpt_response(response_content, id)
                for response in parsed_responses:
                    if bool(response):
                        medical_prompts.append(response)
                    else:
                        questions_failed_to_parse.append(id)
        except json.JSONDecodeError as e:
            questions_failed_to_parse.append(id)
print("Parsing completed")

# Save to JSON file
with open(output_path, 'w') as json_file:
    json.dump(medical_prompts, json_file, indent=4)

# Output the parsed data (for verification)
print(f"Medical prompts have been saved to {output_path}")
print("See an example below:")
print(json.dumps(response, indent=4))
print(f"Failed to parse {len(questions_failed_to_parse)} questions:")
print(questions_failed_to_parse)

Parsing GPT responses...


2586it [00:00, 13917.60it/s]


Parsing completed
Medical prompts have been saved to ../results/parsed_prompts_task_x_specialties_x_demographic_3.json
See an example below:
{
    "id": "D-2586-Burkina Faso",
    "prompt": "A 10-year-old female student in Burkina Faso requires a school health assessment. To ensure compliance with Burkina Faso's preventive health regulations, what are the specific legal requirements regarding vaccinations and health screenings that are mandated for school-age children to both protect individual health and prevent disease outbreaks in educational settings?"
}
Failed to parse 0 questions:
[]
