In [5]:
import json
import random

def print_random_entry_from_jsonl(file_path):
    """
    Reads a .jsonl file, selects an entry uniformly at random, and prints its fields.

    Args:
        file_path (str): Path to the .jsonl file.
    """
    try:
        with open(file_path, 'r') as file:
            entries = [json.loads(line) for line in file]
            
            if not entries:
                print("The file is empty.")
                return

            random_entry = random.choice(entries)

            print("Random Entry:")
            for key, value in random_entry.items():
                print(f"{key}: {value}")

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError:
        print("Error decoding JSON. Please check the file format.")

# Example usage
file_path = "../results/situations_task_x_specialties_x_demographic_x_answerstyle_2.jsonl"
print_random_entry_from_jsonl(file_path)


Random Entry:
id: B-2218-2A-Central African Republic
nbr_of_turns: 3
initial_prompt: In the Central African Republic, an 8-month-old female is brought in with high fever, persistent crying, and refusal to feed. She also has a bulging fontanelle and is lethargic. Given these symptoms, can you guide me on how to prioritize this case and what immediate interventions should be considered?
Answer in bullet points.
system_prompt_chatbot: You are a medical AI chatbot designed to assist health care workers working in Central African Republic by answering their questions. 
The style of your answers must follow these rules: 
- Respectful, polite interaction: You must always engage in a respectful, polite, and courteous manner, maintaining professionalism in all interactions.
- Honest, evidence-based information: All responses have to be based on the latest medical evidence and guidelines
- Ethical and safe content: Under no circumstances should you provide fake, harmful, unethical, racist, sexis

In [1]:
import json

# Function to load and count entries in a .json file
def count_entries(file_path):
    """
    Reads a .json file and counts the number of entries.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        return len(data)

file_path = "../results/parsed_prompts_task_x_specialties_x_demographic_x_answerstyle_2.json"  # Replace with the actual path to your .json file
entry_count = count_entries(file_path)
print(f"The file contains {entry_count} entries.")


The file contains 20395 entries.


In [13]:
import json

def count_entries_and_screen_for_emergency(json_file_path):
    try:
        with open(json_file_path, 'r') as file:
            data = json.load(file)

            # Count the number of entries
            total_entries = len(data)
            print(f"Total number of entries: {total_entries}")

            # Screen for the word 'emergency' in the prompt and count matches
            emergency_count = sum('emergency' in entry['prompt'].lower() for entry in data)
            print(f"Number of prompts containing the word 'emergency': {emergency_count}")

    except (FileNotFoundError, json.JSONDecodeError):
        print("Error: Unable to read or parse the JSON file.")

json_file_path = "../results/parsed_prompts_task_x_specialties_x_demographic_x_answerstyle_2.json"
count_entries_and_screen_for_emergency(json_file_path)

Total number of entries: 20395
Number of prompts containing the word 'emergency': 973


In [17]:
import json
import random

# Load the .json file
def load_json(file_path):
    """
    Loads a .json file and returns its content as a list of dictionaries.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Randomly sample an entry and extract id and prompt
def sample_entry(json_data):
    """
    Randomly samples an entry from the JSON data and extracts the id and prompt.
    """
    sampled_entry = random.choice(json_data)
    
    # Extract id
    entry_id = sampled_entry.get("id", "N/A")

    # Extract prompt
    prompt = sampled_entry.get("prompt", "N/A")

    return entry_id, prompt

file_path = "../results/parsed_prompts_task_x_specialties_x_demographic_x_answerstyle_2.json"  # Replace with the actual path to your .json file
json_data = load_json(file_path)

# Sample an entry and print the id and prompt
entry_id, prompt = sample_entry(json_data)
print(f"ID: {entry_id}\nPrompt: {prompt}")


ID: A-1092-3F-North Macedonia
Prompt: How can I explain the condition of plantar fasciitis to a 35-year-old female patient in North Macedonia, in simple terms, including why it causes heel pain and what treatment options, such as stretching exercises or orthotics, are available to manage the condition?
In your answer highlight the risks and benefits.


In [None]:
import json
import random
age_ranges_file = "../resources/age_categories.json"
# store age ranges in a list
with open(age_ranges_file, 'r') as file:
    age_ranges = json.load(file)  # Load JSON data from file
ages = [dict["category"] for dict in age_ranges]
random.choices(ages, k=30)

['Elderly',
 'Centenarians',
 'Older Adults',
 'Preschoolers',
 'Toddlers',
 'School-age Children',
 'Infants',
 'Adolescents',
 'Infants',
 'Adolescents',
 'Adults',
 'Middle-aged Adults',
 'Older Adults',
 'Elderly',
 'Elderly',
 'Elderly',
 'Elderly',
 'Preschoolers',
 'Infants',
 'School-age Children',
 'Young Adults',
 'Infants',
 'School-age Children',
 'Adults',
 'Centenarians',
 'Young Adults',
 'Infants',
 'Toddlers',
 'Preschoolers',
 'Middle-aged Adults']

In [3]:
import pandas as pd
countries_file = "../resources/countries_by_income_category.csv"
countries = pd.read_csv(countries_file)

# Step 1: Group by 'Income Category'
groups = countries.groupby("Income Category")

# Step 2: Ensure at least two groups are available
if len(groups) < 2:
    raise ValueError("Not enough unique income categories to sample from.")

# Step 3: Randomly sample one country from two different categories
sampled_countries = []
for category, group in groups:
    sampled_countries.append(group.sample(n=2))
sampled_countries_df = pd.concat(sampled_countries).reset_index(drop=True)
sampled_countries_df["Country"].to_list()

['Oman',
 'Greece',
 'Uganda',
 'Ethiopia',
 'Uzbekistan',
 'El Salvador',
 'Thailand',
 'Mauritius']

In [40]:
import json

path_to_file1 = "../resources/patient_age_medical_profession.json"  # Replace with your desired output file name
path_to_file2 = "../resources/medical_professions.json"  # Replace with your desired output file name


with open(path_to_file1, 'r') as file:
    data1 = json.load(file)

with open(path_to_file2, 'r') as file:
    data2 = json.load(file)

for dict1, dict2 in zip(data1, data2):
    list_dict1 = [element[0] for element in dict1["specialties"]]
    if dict2["specialties"] == list_dict1:
        print(list_dict1)

['Allergy and Immunology', 'Anesthesiology', 'Cardiology', 'Dermatology', 'Emergency Medicine', 'Endocrinology', 'Family Medicine', 'Gastroenterology', 'Geriatrics', 'Hematology', 'Infectious Disease', 'Internal Medicine', 'Nephrology', 'Neurology', 'Obstetrics and Gynecology', 'Oncology', 'Ophthalmology', 'Orthopedic Surgery', 'Otolaryngology (ENT)', 'Pathology', 'Pediatrics', 'Physical Medicine and Rehabilitation', 'Psychiatry', 'Pulmonology', 'Radiology', 'Rheumatology', 'Surgery', 'Urology']
['Critical Care Nursing', 'Emergency Nursing', 'Geriatric Nursing', 'Medical-Surgical Nursing', 'Neonatal Nursing', 'Oncology Nursing', 'Pediatric Nursing', 'Psychiatric Nursing', 'Public Health Nursing']
['Family Medicine', 'Emergency Medicine', 'Surgery', 'Dermatology', 'Orthopedics', 'Psychiatry']
['Clinical Pharmacy', 'Community Pharmacy', 'Consultant Pharmacy', 'Hospital Pharmacy', 'Industrial Pharmacy', 'Nuclear Pharmacy', 'Oncology Pharmacy']
['Cardiovascular and Pulmonary', 'Geriatric',

In [23]:
import json

def process_file(input_file, output_file):
    data = []
    current_profession = None
    specialties = []

    with open(input_file, 'r') as file:
        for line in file:
            line = line.strip()

            # Skip empty lines
            if not line:
                continue

            # Process profession (### lines)
            if line.startswith("###"):
                if current_profession:
                    # Save the previous profession and its specialties
                    data.append({
                        "profession": current_profession,
                        "specialties": specialties
                    })
                # Extract profession name
                current_profession = line.replace("###", "").strip(" **")
                specialties = []
            
            # Process specialties (lines starting with numbers)
            elif line[0].isdigit():
                # Extract specialty name and age categories
                specialty, age_categories = line.split(":")
                specialty = specialty.strip().split(". ")[1].replace("*", "")  # Remove number
                age_categories = [age.strip() for age in age_categories.split(",")]
                specialties.append([specialty, age_categories])

    # Add the last profession to data
    if current_profession:
        data.append({
            "profession": current_profession,
            "specialties": specialties
        })

    # Write the data to a JSON file
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=2)

# Usage
input_file = "../resources/patient_age_medical_profession.txt"  # Replace with your input file name
output_file = "../resources/patient_age_medical_profession.json"  # Replace with your desired output file name
process_file(input_file, output_file)

In [6]:
import json

# Define the file path
file_path = '../results/multiturn_tasks_x_subtopics.jsonl'

# Function to replace Unicode escape sequences with real characters
def replace_unicode_in_jsonl(file_path):
    try:
        # Read the file and decode Unicode escape sequences
        updated_lines = []
        with open(file_path, 'r') as file:
            for line in file:
                entry = json.loads(line)  # Parse JSON
                entry_str = json.dumps(entry, ensure_ascii=False)  # Convert to string with real characters
                updated_lines.append(entry_str)
        
        # Write the updated content back to the same file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(updated_lines))
        
        print(f"File '{file_path}' updated successfully!")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
replace_unicode_in_jsonl(file_path)

File '../results/multiturn_tasks_x_subtopics.jsonl' updated successfully!


In [1]:
# Script to convert prompts into embeddings and cluster them
# Script from Bastien

import os
import json
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
import numpy as np
data = []

name_of_prompts = "task_x_subtopics"
path_to_prompts = "../results/parsed_prompts_" + name_of_prompts + ".json"
path_to_embeddings = "../clustering/embeddings_pp_" + name_of_prompts + ".npy"

# Control test mode
test_mode = True  # Set to False to load all data

# Load prompts
with open(path_to_prompts, 'r') as f:
    for i, line in enumerate(f):
        if test_mode and i >= 5:  # Only load first 5 entries in test mode
            break
        data.append(json.loads(line))

questions = []
for d in data:
    questions.append(d['prompt'])


if os.path.exists(path_to_embeddings):
    with open(path_to_embeddings, 'rb') as f:
        embeddings = np.load(f)
else:
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = embedder.encode(questions, show_progress_bar=True)

    with open(path_to_embeddings, 'wb') as f:
        np.save(f, embeddings)

# Reduce dimensionality to 5 or 10 dimensions
umap_reducer = umap.UMAP(n_components=5, random_state=42)
reduced_embeddings = umap_reducer.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1, metric='euclidean')
cluster_labels = clusterer.fit_predict(reduced_embeddings)

# Print the clusters and their associated sentences
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)  # Exclude noise label (-1)
print(f"Number of clusters found: {num_clusters}")

print("I found that the users often ask about the same thing, I don't know what to do with this information, maybe we only use one question per cluster ? We'll see, but first interesting thing to note")
# Print sentences grouped by clusters
for cluster in range(num_clusters):
    # Find indices of elements belonging to the current cluster
    cluster_indices = [i for i, label in enumerate(cluster_labels) if label == cluster]
    
    # Only print out small clusters
    if len(cluster_indices) <= 4:
        print(f"Cluster {cluster} (size: {len(cluster_indices)}):")
        for i in cluster_indices:
            print(f" - {questions[i]}")

import matplotlib.pyplot as plt

# Reduce to 2D for visualization
umap_2d = umap.UMAP(n_components=2, random_state=42)
embedding_2d = umap_2d.fit_transform(embeddings)

# Plot clusters
plt.figure(figsize=(10, 8))
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=cluster_labels, cmap='Spectral', s=50)
plt.colorbar()
plt.title("Sentence Clusters based on Sentence Embeddings (I haven't found this useful honestly)")
plt.show()

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
# Bastien Code for inspiration -> Multiturn

def system_prompt(self, final=False) -> str:

        # First sentence options
        # Updated first sentence options
        if self.brief:
            first_sentence_options = [
                "As a doctor in {}, you are in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Keep your explanations short and to the point, and ask pertinent questions.",
                "You are practicing medicine in {}. {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear but succinct, then pose relevant questions.",
                "While working as a physician in {}, {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit yet brief in your reasoning, and ask appropriate questions.",
                "In {}, you find yourself in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Keep your explanations short and to the point, and ask pertinent questions.",
                "Serving as a doctor in {}, {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear but succinct, then pose relevant questions.",
                "As a physician in {}, you are confronted with {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit yet brief in your reasoning, and ask appropriate questions.",
            ]
        elif self.bullet:
            first_sentence_options = [
                "As a doctor in {}, you are in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Be explicit and to the point, present your reasoning and questions in bullet points.",
                "You are practicing medicine in {}. {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear and format your reasoning and questions as bullet points.",
                "While working as a physician in {}, {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit and present your questions and reasoning in bullet points.",
                "In {}, you find yourself in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Make your thinking clear and use bullet points to present your reasoning and questions.",
                "Serving as a doctor in {}, {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear and present your reasoning and questions in bullet points.",
                "As a physician in {}, you are confronted with {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit and format your reasoning and questions as bullet points.",
            ]
        else:
            first_sentence_options = [
                "As a doctor in {}, you are in {} Your task is to diagnose a patient by asking questions and carefully considering your thoughts. Explain your reasoning and ask pertinent questions.",
                "You are practicing medicine in {}. {} Your mission is to determine the patient's diagnosis through inquiry, being thorough in your thought process. Make your thinking explicit, then pose relevant questions.",
                "While working as a physician in {}, {} Your role is to identify the patient's ailment by asking questions and thoroughly analyzing your thoughts. Be explicit about your reasoning, and ask appropriate questions.",
                "In {}, you find yourself in {} Your task is to diagnose a patient by asking questions and carefully considering your thoughts. Explain your reasoning and ask pertinent questions.",
                "Serving as a doctor in {}, {} Your mission is to determine the patient's diagnosis through inquiry, being thorough in your thought process. Make your thinking explicit, then pose relevant questions.",
                "As a physician in {}, you are confronted with {} Your role is to identify the patient's ailment by asking questions and thoroughly analyzing your thoughts. Be explicit about your reasoning, and ask appropriate questions.",
            ]

        first_sentence = random.choice(first_sentence_options).format(self.country, random.choice(setting_doctor[self.setting]))

        # Test request sentence for high resource setting
        if self.setting == "high" and self.infs < self.MAX_INFS:
            test_request_options = [
                "You may also request tests if they are available and can assist in your diagnosis, using the format \"REQUEST TEST: [test]\". For instance, \"REQUEST TEST: Chest_X-Ray\".",
                "If helpful and accessible, you should request tests to aid your diagnosis using the format \"REQUEST TEST: [test]\". Example: \"REQUEST TEST: Chest_X-Ray\".",
                "Feel free to request any available tests that might help in diagnosing, using \"REQUEST TEST: [test]\". For example, \"REQUEST TEST: Chest_X-Ray\".",
            ]
            test_request_sentence = random.choice(test_request_options)
        else:
            test_request_sentence = ""

        # Question limit sentence options
        if final:
            question_limit_options = [
                "You are only allowed to ask {} questions in total before you must make a decision.",
                "You have a limit of {} questions to ask before making a diagnosis.",
                "You can ask up to {} questions before you need to decide.",
                "You may ask a maximum of {} questions before diagnosing.",
                "A total of {} questions are allowed before you must diagnose.",
                "You have {} questions in total to reach your diagnosis.",
                "Only {} questions can be asked before making your decision.",
                "You are permitted to ask up to {} questions before you must decide.",
            ]
            question_limit_sentence = random.choice(question_limit_options).format(self.MAX_INFS)
        else:
            question_limit_options = [
                "You are only allowed to ask {} questions in total before you must make a decision. You have asked {} questions so far.",
                "You have a limit of {} questions to ask before making a diagnosis. So far, you've asked {} questions.",
                "You can ask up to {} questions before you need to decide. Currently, you have asked {} questions.",
                "You may ask a maximum of {} questions before diagnosing. You have already asked {} questions.",
                "A total of {} questions are allowed before you must diagnose. Up to now, you've asked {} questions.",
                "You have {} questions in total to reach your diagnosis. So far, you've used {} questions.",
                "Only {} questions can be asked before making your decision. You have asked {} questions till now.",
                "You are permitted to ask up to {} questions before you must decide. At present, you have asked {} questions.",
            ]
            question_limit_sentence = random.choice(question_limit_options).format(self.MAX_INFS, self.infs)
        

        # Diagnosis instruction options
        diagnosis_instruction_options = [
            "Once you have decided to make a diagnosis, please type \"DIAGNOSIS READY: [diagnosis here]\".",
            "When ready to diagnose, enter \"DIAGNOSIS READY: [diagnosis here]\".",
            "After concluding your diagnosis, submit it using \"DIAGNOSIS READY: [diagnosis here]\".",
            "Upon reaching a diagnosis, please type \"DIAGNOSIS READY: [diagnosis here]\".",
            "Once you are confident in your diagnosis, enter \"DIAGNOSIS READY: [diagnosis here]\".",
            "When you have determined the diagnosis, submit it using \"DIAGNOSIS READY: [diagnosis here]\".",
            "After you have made a diagnosis, indicate it by typing \"DIAGNOSIS READY: [diagnosis here]\".",
            "When you are prepared to diagnose, please provide it using \"DIAGNOSIS READY: [diagnosis here]\".",
        ]
        diagnosis_instruction_sentence = random.choice(diagnosis_instruction_options)

        # Knows diagnosis sentence
        if self.knows_diagnosis and not final:
            knows_diagnosis_options = [
                " You suspect that the patient suffers from {}. This affects the questions you ask the patient.",
                " You have a hunch that the patient may have {}. This should influence your questioning.",
                " You believe the patient might be suffering from {}. Let this guide your questions.",
            ]
            knows_diagnosis_sentence = random.choice(knows_diagnosis_options).format(self.correct_diagnosis)
        else:
            knows_diagnosis_sentence = ""