In [4]:
import json

# Define the file path
file_path = '../results/multiturn_tasks_x_subtopics.jsonl'

# Function to modify the file
def update_jsonl_roles(file_path):
    try:
        # Read the file and modify the content
        updated_lines = []
        with open(file_path, 'r') as file:
            for line in file:
                entry = json.loads(line)
                for message in entry.get("conversation", []):
                    if message["role"] == "chatbot":
                        message["role"] = "assistant"
                updated_lines.append(json.dumps(entry))
        
        # Write the updated content back to the same file
        with open(file_path, 'w') as file:
            file.write('\n'.join(updated_lines))
        
        print(f"File '{file_path}' updated successfully!")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
update_jsonl_roles(file_path)

File '../results/multiturn_tasks_x_subtopics.jsonl' updated successfully!


In [6]:
import json

# Define the file path
file_path = '../results/multiturn_tasks_x_subtopics.jsonl'

# Function to replace Unicode escape sequences with real characters
def replace_unicode_in_jsonl(file_path):
    try:
        # Read the file and decode Unicode escape sequences
        updated_lines = []
        with open(file_path, 'r') as file:
            for line in file:
                entry = json.loads(line)  # Parse JSON
                entry_str = json.dumps(entry, ensure_ascii=False)  # Convert to string with real characters
                updated_lines.append(entry_str)
        
        # Write the updated content back to the same file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(updated_lines))
        
        print(f"File '{file_path}' updated successfully!")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
replace_unicode_in_jsonl(file_path)

File '../results/multiturn_tasks_x_subtopics.jsonl' updated successfully!


In [1]:
# Script to convert prompts into embeddings and cluster them
# Script from Bastien

import os
import json
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
import numpy as np
data = []

name_of_prompts = "task_x_subtopics"
path_to_prompts = "../results/parsed_prompts_" + name_of_prompts + ".json"
path_to_embeddings = "../clustering/embeddings_pp_" + name_of_prompts + ".npy"

# Control test mode
test_mode = True  # Set to False to load all data

# Load prompts
with open(path_to_prompts, 'r') as f:
    for i, line in enumerate(f):
        if test_mode and i >= 5:  # Only load first 5 entries in test mode
            break
        data.append(json.loads(line))

questions = []
for d in data:
    questions.append(d['prompt'])


if os.path.exists(path_to_embeddings):
    with open(path_to_embeddings, 'rb') as f:
        embeddings = np.load(f)
else:
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = embedder.encode(questions, show_progress_bar=True)

    with open(path_to_embeddings, 'wb') as f:
        np.save(f, embeddings)

# Reduce dimensionality to 5 or 10 dimensions
umap_reducer = umap.UMAP(n_components=5, random_state=42)
reduced_embeddings = umap_reducer.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1, metric='euclidean')
cluster_labels = clusterer.fit_predict(reduced_embeddings)

# Print the clusters and their associated sentences
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)  # Exclude noise label (-1)
print(f"Number of clusters found: {num_clusters}")

print("I found that the users often ask about the same thing, I don't know what to do with this information, maybe we only use one question per cluster ? We'll see, but first interesting thing to note")
# Print sentences grouped by clusters
for cluster in range(num_clusters):
    # Find indices of elements belonging to the current cluster
    cluster_indices = [i for i, label in enumerate(cluster_labels) if label == cluster]
    
    # Only print out small clusters
    if len(cluster_indices) <= 4:
        print(f"Cluster {cluster} (size: {len(cluster_indices)}):")
        for i in cluster_indices:
            print(f" - {questions[i]}")

import matplotlib.pyplot as plt

# Reduce to 2D for visualization
umap_2d = umap.UMAP(n_components=2, random_state=42)
embedding_2d = umap_2d.fit_transform(embeddings)

# Plot clusters
plt.figure(figsize=(10, 8))
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=cluster_labels, cmap='Spectral', s=50)
plt.colorbar()
plt.title("Sentence Clusters based on Sentence Embeddings (I haven't found this useful honestly)")
plt.show()

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
# Bastien Code for inspiration -> Multiturn

def system_prompt(self, final=False) -> str:

        # First sentence options
        # Updated first sentence options
        if self.brief:
            first_sentence_options = [
                "As a doctor in {}, you are in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Keep your explanations short and to the point, and ask pertinent questions.",
                "You are practicing medicine in {}. {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear but succinct, then pose relevant questions.",
                "While working as a physician in {}, {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit yet brief in your reasoning, and ask appropriate questions.",
                "In {}, you find yourself in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Keep your explanations short and to the point, and ask pertinent questions.",
                "Serving as a doctor in {}, {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear but succinct, then pose relevant questions.",
                "As a physician in {}, you are confronted with {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit yet brief in your reasoning, and ask appropriate questions.",
            ]
        elif self.bullet:
            first_sentence_options = [
                "As a doctor in {}, you are in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Be explicit and to the point, present your reasoning and questions in bullet points.",
                "You are practicing medicine in {}. {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear and format your reasoning and questions as bullet points.",
                "While working as a physician in {}, {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit and present your questions and reasoning in bullet points.",
                "In {}, you find yourself in {} Your task is to diagnose a patient by asking concise questions and providing brief reasoning. Make your thinking clear and use bullet points to present your reasoning and questions.",
                "Serving as a doctor in {}, {} Your mission is to determine the patient's diagnosis through concise inquiry, being efficient in your thought process. Make your thinking clear and present your reasoning and questions in bullet points.",
                "As a physician in {}, you are confronted with {} Your role is to identify the patient's ailment by asking concise questions and efficiently analyzing your thoughts. Be explicit and format your reasoning and questions as bullet points.",
            ]
        else:
            first_sentence_options = [
                "As a doctor in {}, you are in {} Your task is to diagnose a patient by asking questions and carefully considering your thoughts. Explain your reasoning and ask pertinent questions.",
                "You are practicing medicine in {}. {} Your mission is to determine the patient's diagnosis through inquiry, being thorough in your thought process. Make your thinking explicit, then pose relevant questions.",
                "While working as a physician in {}, {} Your role is to identify the patient's ailment by asking questions and thoroughly analyzing your thoughts. Be explicit about your reasoning, and ask appropriate questions.",
                "In {}, you find yourself in {} Your task is to diagnose a patient by asking questions and carefully considering your thoughts. Explain your reasoning and ask pertinent questions.",
                "Serving as a doctor in {}, {} Your mission is to determine the patient's diagnosis through inquiry, being thorough in your thought process. Make your thinking explicit, then pose relevant questions.",
                "As a physician in {}, you are confronted with {} Your role is to identify the patient's ailment by asking questions and thoroughly analyzing your thoughts. Be explicit about your reasoning, and ask appropriate questions.",
            ]

        first_sentence = random.choice(first_sentence_options).format(self.country, random.choice(setting_doctor[self.setting]))

        # Test request sentence for high resource setting
        if self.setting == "high" and self.infs < self.MAX_INFS:
            test_request_options = [
                "You may also request tests if they are available and can assist in your diagnosis, using the format \"REQUEST TEST: [test]\". For instance, \"REQUEST TEST: Chest_X-Ray\".",
                "If helpful and accessible, you should request tests to aid your diagnosis using the format \"REQUEST TEST: [test]\". Example: \"REQUEST TEST: Chest_X-Ray\".",
                "Feel free to request any available tests that might help in diagnosing, using \"REQUEST TEST: [test]\". For example, \"REQUEST TEST: Chest_X-Ray\".",
            ]
            test_request_sentence = random.choice(test_request_options)
        else:
            test_request_sentence = ""

        # Question limit sentence options
        if final:
            question_limit_options = [
                "You are only allowed to ask {} questions in total before you must make a decision.",
                "You have a limit of {} questions to ask before making a diagnosis.",
                "You can ask up to {} questions before you need to decide.",
                "You may ask a maximum of {} questions before diagnosing.",
                "A total of {} questions are allowed before you must diagnose.",
                "You have {} questions in total to reach your diagnosis.",
                "Only {} questions can be asked before making your decision.",
                "You are permitted to ask up to {} questions before you must decide.",
            ]
            question_limit_sentence = random.choice(question_limit_options).format(self.MAX_INFS)
        else:
            question_limit_options = [
                "You are only allowed to ask {} questions in total before you must make a decision. You have asked {} questions so far.",
                "You have a limit of {} questions to ask before making a diagnosis. So far, you've asked {} questions.",
                "You can ask up to {} questions before you need to decide. Currently, you have asked {} questions.",
                "You may ask a maximum of {} questions before diagnosing. You have already asked {} questions.",
                "A total of {} questions are allowed before you must diagnose. Up to now, you've asked {} questions.",
                "You have {} questions in total to reach your diagnosis. So far, you've used {} questions.",
                "Only {} questions can be asked before making your decision. You have asked {} questions till now.",
                "You are permitted to ask up to {} questions before you must decide. At present, you have asked {} questions.",
            ]
            question_limit_sentence = random.choice(question_limit_options).format(self.MAX_INFS, self.infs)
        

        # Diagnosis instruction options
        diagnosis_instruction_options = [
            "Once you have decided to make a diagnosis, please type \"DIAGNOSIS READY: [diagnosis here]\".",
            "When ready to diagnose, enter \"DIAGNOSIS READY: [diagnosis here]\".",
            "After concluding your diagnosis, submit it using \"DIAGNOSIS READY: [diagnosis here]\".",
            "Upon reaching a diagnosis, please type \"DIAGNOSIS READY: [diagnosis here]\".",
            "Once you are confident in your diagnosis, enter \"DIAGNOSIS READY: [diagnosis here]\".",
            "When you have determined the diagnosis, submit it using \"DIAGNOSIS READY: [diagnosis here]\".",
            "After you have made a diagnosis, indicate it by typing \"DIAGNOSIS READY: [diagnosis here]\".",
            "When you are prepared to diagnose, please provide it using \"DIAGNOSIS READY: [diagnosis here]\".",
        ]
        diagnosis_instruction_sentence = random.choice(diagnosis_instruction_options)

        # Knows diagnosis sentence
        if self.knows_diagnosis and not final:
            knows_diagnosis_options = [
                " You suspect that the patient suffers from {}. This affects the questions you ask the patient.",
                " You have a hunch that the patient may have {}. This should influence your questioning.",
                " You believe the patient might be suffering from {}. Let this guide your questions.",
            ]
            knows_diagnosis_sentence = random.choice(knows_diagnosis_options).format(self.correct_diagnosis)
        else:
            knows_diagnosis_sentence = ""