In [7]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd
import tiktoken

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line.strip()) for line in file]
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['realidx']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if 'predicted_answer' not in item:
            print(item['realidx'])
        if item['answer_idx'] == item['predicted_answer']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if 'cost' in item:
            total_cost += item['cost']
        elif model == 'gpt-4o-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 0.15 / 1000000 + item['token_usage']['completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['token_usage']['prompt_tokens'] * 2.5 / 1000000 + item['token_usage']['completion_tokens'] * 10 / 1000000
        elif model == 'o3-mini' or model == 'o1-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 1.1 / 1000000 + item['token_usage']['completion_tokens'] * 4.4 / 1000000
        elif model == 'claude-3-5-sonnet':
            total_cost += item['token_usage']['prompt_tokens'] * 3.0 / 1000000 + item['token_usage']['completion_tokens'] * 15.0 / 1000000
        elif model == 'claude-3-5-haiku':
            total_cost += item['token_usage']['prompt_tokens'] * 0.8 / 1000000 + item['token_usage']['completion_tokens'] * 4.0 / 1000000
        elif model == 'QwQ-32B-Preview':
            total_cost += item['token_usage']['prompt_tokens'] * 1.2 / 1000000 + item['token_usage']['completion_tokens'] * 1.2 / 1000000
        elif model == 'DeepSeek-R1':
            total_cost += item['token_usage']['prompt_tokens'] * 7 / 1000000 + item['token_usage']['completion_tokens'] * 7 / 1000000
        elif model == 'DeepSeek-V3':
            total_cost += item['token_usage']['prompt_tokens'] * 1.25 / 1000000 + item['token_usage']['completion_tokens'] * 1.25 / 1000000
        elif model == 'Llama-3.3-70B-Instruct-Turbo':
            total_cost += item['token_usage']['prompt_tokens'] * 0.88 / 1000000 + item['token_usage']['completion_tokens'] * 0.88 / 1000000
    return total_cost / len(data)

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['time_elapsed']
    return total_time / len(data)

def calculate_token_length(text):
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    return len(encoding.encode(text))


In [18]:
tasks = ['medqa', 'pubmedqa', 'medmcqa', 'medbullets', 'afrimedqa', 'mmlu', 'mmlu-pro']
tasks_name_mapping = {
    'medqa': 'MedQA \\cite{jin2021medqa}',
    'pubmedqa': 'PubMedQA \\cite{jin2019pubmedqa}',
    'medmcqa': 'MedMCQA \\cite{pal2022medmcqa}',
    'medbullets': 'MedBullets \\cite{chen2024medbullet}',
    'mmlu': 'MMLU \\cite{hendrycks2020mmlu}',
    'mmlu-pro': 'MMLU-Pro \\cite{wang2024mmlu}',
    'afrimedqa': 'Afrimed-QA \\cite{olatunji2024afrimed}'
}
tasks_description = {
    'medqa': 'Multiple choice questions from medical licensing exams',
    'pubmedqa': 'Questions based on PubMed abstracts',
    'medmcqa': 'Questions from AIIMS \\& NEET PG entrance exams',
    'medbullets': 'Questions from Medbullets online medical study platform',
    'afrimedqa': 'Diverse medical questions from African healthcare contexts',
    'mmlu': 'Multitask questions covering medical, biology, and other academic domains',
    'mmlu-pro': 'Multitask questions covering medical, biology, and other academic domains',
}

latex_table = r"""
\begin{table*}[h]
\begin{tabular}{lrrrp{5cm}}
\hline
Benchmark & Size & Avg Lens & Options & Description \\
\hline
"""

test_hard = []
for task in tasks:
    test = load_jsonl(f'../data/{task}/test.jsonl')
    test_hard.extend(load_jsonl(f'../data/{task}/test_hard.jsonl'))
    len_test = len(test)
    avg_question_length = sum(calculate_token_length(item['question']) for item in test) / len_test
    num_of_options = [len(item['options']) for item in test]
    min_options = min(num_of_options)
    max_options = max(num_of_options)
    options_range = f"{min_options}-{max_options}" if min_options != max_options else min_options
    
    # Add color based on benchmark type
    if task in ['medqa', 'pubmedqa', 'medmcqa']:
        color = '\\rowcolor{blue!10}'  # Traditional benchmarks
    elif task in ['medbullets', 'afrimedqa']:
        color = '\\rowcolor{green!10}'  # Recently emerging benchmarks
    else:
        color = '\\rowcolor{orange!10}'  # General purpose benchmarks
        
    latex_table += f"{color}{tasks_name_mapping[task]} & {len_test} & {avg_question_length:.1f} & {options_range} & {tasks_description[task]} \\\\\n"

# Calculate stats for test_hard subset
len_test_hard = len(test_hard)
avg_question_length_hard = sum(calculate_token_length(item['question']) for item in test_hard) / len_test_hard
num_of_options_hard = [len(item['options']) for item in test_hard]
min_options_hard = min(num_of_options_hard)
max_options_hard = max(num_of_options_hard)
options_range_hard = f"{min_options_hard}-{max_options_hard}" if min_options_hard != max_options_hard else min_options_hard

latex_table += r"\hline" + "\n"
latex_table += f"\\Ours{{}} & {len_test_hard} & {avg_question_length_hard:.1f} & {options_range_hard} & Hard subset across all datasets \\\\\n"

latex_table += r"""\hline
\end{tabular}
\caption{\textbf{Overview of Medical Question-Answering Datasets.} Survey of knowledge-based QA datasets curated from medical literature, professional journals, and educational resources. \colorbox{blue!10}{Traditional benchmarks}, \colorbox{green!10}{recently emerging benchmarks}, and \colorbox{orange!10}{general purpose benchmarks} are shown with corresponding background colors.}
\end{table*}
"""
print(latex_table)



\begin{table*}[h]
\begin{tabular}{lrrrp{5cm}}
\hline
Benchmark & Size & Avg Lens & Options & Description \\
\hline
\rowcolor{blue!10}MedQA \cite{jin2021medqa} & 1273 & 167.1 & 4 & Multiple choice questions from medical licensing exams \\
\rowcolor{blue!10}PubMedQA \cite{jin2019pubmedqa} & 500 & 316.1 & 3 & Questions based on PubMed abstracts \\
\rowcolor{blue!10}MedMCQA \cite{pal2022medmcqa} & 2816 & 18.7 & 4 & Questions from AIIMS \& NEET PG entrance exams \\
\rowcolor{green!10}MedBullets \cite{chen2024medbullet} & 308 & 213.1 & 5 & Questions from Medbullets online medical study platform \\
\rowcolor{green!10}Afrimed-QA \cite{olatunji2024afrimed} & 174 & 30.0 & 5 & Diverse medical questions from African healthcare contexts \\
\rowcolor{orange!10}MMLU \cite{hendrycks2020mmlu} & 1089 & 55.9 & 4 & Multitask questions covering medical, biology, and other academic domains \\
\rowcolor{orange!10}MMLU-Pro \cite{wang2024mmlu} & 818 & 57.4 & 3-10 & Multitask questions covering medical, biolog

In [19]:
methods = [
    # General-purpose methods
    ('Chain-of-Thought', 'wei2022chain', 'Elicits reasoning in large language models'),
    ('Self-Consistency', 'wang2022self', 'Improves chain of thought reasoning in language models'),
    ('MedPrompt', 'chen2024medprompt', 'Multi-round prompting with ensemble voting for medical question answering'),
    ('Multi-Persona', 'wang2023multipersona', 'Task-solving agent through multi-persona self-collaboration'),
    ('Self-Refine', 'madaan2024selfrefine', 'Iterative refinement with self-feedback'),
    
    # Domain-specific methods
    ('MedAgents', 'tang2023medagents', 'Collaborative multi-agent framework for zero-shot medical decision making'),
    ('MDAgents', 'kim2024mdagents', 'Dynamic multi-agent collaboration framework for medical reasoning'),
    
    # Search-based methods
    ('AFlow', 'zhang2024aflow', 'Automating agentic workflow generation'),
    ('SPO', 'xiang2025spo', 'Self-supervised prompt optimization')
]

latex_table = r"""
\begin{table*}[h]
\begin{tabular}{lp{8cm}}
\hline
Method & Description \\
\hline
"""

for method, citation, desc in methods:
    # Add color based on method type
    if method in ['Chain-of-Thought', 'Self-Consistency', 'MedPrompt', 'Multi-Persona', 'Self-Refine']:
        color = '\\rowcolor{blue!10}'  # General-purpose methods
    elif method in ['MedAgents', 'MDAgents']:
        color = '\\rowcolor{green!10}'  # Domain-specific methods
    else:
        color = '\\rowcolor{orange!10}'  # Search-based methods
        
    latex_table += f"{color}{method} \\cite{{{citation}}} & {desc} \\\\\n"

latex_table += r"""\hline
\end{tabular}
\caption{\textbf{Overview of Methods.} Survey of methods used for medical reasoning and question answering. \colorbox{blue!10}{General-purpose methods}, \colorbox{green!10}{domain-specific methods}, and \colorbox{orange!10}{search-based methods} are shown with corresponding background colors.}
\end{table*}
"""
print(latex_table)



\begin{table*}[h]
\begin{tabular}{lp{8cm}}
\hline
Method & Description \\
\hline
\rowcolor{blue!10}Chain-of-Thought \cite{wei2022chain} & Elicits reasoning in large language models \\
\rowcolor{blue!10}Self-Consistency \cite{wang2022self} & Improves chain of thought reasoning in language models \\
\rowcolor{blue!10}MedPrompt \cite{chen2024medprompt} & Multi-round prompting with ensemble voting for medical question answering \\
\rowcolor{blue!10}Multi-Persona \cite{wang2023multipersona} & Task-solving agent through multi-persona self-collaboration \\
\rowcolor{blue!10}Self-Refine \cite{madaan2024selfrefine} & Iterative refinement with self-feedback \\
\rowcolor{green!10}MedAgents \cite{tang2023medagents} & Collaborative multi-agent framework for zero-shot medical decision making \\
\rowcolor{green!10}MDAgents \cite{kim2024mdagents} & Dynamic multi-agent collaboration framework for medical reasoning \\
\rowcolor{orange!10}AFlow \cite{zhang2024aflow} & Automating agentic workflow generat