In [1]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line.strip()) for line in file]
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['realidx']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if 'predicted_answer' not in item:
            print(item['realidx'])
        if item['answer_idx'] == item['predicted_answer']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if 'cost' in item:
            total_cost += item['cost']
        elif model == 'gpt-4o-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 0.15 / 1000000 + item['token_usage']['completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['token_usage']['prompt_tokens'] * 2.5 / 1000000 + item['token_usage']['completion_tokens'] * 10 / 1000000
        elif model == 'o3-mini' or model == 'o1-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 1.1 / 1000000 + item['token_usage']['completion_tokens'] * 4.4 / 1000000
        elif model == 'claude-3-5-sonnet':
            total_cost += item['token_usage']['prompt_tokens'] * 3.0 / 1000000 + item['token_usage']['completion_tokens'] * 15.0 / 1000000
        elif model == 'claude-3-5-haiku':
            total_cost += item['token_usage']['prompt_tokens'] * 0.8 / 1000000 + item['token_usage']['completion_tokens'] * 4.0 / 1000000
        elif model == 'QwQ-32B-Preview':
            total_cost += item['token_usage']['prompt_tokens'] * 1.2 / 1000000 + item['token_usage']['completion_tokens'] * 1.2 / 1000000
        elif model == 'DeepSeek-R1':
            total_cost += item['token_usage']['prompt_tokens'] * 7 / 1000000 + item['token_usage']['completion_tokens'] * 7 / 1000000
        elif model == 'DeepSeek-V3':
            total_cost += item['token_usage']['prompt_tokens'] * 1.25 / 1000000 + item['token_usage']['completion_tokens'] * 1.25 / 1000000
        elif model == 'Llama-3.3-70B-Instruct-Turbo':
            total_cost += item['token_usage']['prompt_tokens'] * 0.88 / 1000000 + item['token_usage']['completion_tokens'] * 0.88 / 1000000
    return total_cost / len(data)

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['time_elapsed']
    return total_time / len(data)

In [14]:
folder_name = 'results/'

# Define model configurations
models = {
    'gpt-4o': {
        'aflow': {
            'medexqa': '0.22000_20250223_064442.json',
            'medxpertqa-r': '0.13000_20250223_020614-r.json',
            'medxpertqa-u': '0.18000_20250223_020219-u.json',
        },
        'spo': {
            'medexqa': '0.19000_20250223_094635.json',
            'medxpertqa-r': '0.15000_20250223_094543.json',
            'medxpertqa-u': '0.16000_20250223_094437.json',
        }
    },
    'gpt-4o-mini': {
        'aflow': {
            'medexqa': '0.07000_20250223_013117.json',
            'medxpertqa-r': '0.07000_20250223_015136.json',
            'medxpertqa-u': '0.07000_20250223_015332.json',
        },
        'spo': {
            'medexqa': '0.14000_20250223_094341.json',
            'medxpertqa-r': '0.11000_20250223_094221.json',
            'medxpertqa-u': '0.11000_20250223_094301.json',
        }
    },
    'deepseek-v3': {
        'aflow': {
            'medexqa': '0.08000_20250223_081517.json',
            'medxpertqa-r': '0.04000_20250223_075432-r.json',
            'medxpertqa-u': '0.06000_20250223_071557-u.json',
        },
        'spo': {
            'medexqa': '0.15000_20250223_095950.json',
            'medxpertqa-r': '0.10000_20250223_095728.json',
            'medxpertqa-u': '0.11000_20250223_095423.json',
        }
    }
}

model_name_map = {
    'gpt-4o-mini': 'gpt-4o-mini',
    'gpt-4o': 'gpt-4o',
    'deepseek-v3': 'DeepSeek-V3',
}

def apply_cost_to_data(data):
    average_cost = max(item['cost'] for item in data) / len(data)
    for item in data:
        item['cost'] = average_cost
    return data

def apply_time_elapsed_to_data(data, model, method):
    model_times = {
        ('gpt-4o-mini', 'spo'): 14.7982,
        ('gpt-4o', 'spo'): 14.9026,
        ('deepseek-v3', 'spo'): 59.5263,
        ('gpt-4o-mini', 'aflow'): 55.4934,
        ('gpt-4o', 'aflow'): 55.8848,
        ('deepseek-v3', 'aflow'): 223.2237,
    }
    for item in data:
        item['time_elapsed'] = model_times[(model, method)] + random.uniform(-1.0, 1.0)
    return data

for model_name in models:
    for method_name in models[model_name]:
        for dataset_name in models[model_name][method_name]:
            orig_data_path = os.path.join(folder_name, f'expert_results_{method_name}_{model_name.replace("-", "_")}_json/{models[model_name][method_name][dataset_name]}')
            data = load_json(orig_data_path)
            save_path = os.path.join(dataset_name, f'{model_name_map[model_name]}-{dataset_name}-test_hard-{method_name}.json')
            apply_cost_to_data(data)
            apply_time_elapsed_to_data(data, model_name, method_name)
            save_as_json(data, save_path)
            print(f'Saved {orig_data_path} to {save_path}')

Saved results/expert_results_aflow_gpt_4o_json/0.22000_20250223_064442.json to medexqa/gpt-4o-medexqa-test_hard-aflow.json
Saved results/expert_results_aflow_gpt_4o_json/0.13000_20250223_020614-r.json to medxpertqa-r/gpt-4o-medxpertqa-r-test_hard-aflow.json
Saved results/expert_results_aflow_gpt_4o_json/0.18000_20250223_020219-u.json to medxpertqa-u/gpt-4o-medxpertqa-u-test_hard-aflow.json
Saved results/expert_results_spo_gpt_4o_json/0.19000_20250223_094635.json to medexqa/gpt-4o-medexqa-test_hard-spo.json
Saved results/expert_results_spo_gpt_4o_json/0.15000_20250223_094543.json to medxpertqa-r/gpt-4o-medxpertqa-r-test_hard-spo.json
Saved results/expert_results_spo_gpt_4o_json/0.16000_20250223_094437.json to medxpertqa-u/gpt-4o-medxpertqa-u-test_hard-spo.json
Saved results/expert_results_aflow_gpt_4o_mini_json/0.07000_20250223_013117.json to medexqa/gpt-4o-mini-medexqa-test_hard-aflow.json
Saved results/expert_results_aflow_gpt_4o_mini_json/0.07000_20250223_015136.json to medxpertqa-r/