In [3]:
import json
import yaml
import os
import numpy as np

config = yaml.safe_load(open("/gscratch/balazinska/enhaoz/VOCAL-UDF/configs/config.yaml", "r"))

In [4]:
tasks = [f"{i}_new_udfs" for i in range(4)]
for task in tasks:
    accuracy_list = []
    f1_list = []
    precision_list = []
    recall_list = []
    failed_list = []
    for run_id in range(5):
        for question_id in range(10):
            with open(os.path.join(config['output_dir'], "clevr", f"task_{task}_labels_run_{run_id}_question_{question_id}.json"), "r") as f:
                data = json.load(f)
            # "accuracy": 0.9329333333333333, "f1": 0.7679889298892989, "precision": 0.8169774288518155, "recall": 0.7245430809399478, "failed": 0
            accuracy_list.append(data["accuracy"])
            f1_list.append(data["f1"])
            precision_list.append(data["precision"])
            recall_list.append(data["recall"])
            failed_list.append(data["failed"])
            if '0' in task and data["failed"] > 0:
                print(run_id, question_id)
    print(f"task_{task}_mean_accuracy: {np.mean(accuracy_list)}")
    print(f"task_{task}_mean_f1: {np.mean(f1_list)}")
    print(f"task_{task}_mean_precision: {np.mean(precision_list)}")
    print(f"task_{task}_mean_recall: {np.mean(recall_list)}")
    print(f"task_{task}_prob_failures: {np.mean(failed_list) / 15000}")
    print()

1 4
task_0_new_udfs_mean_accuracy: 0.8631973333333335
task_0_new_udfs_mean_f1: 0.6192898766954091
task_0_new_udfs_mean_precision: 0.621786854684621
task_0_new_udfs_mean_recall: 0.7265138743239781
task_0_new_udfs_prob_failures: 0.02

task_1_new_udfs_mean_accuracy: 0.8235960000000001
task_1_new_udfs_mean_f1: 0.20628178243492123
task_1_new_udfs_mean_precision: 0.21059127122499313
task_1_new_udfs_mean_recall: 0.24479415607390273
task_1_new_udfs_prob_failures: 0.62

task_2_new_udfs_mean_accuracy: 0.78328
task_2_new_udfs_mean_f1: 0.0
task_2_new_udfs_mean_precision: 0.0
task_2_new_udfs_mean_recall: 0.0
task_2_new_udfs_prob_failures: 1.0

task_3_new_udfs_mean_accuracy: 0.8365533333333333
task_3_new_udfs_mean_f1: 0.0
task_3_new_udfs_mean_precision: 0.0
task_3_new_udfs_mean_recall: 0.0
task_3_new_udfs_prob_failures: 1.0



In [6]:
tasks = [f"{i}_new_udfs" for i in range(4)]
llm_models = ['gpt-3.5-turbo-instruct', 'gpt-3.5-turbo-1106', 'gpt-4-1106-preview']
for llm_model in llm_models:
    print(f"######## {llm_model} ########")
    for task in tasks:
        accuracy_list = []
        f1_list = []
        precision_list = []
        recall_list = []
        failed_list = []
        for run_id in range(5):
            for question_id in range(10):
                try:
                    with open(os.path.join(config['output_dir'], "clevr", llm_model, f"task_{task}_labels_run_{run_id}_question_{question_id}.json"), "r") as f:
                        data = json.load(f)
                    # "accuracy": 0.9329333333333333, "f1": 0.7679889298892989, "precision": 0.8169774288518155, "recall": 0.7245430809399478, "failed": 0
                    accuracy_list.append(data["accuracy"])
                    f1_list.append(data["f1"])
                    precision_list.append(data["precision"])
                    recall_list.append(data["recall"])
                    failed_list.append(data["failed"])
                    if '0' in task and data["failed"] > 0:
                        print(run_id, question_id)
                except:
                    continue
        print(f"task_{task}_mean_accuracy: {np.mean(accuracy_list)}")
        print(f"task_{task}_mean_f1: {np.mean(f1_list)}")
        print(f"task_{task}_mean_precision: {np.mean(precision_list)}")
        print(f"task_{task}_mean_recall: {np.mean(recall_list)}")
        print(f"task_{task}_prob_failures: {np.mean(failed_list) / 15000}")
        print()

######## gpt-3.5-turbo-instruct ########
task_0_new_udfs_mean_accuracy: 0.8529666666666667
task_0_new_udfs_mean_f1: 0.6662707686740801
task_0_new_udfs_mean_precision: 0.6371257113920208
task_0_new_udfs_mean_recall: 0.7866214453547347
task_0_new_udfs_prob_failures: 0.0

task_1_new_udfs_mean_accuracy: 0.82752
task_1_new_udfs_mean_f1: 0.23229065189453352
task_1_new_udfs_mean_precision: 0.312777916692305
task_1_new_udfs_mean_recall: 0.2402823771954207
task_1_new_udfs_prob_failures: 0.5

task_2_new_udfs_mean_accuracy: 0.8017733333333334
task_2_new_udfs_mean_f1: 0.056562425017995674
task_2_new_udfs_mean_precision: 0.08227082363890181
task_2_new_udfs_mean_recall: 0.043095673369896406
task_2_new_udfs_prob_failures: 0.9

task_3_new_udfs_mean_accuracy: 0.8365533333333334
task_3_new_udfs_mean_f1: 0.0
task_3_new_udfs_mean_precision: 0.0
task_3_new_udfs_mean_recall: 0.0
task_3_new_udfs_prob_failures: 1.0

######## gpt-3.5-turbo-1106 ########
0 8
1 4
3 4
3 7
4 4
task_0_new_udfs_mean_accuracy: 0.8551