In [1]:
import json
import yaml
import random
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import f1_score
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display, HTML
config = yaml.safe_load(open("/gscratch/balazinska/enhaoz/VOCAL-UDF/configs/config.yaml", "r"))

import re
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']

# NL To DSL

In [6]:
def eval_nl_to_dsl(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name):
    num_correct = 0
    num_correct_98 = 0
    num_total = 0
    failed_scores = []
    for query_class_name in query_class_names:
        for run_id in run_ids:
            for question_id in question_ids:
                num_total += 1
                try:
                    with open(os.path.join(config['log_dir'], "nl_to_dsl", dataset, query_class_name, vocal_udf_config_name, f"qid={question_id}-run={run_id}.log"), "r") as f:
                        lines = f.readlines()
                    f1_score = -1
                    for line in lines:
                        if "F1 score:" in line:
                            f1_score_pattern = r"F1 score: ([0-9.]+)"
                            match = re.search(f1_score_pattern, line)
                            f1_score = float(match.group(1))
                            break
                    if f1_score == -1:
                        print(f"failed task: qid={question_id}-run={run_id}")
                        f1_score = 0
                    if f1_score == 1:
                        num_correct += 1
                    else:
                        failed_scores.append(f1_score)
                        if f1_score < 0.98:
                            print(f"failed task: qid={question_id}-run={run_id}")
                    if f1_score >= 0.98:
                        if f1_score < 1:
                            print(f"correct (1>f1>=0.98) task: qid={question_id}-run={run_id}")
                        num_correct_98 += 1
                except Exception as e:
                    print(e)
    print(f"num_total={num_total}, num_correct={num_correct/num_total}, num_correct_98={num_correct_98/num_total}")
    failed_scores.sort(reverse = True)
    print(f"failed_scores={failed_scores}")

In [7]:
dataset = "clevrer"
query_class_names = [
    "3_new_udfs_labels",
]
question_ids = list(range(30))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v"

eval_nl_to_dsl(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

correct (1>f1>=0.98) task: qid=1-run=0
correct (1>f1>=0.98) task: qid=2-run=0
correct (1>f1>=0.98) task: qid=3-run=0
failed task: qid=5-run=0
correct (1>f1>=0.98) task: qid=6-run=0
failed task: qid=8-run=0
correct (1>f1>=0.98) task: qid=9-run=0
correct (1>f1>=0.98) task: qid=19-run=0
correct (1>f1>=0.98) task: qid=1-run=1
correct (1>f1>=0.98) task: qid=2-run=1
correct (1>f1>=0.98) task: qid=3-run=1
failed task: qid=5-run=1
correct (1>f1>=0.98) task: qid=6-run=1
failed task: qid=8-run=1
correct (1>f1>=0.98) task: qid=9-run=1
correct (1>f1>=0.98) task: qid=1-run=2
correct (1>f1>=0.98) task: qid=2-run=2
correct (1>f1>=0.98) task: qid=3-run=2
failed task: qid=5-run=2
correct (1>f1>=0.98) task: qid=6-run=2
failed task: qid=8-run=2
correct (1>f1>=0.98) task: qid=9-run=2
num_total=90, num_correct=0.7555555555555555, num_correct_98=0.9333333333333333
failed_scores=[0.9994895354772844, 0.9994895354772844, 0.9994895354772844, 0.9993861264579497, 0.9993861264579497, 0.9993861264579497, 0.99916736

In [8]:
dataset = "cityflow"
query_class_names = [
    "unavailable_pred=1-unavailable_attr_pred=1-npred=1-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737",
    "unavailable_pred=1-unavailable_attr_pred=1-npred=2-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737"
]
question_ids = list(range(15))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"

eval_nl_to_dsl(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

correct (1>f1>=0.98) task: qid=1-run=0
failed task: qid=8-run=0
correct (1>f1>=0.98) task: qid=1-run=1
correct (1>f1>=0.98) task: qid=1-run=2
failed task: qid=3-run=0
correct (1>f1>=0.98) task: qid=6-run=0
failed task: qid=3-run=1
correct (1>f1>=0.98) task: qid=6-run=1
failed task: qid=3-run=2
correct (1>f1>=0.98) task: qid=6-run=2
num_total=90, num_correct=0.8888888888888888, num_correct_98=0.9555555555555556
failed_scores=[0.9925925925925926, 0.9925925925925926, 0.9925925925925926, 0.9868766404199476, 0.9868766404199476, 0.9868766404199476, 0.9217391304347825, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636]


In [9]:
dataset = "charades"
query_class_names = [
    "unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2",
    "unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2",
    "unavailable=2-npred=3-nobj_pred=1-nvars=2-depth=2"
]
question_ids = list(range(10))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"

eval_nl_to_dsl(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

correct (1>f1>=0.98) task: qid=2-run=0
correct (1>f1>=0.98) task: qid=2-run=1
correct (1>f1>=0.98) task: qid=2-run=2
failed task: qid=0-run=1
failed task: qid=0-run=2
failed task: qid=2-run=0
correct (1>f1>=0.98) task: qid=4-run=0
correct (1>f1>=0.98) task: qid=7-run=0
correct (1>f1>=0.98) task: qid=2-run=1
correct (1>f1>=0.98) task: qid=4-run=1
correct (1>f1>=0.98) task: qid=7-run=1
correct (1>f1>=0.98) task: qid=2-run=2
correct (1>f1>=0.98) task: qid=4-run=2
correct (1>f1>=0.98) task: qid=7-run=2
num_total=90, num_correct=0.8444444444444444, num_correct_98=0.9666666666666667
failed_scores=[0.9947368421052631, 0.9947368421052631, 0.9909297052154195, 0.9909297052154195, 0.9904153354632589, 0.9904153354632589, 0.9904153354632589, 0.9852302345786272, 0.9850107066381156, 0.9850107066381156, 0.9850107066381156, 0.9563164108618654, 0.23019250253292803, 0.23019250253292803]


# Proposing UDFs

In [2]:
def standardize_udf_name(dataset, udf_name):
    udf_name = udf_name.replace(" ", "").replace("_", "").lower()
    # location_bottom_left, behind_and_near, in_front_of, behind_left_of, far_left_of, behind_and_left_of, near_and_right_of,
    if dataset == "clevrer":
        if udf_name in ["materialmetal"]:
            udf_name = "metal"
        elif udf_name in ["shapecylinder"]:
            udf_name = "cylinder"
        elif udf_name in ["coloryellow"]:
            udf_name = "yellow"
        elif udf_name in ["colorpurple"]:
            udf_name = "purple"
        elif udf_name in ["colorcyan"]:
            udf_name = "cyan"
        elif udf_name in ["colorbrown"]:
            udf_name = "brown"
        elif udf_name in ["locationright"]:
            udf_name = "right"
        elif udf_name in ["locationbottom"]:
            udf_name = "bottom"
        elif udf_name in ["farfrom", "farawayfrom"]:
            udf_name = "far"
        elif udf_name in ["nearof"]:
            udf_name = "near"
        elif udf_name in ["behindof"]:
            udf_name = "behind"
        elif udf_name in ["shapecylindrical"]:
            udf_name = "cylinder"
    elif dataset == "cityflow":
        # right_of, left_of, pickup, in_front_of_white, suv_and_red, white_sedan, moves_in_front_of, color_blue, color_red,
        if udf_name in ["colorred"]:
            udf_name = "red"
        elif udf_name in ["colorblue"]:
            udf_name = "blue"
        elif udf_name in ["rightof"]:
            udf_name = "totherightof"
        elif udf_name in ["leftof"]:
            udf_name = "totheleftof"
        elif udf_name in ["pickup"]:
            udf_name = "pickuptruck"
    elif dataset == "charades":
        # inside, inside_of, eating_from, inside_and_interacting_with, inside_while_drinking_from, inside_while_drinking, drinking_from_inside, beneath_and_wearing, moving_behind
        if udf_name in ["inside", "insideof"]:
            udf_name = "in"
        elif udf_name in ["eatingfrom"]:
            udf_name = "eating"
    return udf_name

In [33]:
def eval_proposing_udfs(dataset, query_class_names, num_new_udf_list, question_ids, run_ids, vocal_udf_config_name):
    # Is the system able to propose UDFs when needed?
    # When does our approach work? When does it fail?
    # What UDFs does the system propose?
    # Same 90 queries, analyze how well the proposed UDFs match with the ground-truth
    # 1. For every proposed UDF, how to define "match"?
    # 2. When does it over-proposes?
    # 3. When does it under-proposes?
    # 4. What's the average number of UDFs proposed?
    FP_list = defaultdict(int) # proposed UDFs
    FN_list = defaultdict(int) # gt UDFs
    num_proposed_udfs = 0
    num_gt_new_udfs = 0
    for num_new_udfs in num_new_udf_list:
        avg_num_proposed_udfs = []
        for query_class_name in query_class_names:
            for run_id in run_ids:
                for question_id in question_ids:
                    proposed_udfs = []
                    try:
                        with open(os.path.join(config['output_dir'], "udf_generation", dataset, query_class_name, f"num_missing_udfs={num_new_udfs}", vocal_udf_config_name, f"qid={question_id}-run={run_id}.json"), "r") as f:
                            data = json.load(f)
                        proposed_udfs.extend(data["on_the_fly_udf_names"])
                        proposed_udfs.extend(data["materialized_df_names"])
                    except Exception as e:
                        print(e)
                    if len(proposed_udfs) > 0 and num_new_udfs == 0:
                        print(f"query_class_name={query_class_name}, question_id={question_id}, run_id={run_id}")
                    num_proposed_udfs += len(proposed_udfs)
                    num_gt_new_udfs += num_new_udfs
                    avg_num_proposed_udfs.append(len(proposed_udfs))
                    input_query_file = os.path.join(config["data_dir"], dataset, f"{query_class_name}.json")
                    input_query = json.load(open(input_query_file, "r"))["questions"][question_id]
                    new_modules = input_query["new_modules"]
                    gt_udfs = new_modules[(len(new_modules) - num_new_udfs):]
                    for proposed_udf in proposed_udfs:
                        if standardize_udf_name(dataset, proposed_udf) not in [gt_udf.replace(" ", "").replace("_", "").lower() for gt_udf in gt_udfs]:
                            FP_list[proposed_udf] += 1

                    for gt_udf in gt_udfs:
                        if gt_udf.replace(" ", "").replace("_", "").lower() not in [standardize_udf_name(dataset, proposed_udf) for proposed_udf in proposed_udfs]:
                            FN_list[gt_udf] += 1
                            if gt_udf == "HOLDING":
                                print(f"FN: query_class_name={query_class_name}, question_id={question_id}, run_id={run_id}")

        avg_num_proposed_udfs = np.mean(avg_num_proposed_udfs)
        print(f"num_new_udfs={num_new_udfs}: {avg_num_proposed_udfs}")

    # FP: over-proposed
    # FN: under-proposed
    print("FP: over-proposed")
    for udf_name, count in sorted(FP_list.items(), key=lambda x: -x[1]):
        print(f"{udf_name}: {count}")
    print(f"#FP={sum(FP_list.values())}")
    print("FN: under-proposed")
    for udf_name, count in sorted(FN_list.items(), key=lambda x: -x[1]):
        print(f"{udf_name}: {count}")
    print(f"#FN={sum(FN_list.values())}")
    print("num_proposed_udfs:", num_proposed_udfs)
    print("num_gt_new_udfs:", num_gt_new_udfs)

In [49]:
dataset = "clevrer"
query_class_names = [
    "3_new_udfs_labels",
]
question_ids = list(range(30))
run_ids = list(range(3))
num_new_udf_list = [0, 1, 2, 3]
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v"

eval_proposing_udfs(dataset, query_class_names, num_new_udf_list, question_ids, run_ids, vocal_udf_config_name)

num_new_udfs=0: 0.0
num_new_udfs=1: 0.6222222222222222
[Errno 2] No such file or directory: '/gscratch/balazinska/enhaoz/VOCAL-UDF/outputs/udf_generation/clevrer/3_new_udfs_labels/num_missing_udfs=2/ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v/qid=26-run=1.json'
num_new_udfs=2: 1.6777777777777778
num_new_udfs=3: 2.7666666666666666
FP: over-proposed
location_bottom_left: 7
behind_and_near: 4
location_right: 4
in_front_of: 3
behind_left_of: 2
far_left_of: 1
behind_and_left_of: 1
near_and_right_of: 1
#FP=23
FN: under-proposed
RIGHTOF: 36
BEHIND: 32
FAR: 12
RIGHT: 10
NEAR: 9
BOTTOM: 7
CYLINDER: 1
#FN=107
num_proposed_udfs: 456
num_gt_new_udfs: 540


In [51]:
dataset = "cityflow"
query_class_names = [
    "unavailable_pred=1-unavailable_attr_pred=1-npred=1-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737",
    "unavailable_pred=1-unavailable_attr_pred=1-npred=2-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737"
]
question_ids = list(range(15))
run_ids = list(range(3))
num_new_udf_list = [0, 1, 2]
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"

eval_proposing_udfs(dataset, query_class_names, num_new_udf_list, question_ids, run_ids, vocal_udf_config_name)

num_new_udfs=0: 0.0
num_new_udfs=1: 1.011111111111111
num_new_udfs=2: 1.9555555555555555
FP: over-proposed
in_front_of_white: 1
suv_and_red: 1
white_sedan: 1
moves_in_front_of: 1
#FP=4
FN: under-proposed
INFRONTOF: 5
BLACK: 1
TOTHERIGHTOF: 1
#FN=7
num_proposed_udfs: 267
num_gt_new_udfs: 270


In [53]:
dataset = "charades"
query_class_names = [
    "unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2",
    "unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2",
    "unavailable=2-npred=3-nobj_pred=1-nvars=2-depth=2"
]
question_ids = list(range(10))
run_ids = list(range(3))
num_new_udf_list = [0, 1, 2]
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"

eval_proposing_udfs(dataset, query_class_names, num_new_udf_list, question_ids, run_ids, vocal_udf_config_name)

query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=2, run_id=2
query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2, question_id=0, run_id=0
query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2, question_id=0, run_id=1
query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2, question_id=0, run_id=2
num_new_udfs=0: 0.044444444444444446
FN: query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=4, run_id=0
FN: query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=6, run_id=0
FN: query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=7, run_id=0
FN: query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=4, run_id=1
FN: query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=6, run_id=1
FN: query_class_name=unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2, question_id=4, run_id=2
FN: query_class_nam

# Selection Strategy

In [3]:
def is_correctly_proposed(dataset, udf_name, gt_udfs):
    udf_name = standardize_udf_name(dataset, udf_name)
    if udf_name not in [gt_udf.replace(" ", "").replace("_", "").lower() for gt_udf in gt_udfs]:
        return False
    else:
        return True

In [7]:
def eval_selection_strategy(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name):
    """ Only evaluate correctly proposed UDFs"""
    num_incorrect_proposal = 0
    # UDF selection stats
    num_correct_selection = 0
    num_correct_selection_80 = 0
    num_incorrect_selection = 0

    # Choosing between UDF types stats (when 'dummy' is the only best UDF type)
    num_correct_udf_type_dummy = 0 # The number of times the selected UDF type is correct
    num_incorrect_udf_type_dummy = 0 # The number of times the selected UDF type is incorrect
    num_llm_decides_program_udf_type_dummy = 0 # The number of times the LLM-selected UDF type is program
    num_llm_decides_model_udf_type_dummy = 0 # The number of times the LLM-selected UDF type is model

    # Choosing between UDF types stats (when the best UDF types are not 'dummy')
    num_correct_udf_type_not_dummy = 0
    num_incorrect_udf_type_not_dummy = 0
    num_llm_decides_correct_udf_type_not_dummy = 0
    num_llm_decides_incorrect_udf_type_not_dummy = 0

    for query_class_name in query_class_names:
        for run_id in run_ids:
            for question_id in question_ids:
                try:
                    input_query_file = os.path.join(config["data_dir"], dataset, f"{query_class_name}.json")
                    input_query = json.load(open(input_query_file, "r"))["questions"][question_id]
                    new_modules = input_query["new_modules"]
                    gt_udfs = new_modules

                    with open(os.path.join(config['output_dir'], "best_udf_type", dataset, query_class_name, vocal_udf_config_name, f"qid={question_id}-run={run_id}.json"), "r") as f:
                        best_udf_type_data = json.load(f)
                    for udf_name, v in best_udf_type_data.items():
                        if not is_correctly_proposed(dataset, udf_name, gt_udfs):
                            num_incorrect_proposal += 1
                            continue
                        if v["best_test_score"] == v["selected_test_score"]:
                            num_correct_selection += 1
                        else:
                            num_incorrect_selection += 1

                        if v["selected_test_score"] >= 0.8 * v["best_test_score"]:
                            num_correct_selection_80 += 1

                        if "dummy" not in v["best_udf_types"] or ("dummy" in v["best_udf_types"] and len(v["best_udf_types"]) > 1):
                            if v["selected_udf_type"] in v["best_udf_types"]:
                                num_correct_udf_type_not_dummy += 1
                            else:
                                num_incorrect_udf_type_not_dummy += 1
                        else:
                            assert len(v["best_udf_types"]) == 1, "assert 1"
                            if v["selected_udf_type"] == "dummy":
                                num_correct_udf_type_dummy += 1
                            else:
                                num_incorrect_udf_type_dummy += 1
                    with open(os.path.join(config['output_dir'], "llm_decides_udf_type", dataset, query_class_name, vocal_udf_config_name, f"qid={question_id}-run={run_id}.json"), "r") as f:
                        llm_decides_udf_type_data = json.load(f)
                    for udf_name, llm_decides_udf_type in llm_decides_udf_type_data.items():
                        if not is_correctly_proposed(dataset, udf_name, gt_udfs):
                            continue
                        if "dummy" not in best_udf_type_data[udf_name]["best_udf_types"] or ("dummy" in best_udf_type_data[udf_name]["best_udf_types"] and len(best_udf_type_data[udf_name]["best_udf_types"]) > 1):
                            if llm_decides_udf_type in best_udf_type_data[udf_name]["best_udf_types"]:
                                num_llm_decides_correct_udf_type_not_dummy += 1
                            else:
                                num_llm_decides_incorrect_udf_type_not_dummy += 1
                        else:
                            if llm_decides_udf_type == "program":
                                num_llm_decides_program_udf_type_dummy += 1
                            elif llm_decides_udf_type == "model":
                                num_llm_decides_model_udf_type_dummy += 1
                            else:
                                raise ValueError(f"llm_decides_udf_type={llm_decides_udf_type}")
                except Exception as e:
                    print(f"Error: {e}, query_class_name={query_class_name}, question_id={question_id}, run_id={run_id}")

    print(f"num_incorrect_proposal={num_incorrect_proposal}")

    print(f"num_correct_selection={num_correct_selection}, num_incorrect_selection={num_incorrect_selection}, ratio={num_correct_selection/(num_correct_selection+num_incorrect_selection)}, num_correct_selection_80={num_correct_selection_80}, ratio={num_correct_selection_80/(num_correct_selection+num_incorrect_selection)}")
    print()

    print("[When the best UDF types are not 'dummy']")
    print(f"num_correct_udf_type_not_dummy={num_correct_udf_type_not_dummy}, num_incorrect_udf_type_not_dummy={num_incorrect_udf_type_not_dummy}, ratio={num_correct_udf_type_not_dummy/(num_correct_udf_type_not_dummy+num_incorrect_udf_type_not_dummy)}")
    print(f"num_llm_decides_correct_udf_type_not_dummy={num_llm_decides_correct_udf_type_not_dummy}, num_llm_decides_incorrect_udf_type_not_dummy={num_llm_decides_incorrect_udf_type_not_dummy}, ratio={num_llm_decides_correct_udf_type_not_dummy/(num_llm_decides_correct_udf_type_not_dummy+num_llm_decides_incorrect_udf_type_not_dummy)}")
    print()

    print("[When 'dummy' is the only best UDF type]")
    print(f"num_correct_udf_type_dummy={num_correct_udf_type_dummy}, num_incorrect_udf_type_dummy={num_incorrect_udf_type_dummy}, ratio={num_correct_udf_type_dummy/(num_correct_udf_type_dummy+num_incorrect_udf_type_dummy)}")
    print(f"num_llm_decides_program_udf_type_dummy={num_llm_decides_program_udf_type_dummy}, num_llm_decides_model_udf_type_dummy={num_llm_decides_model_udf_type_dummy}")

In [8]:
dataset = "clevrer"
query_class_names = [
    "3_new_udfs_labels",
]
question_ids = list(range(30))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v"

eval_selection_strategy(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

num_incorrect_proposal=7
num_correct_selection=141, num_incorrect_selection=101, ratio=0.5826446280991735, num_correct_selection_80=221, ratio=0.9132231404958677

[When the best UDF types are not 'dummy']
num_correct_udf_type_not_dummy=212, num_incorrect_udf_type_not_dummy=21, ratio=0.9098712446351931
num_llm_decides_correct_udf_type_not_dummy=164, num_llm_decides_incorrect_udf_type_not_dummy=69, ratio=0.703862660944206

[When 'dummy' is the only best UDF type]
num_correct_udf_type_dummy=5, num_incorrect_udf_type_dummy=4, ratio=0.5555555555555556
num_llm_decides_program_udf_type_dummy=5, num_llm_decides_model_udf_type_dummy=4


In [9]:
dataset = "cityflow"
query_class_names = [
    "unavailable_pred=1-unavailable_attr_pred=1-npred=1-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737",
    "unavailable_pred=1-unavailable_attr_pred=1-npred=2-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737"
]
question_ids = list(range(15))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"

eval_selection_strategy(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

num_incorrect_proposal=0
num_correct_selection=130, num_incorrect_selection=46, ratio=0.7386363636363636, num_correct_selection_80=148, ratio=0.8409090909090909

[When the best UDF types are not 'dummy']
num_correct_udf_type_not_dummy=144, num_incorrect_udf_type_not_dummy=30, ratio=0.8275862068965517
num_llm_decides_correct_udf_type_not_dummy=114, num_llm_decides_incorrect_udf_type_not_dummy=60, ratio=0.6551724137931034

[When 'dummy' is the only best UDF type]
num_correct_udf_type_dummy=1, num_incorrect_udf_type_dummy=1, ratio=0.5
num_llm_decides_program_udf_type_dummy=1, num_llm_decides_model_udf_type_dummy=1


In [10]:
dataset = "charades"
query_class_names = [
    "unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2",
    "unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2",
    "unavailable=2-npred=3-nobj_pred=1-nvars=2-depth=2"
]
question_ids = list(range(10))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"

eval_selection_strategy(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

num_incorrect_proposal=0
num_correct_selection=110, num_incorrect_selection=54, ratio=0.6707317073170732, num_correct_selection_80=137, ratio=0.8353658536585366

[When the best UDF types are not 'dummy']
num_correct_udf_type_not_dummy=92, num_incorrect_udf_type_not_dummy=31, ratio=0.7479674796747967
num_llm_decides_correct_udf_type_not_dummy=76, num_llm_decides_incorrect_udf_type_not_dummy=47, ratio=0.6178861788617886

[When 'dummy' is the only best UDF type]
num_correct_udf_type_dummy=28, num_incorrect_udf_type_dummy=13, ratio=0.6829268292682927
num_llm_decides_program_udf_type_dummy=7, num_llm_decides_model_udf_type_dummy=34


# UDF type stats

In [24]:
def eval_udf_type_stats(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name):
    num_missing_udfs = 3 if dataset == "clevrer" else 2
    num_dummy, num_program, num_model = 0, 0, 0
    dummy_names = []
    num_files = 0
    for query_class_name in query_class_names:
        for run_id in run_ids:
            for question_id in question_ids:
                try:
                    input_query_file = os.path.join(config["data_dir"], dataset, f"{query_class_name}.json")
                    input_query = json.load(open(input_query_file, "r"))["questions"][question_id]
                    new_modules = input_query["new_modules"]
                    gt_udfs = new_modules
                    with open(os.path.join(config['output_dir'], "udf_generation", dataset, query_class_name, f"num_missing_udfs={num_missing_udfs}", vocal_udf_config_name, f"qid={question_id}-run={run_id}.json"), "r") as f:
                        data = json.load(f)
                    num_files += 1
                    for udf in data["registered_functions"]:
                        if "semantic_interpretation" in udf:
                            udf_name = udf["signature"].split("(")[0]
                            if not is_correctly_proposed(dataset, udf_name, gt_udfs):
                                continue
                            if udf["semantic_interpretation"] == "dummy":
                                num_dummy += 1
                                dummy_names.append(udf_name)
                            elif udf["semantic_interpretation"] == "model":
                                num_model += 1
                            elif udf["function_implementation"] != "":
                                num_program += 1
                            else:
                                raise ValueError(f"Unknown semantic_interpretation: {udf['semantic_interpretation']}")
                except Exception as e:
                    print(f"Error: {e}, query_class_name={query_class_name}, question_id={question_id}, run_id={run_id}")
    print(f"num_program={num_program}, num_model={num_model}, num_dummy={num_dummy}, num_files={num_files}")
    num_all = num_dummy + num_program + num_model
    print(f"program percentage: {num_program / num_all:.2f}, model percentage: {num_model / num_all:.2f}, dummy percentage: {num_dummy / num_all:.2f}")
    print(f"dummy names: {sorted(dummy_names)}")

In [25]:
# Clevrer
print("Clevrer")
dataset = "clevrer"
query_class_names = [
    "3_new_udfs_labels",
]
question_ids = list(range(30))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v"
eval_udf_type_stats(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

# CityFlow
print("CityFlow")
dataset = "cityflow"
query_class_names = [
    "unavailable_pred=1-unavailable_attr_pred=1-npred=1-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737",
    "unavailable_pred=1-unavailable_attr_pred=1-npred=2-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737"
]
question_ids = list(range(15))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"
eval_udf_type_stats(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

# Charades
print("Charades")
dataset = "charades"
query_class_names = [
    "unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2",
    "unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2",
    "unavailable=2-npred=3-nobj_pred=1-nvars=2-depth=2"
]
question_ids = list(range(10))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"
eval_udf_type_stats(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

Clevrer
num_program=172, num_model=55, num_dummy=15, num_files=90
program percentage: 0.71, model percentage: 0.23, dummy percentage: 0.06
dummy names: ['behind', 'behind', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'material_metal', 'right_of', 'right_of', 'right_of']
CityFlow
num_program=100, num_model=64, num_dummy=12, num_files=90
program percentage: 0.57, model percentage: 0.36, dummy percentage: 0.07
dummy names: ['black', 'black', 'black', 'black', 'color_blue', 'in_front_of', 'in_front_of', 'in_front_of', 'left_of', 'left_of', 'left_of', 'sedan']
Charades
num_program=79, num_model=34, num_dummy=51, num_files=90
program percentage: 0.48, model percentage: 0.21, dummy percentage: 0.31
dummy names: ['behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind', 'behind

# Program-based UDF

In [19]:
def eval_program_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name):
    """ Only evaluate correctly proposed UDFs"""
    best_program_types_when_best_is_program = defaultdict(int)
    best_program_types_when_best_is_not_program = defaultdict(int)
    f1_scores_when_best_is_program = []
    f1_scores_when_best_is_not_program = []
    best_f1_scores_when_best_is_program = []
    best_f1_scores_when_best_is_not_program = []

    for query_class_name in query_class_names:
        for run_id in run_ids:
            for question_id in question_ids:
                try:
                    input_query_file = os.path.join(config["data_dir"], dataset, f"{query_class_name}.json")
                    input_query = json.load(open(input_query_file, "r"))["questions"][question_id]
                    new_modules = input_query["new_modules"]
                    gt_udfs = new_modules

                    with open(os.path.join(config['output_dir'], "best_udf_type", dataset, query_class_name, vocal_udf_config_name, f"qid={question_id}-run={run_id}.json"), "r") as f:
                        best_udf_type_data = json.load(f)
                    for udf_name, v in best_udf_type_data.items():
                        if not is_correctly_proposed(dataset, udf_name, gt_udfs):
                            continue
                        candidates = v["candidates"]
                        if "program" in v["best_udf_types"]:
                            for best_udf_id in v["best_udf_ids"]:
                                if candidates[best_udf_id]["udf_type"] == "program":
                                    for p in candidates[best_udf_id]["program_types"]:
                                        best_program_types_when_best_is_program[p] += 1
                                        best_program_types_when_best_is_program["base"] += 1
                            for udf_id, udf_dict in candidates.items():
                                if udf_dict["udf_type"] == "program":
                                    f1_scores_when_best_is_program.append(udf_dict["test_score"])
                            best_f1_scores_when_best_is_program.append(v["best_test_score"])
                        else:
                            best_program_score = -1
                            best_program_types = []
                            for udf_id, udf_dict in candidates.items():
                                if udf_dict["udf_type"] == "program":
                                    f1_scores_when_best_is_not_program.append(udf_dict["test_score"])
                                    if udf_dict["test_score"] > best_program_score:
                                        best_program_score = udf_dict["test_score"]
                                        best_program_types = udf_dict["program_types"]
                                        best_program_types.append("base")
                                    elif udf_dict["test_score"] == best_program_score:
                                        best_program_types.extend(udf_dict["program_types"])
                                        best_program_types.append("base")
                            for p in best_program_types:
                                best_program_types_when_best_is_not_program[p] += 1
                            best_f1_scores_when_best_is_not_program.append(best_program_score)
                except Exception as e:
                    print(f"Error: {e}, query_class_name={query_class_name}, question_id={question_id}, run_id={run_id}")
    print("[when best is program]")
    print("best_program_types")
    for k, v in sorted(best_program_types_when_best_is_program.items(), key=lambda x: -x[1]):
        print(f"{k}: {v}")
    print(f"best_f1_scores: 25 percentile={np.percentile(best_f1_scores_when_best_is_program, 25)}, 50 percentile={np.percentile(best_f1_scores_when_best_is_program, 50)}, 75 percentile={np.percentile(best_f1_scores_when_best_is_program, 75)}")
    print(f"f1_scores: 25 percentile={np.percentile(f1_scores_when_best_is_program, 25)}, 50 percentile={np.percentile(f1_scores_when_best_is_program, 50)}, 75 percentile={np.percentile(f1_scores_when_best_is_program, 75)}")
    print()
    print("[when best is not program]")
    print("best_program_types")
    for k, v in sorted(best_program_types_when_best_is_not_program.items(), key=lambda x: -x[1]):
        print(f"{k}: {v}")
    print(f"best_f1_scores: 25 percentile={np.percentile(best_f1_scores_when_best_is_not_program, 25)}, 50 percentile={np.percentile(best_f1_scores_when_best_is_not_program, 50)}, 75 percentile={np.percentile(best_f1_scores_when_best_is_not_program, 75)}")
    print(f"f1_scores: 25 percentile={np.percentile(f1_scores_when_best_is_not_program, 25)}, 50 percentile={np.percentile(f1_scores_when_best_is_not_program, 50)}, 75 percentile={np.percentile(f1_scores_when_best_is_not_program, 75)}")
    print()
    print()



In [20]:
# Clevrer
print("Clevrer")
dataset = "clevrer"
query_class_names = [
    "3_new_udfs_labels",
]
question_ids = list(range(30))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v"
eval_program_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

# CityFlow
print("CityFlow")
dataset = "cityflow"
query_class_names = [
    "unavailable_pred=1-unavailable_attr_pred=1-npred=1-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737",
    "unavailable_pred=1-unavailable_attr_pred=1-npred=2-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737"
]
question_ids = list(range(15))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"
eval_program_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

# Charades
print("Charades")
dataset = "charades"
query_class_names = [
    "unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2",
    "unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2",
    "unavailable=2-npred=3-nobj_pred=1-nvars=2-depth=2"
]
question_ids = list(range(10))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"
eval_program_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

Clevrer
[when best is program]
best_program_types
base: 147
reuse: 63
parameter: 57
pixel: 27
best_f1_scores: 25 percentile=0.869850132912086, 50 percentile=0.981864192323914, 75 percentile=0.9995992787016631
f1_scores: 25 percentile=0.014322657648697038, 50 percentile=0.44379099751187506, 75 percentile=0.7430984754841368

[when best is not program]
best_program_types
base: 205
reuse: 146
parameter: 104
pixel: 50
best_f1_scores: 25 percentile=0.11410133808082869, 50 percentile=0.5180923542770628, 75 percentile=0.6045581895882516
f1_scores: 25 percentile=0.0, 50 percentile=0.0, 75 percentile=0.1281340276139313


CityFlow
[when best is program]
best_program_types
base: 24
parameter: 15
reuse: 9
best_f1_scores: 25 percentile=0.9595330739299611, 50 percentile=1.0, 75 percentile=1.0
f1_scores: 25 percentile=0.33152901023890785, 50 percentile=0.6793912819190095, 75 percentile=0.8236407532343673

[when best is not program]
best_program_types
base: 241
reuse: 215
parameter: 77
best_f1_scores: 

# Distilled-model UDFs 

In [29]:
def eval_model_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name):
    """ Only evaluate correctly proposed UDFs"""
    f1_scores_when_best_is_model = []
    f1_scores_when_best_is_not_model = []
    dummy_f1_scores_when_best_is_model = []
    dummy_f1_scores_when_best_is_not_model = []

    for query_class_name in query_class_names:
        for run_id in run_ids:
            for question_id in question_ids:
                try:
                    input_query_file = os.path.join(config["data_dir"], dataset, f"{query_class_name}.json")
                    input_query = json.load(open(input_query_file, "r"))["questions"][question_id]
                    new_modules = input_query["new_modules"]
                    gt_udfs = new_modules

                    with open(os.path.join(config['output_dir'], "best_udf_type", dataset, query_class_name, vocal_udf_config_name, f"qid={question_id}-run={run_id}.json"), "r") as f:
                        best_udf_type_data = json.load(f)
                    for udf_name, v in best_udf_type_data.items():
                        if not is_correctly_proposed(dataset, udf_name, gt_udfs):
                            continue
                        candidates = v["candidates"]
                        if "model" in v["best_udf_types"]:
                            f1_scores_when_best_is_model.append(v["best_test_score"])
                            for udf_id, udf_dict in candidates.items():
                                if udf_dict["udf_type"] == "dummy":
                                    dummy_f1_scores_when_best_is_model.append(udf_dict["test_score"])
                                    break
                        else:
                            for udf_id, udf_dict in candidates.items():
                                if udf_dict["udf_type"] == "model":
                                    f1_scores_when_best_is_not_model.append(udf_dict["test_score"])
                                elif udf_dict["udf_type"] == "dummy":
                                    dummy_f1_scores_when_best_is_not_model.append(udf_dict["test_score"])

                except Exception as e:
                    print(f"Error: {e}, query_class_name={query_class_name}, question_id={question_id}, run_id={run_id}")
    print("[when best is model]")
    print(f"f1_scores: 25 percentile={np.percentile(f1_scores_when_best_is_model, 25)}, 50 percentile={np.percentile(f1_scores_when_best_is_model, 50)}, 75 percentile={np.percentile(f1_scores_when_best_is_model, 75)}")
    print(f"dummy_f1_scores: 25 percentile={np.percentile(dummy_f1_scores_when_best_is_model, 25)}, 50 percentile={np.percentile(dummy_f1_scores_when_best_is_model, 50)}, 75 percentile={np.percentile(dummy_f1_scores_when_best_is_model, 75)}")
    print()

    print("[when best is not model]")
    print(f"f1_scores: 25 percentile={np.percentile(f1_scores_when_best_is_not_model, 25)}, 50 percentile={np.percentile(f1_scores_when_best_is_not_model, 50)}, 75 percentile={np.percentile(f1_scores_when_best_is_not_model, 75)}")
    print(f"dummy_f1_scores: 25 percentile={np.percentile(dummy_f1_scores_when_best_is_not_model, 25)}, 50 percentile={np.percentile(dummy_f1_scores_when_best_is_not_model, 50)}, 75 percentile={np.percentile(dummy_f1_scores_when_best_is_not_model, 75)}")
    print()
    print()



In [30]:
# Clevrer
print("Clevrer")
dataset = "clevrer"
query_class_names = [
    "3_new_udfs_labels",
]
question_ids = list(range(30))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=True-pretrained_models=False-ntrain_distill=100-nselection_samples=500-selection=both-labels=user-budget=20-llm_method=gpt4v"
eval_model_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

# CityFlow
print("CityFlow")
dataset = "cityflow"
query_class_names = [
    "unavailable_pred=1-unavailable_attr_pred=1-npred=1-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737",
    "unavailable_pred=1-unavailable_attr_pred=1-npred=2-nattr_pred=2-nvars=3-depth=3-max_duration=15-min_npos=74-max_npos=737"
]
question_ids = list(range(15))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"
eval_model_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

# Charades
print("Charades")
dataset = "charades"
query_class_names = [
    "unavailable=2-npred=4-nobj_pred=1-nvars=3-depth=2",
    "unavailable=2-npred=4-nobj_pred=1-nvars=2-depth=2",
    "unavailable=2-npred=3-nobj_pred=1-nvars=2-depth=2"
]
question_ids = list(range(10))
run_ids = list(range(3))
vocal_udf_config_name = "ninterp=10-nparams=5-kwargs=True-pixels=False-pretrained_models=False-ntrain_distill=500-nselection_samples=500-selection=both-labels=user-budget=50-llm_method=gpt4v"
eval_model_udfs(dataset, query_class_names, question_ids, run_ids, vocal_udf_config_name)

Clevrer
[when best is model]
f1_scores: 25 percentile=0.8032120824727298, 50 percentile=0.8444200424870745, 75 percentile=0.898361855256242
dummy_f1_scores: 25 percentile=0.2278144417381927, 50 percentile=0.5209288566780063, 75 percentile=0.5253999852539999

[when best is not model]
f1_scores: 25 percentile=0.3155300839511366, 50 percentile=0.47207903431838627, 75 percentile=0.6405950814644101
dummy_f1_scores: 25 percentile=0.23352764529235118, 50 percentile=0.6643958050656311, 75 percentile=0.6676437279328492


CityFlow
[when best is model]
f1_scores: 25 percentile=0.5497728333482863, 50 percentile=0.653390668057452, 75 percentile=0.7927347903130537
dummy_f1_scores: 25 percentile=0.27912579590431946, 50 percentile=0.28237718996908273, 75 percentile=0.30966869506423256

[when best is not model]
f1_scores: 25 percentile=0.6681438421324796, 50 percentile=0.694771797129113, 75 percentile=0.7325547845858325
dummy_f1_scores: 25 percentile=0.6627440492110189, 50 percentile=0.667732480682121,

## Labeling quality

In [4]:
def eval_labeling_quality(dataset):
    if dataset == "cityflow":
        udfs = ["suv", "white", "grey", "van", "sedan", "black", "red", "blue", "pickup_truck"]
    elif dataset == "charades":
        # removing "have_it_on_the_back" and "twisting" due to insufficient positives
        udfs = ["holding", "sitting_on", "standing_on", "covered_by", "carrying", "eating", "wiping", "touching", "leaning_on", "wearing", "drinking_from", "lying_on", "writing_on", "above", "in_front_of", "beneath", "behind", "in"]
    else:
        raise ValueError(f"dataset={dataset}")

    gpt4v_results = defaultdict(list)
    for udf_name in udfs:
        for run_id in range(3):
            # random.seed(run_id)
            # np.random.seed(run_id)
            try:
                with open(os.path.join(config["log_dir"], "labeling_quality", dataset, "balanced=True", f"udf_name={udf_name}-n_train_distill=500-llm_method=gpt4v-run_id={run_id}.log"), "r") as f:
                    lines = f.readlines()
                    llm_f1 = -1
                    npos, nneg = 0, 0
                    for line in lines:
                        if "llm_f1: " in line:
                            # 2024-07-21 23:37:06,240 - vocaludf - DEBUG - llm_TP: 63, llm_FP: 47, llm_TN: 203, llm_FN: 187, llm_f1: 0.35
                            pattern = r"llm_TP: (\d+), llm_FP: (\d+), llm_TN: (\d+), llm_FN: (\d+), llm_f1: ([\d.]+)"
                            match = re.search(pattern, line)
                            llm_tp = int(match.group(1))
                            llm_fp = int(match.group(2))
                            llm_tn = int(match.group(3))
                            llm_fn = int(match.group(4))
                            llm_f1 = float(match.group(5))
                            npos = llm_tp + llm_fn
                            nneg = llm_fp + llm_tn
                            # print(f"llm_f1={llm_f1}, npos={npos}, nneg={nneg}")
                            # if npos != nneg:
                            #     print(f"not enough positives: udf_name={udf_name}, run_id={run_id}")
                            break
                    if llm_f1 == -1:
                        print(f"failed task: udf_name={udf_name}, run_id={run_id}")
                        llm_f1 = 0
                    gpt4v_results[udf_name].append(llm_f1)
            except Exception as e:
                print(f"Error: {e}, udf_name={udf_name}, run_id={run_id}")

    print(f"dataset={dataset}")
    for udf_name, llm_f1s in gpt4v_results.items():
        print(f"[gpt4v] udf_name={udf_name}, mean={np.mean(llm_f1s):.3f}, std={np.std(llm_f1s):.3f}")
    print()

eval_labeling_quality("cityflow")
eval_labeling_quality("charades")

dataset=cityflow
[gpt4v] udf_name=suv, mean=0.782, std=0.012
[gpt4v] udf_name=white, mean=0.883, std=0.008
[gpt4v] udf_name=grey, mean=0.762, std=0.014
[gpt4v] udf_name=van, mean=0.872, std=0.012
[gpt4v] udf_name=sedan, mean=0.797, std=0.008
[gpt4v] udf_name=black, mean=0.784, std=0.011
[gpt4v] udf_name=red, mean=0.801, std=0.012
[gpt4v] udf_name=blue, mean=0.874, std=0.007
[gpt4v] udf_name=pickup_truck, mean=0.912, std=0.005

dataset=charades
[gpt4v] udf_name=holding, mean=0.563, std=0.012
[gpt4v] udf_name=sitting_on, mean=0.781, std=0.005
[gpt4v] udf_name=standing_on, mean=0.868, std=0.005
[gpt4v] udf_name=covered_by, mean=0.717, std=0.009
[gpt4v] udf_name=carrying, mean=0.718, std=0.012
[gpt4v] udf_name=eating, mean=0.547, std=0.036
[gpt4v] udf_name=wiping, mean=0.671, std=0.017
[gpt4v] udf_name=touching, mean=0.536, std=0.022
[gpt4v] udf_name=leaning_on, mean=0.756, std=0.020
[gpt4v] udf_name=wearing, mean=0.820, std=0.014
[gpt4v] udf_name=drinking_from, mean=0.574, std=0.016
[gpt4