In [None]:
import json
import random


def get_accuracy(file_name: str):
    data = [json.loads(line) for line in open(file_name)]
    accs = []
    for item in data:
        if "mistral" in file_name:
            if "[/INST]" in item["pred"]:
                item["pred"] = item["pred"].split("[/INST]")[1]
        acc = item["label"].lower() in item["pred"].lower()
        accs.append(acc)
    
    print(file_name, sum(accs) / (len(accs) + 1e-9) * 100, len(accs))

def get_incorrect(file_name: str, output_file_name: str, output_jsonl_file: str):
    data = [json.loads(line) for line in open(file_name)]

    jsonl_results = []

    with open(output_file_name, "w") as f:

        for item in data:
            if "mistral" in file_name:
                if "[/INST]" in item["pred"]:
                    item["pred"] = item["pred"].split("[/INST]")[1]


            if item["label"].lower() not in item["pred"].lower():
                f.write("IMAGE PATH: " + item["image"] + "      WRONG PREDICTION: " + item["pred"] + "\n")

                result = {
                    "image": item["image"],
                    "label": item["label"],
                }
                jsonl_results.append(result)

    # keep jsonl just few
    with open(output_jsonl_file, "w") as f:
        for result in jsonl_results:
            if random.random() < 0.002:
                f.write(json.dumps(result) + "\n")


def get_turned_incorrect(file_name_corrupted: str, file_name_original: str, output_file_name: str, output_jsonl_file: str):
    data_c = [json.loads(line) for line in open(file_name_corrupted)]
    data_o = [json.loads(line) for line in open(file_name_original)]

    jsonl_results = []

    with open(output_file_name, "w") as f:

        for item_c, item_o in zip(data_c, data_o):
            if "mistral" in file_name_corrupted:
                if "[/INST]" in item_c["pred"]:
                    item_c["pred"] = item_c["pred"].split("[/INST]")[1]

            if "mistral" in file_name_original:
                if "[/INST]" in item_o["pred"]:
                    item_o["pred"] = item_o["pred"].split("[/INST]")[1]

            if (item_c["label"].lower() not in item_c["pred"].lower()) and (item_o["label"].lower() in item_o["pred"].lower()):
                f.write("IMAGE PATH: " + item_c["image"] + "      WRONG PREDICTION: " + item_c["pred"] + "\n")

                result = {
                    "image": item_c["image"],
                    "label": item_c["label"],
                }
                jsonl_results.append(result)
    
    with open(output_jsonl_file, "w") as f:
        for result in jsonl_results:
            if random.random() < 0.002:
                f.write(json.dumps(result) + "\n")


: 

# Main Results

In [3]:
# ImageNet-C 0425

# open world
get_accuracy("outputs/imagenet_c/llava7b_gaussian_noise.jsonl")
get_accuracy("outputs/imagenet_c/llava7b_original.jsonl")

# closed world (50 labels)
get_accuracy("outputs/imagenet_c/llava7b_gaussian_noise_closed_set.jsonl")
get_accuracy("outputs/imagenet_c/llava7b_original_closed_set.jsonl")

#get_incorrect("outputs/imagenet_c/llava7b_gaussian_noise_closed_set.jsonl", "../gpt_exp/candidates/gaussian_noise_llava7b_incorrects.txt", "../gpt_exp/candidates/gaussian_noise_llava7b_incorrects.jsonl")

# post diffusion 0429
get_accuracy("outputs/imagenet_c/llava7b_gaussian_noise_closed_set_post_diffusion.jsonl")


outputs/imagenet_c/llava7b_gaussian_noise.jsonl 7.337999999999854 50000
outputs/imagenet_c/llava7b_original.jsonl 22.823999999999543 50000
outputs/imagenet_c/llava7b_gaussian_noise_closed_set.jsonl 26.281999999999478 50000
outputs/imagenet_c/llava7b_original_closed_set.jsonl 63.30599999999874 50000


In [3]:
# CIFAR10-C 0425

# closed world
get_accuracy("outputs/cifar10_c/llava7b_original.jsonl")

get_accuracy("outputs/cifar10_c/llava7b_gaussian_noise.jsonl")
get_accuracy("outputs/cifar10_c/llava7b_impulse_noise.jsonl")
get_accuracy("outputs/cifar10_c/llava7b_shot_noise.jsonl")

outputs/cifar10_c/llava7b_original.jsonl 88.45999999999114 10000
outputs/cifar10_c/llava7b_gaussian_noise.jsonl 59.249999999994074 10000
outputs/cifar10_c/llava7b_impulse_noise.jsonl 71.69999999999283 10000
outputs/cifar10_c/llava7b_shot_noise.jsonl 60.84999999999391 10000


In [None]:
# ImageNet
print("\nOpen World\n")
get_accuracy("outputs/imagenet_llava7b.jsonl")
get_accuracy("outputs/imagenet_llava13b.jsonl")
get_accuracy("outputs/imagenet_llavanext7bmistral.jsonl")
get_accuracy("outputs/imagenet_llavanext7bvicuna.jsonl")
get_accuracy("outputs/imagenet_blip2.jsonl")
get_accuracy("outputs/imagenet_iblip7b.jsonl")
get_accuracy("outputs/imagenet_iblip13b.jsonl")
print("\nClose World\n")
get_accuracy("outputs/imagenet_llava7b_1000classes.jsonl")
get_accuracy("outputs/imagenet_llava13b_1000classes.jsonl")
get_accuracy("outputs/imagenet_llavanext7bmistral_1000classes.jsonl")
get_accuracy("outputs/imagenet_llavanext7bvicuna_1000classes.jsonl")
get_accuracy("outputs/imagenet_blip2_1000classes.jsonl")
get_accuracy("outputs/imagenet_iblip7b_1000classes.jsonl")
get_accuracy("outputs/imagenet_iblip13b_1000classes.jsonl")
print("\nCLIP\n")
get_accuracy("outputs/imagenet_clipvitl336_1000classes.jsonl")
get_accuracy("outputs/imagenet_evaclipg_1000classes.jsonl")

In [None]:
# Flowers
print("\nOpen World\n")
get_accuracy("outputs/flowers_llava7b.jsonl")
get_accuracy("outputs/flowers_llava13b.jsonl")
get_accuracy("outputs/flowers_llavanext7bmistral.jsonl")
get_accuracy("outputs/flowers_llavanext7bvicuna.jsonl")
get_accuracy("outputs/flowers_blip2.jsonl")
get_accuracy("outputs/flowers_iblip7b.jsonl")
get_accuracy("outputs/flowers_iblip13b.jsonl")
print("\nClose World\n")
get_accuracy("outputs/flowers_llava7b_102classes.jsonl")
get_accuracy("outputs/flowers_llava13b_102classes.jsonl")
get_accuracy("outputs/flowers_llavanext7bmistral_102classes.jsonl")
get_accuracy("outputs/flowers_llavanext7bvicuna_102classes.jsonl")
get_accuracy("outputs/flowers_blip2_102classes.jsonl")
get_accuracy("outputs/flowers_iblip7b_102classes.jsonl")
get_accuracy("outputs/flowers_iblip13b_102classes.jsonl")
print("\nCLIP\n")
get_accuracy("outputs/flowers_clipvitl336_102classes.jsonl")
get_accuracy("outputs/flowers_evaclipg_102classes.jsonl")

In [None]:
# Cars
print("\nOpen World\n")
get_accuracy("outputs/cars_llava7b.jsonl")
get_accuracy("outputs/cars_llava13b.jsonl")
get_accuracy("outputs/cars_llavanext7bmistral.jsonl")
get_accuracy("outputs/cars_llavanext7bvicuna.jsonl")
get_accuracy("outputs/cars_blip2.jsonl")
get_accuracy("outputs/cars_iblip7b.jsonl")
get_accuracy("outputs/cars_iblip13b.jsonl")
print("\nClose World\n")
get_accuracy("outputs/cars_llava7b_196classes.jsonl")
get_accuracy("outputs/cars_llava13b_196classes.jsonl")
get_accuracy("outputs/cars_llavanext7bmistral_196classes.jsonl")
get_accuracy("outputs/cars_llavanext7bvicuna_196classes.jsonl")
get_accuracy("outputs/cars_blip2_196classes.jsonl")
get_accuracy("outputs/cars_iblip7b_196classes.jsonl")
get_accuracy("outputs/cars_iblip13b_196classes.jsonl")
print("\nCLIP\n")
get_accuracy("outputs/cars_clipvitl336_196classes.jsonl")
get_accuracy("outputs/cars_evaclipg_196classes.jsonl")

In [3]:
# Caltech
print("\nOpen World\n")
get_accuracy("outputs/caltech_llava7b.jsonl")
#get_accuracy("outputs/caltech_llava13b.jsonl")
#get_accuracy("outputs/caltech_llavanext7bmistral.jsonl")
#get_accuracy("outputs/caltech_llavanext7bvicuna.jsonl")
#get_accuracy("outputs/caltech_blip2.jsonl")
#get_accuracy("outputs/caltech_iblip7b.jsonl")
#get_accuracy("outputs/caltech_iblip13b.jsonl")
get_accuracy("outputs/caltech_llavaov7b.jsonl")
print("\nClose World\n")
get_accuracy("outputs/caltech_llava7b_100classes.jsonl")
#get_accuracy("outputs/caltech_llava13b_100classes.jsonl")
#get_accuracy("outputs/caltech_llavanext7bmistral_100classes.jsonl")
#get_accuracy("outputs/caltech_llavanext7bvicuna_100classes.jsonl")
#get_accuracy("outputs/caltech_blip2_100classes.jsonl")
#get_accuracy("outputs/caltech_iblip7b_100classes.jsonl")
#get_accuracy("outputs/caltech_iblip13b_100classes.jsonl")
get_accuracy("outputs/caltech_llavaov7b_100classes.jsonl")
#print("\nCLIP\n")
#get_accuracy("outputs/caltech_clipvitl336_100classes.jsonl")
#get_accuracy("outputs/caltech_evaclipg_100classes.jsonl")

print("\nCORRUPTIONS\n")
get_accuracy("outputs/caltech_corrupted_llava7b.jsonl")
get_accuracy("outputs/caltech_corrupted_llavaov7b.jsonl")
get_accuracy("outputs/caltech_corrupted_llava7b_100classes.jsonl")
get_accuracy("outputs/caltech_corrupted_llavaov7b_100classes.jsonl")

print("\nCORRUPTIONS_DIFFPROMPT\n")
get_accuracy("outputs/caltech_corrupted_llava7b_diffprompt.jsonl")
get_accuracy("outputs/caltech_corrupted_llavaov7b_diffprompt.jsonl")
get_accuracy("outputs/caltech_corrupted_llava7b_100classes_diffprompt.jsonl")
get_accuracy("outputs/caltech_corrupted_llavaov7b_100classes_diffprompt.jsonl")

print("\nCUSTOM_PROMPT\n")
get_accuracy("outputs/caltech_corrupted_llava7b_100classes_customprompt.jsonl")
get_accuracy("outputs/caltech_llava7b_100classes_customprompt.jsonl")
get_accuracy("outputs/caltech_corrupted_llavaov7b_100classes_customprompt.jsonl")
get_accuracy("outputs/caltech_llavaov7b_100classes_customprompt.jsonl")

# get list of incorrectly predicted images
get_incorrect("outputs/caltech_corrupted_llavaov7b_100classes.jsonl", "../image_generation/incorrects/caltech_corrupted_llavaov7b_100classes.txt")

get_turned_incorrect("outputs/caltech_corrupted_llavaov7b_100classes.jsonl", "outputs/caltech_llavaov7b_100classes.jsonl", "../image_generation/turned_incorrects/caltech_corrupted_llavaov7b_100classes.txt")





Open World

outputs/caltech_llava7b.jsonl 47.14846455782794 4331
outputs/caltech_llavaov7b.jsonl 51.39690602631 4331

Close World



outputs/caltech_llava7b_100classes.jsonl 62.01800969751511 4331
outputs/caltech_llavaov7b_100classes.jsonl 91.08750865848738 4331

CORRUPTIONS

outputs/caltech_corrupted_llava7b.jsonl 34.21842530592606 4331
outputs/caltech_corrupted_llavaov7b.jsonl 36.91987993534128 4331
outputs/caltech_corrupted_llava7b_100classes.jsonl 49.15723851303413 4331
outputs/caltech_corrupted_llavaov7b_100classes.jsonl 77.53405679979738 4331

CORRUPTIONS_DIFFPROMPT

outputs/caltech_corrupted_llava7b_diffprompt.jsonl 35.1189101823978 4331
outputs/caltech_corrupted_llavaov7b_diffprompt.jsonl 43.86977603323854 4331
outputs/caltech_corrupted_llava7b_100classes_diffprompt.jsonl 48.85707688754355 4331
outputs/caltech_corrupted_llavaov7b_100classes_diffprompt.jsonl 77.28007388592073 4331

CUSTOM_PROMPT

outputs/caltech_corrupted_llava7b_100classes_customprompt.jsonl 42.41514661739957 4331
outputs/caltech_llava7b_100classes_customprompt.jsonl 54.7448626183203 4331
outputs/caltech_corrupted_llavaov7b_100classes_custom

# Analysis

In [None]:
# Prompt
print("\nPrompt Variation\n")
get_accuracy("outputs/prompt/imagenet_llava7b_prompt1.jsonl")
get_accuracy("outputs/prompt/imagenet_llava7b_prompt2.jsonl")
get_accuracy("outputs/prompt/imagenet_llava7b_prompt3.jsonl")
get_accuracy("outputs/prompt/imagenet_llava7b_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/flowers_llava7b_prompt1.jsonl")
get_accuracy("outputs/prompt/flowers_llava7b_prompt2.jsonl")
get_accuracy("outputs/prompt/flowers_llava7b_prompt3.jsonl")
get_accuracy("outputs/prompt/flowers_llava7b_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/cars_llava7b_prompt1.jsonl")
get_accuracy("outputs/prompt/cars_llava7b_prompt2.jsonl")
get_accuracy("outputs/prompt/cars_llava7b_prompt3.jsonl")
get_accuracy("outputs/prompt/cars_llava7b_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/caltech_llava7b_prompt1.jsonl")
get_accuracy("outputs/prompt/caltech_llava7b_prompt2.jsonl")
get_accuracy("outputs/prompt/caltech_llava7b_prompt3.jsonl")
get_accuracy("outputs/prompt/caltech_llava7b_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/imagenet_blip2_prompt1.jsonl")
get_accuracy("outputs/prompt/imagenet_blip2_prompt2.jsonl")
get_accuracy("outputs/prompt/imagenet_blip2_prompt3.jsonl")
get_accuracy("outputs/prompt/imagenet_blip2_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/flowers_blip2_prompt1.jsonl")
get_accuracy("outputs/prompt/flowers_blip2_prompt2.jsonl")
get_accuracy("outputs/prompt/flowers_blip2_prompt3.jsonl")
get_accuracy("outputs/prompt/flowers_blip2_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/cars_blip2_prompt1.jsonl")
get_accuracy("outputs/prompt/cars_blip2_prompt2.jsonl")
get_accuracy("outputs/prompt/cars_blip2_prompt3.jsonl")
get_accuracy("outputs/prompt/cars_blip2_prompt4.jsonl")
print()
get_accuracy("outputs/prompt/caltech_blip2_prompt1.jsonl")
get_accuracy("outputs/prompt/caltech_blip2_prompt2.jsonl")
get_accuracy("outputs/prompt/caltech_blip2_prompt3.jsonl")
get_accuracy("outputs/prompt/caltech_blip2_prompt4.jsonl")
print("\nCoT\n")

get_accuracy("outputs/flowers_llava7b_102classes.jsonl")
get_accuracy("outputs/prompt/flowers_llava7b_102classes_cot.jsonl")

In [None]:
# N Classes
print("\nLLaVA-1.5-7B\n")
get_accuracy("outputs/n_classes/imagenet_llava7b_2classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_llava7b_5classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_llava7b_20classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_llava7b_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/flowers_llava7b_2classes.jsonl")
get_accuracy("outputs/n_classes/flowers_llava7b_5classes.jsonl")
get_accuracy("outputs/n_classes/flowers_llava7b_20classes.jsonl")
get_accuracy("outputs/n_classes/flowers_llava7b_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/cars_llava7b_2classes.jsonl")
get_accuracy("outputs/n_classes/cars_llava7b_5classes.jsonl")
get_accuracy("outputs/n_classes/cars_llava7b_20classes.jsonl")
get_accuracy("outputs/n_classes/cars_llava7b_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/caltech_llava7b_2classes.jsonl")
get_accuracy("outputs/n_classes/caltech_llava7b_5classes.jsonl")
get_accuracy("outputs/n_classes/caltech_llava7b_20classes.jsonl")
get_accuracy("outputs/n_classes/caltech_llava7b_100classes.jsonl")

print("\nBLIP2\n")
get_accuracy("outputs/n_classes/imagenet_blip2_2classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_blip2_5classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_blip2_20classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_blip2_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/flowers_blip2_2classes.jsonl")
get_accuracy("outputs/n_classes/flowers_blip2_5classes.jsonl")
get_accuracy("outputs/n_classes/flowers_blip2_20classes.jsonl")
get_accuracy("outputs/n_classes/flowers_blip2_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/cars_blip2_2classes.jsonl")
get_accuracy("outputs/n_classes/cars_blip2_5classes.jsonl")
get_accuracy("outputs/n_classes/cars_blip2_20classes.jsonl")
get_accuracy("outputs/n_classes/cars_blip2_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/caltech_blip2_2classes.jsonl")
get_accuracy("outputs/n_classes/caltech_blip2_5classes.jsonl")
get_accuracy("outputs/n_classes/caltech_blip2_20classes.jsonl")
get_accuracy("outputs/n_classes/caltech_blip2_100classes.jsonl")


print("\nCLIP-VIT-L-336\n")
get_accuracy("outputs/n_classes/imagenet_clipvitl336_2classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_clipvitl336_5classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_clipvitl336_20classes.jsonl")
get_accuracy("outputs/n_classes/imagenet_clipvitl336_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/flowers_clipvitl336_2classes.jsonl")
get_accuracy("outputs/n_classes/flowers_clipvitl336_5classes.jsonl")
get_accuracy("outputs/n_classes/flowers_clipvitl336_20classes.jsonl")
get_accuracy("outputs/n_classes/flowers_clipvitl336_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/cars_clipvitl336_2classes.jsonl")
get_accuracy("outputs/n_classes/cars_clipvitl336_5classes.jsonl")
get_accuracy("outputs/n_classes/cars_clipvitl336_20classes.jsonl")
get_accuracy("outputs/n_classes/cars_clipvitl336_100classes.jsonl")
print()
get_accuracy("outputs/n_classes/caltech_clipvitl336_2classes.jsonl")
get_accuracy("outputs/n_classes/caltech_clipvitl336_5classes.jsonl")
get_accuracy("outputs/n_classes/caltech_clipvitl336_20classes.jsonl")
get_accuracy("outputs/n_classes/caltech_clipvitl336_100classes.jsonl")

# Label Order

In [None]:
get_accuracy("outputs/prompt/flowers_llava7b_102classes_fixedorder.jsonl")
get_accuracy("outputs/prompt/cars_llava7b_196classes_fixedorder.jsonl")
get_accuracy("outputs/prompt/caltech_llava7b_100classes_fixedorder.jsonl")

get_accuracy("outputs/prompt/flowers_blip2_102classes_fixedorder.jsonl")
get_accuracy("outputs/prompt/cars_blip2_196classes_fixedorder.jsonl")
get_accuracy("outputs/prompt/caltech_blip2_100classes_fixedorder.jsonl")

# CoT

In [None]:
get_accuracy("outputs/prompt/flowers_llava7b_102classes_cot.jsonl")
get_accuracy("outputs/prompt/cars_llava7b_196classes_cot.jsonl")
get_accuracy("outputs/prompt/caltech_llava7b_100classes_cot.jsonl")

get_accuracy("outputs/prompt/flowers_blip2_102classes_cot.jsonl")
get_accuracy("outputs/prompt/cars_blip2_196classes_cot.jsonl")
get_accuracy("outputs/prompt/caltech_blip2_100classes_cot.jsonl")

In [None]:
get_accuracy("outputs/prompt/imagenet_llava7b_cot.jsonl")
get_accuracy("outputs/prompt/flowers_llava7b_cot.jsonl")
get_accuracy("outputs/prompt/cars_llava7b_cot.jsonl")
get_accuracy("outputs/prompt/caltech_llava7b_cot.jsonl")
print()
get_accuracy("outputs/prompt/imagenet_blip2_cot2.jsonl")
get_accuracy("outputs/prompt/flowers_blip2_cot2.jsonl")
get_accuracy("outputs/prompt/cars_blip2_cot2.jsonl")
get_accuracy("outputs/prompt/caltech_blip2_cot2.jsonl")