In [1]:
from model import Icon, Dataset
from llm import GPT, Claude, Llava

import openai
import anthropic
from ollama import Client

from dotenv import load_dotenv
from IPython.core.display import HTML

# import evaluate
# import pandas as pd

# from bert_score import BERTScorer
# from scipy.stats import ttest_rel

# Datasets

### Load Datasets from Saved Pickle File

In [2]:
train = Dataset("")
train.load_pkl("k-shot_train.pkl")

test = Dataset("")
test.load_pkl("k-shot_test.pkl")

In [18]:
print(len(train.icons), len(test.icons))

20 388


### Load Dataset and Do Train/Test Split

In [3]:
# dataset = Dataset("")
# dataset.load_pkl("ground-truth.pkl")
# dataset.name = "all manuals - multi-ground-truth - k-shot"
# train, test = dataset.split_train_test(n = 20)

In [4]:
# len(train.icons)

20

In [5]:
# len(test.icons)

388

In [6]:
# train.save_pkl("train.pkl")
# test.save_pkl("test.pkl")

# Generate Descriptions

In [5]:
load_dotenv() # API keys are stored in an environment variable
openai_client = openai.OpenAI()
anth_client = anthropic.Anthropic()
llava_client = Client() # pass host=<ollama_api_endpoint> if necessary

INSTRUCTIONS = (
    "You are an AI visual assistant specialized in interpreting icons displayed on the dashboard of a vehicle. "
    "An icon communicates important information about the vehicle to the driver. "
    "For example, a particular icon may indicate that a seatbelt is not fastened. "
    "You are seeing an image of a single dashboard icon. "
)
PROMPT = (
    "Briefly describe the dashboard icon depicted in the image, focusing on the visual content of the image and meaning of the icon. "
    "Limit your response to 2 sentences. The first sentence should describe the visual content. The second sentence should describe the icon's meaning."
    "Format your response as a JSON object with the following keys: 'visual_content', 'meaning'. "
    "Respond only with the JSON object. "
    "The image has the following associated text: "
)

## GPT-4o, Claude 3.5, LLaVA

In [6]:
gpt = GPT(openai_client, "gpt-4o")
claude = Claude(anth_client, "claude-3-5-sonnet-20240620")
llava = Llava(llava_client, "llava:34b")

### Treatment #1: k=0,1,3,5 (Image + Context)

In [7]:
# for k in [0]:
for k in [0, 5, 1, 3]:
    gpt.get_captions(
        train = train,
        test = test,
        k = k,
        treatment = f"gpt-4_k-{k}_closest",
        instructions = INSTRUCTIONS,
        prompt = PROMPT,
        use_image = True,
        use_context = True,
        max_tokens=1000
    )
    # test.save_pkl("k-shot.pkl")
    test.save_json("../outputs/temp-k-shot.json")

In [9]:
for k in [1, 3, 5]:
    claude.get_captions(
        train = train,
        test = test,
        k = k,
        treatment = f"claude-3-5_k-{k}_closest",
        instructions = INSTRUCTIONS,
        prompt = PROMPT,
        use_image = True,
        use_context = True,
        max_tokens=1000,
        rerun=True
    )
    test.save_pkl("k-shot_test.pkl")
    test.save_json("../outputs/k-shot.json")

InternalServerError: Error code: 500 - {'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal server error'}}


In [9]:
for k in [0, 5, 1, 3]:
    llava.get_captions(
        train = train,
        test = test,
        k = k,
        treatment = f"llava_k-{k}_closest",
        instructions = INSTRUCTIONS,
        prompt = PROMPT,
        use_image = True,
        use_context = True,
        max_tokens=1000    
    )
    # test.save_pkl("k-shot.pkl")
    test.save_json("../outputs/temp-k-shot.json")

In [3]:
test.save_json("../outputs/k-shot_test.json")
test.save_pkl("k-shot_test.pkl")

train.save_json("../outputs/k-shot_train.json")
train.save_pkl("k-shot_train.pkl")