In [1]:
import json
import glob 
import os
import pandas as pd
from PIL import Image
import base64
import io
from openai import OpenAI
from pydantic import BaseModel
from textwrap import dedent
from tqdm import tqdm, trange


def base64_to_image(base64_str):
    image_data = base64.b64decode(base64_str)
    image = Image.open(io.BytesIO(image_data))
    return image

# annotation_dir = "/pasteur2/u/suyc/VLMEval/AutoConverter/visualization"
# dataset_dir = "/pasteur2/u/suyc/VLMEval/VLMEvalKit/LMUData/"

# annotations = []
# for filename in glob.glob(annotation_dir + "/*/*.xlsx"):
#     print(filename)
#     basename = os.path.basename(filename)
#     data = pd.read_csv(dataset_dir + basename.replace('_correctness.xlsx', '-50.tsv'), sep='\t').to_dict('records')[:50]
#     annotation = pd.read_excel(filename).to_dict('records')[:50]
#     for item, anno in zip(data, annotation):
#         anno["filename"] = basename
#         anno["image"] = item["image"]
#     annotations += annotation

# # json.dump(annotations, open('annotations_20241109_0042.json', 'w'), indent=4)
# display(len(annotations), annotations[0])
# display(base64_to_image(annotations[0]["image"]))

annotations = json.load(open('annotations_20241109_0042.json'))

In [2]:
class Judgement(BaseModel):
    reasoning: str
    correctness: int


client = OpenAI(timeout=20)

def judge_multichoice_correctness_with_image(image_base64: str, question: str, choices: list, correct_choice: str) -> str:
    system_prompt = f"""
    Your task is to evaluate a multiple-choice question (with accompanying image) to determine if any incorrect choices (distractors) could also be considered correct answers.

    CRITICAL: The marked correct answer MUST always be treated as valid and correct, regardless of your own assessment. Never question or evaluate the correct answer - your task is to accept it as an absolute truth and evaluate only whether other choices could also be correct.

    Score the question's correctness using this scale:
    5 - Perfect: All other choices are clearly incorrect
    4 - Good: Other choices are mostly wrong but have minor elements of correctness
    3 - Fair: At least one other choice could be partially correct
    2 - Poor: At least one other choice could be equally correct
    1 - Invalid: Multiple choices are equally valid as the correct answer

    Provide:
    1. Score (1-5)
    2. Brief explanation focusing specifically on any problematic distractor choices

    Remember: Never analyze whether the marked correct answer is right or wrong - it is ALWAYS correct by definition. Focus exclusively on whether other choices could also be valid answers.
    """

    prompt = f"""
    Question: {question}
    Choices: {choices}
    Correct Answer: {correct_choice}
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o", 
        messages=[
            {"role": "system", "content": dedent(system_prompt)}, # "You are a helpful assistant."
            {"role": "user", "content": [
                {
                    "type": "text",
                    "text": dedent(prompt)
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_base64}"
                    }
                }
            ]}
        ],
        response_format=Judgement,
        temperature=0  # Set to 0 for deterministic responses
    )
    
    answer = response.choices[0].message.parsed.dict()
    return answer



In [3]:
item = annotations[0]
prediction = judge_multichoice_correctness_with_image(item["image"], item["question"], [item["A"], item["B"], item["C"], item["D"]], item[item["answer"]])
display(prediction)

{'reasoning': 'The image shows a variety of clay vessels, which are typically associated with pottery. However, ceramics is a broader category that includes pottery, so it could also be considered partially correct. Glassblowing and sculpture are clearly incorrect as they involve different materials and techniques.',
 'correctness': 3}

In [None]:
# predictions = []
# for item in tqdm(annotations):
#     prediction = judge_multichoice_correctness_with_image(item["image"], item["question"], [item["A"], item["B"], item["C"], item["D"]], item[item["answer"]])
#     predictions.append(prediction)


from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

def process_item(item):
    return judge_multichoice_correctness_with_image(
        item["image"],
        item["question"],
        [item["A"], item["B"], item["C"], item["D"]],
        item[item["answer"]]
    )

def parallel_judge(annotations):
    with ProcessPoolExecutor(max_workers=32) as executor:
        results = list(tqdm(executor.map(process_item, annotations), total=len(annotations)))
    return results

predictions = parallel_judge(annotations)

100%|██████████| 450/450 [01:00<00:00,  7.46it/s]


In [137]:
json.dump(predictions, open('predictions_20241109_0251_final.json', 'w'), indent=4)

In [138]:
import json

predictions = json.load(open('predictions_20241109_0251_final.json'))
annotations = json.load(open('annotations_20241109_0042.json'))

preds = [item["correctness"] for item in predictions]
labels = [item["correctness"] for item in annotations]

# compute accuracy, f1, auroc
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support


auroc = roc_auc_score(labels, preds)
preds_binary = [1 if pred >= 2 else 0 for pred in preds]

accuracy = accuracy_score(labels, preds_binary)
p, r, f1, _ = precision_recall_fscore_support(labels, preds_binary, average=None)
precision = p[0]
recall = r[0]
f1 = f1[0]


print("AUROC:", auroc)
print("Accuracy:", accuracy)
print("Precision", precision)
print("Recall", recall)
print("F1:", f1)

# if preds==1, print the percentage of questions are indeed wrong
for i in range(1, 6):
    selected_preds = [label for label, pred in zip(labels, preds) if pred==i]
    print(i, selected_preds.count(0), len(selected_preds))


AUROC: 0.7651577503429356
Accuracy: 0.8844444444444445
Precision 0.1111111111111111
Recall 0.022222222222222223
F1: 0.037037037037037035
1 1 9
2 8 22
3 22 67
4 1 7
5 13 345


In [139]:
# for pred, anno in zip(predictions, annotations):
#     if pred["correctness"] == 5 and anno["correctness"] == 0:
#         display(anno["question"], [anno["A"], anno["B"], anno["C"], anno["D"]], anno[anno["answer"]]) # , pred["reasoning"])
#         image = base64_to_image(anno["image"])
#         display(image)

In [76]:
# for pred, anno in zip(predictions, annotations):
#     display(pred)
#     display(anno)
#     image = base64_to_image(anno["image"]).resize((384, 384))
#     display(image)
#     input()

# Examine Data

In [23]:
data = pd.read_csv("/pasteur2/u/suyc/VLMEval/VLMEvalKit/LMUData/VMCBench-1000-v2.tsv", sep='\t').to_dict('records')

In [84]:
data[336]

{'index': 337,
 'question': 'How to cook this dish?',
 'A': "This dish is called 'Lemon Garlic Butter Salmon'. This recipe combines lemon and garlic with butter, offering a rich buttery taste balanced by the zesty lemon.",
 'B': "This dish is called 'Lemon Herb Crusted Mahi Mahi'. It relies on lemon zest and herbs like rosemary and thyme to create a zesty crust. The cooking method involves baking the fish instead of skillet frying.",
 'C': "This dish is called 'Sweet and Sour Chicken'. The sauce combines tangy and sweet flavors, creating a classic sweet and sour experience.",
 'D': 'This dish is called "Ginger Glazed Mahi Mahi". This ginger sauce for mahi mahi is bursting with flavor and combines both sweet and sour taste sensations. This recipe is a snap and so delicious. You\'ll love it! \n\n\nPrep Time:\n5 mins\nCook Time:\n10 mins\nAdditional Time:\n20 mins\nTotal Time:\n35 mins\nServings:\n4\n\n Ingredients\n\n    3 tablespoons honey\n\n    3 tablespoons soy sauce\n\n    3 tablesp

# Refine 

In [166]:
class Question(BaseModel):
    distractors: list[str]

client = OpenAI(timeout=20)

def refine_question_with_image(image_base64: str, question: str, choices: list, correct_choice: str) -> str:
    system_prompt = f"""
    Your task is to refine the distractors (incorrect options) of a multiple-choice question by matching their capitalization style with the correct answer:

    1. Analyze the capitalization pattern of the correct answer (e.g., "Full Capitals", "first Letter Only", "all lowercase", "camelCase", "Title Case")
    2. Modify each distractor to follow the same capitalization pattern as the correct answer
    - Every word position should match (first word, middle words, last word)
    - Special terms, acronyms, and proper nouns should maintain their standard capitalization
    3. Do not make any other changes to the distractors' content, length, or meaning
    4. Do not modify the correct answer or question itself

    If the distractors already match the correct answer's capitalization pattern, leave them unchanged.

    Please output only the refined distractors.
    """

    prompt = f"""
    Correct Answer: ```{correct_choice}```
    Distractors: ```{list(set(choices) - set([correct_choice]))}```
    """
    print(prompt)

    response = client.beta.chat.completions.parse(
        model="gpt-4o", 
        messages=[
            {"role": "system", "content": dedent(system_prompt)}, # "You are a helpful assistant."
            {"role": "user", "content": [
                {
                    "type": "text",
                    "text": dedent(prompt)
                },
                # {
                #     "type": "image_url",
                #     "image_url": {
                #         "url": f"data:image/jpeg;base64,{image_base64}"
                #     }
                # }
            ]}
        ],
        response_format=Question,
        temperature=0  # Set to 0 for deterministic responses
    )
    
    answer = response.choices[0].message.parsed.dict()
    return answer



In [182]:
item = annotations[103] # 3, 103, 200
prediction = refine_question_with_image(item["image"], item["question"], [item["A"], item["B"], item["C"], item["D"]], item[item["answer"]])
display(prediction)


    Correct Answer: ```justin bieber and kim kardashian```
    Distractors: ```['Elon Musk and Mark Zuckerberg', 'Barack Obama and Evan Williams', 'Taylor Swift and Beyoncé']```
    


{'distractors': ['elon musk and mark zuckerberg',
  'barack obama and evan williams',
  'taylor swift and beyoncé']}