In [1]:
import json
import glob
import os
import pandas as pd
from PIL import Image
import base64
import io
from openai import OpenAI
from pydantic import BaseModel
from textwrap import dedent
from tqdm import tqdm, trange
from copy import deepcopy
import random


client = OpenAI(timeout=20)


def base64_to_image(base64_str):
    image_data = base64.b64decode(base64_str)
    image = Image.open(io.BytesIO(image_data))
    return image


class Question(BaseModel):
    distractors: list[str]


client = OpenAI(timeout=20)


def judge_multichoice_correctness_with_image(
    image_base64: str, question: str, choices: list, correct_choice: str
) -> str:


    prompt = f"""Generate 3 distractors for the following question with image:
    Question: {question}
    Answer: {correct_choice}
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": dedent(prompt)},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                    },
                ],
            },
        ],
        response_format=Question,
        temperature=0,  # Set to 0 for deterministic responses
    )

    distractors = response.choices[0].message.parsed.dict()
    return distractors




In [2]:
# data = [json.loads(line) for line in open("/pasteur2/u/yuhuiz/CVPR/AutoConverter/data/MMMU_DEV_VAL_4choices_20241102_2028.jsonl")]
data = pd.read_csv("/pasteur2/u/yuhuiz/CVPR/AutoConverter/data/MMMU-500-v2.tsv", sep="\t").to_dict(orient="records")


# import random
# random.seed(134)
# data = random.sample(data, 300)
len(data)

500

In [4]:
from copy import deepcopy

def process_item(item):
    item = deepcopy(item)
    distractors = judge_multichoice_correctness_with_image(
        item["image"],
        item["question"],
        [item["A"], item["B"], item["C"], item["D"]],
        item[item["answer"]]
    )
    answer = item[item["answer"]]
    options = distractors["distractors"] + [answer]
    random.shuffle(options)
    item["A"], item["B"], item["C"], item["D"] = options
    item["answer"] = "ABCD"[options.index(answer)]
    return item

process_item(data[0])

{'index': 1,
 'question': 'Write the signal f(t) shown in <image 1> using the step functions.',
 'A': '5t^2 [u(t) - u(t - 2)] + 20[u(t - 2) - u(t - 5)] + 15(t - 7)[u(t - 5) - u(t - 7)]',
 'B': '5t^2 [u(t) - u(t - 2)] + 25[u(t - 2) - u(t - 5)] + 10(t - 7)[u(t - 5) - u(t - 7)]',
 'C': '5t^2 [u(t) - u(t - 3)] + 20[u(t - 3) - u(t - 6)] + 15(t - 7)[u(t - 6) - u(t - 8)]',
 'D': '5t^2 [u(t) - u(t - 1)] + 20[u(t - 1) - u(t - 4)] + 15(t - 6)[u(t - 4) - u(t - 6)]',
 'answer': 'A',
 'image': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAEfAhABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APf6KKKKrfarb7X9l8+P7Tt3+TuG7b0zjrirNY3iXxDZeFtButWv5AkUK/KCfvt2Ue5NeefDb4ja/

In [5]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

def parallel_judge(data):
    with ProcessPoolExecutor(max_workers=32) as executor:
        results = list(tqdm(executor.map(process_item, data), total=len(data)))
    return results

outputs = parallel_judge(data)

100%|██████████| 500/500 [00:42<00:00, 11.86it/s]


In [6]:
with open("/pasteur2/u/yuhuiz/CVPR/AutoConverter/data/MMMU-500-v2-naive.jsonl", "w") as f:
    for item in outputs:
        f.write(json.dumps(item) + "\n")

In [None]:
# predictions = []
# for item in tqdm(annotations):
#     prediction = judge_multichoice_correctness_with_image(item["image"], item["question"], [item["A"], item["B"], item["C"], item["D"]], item[item["answer"]])
#     predictions.append(prediction)


from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm


def parallel_judge(data):
    with ProcessPoolExecutor(max_workers=32) as executor:
        results = list(tqdm(executor.map(process_item, data), total=len(data)))
    return results

predictions = parallel_judge(data)