In [1]:
from openai import OpenAI
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
from pydantic import BaseModel
from enum import Enum
from tqdm import tqdm
import dotenv
import os

dotenv.load_dotenv()

DATA_PATH = "/Users/maxence/Documents/ifood-2019-fgvc6/val_labels.csv"
IMAGES_PATH = "/Users/maxence/Documents/ifood-2019-fgvc6/val_set/"

CLASSES = [
    "chocolate_mousse",
    "panna_cotta",
    "churro",
    "creme_brulee",
    "dumpling",
    "macaroni_and_cheese",
    "boiled_egg",
    "risotto",
    "gnocchi",
    "macaron",
    "cannoli",
    "linguine",
    "eggs_benedict",
    "burrito",
    "apple_pie",
    "grilled_cheese_sandwich",
    "onion_rings",
    "ice_cream",
    "edamame",
    "fried_rice",
    "filet_mignon",
    "tempura",
    "lasagna",
    "donut",
    "pancake",
]

tqdm.pandas()

In [2]:
classes_df = pd.read_csv("/Users/maxence/Documents/ifood-2019-fgvc6/class_list.txt", sep=' ', header=None)
classes_df = classes_df.rename(columns={0: "labels", 1: "class"})
classes_df = classes_df.set_index("class")
classes_df = classes_df.loc[CLASSES]
classes_df = classes_df.reset_index()
classes_df = classes_df.set_index("labels")
classes_df = classes_df.sort_index()
classes_df

Unnamed: 0_level_0,class
labels,Unnamed: 1_level_1
0,macaron
8,dumpling
19,cannoli
29,apple_pie
30,risotto
57,gnocchi
62,creme_brulee
67,tempura
75,linguine
76,edamame


In [3]:
images_df = pd.read_csv(DATA_PATH)
images_df = images_df.rename(columns={"label": "true_labels"})

# Filter out classes that we're not interested in
images_df = images_df[images_df["true_labels"].isin(classes_df.index)]
images_df = images_df.reset_index(drop=True)

# Join the class names to the dataframe
images_df = images_df.join(classes_df, on="true_labels")
images_df = images_df.rename(columns={"class": "true_class"})

print(len(images_df))
images_df.head(n=20)

1263


Unnamed: 0,img_name,true_labels,true_class
0,val_000006.jpg,201,grilled_cheese_sandwich
1,val_000008.jpg,19,cannoli
2,val_000013.jpg,0,macaron
3,val_000015.jpg,201,grilled_cheese_sandwich
4,val_000019.jpg,29,apple_pie
5,val_000020.jpg,155,churro
6,val_000025.jpg,172,chocolate_mousse
7,val_000053.jpg,79,macaroni_and_cheese
8,val_000063.jpg,30,risotto
9,val_000074.jpg,79,macaroni_and_cheese


In [4]:
classes_text = "\n".join(CLASSES)

PROMPT = f"""
Classify the following image into one of the following categories. Return the category name.
{classes_text}
"""

print(PROMPT)


Classify the following image into one of the following categories. Return the category name.
chocolate_mousse
panna_cotta
churro
creme_brulee
dumpling
macaroni_and_cheese
boiled_egg
risotto
gnocchi
macaron
cannoli
linguine
eggs_benedict
burrito
apple_pie
grilled_cheese_sandwich
onion_rings
ice_cream
edamame
fried_rice
filet_mignon
tempura
lasagna
donut
pancake



In [5]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

FoodCategory = Enum('FoodCategory', CLASSES)

class ClassificationOutput(BaseModel):
    category_name: str

def encode_image(image):
    # resize to 512x512
    image = image.resize((512, 512))

    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")

    return f"data:image/jpeg;base64,{image_str}"

def classify_image(image):
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": PROMPT},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": encode_image(image),
                            "detail": "high",
                        },
                    },
                ],
            }
        ],
        max_tokens=300,
        response_format=ClassificationOutput,
    )

    return response.choices[0].message.parsed.category_name

# Uncomment to test the function
# images_df = images_df.sample(5)

images_df["predicted_class"] = images_df["img_name"].progress_apply(
    lambda x: classify_image(Image.open(IMAGES_PATH + x))
)

100%|██████████| 1263/1263 [54:46<00:00,  2.60s/it] 


In [6]:
# Join the label number to the dataframe
labels_df = classes_df.reset_index().set_index("class")
images_df = images_df.join(labels_df, on="predicted_class")
images_df = images_df.rename(columns={"labels": "predicted_labels"})

images_df.to_csv("../data/results/gpt-4o-mini.csv", index=False)

In [7]:
results_df = pd.read_csv("../data/results/gpt-4o-mini.csv")

print(
    f"Accuracy: {sum(results_df['true_class'] == results_df['predicted_class']) / len(results_df)}"
)

Accuracy: 0.9501187648456056
