In [None]:
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel

model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

dataset = load_dataset("sasha/dog-food")
images = dataset["test"]["image"][:2]
labels = ["dog", "food"]
inputs = processor(images=images, text=labels, return_tensors="pt") # padding=True

print("input_ids :", inputs["input_ids"])
print("attention_mask :", inputs["attention_mask"])
print("pixel_values :", inputs["pixel_values"])
print("image_shape :", inputs["pixel_values"].shape)

In [None]:
import torch

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    print("outputs :", outputs.keys())
    print("logits_per_image :", logits_per_image)
    print("probs :", probs)

for idx, prob in enumerate(probs):
    print(f"- Image #{idx}")
    for label, p in zip(labels, prob):
        print(f"{label} : {p:.4f}")

In [None]:
import evaluate
from torch.utils.data import DataLoader

test_dataloader = DataLoader(
    dataset["test"],
    batch_size=8,
    collate_fn=lambda batch: (
        [item["image"] for item in batch],
        [item["label"] for item in batch],
    ),
)

metric = evaluate.load("accuracy")
predictions, references = [], []
labels_names = dataset["test"].features["label"].names

model.eval()
with torch.no_grad():
    for images, labels in test_dataloader:
        inputs = processor(images=images, text=labels_names, return_tensors="pt")
        outputs = model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)

        predictions += probs.argmax(dim=1).cpu().tolist()
        references += labels

results = metric.compute(predictions=predictions, references=references)
print(f"클래스 목록 : {labels_names}")
print(f"정확도 : {results['accuracy']*100 :.2f}%")