In [6]:
# Load model directly
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
from PIL import Image
import torch

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = AutoModelForZeroShotImageClassification.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [23]:
# image_path = "/Volumes/Cucumber/Projects/datasets/raw/hcmus-iid/train/images/0d123d7a5c3c0247218f44118e287c23.jpg"
# image_path = "/Volumes/Cucumber/Projects/datasets/raw/hcmus-iid/train/images/0dda1efa3984aa384ff3620f6bc77912.jpg"
# image_path = "/Volumes/Cucumber/Projects/datasets/raw/hcmus-iid/train/images/1b39c7c32abb00be6de96480a829650b.jpeg"
image_path = "/Volumes/Cucumber/Projects/datasets/raw/hcmus-iid/train/images/2e40286508d7a274fa0aeb9cc587a24b.jpg"

In [19]:
def encode_image(image_path):
    """Extract image features using CLIP"""
    # Load image
    if isinstance(image_path, str):
        image = Image.open(image_path)
    else:
        image = image_path  # PIL Image

    # Process image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Get image features
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
        # Normalize for cosine similarity
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

    return image_features

# Usage
image_embedding = encode_image(image_path)
print(f"Image embedding shape: {image_embedding.shape}")  # [1, 512]

Image embedding shape: torch.Size([1, 512])


In [24]:
def classify_image(image_path, class_names, template="a photo of a {}"):
    """Classify image using text descriptions"""
    # Load image
    image = Image.open(image_path)

    # Create text descriptions
    text_descriptions = [template.format(cls) for cls in class_names]

    # Process inputs
    inputs = processor(
        text=text_descriptions,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)

    # Return results
    results = {}
    for i, class_name in enumerate(class_names):
        results[class_name] = float(probs[0][i])

    return results

# Usage
classes = ["can","bottle", "pack", "box", "number1"]
predictions = classify_image(image_path, classes)

# Sort by probability
sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
for class_name, prob in sorted_predictions:
    print(f"{class_name}: {prob:.3f}")

bottle: 0.832
can: 0.124
pack: 0.034
number1: 0.009
box: 0.001
