In [10]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [11]:
outputs

CLIPOutput(loss=None, logits_per_image=tensor([[24.5701, 19.3049]], grad_fn=<TBackward0>), logits_per_text=tensor([[24.5701],
        [19.3049]], grad_fn=<MulBackward0>), text_embeds=tensor([[ 0.0148,  0.0070, -0.0234,  ..., -0.0508, -0.0438,  0.0033],
        [ 0.0087,  0.0258, -0.0387,  ..., -0.0547, -0.0242,  0.0112]],
       grad_fn=<DivBackward0>), image_embeds=tensor([[-9.7876e-03,  1.2770e-02, -2.7419e-02,  1.9676e-03, -5.9324e-03,
         -1.5613e-02, -1.2514e-02, -2.2666e-04,  4.3868e-02, -1.6322e-02,
          2.2630e-02, -3.5160e-02,  4.4751e-03, -1.2946e-02, -3.1524e-02,
         -1.1737e-02, -2.1543e-02, -2.7556e-02,  1.6562e-02,  4.5936e-03,
         -1.2106e-01, -3.0034e-03,  3.9024e-02, -3.0893e-02, -4.3864e-03,
          2.7598e-02,  2.2139e-02, -1.7064e-02,  1.4509e-02, -4.5194e-03,
         -7.1842e-03,  2.3971e-02, -6.8106e-03,  1.6382e-02, -5.3629e-02,
         -4.5561e-04,  2.5840e-02, -2.6581e-02,  1.7667e-02,  3.0216e-02,
         -9.3062e-03, -3.2082e-02,  6.6

In [7]:
outputs

CLIPOutput(loss=None, logits_per_image=tensor([[24.5701, 19.3049]], grad_fn=<TBackward0>), logits_per_text=tensor([[24.5701],
        [19.3049]], grad_fn=<MulBackward0>), text_embeds=tensor([[ 0.0148,  0.0070, -0.0234,  ..., -0.0508, -0.0438,  0.0033],
        [ 0.0087,  0.0258, -0.0387,  ..., -0.0547, -0.0242,  0.0112]],
       grad_fn=<DivBackward0>), image_embeds=tensor([[-9.7876e-03,  1.2770e-02, -2.7419e-02,  1.9676e-03, -5.9324e-03,
         -1.5613e-02, -1.2514e-02, -2.2666e-04,  4.3868e-02, -1.6322e-02,
          2.2630e-02, -3.5160e-02,  4.4751e-03, -1.2946e-02, -3.1524e-02,
         -1.1737e-02, -2.1543e-02, -2.7556e-02,  1.6562e-02,  4.5936e-03,
         -1.2106e-01, -3.0034e-03,  3.9024e-02, -3.0893e-02, -4.3864e-03,
          2.7598e-02,  2.2139e-02, -1.7064e-02,  1.4509e-02, -4.5194e-03,
         -7.1842e-03,  2.3971e-02, -6.8106e-03,  1.6382e-02, -5.3629e-02,
         -4.5561e-04,  2.5840e-02, -2.6581e-02,  1.7667e-02,  3.0216e-02,
         -9.3062e-03, -3.2082e-02,  6.6

In [5]:
probs

tensor([[0.9949, 0.0051]], grad_fn=<SoftmaxBackward0>)

In [None]:
from PIL import Image
import requests
from transformers import AutoProcessor, AutoTokenizer, CLIPModel

In [None]:
import torch

device = torch.device("mps")

In [12]:
# Get the image features
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt")

image_features = model.get_image_features(**inputs)


In [None]:
from tqdm import tqdm

In [None]:
for _ in tqdm(range(1000)):
    inputs = processor(images=image, return_tensors="pt")
    image_features = model.get_image_features(**inputs)

In [None]:
inputs = processor(images=image, return_tensors="pt")

image_features = model.get_image_features(**inputs)

In [None]:
outputs.image_embeds

In [None]:
probs

In [None]:
from PIL import Image
import requests
from transformers import AutoProcessor, AutoTokenizer, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

# Get the image features
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt").to(device)

image_features = model.get_image_features(**inputs)

print(image_features.shape) 

In [None]:
import cv2
image = cv2.imread("data/image.png")
image.shape

In [None]:
inputs = processor(images=image, return_tensors="pt").to(device)

image_features = model.get_image_features(**inputs)

print(image_features.shape) 

In [None]:
for _ in tqdm(range(1000)):
    inputs = processor(images=[image, image, image], return_tensors="pt").to(device)
    image_features = model.get_image_features(**inputs)