In [1]:
from PIL import Image
import requests
import torchvision
import torch

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("generated.png")

inputs = processor(text=["a photo of a camouflaged crab", "a photo of an easy-to-see crab"], images=image, return_tensors="pt", padding=True)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
inputs.pixel_values.requires_grad = True

In [8]:
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
print(probs[0][1])
probs[0][1].backward()

tensor(0.6416, grad_fn=<SelectBackward0>)


In [9]:
inputs.pixel_values.grad

tensor([[[[ 1.4080e-04, -3.0572e-04, -7.5578e-04,  ..., -2.8225e-04,
           -1.3286e-04, -6.1910e-06],
          [ 2.4908e-04,  8.9160e-05, -2.7698e-04,  ..., -4.5672e-05,
           -4.6278e-05,  2.0287e-04],
          [ 4.0082e-04, -1.5961e-04,  3.0719e-06,  ...,  4.0979e-05,
           -2.2251e-04,  1.7725e-04],
          ...,
          [-5.6736e-04, -1.7783e-04, -2.2961e-04,  ...,  5.9265e-04,
            3.3190e-04,  2.4395e-04],
          [-1.0303e-04, -3.4927e-05,  6.8434e-05,  ...,  1.4542e-04,
            8.1527e-05, -5.0722e-05],
          [-3.8030e-05,  1.4807e-04,  2.1165e-04,  ...,  3.6302e-04,
            2.2868e-04,  8.5291e-05]],

         [[-1.5988e-04, -2.2908e-04, -7.9835e-04,  ..., -1.4919e-04,
           -5.0156e-05,  1.5115e-04],
          [ 2.2029e-04,  1.2896e-04, -2.2392e-04,  ..., -3.7935e-04,
           -1.0394e-04,  7.9444e-05],
          [-3.6271e-04,  7.9844e-05, -1.8428e-04,  ..., -3.4500e-04,
           -2.5171e-04, -7.5710e-05],
          ...,
     