CLIP
====

**Learning Transferable Visual Models From Natural Language Supervision**

* Paper: https://arxiv.org/abs/2103.00020

![CLIP](../assets/clip-overview.png)

```bash
pip install torch torchvision
pip install transformers
pip install matplotlib
pip install supervision
```

In [1]:
from PIL import Image
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained(
    "openai/clip-vit-base-patch32"
)
processor = CLIPProcessor.from_pretrained(
    "openai/clip-vit-base-patch32"
)

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
model = model.to(device)

### Image Embedding

In [2]:
image_path = "../samples/plants.jpg"

image = Image.open(image_path).convert("RGB")
print(image.size)

inputs = processor(images=image, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
print(inputs.keys())
print(inputs["pixel_values"].shape)

with torch.no_grad():
    img_embeds = model.get_image_features(
        pixel_values=inputs["pixel_values"]
    )
print(img_embeds.shape)


(1068, 1137)
dict_keys(['pixel_values'])
torch.Size([1, 3, 224, 224])
torch.Size([1, 512])


### Text embeddings

In [3]:
texts = [
    "a photo of an indoor plant",
    "a photo of two indoor plant in a golden vase",
    "a phot of a cat sitting on a couch"
]

inputs = processor(
    text=texts, return_tensors="pt", padding=True
)
inputs = {k: v.to(device) for k, v in inputs.items()}
print(inputs.keys())
print(inputs["input_ids"].shape)

with torch.no_grad():
    txt_embeds = model.get_text_features(
        input_ids=inputs["input_ids"]
    )
print(txt_embeds.shape)

dict_keys(['input_ids', 'attention_mask'])
torch.Size([3, 12])
torch.Size([3, 512])


### Image-Text Matching

In [4]:
img_embeds_array = img_embeds.detach().cpu().numpy()
txt_embeds_array = txt_embeds.detach().cpu().numpy()
# print L2 norm of the embeddings
print("Before normalization:")
print(np.linalg.norm(img_embeds_array, ord=2, axis=-1, keepdims=True))
print(np.linalg.norm(txt_embeds_array, ord=2, axis=-1, keepdims=True))

Before normalization:
[[10.161607]]
[[9.406647]
 [7.691049]
 [8.129713]]


In [5]:
img_embeds_array = img_embeds.detach().cpu().numpy()
txt_embeds_array = txt_embeds.detach().cpu().numpy()

# print L2 norm of the embeddings
print("Before normalization:")
print(np.linalg.norm(img_embeds_array, ord=2, axis=-1))
print(np.linalg.norm(txt_embeds_array, ord=2, axis=-1))

# normalize embeddings by L2 norm
img_embeds_array /= np.linalg.norm(
    img_embeds_array, axis=-1, ord=2, keepdims=True
)
txt_embeds_array /= np.linalg.norm(
    txt_embeds_array, axis=-1, ord=2, keepdims=True
)

print("\nAfter normalization:")
# print L2 norm of the normalized embeddings
print(np.linalg.norm(img_embeds_array, ord=2, axis=-1))
print(np.linalg.norm(txt_embeds_array, ord=2, axis=-1))

Before normalization:
[10.161607]
[9.406647 7.691049 8.129713]

After normalization:
[1.]
[1. 1. 1.]


In [6]:
# calculate similarity between image and text
similarity = np.inner(img_embeds_array, txt_embeds_array)
print(similarity)

[[0.28259343 0.31666955 0.11867781]]
