CLIP
====

**Learning Transferable Visual Models From Natural Language Supervision**

* Paper: https://arxiv.org/abs/2103.00020

![CLIP](../assets/clip-overview.png)

```bash
pip install torch torchvision
pip install transformers
pip install matplotlib
pip install supervision
```

In [2]:
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained(
    "openai/clip-vit-base-patch32"
)
processor = CLIPProcessor.from_pretrained(
    "openai/clip-vit-base-patch32"
)

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
model = model.to(device)

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

### Image Embedding

In [None]:
image_path = "../samples/plants.jpg"

image = Image.open(image_path).convert("RGB")
print(image.size)

inputs = processor(images=image, return_tensors="pt", padding=True)
print(inputs.keys())
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    img_embeds = model.get_image_features(
        pixel_values=inputs["pixel_values"]
    )
print(img_embeds.shape)

(1068, 1137)
torch.Size([1, 512])


### Text embeddings

In [16]:
texts = ["a photo of a plant", "a phot of a cat sitting on a couch"]

inputs = processor(
    text=texts, return_tensors="pt", padding=True
)
inputs = {k: v.to(device) for k, v in inputs.items()}
print(inputs.keys())
print(inputs["input_ids"].shape)

with torch.no_grad():
    txt_embeds = model.get_text_features(
        input_ids=inputs["input_ids"]
    )
print(txt_embeds.shape)

dict_keys(['input_ids', 'attention_mask'])
torch.Size([2, 12])
torch.Size([2, 512])
