In [10]:
import torch
import open_clip
from tglcourse.utils import *

print(open_clip.list_pretrained())

[('RN50', 'openai'), ('RN50', 'yfcc15m'), ('RN50', 'cc12m'), ('RN50-quickgelu', 'openai'), ('RN50-quickgelu', 'yfcc15m'), ('RN50-quickgelu', 'cc12m'), ('RN101', 'openai'), ('RN101', 'yfcc15m'), ('RN101-quickgelu', 'openai'), ('RN101-quickgelu', 'yfcc15m'), ('RN50x4', 'openai'), ('RN50x16', 'openai'), ('RN50x64', 'openai'), ('ViT-B-32', 'openai'), ('ViT-B-32', 'laion400m_e31'), ('ViT-B-32', 'laion400m_e32'), ('ViT-B-32', 'laion2b_e16'), ('ViT-B-32', 'laion2b_s34b_b79k'), ('ViT-B-32-quickgelu', 'openai'), ('ViT-B-32-quickgelu', 'laion400m_e31'), ('ViT-B-32-quickgelu', 'laion400m_e32'), ('ViT-B-16', 'openai'), ('ViT-B-16', 'laion400m_e31'), ('ViT-B-16', 'laion400m_e32'), ('ViT-B-16-plus-240', 'laion400m_e31'), ('ViT-B-16-plus-240', 'laion400m_e32'), ('ViT-L-14', 'openai'), ('ViT-L-14', 'laion400m_e31'), ('ViT-L-14', 'laion400m_e32'), ('ViT-L-14', 'laion2b_s32b_b82k'), ('ViT-L-14-336', 'openai'), ('ViT-H-14', 'laion2b_s32b_b79k'), ('ViT-g-14', 'laion2b_s12b_b42k'), ('roberta-ViT-B-32', 'la

In [11]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32-quickgelu', pretrained='laion400m_e32')

100%|███████████████████████████████████████| 605M/605M [00:07<00:00, 82.1MiB/s]


##### Example 1

In [None]:
input_image = pil_from_url('https://images.pexels.com/photos/185032/pexels-photo-185032.jpeg?').resize((600, 400))

In [116]:
image_batch = preprocess(input_image).unsqueeze(0)

In [117]:
sentence = "persistent is all you need"

In [118]:
import open_clip

`image_batch` is a batch of 1 color image

In [119]:
image_batch.shape

torch.Size([1, 3, 224, 224])

In [120]:
sentence

'persistent is all you need'

In [46]:
type(model)

open_clip.model.CLIP

Use OpenCLIP to encodes the `image_batch` and `sequence`

In [47]:
text_batch = open_clip.tokenize([sentence])

In [48]:
with torch.no_grad():
    image_features = model.encode_image(image_batch)
    text_features = model.encode_text(text_batch)

In [49]:
image_features.shape, text_features.shape

(torch.Size([1, 512]), torch.Size([1, 512]))

##### Example 1.1

In [121]:
image_batch = preprocess(input_image).unsqueeze(0)

In [122]:
sentence = "persistent is all you need"

In [124]:
text_batch = open_clip.tokenize([sentence])

In [125]:
import open_clip

`image_batch` is a batch of 1 color image, `text_batch` is a batch of 

In [126]:
image_batch.shape, image_batch.shape

(torch.Size([1, 3, 224, 224]), torch.Size([1, 3, 224, 224]))

In [127]:
type(model)

open_clip.model.CLIP

Use OpenCLIP to encodes the `image_batch` and `sequence`

In [128]:
with torch.no_grad():
    image_features = model.encode_image(image_batch)
    text_features = model.encode_text(text_batch)

In [49]:
image_features.shape, text_features.shape

(torch.Size([1, 512]), torch.Size([1, 512]))

##### Example 2

In [12]:
prompts = ["a cat", "spider man walks on the beach", "starship on mar"]

In [13]:
text_batch = open_clip.tokenize(prompts)

In [14]:
text_embedding = model.encode_text(text_batch)

In [15]:
input_image = pil_from_url('https://images.pexels.com/photos/185032/pexels-photo-185032.jpeg?').resize((600, 400))

In [16]:
image_batch = preprocess(input_image).unsqueeze(0)

In [17]:
image_embedding = model.encode_image(image_batch)

In [18]:
text_embedding = text_embedding.detach()
image_embedding = image_embedding.detach()

In [19]:
import torch.nn.functional as F

`image_embedding` is the embedding of a cat

In [20]:
image_embedding.shape

torch.Size([1, 512])

`text_embedding` is a batch of embedding of three sentences
- a cat
- spider man walks on the beach
- starship on mar

In [92]:
text_embedding.shape

torch.Size([3, 512])

Write a function calculate the probabily that each sentence in `text_embedding` has the meaning same as the `image_embedding`

**Hint**: Normalize the embedding first

In [93]:
def probability_scores(image_embedding, text_embedding):
    image_embedding_norm = image_embedding.norm(dim=-1, keepdim=True)
    image_embedding = image_embedding / image_embedding_norm
    
    text_embedding_norm = text_features.norm(dim=-1, keepdim=True)
    text_embedding = text_embedding / text_embedding_norm
    
    similarities = image_embedding @ text_embedding.T
    probs = F.softmax(similarities, dim=-1)
    
    return probs

In [94]:
probability_scores(image_embedding, text_embedding)

tensor([[0.3555, 0.3055, 0.3390]])

### Projection Head

##### Example 1: GELU activation

In [20]:
from torch import nn

In [21]:
gelu = nn.GELU()

In [34]:
x = torch.tensor([-2, -1, 0, 1, 2]).float()

In [35]:
output = gelu(x)

In [36]:
output.mean()

tensor(0.5183)

In [37]:
output.var()

tensor(0.8030)

In [43]:
gelu(torch.tensor([-20, -1000, 0, 122, 2099]).float())

tensor([   0.,    0.,    0.,  122., 2099.])

### Cosine Similarity

### Contrastive loss

##### Example 1

In [21]:
import torch

In [22]:
text_embeddings = torch.randn(10, 128)
image_embeddings = torch.randn(10, 128)

In [23]:
temperature = 1.0

In [24]:
import torch.nn.functional as F

In [25]:
temperature

1.0

In [26]:
text_embeddings.shape, image_embeddings.shape

(torch.Size([10, 128]), torch.Size([10, 128]))

Given `text_embeddings` is the embedding of text and `image_embeddings` is the embedding of images

Calculate the target distribution for both images and texts in CLIP

In [27]:
images_similarity = image_embeddings @ image_embeddings.T

In [28]:
texts_similarity = text_embeddings @ text_embeddings.T

In [29]:
targets = F.softmax(
    (images_similarity + texts_similarity) / 2 * temperature,
    dim=-1
)

In [30]:
targets.shape

torch.Size([10, 10])

##### Example 2

In [70]:
logits = (text_embeddings @ image_embeddings.T) / temperature

##### Example 3

In [101]:
batch_size = 4
dim = 150

In [102]:
embeddings = torch.randn(batch_size, dim)

In [103]:
out = embeddings @ embeddings.T

In [104]:
out

tensor([[139.4657,  -8.0346,   8.9255,  -1.7976],
        [ -8.0346, 134.4699, -10.2236,  19.8583],
        [  8.9255, -10.2236, 106.3404,  14.3997],
        [ -1.7976,  19.8583,  14.3997, 137.2661]])

In [105]:
print(F.softmax(out, dim=-1))

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.9326e-43, 0.0000e+00, 1.0000e+00, 1.1767e-40],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00]])


##### Training

##### Example 1

Write a pseudocode for training CLIP. Given
- `images` is a list of all tensor images
- `texts` is a list of all texts

**Hint:** No need batch training, just do a simple loop

In [None]:
for epoch in n_epochs:
    image_features = text_encoder(text)
    text_features = text_encoder(text)
    
    image_embeddings = image_projection(image_features)
    text_embeddings = text_projection(text_features)
    
    logits = (text_embeddings @ image_embeddings.T) / temperature