## CLIP - Image Embedding

#### 1. Download model from Hugging Face

In [2]:
import torch
import sys, os, shutil
from pathlib import Path
from transformers import CLIPProcessor, CLIPModel
sys.path.append(str(Path('impl.ipynb').resolve().parents[3]))
from prep.params import SQLITE_PATH, CLIP_DIR

# Define the directory where models will be saved
TEMP_DIR = "./.clip_model"
# Define the specific CLIP model ID from Hugging Face
MODEL_ID = "openai/clip-vit-base-patch32"

# Create the directory if it doesn't exist
if not os.path.exists(TEMP_DIR):
    os.makedirs(TEMP_DIR)

# Load the model and processor from Hugging Face
processor = CLIPProcessor.from_pretrained(MODEL_ID)
model = CLIPModel.from_pretrained(MODEL_ID)

# Save the model and processor to the specified local directory
processor.save_pretrained("../../"+CLIP_DIR)
model.save_pretrained("../../"+CLIP_DIR)

# Clean up
shutil.rmtree(TEMP_DIR, ignore_errors=True)

#### 2. Load model from local folder

In [1]:
import torch
import sys, os, shutil
from pathlib import Path
from transformers import CLIPProcessor, CLIPModel
sys.path.append(str(Path('impl.ipynb').resolve().parents[3]))
from prep.params import CLIP_DIR

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
processor = CLIPProcessor.from_pretrained("../../"+CLIP_DIR)
model = CLIPModel.from_pretrained("../../"+CLIP_DIR)
model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

#### 3. Embeddings

In [None]:
import requests
from PIL import Image

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Added more text for better context
text_inputs = ["a photo of 2 cats", "a picture of a pink couch", "a photo of 2 remote controls"]

inputs = processor(
                    text=text_inputs,
                    images=image,
                    return_tensors="pt",
                    padding=True,
                   ).to(device)

with torch.no_grad():
    outputs = model(**inputs)


# Image embedding
image_embedding = outputs.image_embeds
print(f"\nImage embedding shape: {image_embedding.shape}")
print(f"Image embedding (first 5 values): {image_embedding[0, :5].tolist()}")


# Text embeddings
# This tensor contains the embeddings for each of the input text phrases.
text_embeddings = outputs.text_embeds
print(f"\nText embeddings shape: {text_embeddings.shape}")
print(f"Text embedding for '{text_inputs[0]}' (first 5 values): {text_embeddings[0, :5].tolist()}")
print(f"Text embedding for '{text_inputs[1]}' (first 5 values): {text_embeddings[1, :5].tolist()}")
if len(text_inputs) > 2:
    print(f"Text embedding for '{text_inputs[2]}' (first 5 values): {text_embeddings[2, :5].tolist()}")





Image embedding shape: torch.Size([1, 512])
Image embedding (first 5 values): [-0.00978781282901764, 0.01276974193751812, -0.02741880528628826, 0.001967571210116148, -0.00593261793255806]

Text embeddings shape: torch.Size([3, 512])
Text embedding for 'a photo of 2 cats' (first 5 values): [0.031352195888757706, 0.0010832290863618255, -0.06258819997310638, -0.037271205335855484, 0.008031794801354408]
Text embedding for 'a picture of a pink couch' (first 5 values): [0.0009097328293137252, -0.0035542109981179237, -0.020510952919721603, -0.05030852556228638, -0.02737904153764248]
Text embedding for 'a photo of 2 remote controls' (first 5 values): [-0.003358860034495592, 0.002799508860334754, -0.08756726235151291, 0.007565741427242756, 0.01846543326973915]

Image-Text similarity logits (before softmax): tensor([[27.5308, 23.8256, 27.3588]], device='mps:0')
Image-Text similarity probabilities (softmax): tensor([[0.5357, 0.0132, 0.4511]], device='mps:0')
Probability for 'a photo of 2 cats': 

In [None]:
# --- The original similarity calculation ---
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

print(f"\nImage-Text similarity logits (before softmax): {logits_per_image}")
print(f"Image-Text similarity probabilities (softmax): {probs}")

# To interpret the probabilities:
for i, prob in enumerate(probs[0]): # Assuming one image, iterate over text probabilities
    print(f"Probability for '{text_inputs[i]}': {prob.item():.4f}")

### 4. Load from packed function

In [3]:
import requests, sys
from PIL import Image
from pathlib import Path
sys.path.append(str(Path('impl.ipynb').resolve().parents[3]))
from FrameEmb import ClipEncoder

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# Image to base64
import base64
from io import BytesIO
def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

image_base64 = image_to_base64(image)

CE = ClipEncoder()
# Ensure the output tensor is on the correct device
result = CE.image_encode(image_base64)
print(result[:10])

text_inputs = "a photo of 2 cats"
result = CE.text_encode(text_inputs)
print(result[:10])

[-0.08948638  0.15628737 -0.35676813 -0.19910409 -0.05353561 -0.23494473
 -0.36256215  0.13674879  0.245684    0.01422149]
[ 0.3023144   0.01044509 -0.60350806 -0.35938838  0.07744727  0.21590227
 -0.24014613 -0.69491875 -0.38131976  0.04199659]
