# Interacting with CLIP

This is a self-contained notebook that shows how to download and run CLIP models, calculate the similarity between arbitrary image and text inputs, and perform zero-shot image classifications.

In [2]:
import numpy as np
import pandas as pd
from PIL import Image
import IPython.display
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import transformers
print(transformers.__version__)

4.38.1


In [None]:
print(torch.__version__)

In [None]:
import PIL
print(PIL.__version__)

# Load data

In [None]:
df_caption = pd.read_csv("data/description.csv")
df_caption

In [None]:
df_caption[df_caption["image_name"]=="chelsea"].reset_index()["description"][0]

In [None]:
file_list = df_caption["filename"].to_list()
file_list

In [None]:
os.path.splitext("page.png")[0]

# Load the pretrained CLIP model

In [None]:
# Set the device
device = "cpu"
# Define the model ID
model_ID = "openai/clip-vit-base-patch32"
# Save the model to device
model = CLIPModel.from_pretrained(model_ID).to(device)
# Get the processor
processor = CLIPProcessor.from_pretrained(model_ID)
# Get the tokenizer
tokenizer = CLIPTokenizer.from_pretrained(model_ID)

# Create function to generate text and image embeddings

In [None]:
def single_text_embedding(text):
  inputs = tokenizer(text, return_tensors = "pt")
  text_embedding = model.get_text_features(**inputs)
  # convert the embeddings to numpy array
  embedding_as_np = text_embedding.cpu().detach().numpy()
  return embedding_as_np

def single_image_embedding(my_image):
  image = processor(
		text = None,
		images = my_image,
		return_tensors="pt"
		)["pixel_values"].to(device)
  image_embedding = model.get_image_features(image)
  # convert the embeddings to numpy array
  embedding_as_np = image_embedding.cpu().detach().numpy()
  return embedding_as_np

# Apply the functions to images and their descriptions

In [None]:
dir_path = "data/image/"

text_embs = []
img_embs = []
texts = []
original_images = []
for filename in file_list:
    name = os.path.splitext(filename)[0]
    description = df_caption[df_caption["image_name"]==name].reset_index()["description"][0]
    text_embs.append(single_text_embedding(description))
    image = Image.open(os.path.join(dir_path, filename)).convert("RGB")
    img_embs.append(single_image_embedding(image))
    texts.append(description)
    original_images.append(image)

text_embs_np = np.vstack(text_embs)
img_embs_np = np.vstack(img_embs)

# Compare cosine similarity between text features and image features

In [None]:
img_embs = torch.from_numpy(img_embs_np)
text_embs = torch.from_numpy(text_embs_np)

img_embs /= img_embs.norm(dim=-1, keepdim=True)
text_embs /= text_embs.norm(dim=-1, keepdim=True)
similarity = text_embs.cpu().numpy() @ img_embs.cpu().numpy().T

In [None]:
similarity

In [None]:
count = df_caption.shape[0]

plt.figure(figsize=(20, 16))
plt.imshow(similarity, vmin=0.1, vmax=0.3)
# plt.colorbar()
plt.yticks(range(count), texts, fontsize=18)
plt.xticks([])
for i, image in enumerate(original_images):
    plt.imshow(image, extent=(i - 0.5, i + 0.5, -1.6, -0.6), origin="lower")
for x in range(similarity.shape[1]):
    for y in range(similarity.shape[0]):
        plt.text(x, y, f"{similarity[y, x]:.2f}", ha="center", va="center", size=12)

for side in ["left", "top", "right", "bottom"]:
  plt.gca().spines[side].set_visible(False)

plt.xlim([-0.5, count - 0.5])
plt.ylim([count + 0.5, -2])

plt.title("Cosine similarity between text and image features", size=20)

# Do text-image search using FAISS

In [None]:
import faiss

In [None]:
def find_image(qr_sentence, img_embs_np):
    # generate vector for query sentence
    qr_sent_vec = single_text_embedding(qr_sentence)

    # perform vector search through similarity comparison
    n_dim = img_embs_np.shape[1]
    x = np.vstack(img_embs_np).astype(np.float32)
    q = qr_sent_vec.reshape(1, -1)
    index = faiss.index_factory(n_dim, "Flat", faiss.METRIC_INNER_PRODUCT)
    # add all questions
    faiss.normalize_L2(x)
    index.add(x)
    # do vector search for the query sentence
    faiss.normalize_L2(q)
    similarity, idx = index.search(q, k=index.ntotal)
    img_idx = idx[0][0]
    image = Image.open(os.path.join(dir_path, file_list[img_idx])).convert("RGB")
    return image

In [None]:
qr_sentence = "there is a cat outside window"
response_img = find_image(qr_sentence, img_embs_np)
plt.imshow(response_img)

In [None]:
qr_sentence = "there is a dog watching you"
response_img = find_image(qr_sentence, img_embs_np)
plt.imshow(response_img)