In [17]:
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import fiftyone as fo
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

In [19]:
image_embeddings = np.load("L01_V001.npy")
image_embeddings.shape

(272, 768)

In [20]:
text_query = "white car"

In [21]:
inputs = processor(text=text_query, return_tensors="pt", padding=True, truncation=True).to(device)

In [22]:
with torch.no_grad():
    text_embedding = model.get_text_features(**inputs).cpu().numpy()

In [23]:
cosine_similarities = np.dot(image_embeddings, text_embedding.T).flatten()
k = 100
top_k_indices = np.argsort(cosine_similarities)[-k:][::-1]

print("Top 5 matching image indices:", top_k_indices)

Top 5 matching image indices: [ 16 257   5 129  70 266 130   3 135 121 125  68 255 127 153 126  83  23
 243  78  90 173   9   4 252 251  10  75 109  37 172  99 256 101 171 268
 202  92  85  28  95  48 142  27 162  38  21 170 141  25 271  69  79  86
  36  45  19  74  17  24  29  26   8 244 103  46  14  44 157 102   1  71
  84 176  88 152 270 269  20 131 140 267  18   0 209  11  40 211 189  39
 260  32 197 169 155 136 262  81 195 120]


In [24]:
images_path = []
for id in top_k_indices:
    id = id + 1
    path = os.getcwd() + f"/../../data/batch1/keyframes/keyframes_L01/L01_V001/{id:03d}.jpg"
    images_path.append(path)
images_path

if fo.dataset_exists("my_dataset"):
    fo.delete_dataset("my_dataset")
dataset = fo.Dataset("my_dataset")

for path in images_path:
    sample = fo.Sample(filepath=path)
    dataset.add_sample(sample)

session = fo.launch_app(dataset, auto=False)
session.open_tab()

Session launched. Run `session.show()` to open the App in a cell output.


<IPython.core.display.Javascript object>