In [1]:
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import fiftyone as fo
import os
import glob

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")



In [17]:
text_query = "real dog"

In [18]:
inputs = processor(text=text_query, return_tensors="pt", padding=True, truncation=True).to(device)
with torch.no_grad():
    text_embedding = model.get_text_features(**inputs).cpu().numpy()

In [19]:
path_pattern = os.path.join(os.getcwd(), '..', '..', 'data', 
                            'batch*', 'clip-features-14', 'L01_*.npy')
paths = glob.glob(path_pattern)
videos = [video[:-4].rsplit(os.sep)[-1] for video in paths]
videos.sort()

video_keyframe_embedding = {}


for video in videos:
    video_keyframe_embedding[video] = {}
    path_to_clip = os.path.join(os.getcwd(), '..', '..', 'data', 
                            'batch1', 'clip-features-14', video + '.npy')
    if (os.path.exists(path_to_clip)):
        a = np.load(path_to_clip)
        for index, embedding in enumerate(a):
            video_keyframe_embedding[video][index] = embedding


In [6]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [7]:
def find_top_matches(query_embedding, video_embeddings, top_n=100):
    results = []

    # Iterate over videos and their keyframe embeddings
    for video, embeddings in video_embeddings.items():
        for index, embedding in embeddings.items():
            # Calculate similarity
            similarity = cosine_similarity(query_embedding, embedding)
            results.append((video, index, similarity))
    
    # Sort results by similarity in descending order and select top N
    results = sorted(results, key=lambda x: x[2], reverse=True)[:top_n]
    
    return results

In [20]:
top_matches = find_top_matches(text_embedding, video_keyframe_embedding)

In [21]:
top_matches[0][1]

9

In [22]:
images_path = []
for element in top_matches:
    video = element[0]
    id = element[1] + 1
    path = os.getcwd() + f"/../../data/batch1/keyframes/keyframes_L01/{video}/{id:03d}.jpg"
    images_path.append(path)

if fo.dataset_exists("my_dataset"):
    fo.delete_dataset("my_dataset")
dataset = fo.Dataset("my_dataset")

for path in images_path:
    sample = fo.Sample(filepath=path)
    dataset.add_sample(sample)

session = fo.launch_app(dataset, auto=False)
session.open_tab()

Session launched. Run `session.show()` to open the App in a cell output.


<IPython.core.display.Javascript object>