# Video Search with CLIP4CLIP

In this demo, we'll build a video search system using CLIP4CLIP, a model that can understand both video content and text descriptions. CLIP4CLIP extends the original CLIP model to work with video sequences by processing multiple frames and creating video embeddings.

## What you'll learn:
- Convert videos to embeddings using CLIP4CLIP
- Search videos using text queries
- Compare video similarities

## Import Libraries and Setup

First, let's import all the necessary libraries for video processing and the CLIP4CLIP model:

In [None]:
import torch
import cv2
import numpy as np
from PIL import Image
from transformers import CLIPVisionModelWithProjection, CLIPTokenizer, CLIPTextModelWithProjection
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
from typing import List
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

## Video Processing Function

This function converts a video file into a tensor of processed frames that CLIP4CLIP can understand:

In [2]:
def preprocess_frame(size, n_px):
        
    return Compose([
        Resize(size, interpolation=InterpolationMode.BICUBIC),            
        CenterCrop(size),
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])(n_px)


def video2frames(video_path, frame_rate=1.0, size=224):
    """Convert video to preprocessed frames tensor."""
    
    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    
    if fps < 1:
        raise IOError(f"ERROR: problem reading video file: {video_path}. No frames found.")
    
    total_duration = (frameCount + fps - 1) // fps
    start_sec, end_sec = 0, total_duration
    interval = fps / frame_rate
    frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))
    
    images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)
    last_frame = 0
        
    for i, idx in enumerate(frames_idx):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()    
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)             
        last_frame = i
        images[i,:,:,:] = preprocess_frame(size, Image.fromarray(frame))
    
    # If we exited early the loop, truncate to the frames we actually
    # have read (to avoid black frames at the end)
    images = images[:last_frame+1]
    cap.release()
    
    return torch.tensor(images)

## Load CLIP4CLIP Models

We load clip4clip twice, once as a video encoder, once as a text encoder. Note that the model is the same, because clip4clip brings both text and videos to the same embedding space:

In [3]:
model_name = "Searchium-ai/clip4clip-webvid150k"

# Load the vision model using CLIPVisionModelWithProjection
vision_model = CLIPVisionModelWithProjection.from_pretrained(model_name)
vision_model = vision_model.eval()

# Load the text model using CLIPTextModelWithProjection
text_model = CLIPTextModelWithProjection.from_pretrained(model_name)
text_model = text_model.eval()

# Load the tokenizer
tokenizer = CLIPTokenizer.from_pretrained(model_name)

print("Models loaded successfully!")

Models loaded successfully!


## Video Embedding Function

Create a function that converts video frames into a single embedding vector:

In [4]:
def get_video_embedding(video_frames):
    """Convert video frames to a normalized embedding vector."""
    
    with torch.no_grad():
        # Get visual output from the vision model
        visual_output = vision_model(video_frames)
        
        # Extract embeddings and normalize at the frame level
        # (to bring all frames to weight equally)
        embeddings = visual_output["image_embeds"]
        embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True)
        
        # Calculate mean across all frames to get single video embedding
        video_embedding = torch.mean(embeddings, dim=0)

        # Normalize the final embedding
        video_embedding = video_embedding / video_embedding.norm(dim=-1, keepdim=True)
        
    return video_embedding

## Text Embedding Function

Create a function that converts text queries into embedding vectors:

In [5]:
def get_text_embedding(text_query):
    """Convert text query to normalized embedding vector."""
    
    # Tokenize the text query
    inputs = tokenizer([text_query], padding=True, return_tensors="pt")
    
    with torch.no_grad():
        # Get text embeddings from the text model
        text_output = text_model(**inputs)
        text_embedding = text_output.text_embeds

        # Normalize the embedding
        text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
        
    return text_embedding.squeeze(0)

## Load Sample Videos

Create a list of video file paths:

In [8]:
import glob
from IPython.display import display, Video

video_paths = glob.glob("videos/*")

print(f"Found {len(video_paths)} video files")

for video_path in video_paths:
    display(Video(video_path, width=224))

Found 3 video files


## Create Video Embeddings Database

Let's process all videos and create embeddings for searching:

In [None]:
video_embeddings = []

print("Processing videos and creating embeddings...")

for video_path in tqdm(video_paths):
    try:
        # Convert video to frames
        frames = video2frames(video_path)

        # Get video embedding
        embedding = get_video_embedding(frames)
        
        video_embeddings.append(embedding)
        
    except Exception as e:
        print(f"Error processing {video_path}: {e}")
        continue

# Stack all embeddings into a tensor
video_embeddings_tensor = torch.stack(video_embeddings)

print(f"Created embeddings for {len(video_embeddings)} videos")
print(f"Embeddings shape: {video_embeddings_tensor.shape}")

Processing videos and creating embeddings...


100%|██████████| 3/3 [00:01<00:00,  1.54it/s]

Created embeddings for 3 videos
Embeddings shape: torch.Size([3, 512])





## Video Search Function

Let's implement the search function that finds videos most similar to a text query:

In [10]:
def search_videos(query_text, video_embeddings_tensor, video_paths, top_k=3):
    """Search for videos using text query."""
    
    # Get text embedding for the query
    text_embedding = get_text_embedding(query_text)

    # Calculate cosine similarity between text and all video embeddings
    # Since embeddings are normalized, cosine similarity = dot product
    similarities = torch.matmul(text_embedding, video_embeddings_tensor.T)

    # Get top-k most similar videos
    top_indices = similarities.argsort(descending=True)[:top_k]
    top_scores = similarities[top_indices]
    
    print(f"Search results for: '{query_text}'")
    print("-" * 50)
    
    results = []
    for i, (idx, score) in enumerate(zip(top_indices, top_scores)):
        video_path = video_paths[idx]
        results.append((video_path, score.item()))
        print(f"{i+1}. {video_path} (similarity: {score:.3f})")
    
    return results

## Try Video Search

Now let's test our video search system with different text queries:

In [11]:
# TODO: Try different search queries
queries = [
    "animal playing",
    "I love motor sports!",
    "airport life"
]

for query in queries:
    print("\n" + "="*60)
    results = search_videos(query, video_embeddings_tensor, video_paths, top_k=3)
    print()


Search results for: 'animal playing'
--------------------------------------------------
1. videos/dog.mp4 (similarity: 0.233)
2. videos/cross.mp4 (similarity: 0.163)
3. videos/bags.mp4 (similarity: 0.128)


Search results for: 'I love motor sports!'
--------------------------------------------------
1. videos/cross.mp4 (similarity: 0.220)
2. videos/dog.mp4 (similarity: 0.206)
3. videos/bags.mp4 (similarity: 0.159)


Search results for: 'airport life'
--------------------------------------------------
1. videos/bags.mp4 (similarity: 0.300)
2. videos/cross.mp4 (similarity: 0.160)
3. videos/dog.mp4 (similarity: 0.150)



## Final remarks

When implementing a video search system, there are a lot of choices to be made regarding the embeddings (embedding model, frame sampling strategy, similarity metrics...). Moreover, it is common for real system to include hybrid searches (metadata + embeddings), searches on multiple embeddings, as well as additional steps like re-ranking.