# GRADIO IMAGE RETRIEVAL APP (CLIP + VIDEO)

Task Description:
In this project, your goal is to build a system that performs image retrieval from a video based on a natural language query. You must use a Vision Transformer (ViT) or a multi-modal language model like CLIP to accomplish this.
Your system should accept a text input from the user (such as "a man riding a bicycle" or "a dog playing with a ball") and return the most relevant frame(s) from the input video that match the query.


# Imports and Loads

In [1]:
!pip install yt-dlp

Collecting yt-dlp
  Downloading yt_dlp-2025.12.8-py3-none-any.whl.metadata (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.12.8-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.12.8


In [9]:
import gradio as gr
import cv2
import os
import torch
import yt_dlp
import tempfile
import numpy as np
from PIL import Image
from transformers import CLIPModel, CLIPProcessor, MarianMTModel, MarianTokenizer

# LOAD CLIP MODEL (Multilingual Support)

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Option 1: Multilingual CLIP (supports Arabic + 100+ languages)
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# Load Arabic to English translation model
translator_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
translator_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ar-en").to(device)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# TRANSLATION FUNCTION

In [11]:
def is_arabic(text):
    """Check if text contains Arabic characters"""
    arabic_pattern = any('\u0600' <= char <= '\u06FF' for char in text)
    return arabic_pattern


def translate_arabic_to_english(text):
    """Translate Arabic text to English"""
    if not is_arabic(text):
        return text

    try:
        inputs = translator_tokenizer(text, return_tensors="pt", padding=True).to(device)
        translated = translator_model.generate(**inputs)
        translation = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
        print(f"Translated '{text}' → '{translation}'")
        return translation
    except Exception as e:
        print(f"Translation failed: {e}, using original text")
        return text

# DOWNLOAD YOUTUBE VIDEO

In [12]:
def download_youtube(url):
    tmpdir = tempfile.mkdtemp(prefix="yt_")
    out_template = os.path.join(tmpdir, "video.%(ext)s")

    opts = {
        "format": "best[ext=mp4]/best",
        "outtmpl": out_template,
        "quiet": False,
        "no_warnings": False,
    }

    try:
        with yt_dlp.YoutubeDL(opts) as ydl:
            info = ydl.extract_info(url, download=True)
            filename = ydl.prepare_filename(info)
            return filename
    except Exception as e:
        raise Exception(f"Failed to download video from YouTube: {str(e)}")

# EXTRACT FRAMES




In [13]:
def extract_frames(video_path, step_seconds=1):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        raise Exception(f"Failed to open video: {video_path}")

    fps = cap.get(cv2.CAP_PROP_FPS) or 25
    step = int(fps * step_seconds)

    frames = []
    idx = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if idx % step == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame_rgb))

        idx += 1

    cap.release()

    if not frames:
        raise Exception("No frames extracted from video")

    return frames

# EMBEDDING FUNCTIONS

In [14]:
def embed_images(frames):
    image_embs = []
    with torch.no_grad():
        for i in range(0, len(frames), 16):
            batch = frames[i:i+16]
            inputs = processor(images=batch, return_tensors="pt").to(device)
            feats = model.get_image_features(**inputs)
            feats = feats / feats.norm(dim=-1, keepdim=True)
            image_embs.append(feats.cpu().numpy())
    return np.vstack(image_embs)


def embed_text(text):
    # Translate Arabic to English if needed
    translated_text = translate_arabic_to_english(text)

    inputs = processor(text=[translated_text], return_tensors="pt").to(device)
    with torch.no_grad():
        feats = model.get_text_features(**inputs)
        feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats.cpu().numpy()[0]

# SEARCH FUNCTION

In [15]:
def search(video_file, youtube_url, query, top_k):

    if not query or query.strip() == "":
        return None, "⚠️ Please enter a search query"

    try:
        # Determine the input source
        if youtube_url and youtube_url.strip():
            status_msg = "⏳ Downloading video from YouTube..."
            print(status_msg)
            video_path = download_youtube(youtube_url.strip())
        elif video_file:
            video_path = video_file
        else:
            return None, "⚠️ Please upload a video or provide a YouTube URL"

        # 1) Extract frames
        print(f"⏳ Extracting frames from: {video_path}")
        frames = extract_frames(video_path)
        print(f"✅ Extracted {len(frames)} frames")

        # 2) CLIP embeddings
        print("⏳ Computing similarities...")
        img_embs = embed_images(frames)
        txt_emb = embed_text(query)

        # 3) Cosine similarity
        sims = img_embs @ txt_emb
        best_idx = sims.argsort()[::-1][:top_k]

        # 4) Return top images
        results = [(frames[i], f"Score: {sims[i]:.3f}") for i in best_idx]
        status = f"✅ Found {len(results)} matching frames out of {len(frames)} total frames"

        return results, status

    except Exception as e:
        error_msg = f"❌ Error: {str(e)}"
        print(error_msg)
        return None, error_msg

# GRADIO UI

In [16]:
with gr.Blocks() as demo:
    gr.Markdown("# 🎥 Image Retrieval from Video using CLIP")
    gr.Markdown("Upload a video from your device **or** provide a YouTube URL")

    with gr.Row():
        video_input = gr.Video(label="📁 Upload Video")
        youtube_input = gr.Textbox(
            label="🔗 Or YouTube URL",
            placeholder="https://www.youtube.com/watch?v=..."
        )

    query = gr.Textbox(
        label="🔍 Search Query",
        placeholder="e.g., girl singing, car driving, sunset, تفاح, بنت بتغني..."
    )
    top_k = gr.Slider(1, 10, value=3, step=1, label="Number of Results (Top-K)")

    btn = gr.Button("🚀 Search", variant="primary")

    status = gr.Textbox(label="📊 Status", interactive=False)
    gallery = gr.Gallery(label="🖼️ Retrieved Frames", columns=3, rows=1)

    btn.click(
        fn=search,
        inputs=[video_input, youtube_input, query, top_k],
        outputs=[gallery, status]
    )

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://41265a539bdce7a1c3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


