# Multimodal Search: Text + Images with CLIP

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/multimodal_search.ipynb)

Build a cross-modal search system where you can search images with text and text with images using CLIP embeddings stored in Qdrant.

In [None]:
!pip install -q qdrant-client transformers pillow torch

In [None]:
import numpy as np
from PIL import Image
from transformers import CLIPModel, CLIPProcessor
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

In [None]:
client = QdrantClient(":memory:")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
# Both text and images live in the same vector space with CLIP
client.create_collection(
    collection_name="multimodal",
    vectors_config=VectorParams(size=512, distance=Distance.COSINE),
)

In [None]:
# Index some images (synthetic colored squares for demo)
image_items = [
    {"name": "sunset", "color": (255, 100, 50)},
    {"name": "ocean", "color": (30, 100, 200)},
    {"name": "forest", "color": (34, 139, 34)},
    {"name": "snow", "color": (240, 240, 255)},
    {"name": "night", "color": (20, 20, 40)},
]

# Index some text documents too
text_items = [
    "A beautiful sunset over the mountains with orange and red hues",
    "Deep blue ocean waves crashing on a sandy beach",
    "Dense green forest with tall pine trees and moss",
    "Fresh white snow covering a quiet village in winter",
    "Starry night sky with the milky way visible",
]

points = []
idx = 0

# Embed images
for item in image_items:
    img = Image.new("RGB", (224, 224), item["color"])
    inputs = processor(images=img, return_tensors="pt")
    emb = model.get_image_features(**inputs).detach().numpy()[0].tolist()
    points.append(PointStruct(
        id=idx, vector=emb,
        payload={"type": "image", "name": item["name"], "color": list(item["color"])},
    ))
    idx += 1

# Embed text
for text in text_items:
    inputs = processor(text=[text], return_tensors="pt")
    emb = model.get_text_features(**inputs).detach().numpy()[0].tolist()
    points.append(PointStruct(
        id=idx, vector=emb,
        payload={"type": "text", "content": text},
    ))
    idx += 1

client.upsert(collection_name="multimodal", points=points)
print(f"Indexed {len(image_items)} images + {len(text_items)} texts")

In [None]:
# Text-to-anything search (finds both matching images and text)
query_text = "warm colors like fire"
inputs = processor(text=[query_text], return_tensors="pt")
query_vec = model.get_text_features(**inputs).detach().numpy()[0].tolist()

response = client.query_points(
    collection_name="multimodal",
    query=query_vec,
    limit=5,
)
results = response.points

print(f"Query: '{query_text}'")
for r in results:
    if r.payload["type"] == "image":
        print(f"  Score: {r.score:.4f} | [IMAGE] {r.payload['name']}")
    else:
        print(f"  Score: {r.score:.4f} | [TEXT] {r.payload['content'][:60]}")

In [None]:
# Image-to-anything search
query_img = Image.new("RGB", (224, 224), (0, 50, 150))  # dark blue
inputs = processor(images=query_img, return_tensors="pt")
query_vec = model.get_image_features(**inputs).detach().numpy()[0].tolist()

response = client.query_points(
    collection_name="multimodal",
    query=query_vec,
    limit=5,
)
results = response.points

print("Query: [dark blue image]")
for r in results:
    if r.payload["type"] == "image":
        print(f"  Score: {r.score:.4f} | [IMAGE] {r.payload['name']}")
    else:
        print(f"  Score: {r.score:.4f} | [TEXT] {r.payload['content'][:60]}")