### Example code
https://colab.research.google.com/github/mlfoundations/open_clip/blob/master/docs/Interacting_with_open_clip.ipynb

In [5]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

image = preprocess(Image.open("../data/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.autocast("cuda"):
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

Label probs: tensor([[9.9950e-01, 4.1210e-04, 8.5325e-05]])




If model uses timm image encoders (convnext, siglip, eva, etc) ensure the latest timm is installed. Upgrade timm if you see 'Unknown model' errors for the image encoder.

If model uses transformers tokenizers, ensure transformers is installed.

See also this [Clip Colab].

To compute billions of embeddings efficiently, you can use clip-retrieval which has openclip support.

In [7]:
import open_clip

open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_s13m_b4k'),
 ('ViT-

In [8]:
# pretrained also accepts local paths
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k') 

In [12]:
# GPU 사용 가능 시
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(device)

cpu


In [13]:
def get_image_embedding(image_path: str) -> torch.Tensor:
    # 이미지 열기
    image = Image.open(image_path).convert("RGB")
    
    # 전처리 (사이즈 조정, 정규화 등)
    image_tensor = preprocess(image).unsqueeze(0).to(device)

    # 임베딩 추출
    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_features /= image_features.norm(dim=-1, keepdim=True)  # 정규화 (Cosine 유사도 계산을 위해)

    return image_features.cpu().squeeze()


In [16]:
if __name__ == "__main__":
    embedding = get_image_embedding("../data/images/dataset/bear.jpg")
    
    print("Embedding shape:", embedding.shape)
    print("Embedding (앞 5개):", embedding[:5])

Embedding shape: torch.Size([512])
Embedding (앞 5개): tensor([-0.0314,  0.0639, -0.1324, -0.0225,  0.0143])


In [17]:
import torch
import torch.nn.functional as F

def compute_cosine_similarity(embedding1: torch.Tensor, embedding2: torch.Tensor) -> float:
    # Cosine similarity 계산 (1에 가까울수록 유사함)
    similarity = F.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
    return similarity.item()

In [20]:
if __name__ == "__main__":
    emb1 = get_image_embedding("../data/images/dataset/ex1.jpeg")
    emb2 = get_image_embedding("../data/images/dataset/ex2.jpeg")

    similarity = compute_cosine_similarity(emb1, emb2)
    print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.8189
