# Extracting Image Embeddings using CLIP

In [1]:
# Move up to project root directory (parent directory) for module imports
import os

os.chdir("../")

# Current working directory should now be project root
print("Current working directory:", os.getcwd())

Current working directory: /home/klass/gpt2-image-captioning


In [2]:
# Imports
import torch

from src.embeddings.clip import extract_clip_embeddings, load_clip_model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Load CLIP Model and Processor
clip_model, clip_processor = load_clip_model(device=DEVICE)

Loading CLIP model 'openai/clip-vit-base-patch32' on device: cuda...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
DATA_DIR = "coco_data/"
OUTPUT_DIR = DATA_DIR + "embeddings/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

BATCH_SIZE = 64
NUM_WORKERS = 4  # Set to 0 on Windows

In [6]:
# Embed Train Set
extract_clip_embeddings(
    image_dir=DATA_DIR + "train2017/",
    output_path=OUTPUT_DIR + "train_clip_embeddings.pt",
    clip_model=clip_model,
    clip_processor=clip_processor,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,  # Set to 0 on Windows
)

Starting CLIP embedding extraction for 118287 images...


CLIP Embedding Extraction: 100%|██████████| 1849/1849 [57:33<00:00,  1.87s/it] 


Saving 118287 embeddings to coco_data/embeddings/train_clip_embeddings.pt...


In [7]:
# Embed Validation Set
extract_clip_embeddings(
    image_dir=DATA_DIR + "val2017/",
    output_path=OUTPUT_DIR + "val_clip_embeddings.pt",
    clip_model=clip_model,
    clip_processor=clip_processor,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,  # Set to 0 on Windows
)

Starting CLIP embedding extraction for 5000 images...


CLIP Embedding Extraction: 100%|██████████| 79/79 [02:09<00:00,  1.64s/it]

Saving 5000 embeddings to coco_data/embeddings/val_clip_embeddings.pt...





In [None]:
# Embed Test Set
extract_clip_embeddings(
    image_dir=DATA_DIR + "val2014/", # We use val2014 as test set
    output_path=OUTPUT_DIR + "test_clip_embeddings.pt",
    clip_model=clip_model,
    clip_processor=clip_processor,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,  # Set to 0 on Windows
)

Starting CLIP embedding extraction for 40670 images...


CLIP Embedding Extraction: 100%|██████████| 636/636 [14:06<00:00,  1.33s/it]

Saving 40670 embeddings to coco_data/embeddings/test_clip_embeddings.pt...



