# CLIP for image processing

In [1]:
# ! pip install ftfy regex tqdm
# ! pip install git+https://github.com/openai/CLIP.git

In [1]:
import numpy as np
import torch
from pkg_resources import packaging

print("Torch version:", torch.__version__)

Torch version: 1.10.0+cu113


In [2]:
import clip

clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [3]:
model, preprocess = clip.load("ViT-B/32")
# model, preprocess = clip.load("RN101")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

100%|███████████████████████████████████████| 278M/278M [00:33<00:00, 8.83MiB/s]


Model parameters: 119,688,033
Input resolution: 224
Context length: 77
Vocab size: 49408


In [4]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7f104e246d30>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [5]:
clip.tokenize("Hello World!")

tensor([[49406,  3306,  1002,   256, 49407,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)

In [6]:
dataset_root = "dataset/amazon_dresses/"
image_dir = dataset_root + 'images_resized'
docnos_file = dataset_root + 'dresses_docnos.json'

In [7]:
import json
splits = json.load(open(docnos_file, 'r'))

In [8]:
from PIL import Image
import os
from tqdm import tqdm

image_features = []
for filename in tqdm(splits):
    image = Image.open(os.path.join(image_dir, filename+'.jpg')).convert("RGB")
    image_input = preprocess(image)
    # image_input = torch.tensor(image_input).cuda()
    image_input = torch.unsqueeze(image_input, 0).cuda()
    with torch.no_grad():
        image_feature = model.encode_image(image_input).float()
    image_features.append(image_feature)

100%|██████████| 18501/18501 [07:15<00:00, 42.52it/s]


In [9]:
image_features = torch.cat(image_features)
image_features.shape

torch.Size([18501, 512])

In [10]:
embeddings_clip = {"all":image_features}

from six.moves import cPickle
with open(os.path.join('image_features', 'clip_embedding.p'), 'wb') as f:
    cPickle.dump(embeddings_clip, f)