In [1]:
import pickle
import numpy as np
import torch
from tqdm import tqdm

## Load the `laion/CLIP-ViT-H-14-laion2B-s32B-b79K` model

In [5]:
"""
    Use the `transformers` library to instantiate and get inference scores for different variants of the `clip-vit` family of zero-shot-classification models
@ref https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images/
"""
from PIL import Image
from transformers import AutoProcessor, CLIPModel, AutoTokenizer

# Model variant
model_variant = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"

# Load model, tokeniser, and processor
model = CLIPModel.from_pretrained(model_variant)
processor = AutoProcessor.from_pretrained(model_variant)
tokenizer = AutoTokenizer.from_pretrained(model_variant)

# Get image/text similarity softmax output
def image_text_relevance(image_path:str, text_choices:list[str]):
    global processor, model
    img = Image.open(image_path)
    inputs = processor(
        text = text_choices,
        images = img,
        return_tensors = "pt",
        padding = True
    )

    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
    return probs

# Get image feature vector
def image_features(image_path:str):
    global model, processor
    img = Image.open(image_path)
    inputs = processor(
        images = img,
        return_tensors = "pt",
        padding = True,
        device=0
    ).to("cuda:0")
    image_features = model.get_image_features(**inputs) # image features
    return image_features

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [6]:
model.to("cuda:0")

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 1024)
      (position_embedding): Embedding(77, 1024)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-23): 24 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), e

## Load the input `.png` files

In [7]:
import glob

all_png_files = glob.glob("/kaggle/input/pointnet-merged/original-image-individual/original-image-individual/rendered-original/*.png")
print(f"Total number of examples: {len(all_png_files)}")

Total number of examples: 2468


## Method to get 1024-dim features 

In [8]:
def global_features(img_path):
    # Get the model features
    feat = image_features(img_path)
    return feat

## Generate and save embeddings for ModelNet

In [9]:
from tqdm import tqdm
import numpy as np

all_model_outputs = {}

for i in tqdm(range(len(all_png_files))):
    # Get file name identifier
    full_file_name = all_png_files[i]
    filtered_file_name = full_file_name.split("/")[-1].replace("_image.png","")
    
    # Get output tensor
    feat = global_features(full_file_name)
    feat_cpu = feat.detach().cpu()
    
    # Save model outputs
    if filtered_file_name not in all_model_outputs:
        all_model_outputs[filtered_file_name] = feat_cpu

100%|██████████| 2468/2468 [04:39<00:00,  8.83it/s]


## Save the embeddings

In [11]:
assert len(all_model_outputs.keys()) == len(all_png_files)

In [12]:
import pickle

with open("clip_image_global_features_full.pkl", "wb") as f:
    pickle.dump(all_model_outputs, f)