# Zero-shot classification Evaluation

In [155]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_0.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_12.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_11.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_16.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_8.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_19.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_22.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_20.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_34.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_26.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_10.npy
/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-set-np/testmin_27.npy
/kaggl

## Candidate choices
- https://huggingface.co/openai/clip-vit-base-patch32 
- https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
- https://huggingface.co/openai/clip-vit-large-patch14
- https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K

In [156]:
sample_image_path = "/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-img/testmin_10_image.png"

In [157]:
"""
    Use the `transformers` library to instantiate and get inference scores for different variants of the `clip-vit` family of zero-shot-classification models
@ref https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images/
"""
from PIL import Image
from transformers import AutoProcessor, CLIPModel, AutoTokenizer

# Model variant
model_variant = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"

# Load model, tokeniser, and processor
model = CLIPModel.from_pretrained(model_variant)
processor = AutoProcessor.from_pretrained(model_variant)
tokenizer = AutoTokenizer.from_pretrained(model_variant)

# Get image/text similarity softmax output
def image_text_relevance(image_path:str, text_choices:list[str]):
    global processor, model
    img = Image.open(image_path)
    inputs = processor(
        text = text_choices,
        images = img,
        return_tensors = "pt",
        padding = True
    )

    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
    return probs

# Get image feature vector
def image_features(image_path:str):
    global model, processor
    img = Image.open(image_path)
    inputs = processor(
        images = image,
        return_tensors = "pt",
        padding = True
    )
    image_features = model.get_image_features(**inputs) # image features
    return image_features

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [158]:
X = image_text_relevance(sample_image_path, ["A model of a cup", "A model of a ball", "A model of a train"])

In [159]:
X

tensor([[9.9977e-01, 2.3415e-04, 1.4951e-09]], grad_fn=<SoftmaxBackward0>)

## Construct text prompts

In [163]:
label_to_names = {0: 'airplane',
       1: 'bathtub',
       2: 'bed',
       3: 'bench',
       4: 'bookshelf',
       5: 'bottle',
       6: 'bowl',
       7: 'car',
       8: 'chair',
       9: 'cone',
       10: 'cup',
       11: 'curtain',
       12: 'desk',
       13: 'door',
       14: 'dresser',
       15: 'flower_pot',
       16: 'glass_box',
       17: 'guitar',
       18: 'keyboard',
       19: 'lamp',
       20: 'laptop',
       21: 'mantel',
       22: 'monitor',
       23: 'night_stand',
       24: 'person',
       25: 'piano',
       26: 'plant',
       27: 'radio',
       28: 'range_hood',
       29: 'sink',
       30: 'sofa',
       31: 'stairs',
       32: 'stool',
       33: 'table',
       34: 'tent',
       35: 'toilet',
       36: 'tv_stand',
       37: 'vase',
       38: 'wardrobe',
       39: 'xbox'
}

In [164]:
all_text_prompts = []

for i in range(40):
    base_prompt = label_to_names[i]
    full_prompt = f"A model of {base_prompt}"
    all_text_prompts.append(full_prompt)

## Metrics

In [165]:
def top_1_individual(img_file_path, text_prompts):
    # Find the file index from the full file path
    def find_index_from_filename(img_file_path):
        # sample_image_path = "/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-img/testmin_10_image.png"
        base_file_path = img_file_path.split("/")[-1].replace(".png","")
        index = int(base_file_path.split("_")[1])
        return index
    
    # Find the argmax in the output of the softmax function
    def find_index_softmax(img_file_path, text_prompts):
        # Get the model outputs
        X = image_text_relevance(img_file_path, text_prompts)
        # Get the argmax
        y = torch.argmax(X)
        index = y.detach().cpu()
        return index
    
    # Comparison cell
    index_1 = find_index_from_filename(img_file_path)
    index_2 = find_index_softmax(img_file_path, text_prompts)
    
    if index_1 is None or index_2 is None:
        return 0
    
    if int(index_1) == int(index_2):
        return 1
    else:
        return 0    

In [166]:
import glob
from tqdm import tqdm
all_input_images_test = (glob.glob("/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-img/*.png"))

total_score = 0
for img in tqdm(all_input_images_test):
    total_score += top_1_individual(img, all_text_prompts)
    
print(total_score)

100%|██████████| 40/40 [02:00<00:00,  3.00s/it]

12





In [167]:
total_score/40

0.3

In [168]:
def top_k_individual(img_file_path, text_prompts, k):
    # Find the file index from the full file path
    def find_index_from_filename(img_file_path):
        # sample_image_path = "/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-img/testmin_10_image.png"
        base_file_path = img_file_path.split("/")[-1].replace(".png","")
        index = int(base_file_path.split("_")[1])
        return index
    
    # Find the k argmax indices in the output of the softmax function
    def find_index_softmax(img_file_path, text_prompts):
        # Get the model outputs
        X = image_text_relevance(img_file_path, text_prompts)
        # Get the argmax
        y = torch.topk(X, int(k))
        z = np.array(y.indices.detach().cpu())[0]
        return z.tolist()
    
    # Comparison cell
    index_1 = find_index_from_filename(img_file_path)
    index_2 = find_index_softmax(img_file_path, text_prompts)
    
    if index_1 is None or index_2 is None:
        return 0
    
    if int(index_1) in index_2:
        return 1
    else:
        return 0    

In [171]:
import glob
from tqdm import tqdm
all_input_images_test = (glob.glob("/kaggle/input/modelnet-minimal/minimal-test-set/minimal-test-img/*.png"))

total_score = 0
for img in tqdm(all_input_images_test):
    total_score += top_k_individual(img, all_text_prompts, 3)
    
print(total_score)

100%|██████████| 40/40 [02:05<00:00,  3.13s/it]

16





In [172]:
total_score/40

0.4