In [2]:
import torch
from PIL import Image
import open_clip
import ipyplot
import matplotlib.pyplot as plt

from matplotlib.pyplot import figure, imshow, axis
from matplotlib.image import imread

torch.set_printoptions(precision=3, sci_mode=False)

In [3]:
def showImagesHorizontally(list_of_files):
    fig = figure()
    number_of_files = len(list_of_files)
    for i in range(number_of_files):
        a=fig.add_subplot(1,number_of_files,i+1)
        image = imread(list_of_files[i])
        # imshow(image, cmap='Greys_r')
        imshow(image)
        axis('off')

# Load Model

In [4]:
open_clip.list_models()

['coca_base',
 'coca_roberta-ViT-B-32',
 'coca_ViT-B-32',
 'coca_ViT-L-14',
 'convnext_base',
 'convnext_base_w',
 'convnext_base_w_320',
 'convnext_large',
 'convnext_large_d',
 'convnext_large_d_320',
 'convnext_small',
 'convnext_tiny',
 'convnext_xlarge',
 'convnext_xxlarge',
 'convnext_xxlarge_320',
 'EVA01-g-14',
 'EVA01-g-14-plus',
 'EVA02-B-16',
 'EVA02-E-14',
 'EVA02-E-14-plus',
 'EVA02-L-14',
 'EVA02-L-14-336',
 'mt5-base-ViT-B-32',
 'mt5-xl-ViT-H-14',
 'nllb-clip-base',
 'nllb-clip-base-siglip',
 'nllb-clip-large',
 'nllb-clip-large-siglip',
 'RN50',
 'RN50-quickgelu',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'RN101',
 'RN101-quickgelu',
 'roberta-ViT-B-32',
 'swin_base_patch4_window7_224',
 'ViT-B-16',
 'ViT-B-16-plus',
 'ViT-B-16-plus-240',
 'ViT-B-16-quickgelu',
 'ViT-B-16-SigLIP',
 'ViT-B-16-SigLIP-256',
 'ViT-B-16-SigLIP-384',
 'ViT-B-16-SigLIP-512',
 'ViT-B-16-SigLIP-i18n-256',
 'ViT-B-32',
 'ViT-B-32-256',
 'ViT-B-32-plus-256',
 'ViT-B-32-quickgelu',
 'ViT-bigG-14',
 'ViT-

In [5]:
model_b, _, preprocess_b = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer_b = open_clip.get_tokenizer('ViT-B-32')

In [6]:
model_l, _, preprocess_l = open_clip.create_model_and_transforms('ViT-L-14-336', pretrained='openai')
tokenizer_l = open_clip.get_tokenizer('ViT-B-32')

In [7]:
text_b = tokenizer_b(["a diagram", "a lobster", "a cat"])
text_l = tokenizer_l(["a diagram", "a lobster", "a cat"])

# Show Images

<div style="display: flex; flex-direction: row;">
    <img src="./experiment_data/lobster_grayscale_0.png" style="width: 200px; height: auto; margin-right: 10px;">
    <img src="./experiment_data/lobster_blue_0.png" style="width: 200px; height: auto; margin-right: 10px;">
    <img src="./experiment_data/lobster_red_0.png" style="width: 200px; height: auto; margin-right: 10px;">
</div>

In [8]:
lobster_0_files = [
    "./experiment_data/lobster_grayscale_0.png",
    "./experiment_data/lobster_blue_0.png",
    "./experiment_data/lobster_red_0.png"
]

lobster_1_files = [
    "./experiment_data/lobster_grayscale_1.png",
    "./experiment_data/lobster_blue_1.png",
    "./experiment_data/lobster_red_1.png"
]

lobster_2_files = [
    "./experiment_data/lobster_grayscale_2.png",
    "./experiment_data/lobster_blue_2.png",
    "./experiment_data/lobster_red_2.png"
]

# showImagesHorizontally(lobster_0_files)

# Lobster (Top)

In [9]:
ipyplot.plot_images(lobster_0_files, max_images=3, img_width=250)

# Lobster (Front)

In [10]:
ipyplot.plot_images(lobster_1_files, max_images=3, img_width=250)

# Lobster (Bottom)

In [11]:
ipyplot.plot_images(lobster_2_files, max_images=3, img_width=250)

In [12]:
for filepath in lobster_0_files:
    image = preprocess_b(Image.open(filepath)).unsqueeze(0)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model_b.encode_image(image)
        text_features = model_b.encode_text(text_b)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        dist = image_features @ text_features.T
        cos_sim = dist / (torch.norm(image_features, dim=-1) * torch.norm(text_features, dim=-1))
        text_probs = (100.0 * dist).softmax(dim=-1)
    
    print("Image: ", filepath.split("/")[-1])
    print(f"Label probs: {text_probs}")
    print(f"Cosine similarity: {cos_sim}")
    print()

Image:  lobster_grayscale_0.png
Label probs: tensor([[    0.015,     0.985,     0.000]])
Cosine similarity: tensor([[0.226, 0.268, 0.169]])

Image:  lobster_blue_0.png
Label probs: tensor([[    0.003,     0.997,     0.000]])
Cosine similarity: tensor([[0.232, 0.292, 0.157]])

Image:  lobster_red_0.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.235, 0.319, 0.164]])



In [13]:
for filepath in lobster_1_files:
    image = preprocess_b(Image.open(filepath)).unsqueeze(0)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model_b.encode_image(image)
        text_features = model_b.encode_text(text_b)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        dist = image_features @ text_features.T
        cos_sim = dist / (torch.norm(image_features, dim=-1) * torch.norm(text_features, dim=-1))
        text_probs = (100.0 * dist).softmax(dim=-1)
    
    print("Image: ", filepath.split("/")[-1])
    print(f"Label probs: {text_probs}")
    print(f"Cosine similarity: {cos_sim}")
    print()

Image:  lobster_grayscale_1.png
Label probs: tensor([[    0.002,     0.998,     0.000]])
Cosine similarity: tensor([[0.216, 0.280, 0.151]])

Image:  lobster_blue_1.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.197, 0.288, 0.132]])

Image:  lobster_red_1.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.196, 0.322, 0.136]])



In [14]:
for filepath in lobster_2_files:
    image = preprocess_b(Image.open(filepath)).unsqueeze(0)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model_b.encode_image(image)
        text_features = model_b.encode_text(text_b)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        dist = image_features @ text_features.T
        cos_sim = dist / (torch.norm(image_features, dim=-1) * torch.norm(text_features, dim=-1))
        text_probs = (100.0 * dist).softmax(dim=-1)
    
    print("Image: ", filepath.split("/")[-1])
    print(f"Label probs: {text_probs}")
    print(f"Cosine similarity: {cos_sim}")
    print()

Image:  lobster_grayscale_2.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.207, 0.313, 0.177]])

Image:  lobster_blue_2.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.180, 0.317, 0.137]])

Image:  lobster_red_2.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.198, 0.328, 0.148]])



# CLIP: ViT-L-14 (Larger model)

In [15]:
for filepath in lobster_0_files:
    image = preprocess_l(Image.open(filepath)).unsqueeze(0)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model_l.encode_image(image)
        text_features = model_l.encode_text(text_b)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        dist = image_features @ text_features.T
        cos_sim = dist / (torch.norm(image_features, dim=-1) * torch.norm(text_features, dim=-1))
        text_probs = (100.0 * dist).softmax(dim=-1)
    
    print("Image: ", filepath.split("/")[-1])
    print(f"Label probs: {text_probs}")
    print(f"Cosine similarity: {cos_sim}")
    print()

Image:  lobster_grayscale_0.png
Label probs: tensor([[0.020, 0.968, 0.011]])
Cosine similarity: tensor([[0.168, 0.207, 0.162]])

Image:  lobster_blue_0.png
Label probs: tensor([[0.015, 0.971, 0.014]])
Cosine similarity: tensor([[0.153, 0.195, 0.152]])

Image:  lobster_red_0.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.149, 0.237, 0.154]])



In [16]:
for filepath in lobster_1_files:
    image = preprocess_l(Image.open(filepath)).unsqueeze(0)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model_l.encode_image(image)
        text_features = model_l.encode_text(text_b)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        dist = image_features @ text_features.T
        cos_sim = dist / (torch.norm(image_features, dim=-1) * torch.norm(text_features, dim=-1))
        text_probs = (100.0 * dist).softmax(dim=-1)
    
    print("Image: ", filepath.split("/")[-1])
    print(f"Label probs: {text_probs}")
    print(f"Cosine similarity: {cos_sim}")
    print()

Image:  lobster_grayscale_1.png
Label probs: tensor([[0.008, 0.989, 0.002]])
Cosine similarity: tensor([[0.160, 0.208, 0.148]])

Image:  lobster_blue_1.png
Label probs: tensor([[0.003, 0.996, 0.002]])
Cosine similarity: tensor([[0.156, 0.215, 0.151]])

Image:  lobster_red_1.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.143, 0.226, 0.140]])



In [17]:
for filepath in lobster_2_files:
    image = preprocess_l(Image.open(filepath)).unsqueeze(0)
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model_l.encode_image(image)
        text_features = model_l.encode_text(text_b)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        dist = image_features @ text_features.T
        cos_sim = dist / (torch.norm(image_features, dim=-1) * torch.norm(text_features, dim=-1))
        text_probs = (100.0 * dist).softmax(dim=-1)
    
    print("Image: ", filepath.split("/")[-1])
    print(f"Label probs: {text_probs}")
    print(f"Cosine similarity: {cos_sim}")
    print()

Image:  lobster_grayscale_2.png
Label probs: tensor([[    0.000,     0.999,     0.000]])
Cosine similarity: tensor([[0.161, 0.239, 0.156]])

Image:  lobster_blue_2.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.139, 0.237, 0.142]])

Image:  lobster_red_2.png
Label probs: tensor([[    0.000,     1.000,     0.000]])
Cosine similarity: tensor([[0.141, 0.250, 0.152]])

