In [1]:
import os

import pandas as pd
import numpy as np
import torch
import clip
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

In [2]:
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [2]:
data_root = "../../data/CUB_200_2011"

# Load image data
images = pd.read_csv(
    os.path.join(data_root, "CUB_200_2011", "images.txt"),
    sep=" ", names=["image_id", "filepath"],
)
image_class_labels = pd.read_csv(
    os.path.join(data_root, "CUB_200_2011", "image_class_labels.txt"),
    sep=" ", names=["image_id", "class_id"],
)
train_test_split = pd.read_csv(
    os.path.join(data_root, "CUB_200_2011", "train_test_split.txt"),
    sep=" ", names=["image_id", "is_training_image"],
)
classes = pd.read_csv(
    os.path.join(data_root, "CUB_200_2011", "classes.txt"),
    sep=" ", names=["class_id", "class_name"],
)

data = images.merge(image_class_labels, on="image_id")
data = data.merge(train_test_split, on="image_id")
data = data.merge(classes, on="class_id")
data = data[data.is_training_image==0]
data["class_name"] = [class_name.split(".")[1].lower().replace("_", " ") for class_name in data.class_name]

# Load attribute data
image_attribute_labels = pd.read_csv(
    os.path.join(data_root, "CUB_200_2011", "attributes", "image_attribute_labels.txt"),
    sep=" ", names=["image_id", "attribute_id", "is_present", "certainty_id", "time"],
)
attributes = pd.read_csv(
    os.path.join(data_root, "CUB_200_2011", "attributes", "attributes.txt"),
    sep=" ", names=["attribute_id", "attribute_name"]
)
attributes_info = [attr.split("::") for attr in attributes.attribute_name]
attributes_info = np.array([[attr.replace("_", " "), label.replace("_", " ")] for attr, label in attributes_info])
attributes["attribute_template"] = attributes_info[:, 0]
attributes["attribute_label"] = attributes_info[:, 1]
attributes = image_attribute_labels.merge(attributes, on="attribute_id")
unique_attributes = attributes.attribute_template.unique()

# CUB Concept Classificatin

In [221]:
# Zero-shot concept classification - 312 retrieval task
topk = 1

image_id_list, class_id_list, attribute_list, certainty_list = [],[],[],[]
label_value_list, label_rank_list, pred_list, label_list = [],[],[],[]
#for row_id in tqdm(range(len(data))):
for row_id in tqdm(range(1)):
    # Prepare image inputs
    image_id, class_id, image_name = data.iloc[row_id][["image_id", "class_id", "filepath"]]
    image_path = os.path.join(data_root, "CUB_200_2011", "images", image_name)
    image_input = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    image_attributes = attributes[attributes.image_id==image_id]
    
    num_presented_attributes = 0
    for uni_attr in unique_attributes:
        curr_attr_df = image_attributes[image_attributes.attribute_template==uni_attr]
        
        if any(curr_attr_df.is_present):
            # Only evaluate when an attribute is presented in the image
            num_presented_attributes += 1
            curr_attr_label = np.where(curr_attr_df.is_present)[0][0]
            
            # Prepare text inputs
            text_inputs_raw = [
                "a photo of bird whose {} is {}".format(attr.replace("has ", ""), label)
                for attr, label in zip(curr_attr_df["attribute_template"], curr_attr_df["attribute_label"])
            ]

            # Multiclass concept classification
            # Calculate features
            text_inputs = clip.tokenize(text_inputs_raw).to(device)
            with torch.no_grad():
                image_features = model.encode_image(image_input)
                text_features = model.encode_text(text_inputs)

            # Pick k most similar labels for the image
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            values, indices = similarity[0].sort(descending=True)
            pred = indices[0].item()
            label_value = similarity[0][curr_attr_label].item()
            label_rank = (indices==curr_attr_label).nonzero()[0][0].item() + 1  # Index to 1
            certainty = curr_attr_df.certainty_id.mean()
            
            image_id_list.append(image_id)
            class_id_list.append(class_id)
            attribute_list.append(uni_attr)
            label_value_list.append(label_value)
            label_rank_list.append(label_rank)
            pred_list.append(pred)
            label_list.append(curr_attr_label)
            certainty_list.append(certainty)

100%|███████████████████████████████████████| 1/1 [00:12<00:00, 12.05s/it]


In [219]:
pd.DataFrame(
    {
        "image_id": image_id_list,
        "class_id": class_id_list,
        "attribute": attribute_list,
        "pred": pred_list,
        "label": label_list,
        "label_value": label_value_list,
        "label_rank": label_rank_list,
        "certainty_id": certainty_list,
    }
)

Unnamed: 0,image_id,class_id,attribute,pred,label,label_value,label_rank,certainty_id
0,1,1,has bill shape,4,4,0.766961,1,3.0
1,1,1,has head pattern,8,3,0.121434,2,3.0
2,1,1,has throat color,7,14,0.053571,7,4.0
3,1,1,has eye color,4,1,0.125507,2,4.0
4,1,1,has bill length,0,1,0.271769,3,4.0
5,1,1,has forehead color,12,12,0.152225,1,4.0
6,1,1,has nape color,12,14,0.055571,8,4.0
7,1,1,has size,3,0,0.171008,4,2.0
8,1,1,has shape,5,2,0.098761,3,2.0
9,1,1,has primary color,5,14,0.037261,11,3.0


# CUB Concept classification - to-be-removed

In [None]:
# Zero-shot concept classification - 312 retrieval task
topk = 1

image_id_list, class_id_list, attribute_id_list, certainty_id_list = [],[],[],[]
value_list, pred_list, label_list = [],[],[]
#for row_id in tqdm(range(len(data))):
for row_id in tqdm(range(2)):
    # Prepare image inputs
    image_id, class_id, image_name = data.iloc[row_id][["image_id", "class_id", "filepath"]]
    image_path = os.path.join(data_root, "CUB_200_2011", "images", image_name)
    image_input = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    image_attributes = attributes[attributes.image_id==image_id]

    # Prepare text inputs
    text_inputs_raw = [
        [
            "a photo of bird whose {} is not {}".format(attr.replace("has ", ""), label),
            "a photo of bird whose {} is {}".format(attr.replace("has ", ""), label),
        ]
        for attr, label in zip(attributes["attribute_template"], attributes["attribute_label"])
    ]

    curr_value, curr_pred = [],[]
    for text in text_inputs_raw:
        # Binary concept classification
        # Calculate features
        text_inputs = clip.tokenize(text)
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_inputs)

        # Pick k most similar labels for the image
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        value, pred = similarity[0].topk(topk)
        curr_value.append(value.item())
        curr_pred.append(pred.item())
    
    image_id_list.append([image_id] * len(text_inputs_raw))
    class_id_list.append([class_id] * len(text_inputs_raw))
    attribute_id_list.append(np.arange(1, len(text_inputs_raw)+1))
    value_list.append(curr_value)
    pred_list.append(curr_pred)
    label_list.append(image_attributes.is_present.to_numpy())
    certainty_id_list.append(image_attributes.certainty_id.to_numpy())
    
    """
    # Zero-shot concept classifcation - Output
    print("Top predictions:\n")
    for text, value, pred in zip(text_inputs_raw, value_list, pred_list):
        print(f"{100 * value:6.2f}: {text[pred]}")
    """

In [None]:
pd.DataFrame(
    {
        "image_id": np.array(image_id_list).flatten(),
        "class_id": np.array(class_id_list).flatten(),
        "attribute_id": np.array(attribute_id_list).flatten(),
        "value": np.array(value_list).flatten(),
        "pred": np.array(pred_list).flatten(),
        "label": np.array(label_list).flatten(),
        "certainty_id": np.array(certainty_id_list).flatten(),
    }
)

# CUB Class Classification

In [225]:
# Zero-shot bird species classfication
topk = 1

label = data.class_id
image_id_list, value_list, pred_list = [],[],[]

# Prepare text inputs
text_inputs_raw = [f"the bird is {class_name}" for class_name in data.class_name.unique()]
text_inputs = torch.cat([clip.tokenize(text) for text in text_inputs_raw]).to(device)

#for row_id in tqdm(range(len(data))):
for row_id in tqdm(range(2)):
    # Prepare image inputs
    image_id, image_name = data.iloc[row_id][["image_id", "filepath"]]
    image_path = os.path.join(data_root, "CUB_200_2011", "images", image_name)
    image_input = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

    # Calculate featuress
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    # Pick k most similar labels for the image
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    value, pred = similarity[0].topk(topk)

    image_id_list.append(image_id)
    value_list.append(value.item())
    pred_list.append(pred.item())

pred_list = np.array(pred_list) + 1  # Shift predictions by 1 to align with

100%|███████████████████████████████████████| 1/1 [00:14<00:00, 14.80s/it]


In [231]:
df = pd.DataFrame(
    {
        "image_id": np.array(image_id_list).flatten(),
        "value": np.array(value_list).flatten(),
        "pred": np.array(pred_list).flatten(),
        "label": np.array(label[:2]).flatten(),
    }
)
df

Unnamed: 0,image_id,value,pred,label
0,1,0.64262,1,1
