##### Notes

# Data Exploration

### Read & Split the Dataset

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("Cats&Dogs_Dataset.csv")

# splitting the dataset into training and testing 80:20
from sklearn.model_selection import train_test_split

# only rows in image column that contain string with cat
df_cat = df[df["image"].str.contains("cat")]
# only rows in image column that contain string with dog
df_dog = df[df["image"].str.contains("dog")]

# train test split
train_cat, test_cat = train_test_split(df_cat, test_size=0.2, random_state=1)
train_dog, test_dog = train_test_split(df_dog, test_size=0.2, random_state=1)

#take only 10 rows from each dataset to train the model
train_cat = train_cat.head(2)
train_dog = train_dog.head(2)

### Load Clip 

In [None]:
import torch
import clip 

clip.available_models()

In [None]:
# Load model once at the beginning
# model, preprocess = clip.load("ViT-B/32", device="cpu") 
# model.eval() # move model to GPU and set it to evaluation mode

In [None]:
#save the model
# torch.save(model, 'model.pt')

#load the model
model = torch.load('model.pt')
model , preprocess = clip.load("ViT-B/32", device="cpu")

## Setting up input images

In [None]:
import os 
import skimage as sk #image processing library
import matplotlib.pyplot as plt
from PIL import Image #image processing library

#images destination
descriptions = {
"C",
"D",
}

In [None]:
original_images = []
processed_images = []
descriptions = ["C", "D"]  # Change set to list
plt.figure(figsize=(16, 16))

# preprocess the images
for i in range(0, 4):
    var = ''
    if i % 2 == 0:
        var = 'cat.' + str(i)
    else:
        var = 'dog.' + str(i)
    # original image
    original_image = Image.open('./Cats&Dogs_Pics/' + var + '.jpg')
    original_images.append(original_image)
    # processed image
    processed_image = preprocess(original_image)
    processed_images.append(processed_image)
    # description
    plt.subplot(1, 4, i + 1)
    plt.imshow(original_image)
    if i % 2 == 0:
        plt.title(descriptions[0])
    else:
        plt.title(descriptions[1])
    plt.axis("off")

plt.tight_layout()
plt.show()


In [None]:
from PIL import Image

# Building Features
image_input_train_cat = torch.stack([preprocess(Image.open('./Cats&Dogs_Pics/' + img_path)) for img_path in train_cat['image']]) #preprocess the images and stack them into a tensor
image_input_train_dog = torch.stack([preprocess(Image.open('./Cats&Dogs_Pics/' + img_path)) for img_path in train_dog['image']]) #preprocess the images and stack them into a tensor

text_tokens_train_cat = clip.tokenize(descriptions[0] * len(train_cat)) #tokenize the text and stack them into a tensor
text_tokens_train_dog = clip.tokenize(descriptions[1] * len(train_dog)) #tokenize the text and stack them into a tensor

text_features_train_cat = model.encode_text(text_tokens_train_cat).float() #encode the text and convert to float
text_features_train_dog = model.encode_text(text_tokens_train_dog).float() #encode the text and convert to float

# Combine image and text features
with torch.no_grad(): #disable gradient calculation to speed up computation and reduce memory consumption
    image_features_train_cat = model.encode_image(image_input_train_cat).float()
    image_features_train_dog = model.encode_image(image_input_train_dog).float()


In [None]:
def calculate_and_visualize_combined_cosine_similarity(image_features_cat, text_features_cat, image_features_dog, text_features_dog, descriptions, original_images):
    # Normalize the features
    image_features_cat /= image_features_cat.norm(dim=-1, keepdim=True)
    text_features_cat /= text_features_cat.norm(dim=-1, keepdim=True)
    image_features_dog /= image_features_dog.norm(dim=-1, keepdim=True)
    text_features_dog /= text_features_dog.norm(dim=-1, keepdim=True)

    # Calculate cosine similarity for both cats and dogs
    similarity_matrix_cat = text_features_cat.detach().numpy() @ image_features_cat.detach().numpy().T
    similarity_matrix_dog = text_features_dog.detach().numpy() @ image_features_dog.detach().numpy().T

    # Create a combined visualization
    count_cat = len(descriptions[0])
    count_dog = len(descriptions[1])

    plt.figure(figsize=(20, 14))

    # Plot Cat Images
    plt.subplot(2, 2, 1)
    for i, image in enumerate(original_images[:count_cat]):
        plt.imshow(image, extent=(i - 0.5, i + 0.5, -0.6, 0.6), origin="upper")  # Adjust extent and origin
    plt.yticks([])
    plt.xticks([])
    plt.title("Original Images - Cats", size=20)

    for side in ["left", "top", "right", "bottom"]:
        plt.gca().spines[side].set_visible(False)

    # Plot Cosine Similarity Matrix for Cats
    plt.subplot(2, 2, 2)
    plt.imshow(similarity_matrix_cat, vmin=0.1, vmax=0.3, cmap='viridis', origin="upper")  # Adjust origin
    plt.yticks([])
    plt.xticks([])
    plt.title("Cosine Similarity Matrix - Cats", size=20)

    for x in range(similarity_matrix_cat.shape[1]):
        for y in range(similarity_matrix_cat.shape[0]):
            plt.text(x, y, f"{similarity_matrix_cat[y, x]:.2f}", ha="center", va="center", size=12)

    for side in ["left", "top", "right", "bottom"]:
        plt.gca().spines[side].set_visible(False)

    # Plot Dog Images
    plt.subplot(2, 2, 3)
    for i, image in enumerate(original_images[count_cat:count_cat + count_dog]):
        plt.imshow(image, extent=(i - 0.5, i + 0.5, -0.6, 0.6), origin="upper")  # Adjust extent and origin
    plt.yticks([])
    plt.xticks([])
    plt.title("Original Images - Dogs", size=20)

    for side in ["left", "top", "right", "bottom"]:
        plt.gca().spines[side].set_visible(False)

    # Plot Cosine Similarity Matrix for Dogs
    plt.subplot(2, 2, 4)
    plt.imshow(similarity_matrix_dog, vmin=0.1, vmax=0.3, cmap='viridis', origin="upper")  # Adjust origin
    plt.yticks([])
    plt.xticks([])
    plt.title("Cosine Similarity Matrix - Dogs", size=20)

    for x in range(similarity_matrix_dog.shape[1]):
        for y in range(similarity_matrix_dog.shape[0]):
            plt.text(x, y, f"{similarity_matrix_dog[y, x]:.2f}", ha="center", va="center", size=12)

    for side in ["left", "top", "right", "bottom"]:
        plt.gca().spines[side].set_visible(False)

    plt.tight_layout()
    plt.show()

# Visualize Combined Cosine Similarity Matrix for Cats and Dogs
calculate_and_visualize_combined_cosine_similarity(image_features_train_cat, text_features_train_cat, image_features_train_dog, text_features_train_dog, descriptions, original_images)


In [None]:
# Example Evaluation (You need to modify this based on your specific labels)
predictions = (similarity_cat > similarity_dog).astype(int)
ground_truth = np.concatenate([np.zeros(len(similarity_cat)), np.ones(len(similarity_dog))])

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(ground_truth, predictions)
accuracy = accuracy_score(ground_truth, predictions)
precision = precision_score(ground_truth, predictions)
recall = recall_score(ground_truth, predictions)
f1 = f1_score(ground_truth, predictions)

print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


## Notes

To generate text descriptions for images using the CLIP model, you would typically follow these steps:

1. Load the image and preprocess it to the format expected by the CLIP model.
2. Use `model.encode_image(image:Tensor)` to convert the image into a feature vector.
3. Generate a set of candidate descriptions (these could be completely random, or they could be based on some prior knowledge).
4. Use `clip.tokenize` to convert these descriptions into the format expected by the CLIP model.
5. Use `model.encode_text` to convert these tokenized descriptions into feature vectors.
6. Compare the image feature vector to the description feature vectors to find the best match. The description corresponding to the closest-matching feature vector is the model's generated description of the image.

Here's a Python code snippet that demonstrates these steps:


This code assumes that you have the CLIP model and the necessary libraries (torch, torchvision, PIL) installed. If not, you can install them with pip:


In [None]:
# import torch
# from PIL import Image
# from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
# from clip import clip, tokenize

# # Load the image
# image_path = "path_to_your_image.jpg"
# image = Image.open(image_path)

# # Preprocess the image
# preprocess = Compose([
#     Resize(256), 
#     CenterCrop(224), 
#     ToTensor(), 
#     Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
# ])
# image = preprocess(image)
# image = image.unsqueeze(0)  # add batch dimension

# # Load the CLIP model
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model, transform = clip.load("ViT-B/32", device=device)

# # Encode the image to get a feature vector
# image_features = model.encode_image(image.to(device))

# # Generate some candidate descriptions
# descriptions = ["A cat on a sofa", "A dog in a park", "A group of people at the beach"]

# # Tokenize and encode the descriptions to get feature vectors
# description_tokens = clip.tokenize(descriptions).to(device)
# description_features = model.encode_text(description_tokens)

# # Compare the image feature vector to the description feature vectors
# # The dot product between two vectors is a measure of how similar they are
# similarities = (image_features @ description_features.T).softmax(dim=-1)

# # Get the index of the most similar description
# best_match_index = similarities.argmax(dim=-1).item()

# # Print the best-matching description
# print(f"Generated description: {descriptions[best_match_index]}")