https://www.pinecone.io/learn/clip-image-search/

**Configs**

In [1]:
max_context_length = 77
clip_model_vers = "openai/clip-vit-base-patch32"
kscore = [1, 5, 10, 20, 50, 100]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
!pip -qqq install transformers torch datasets
!pip3 -qqq install gdcm
!pip3 -qqq install pydicom
!pip -qqq install faiss-gpu
!pip -qqq install pinecone-client

In [None]:
import os
import faiss
import torch
import skimage
import requests
import pinecone
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
import IPython.display
import matplotlib.pyplot as plt
from datasets import load_dataset
from collections import OrderedDict
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

In [None]:
!unzip "drive/MyDrive/Text_Essential/sbu_downloaded_shuffled.zip"

**Load Captions and construct Caption_Dataframe**

In [6]:
caption_file_path = "drive/MyDrive/Text_Essential/Dataset_Captions.csv"
df = pd.read_csv(caption_file_path)

In [7]:
df

Unnamed: 0,title,image_id
0,Spider rock dominates the landscape at canyon...,78
1,Taken in scotiabank theater where they have s...,102
2,Old Pepsi machine beside a building in Rockfo...,185
3,boat ride in the lake,423
4,"The girl in the red is my niece, Emily. I don...",785
...,...,...
7995,"Looking through the glass in the train, while...",999025
7996,Mean boy in black aint' no queer,999029
7997,frog bin and wildflowers by liverpool footbal...,999378
7998,Must get the pink flowers growing in front of...,999560


**Load Image and construct Image_Dataframe**

In [8]:
images_id = []
image_path = []
for images in os.listdir("sbu_downloaded"):
  images_id.append(images.replace(".jpg", ""))
  image_path.append(f"sbu_downloaded/{images}")

In [9]:
_cols = {"image_id": images_id,
        "image_path": image_path}

image_df = pd.DataFrame(_cols)

In [10]:
image_df

Unnamed: 0,image_id,image_path
0,127688,sbu_downloaded/127688.jpg
1,578367,sbu_downloaded/578367.jpg
2,23619,sbu_downloaded/23619.jpg
3,906538,sbu_downloaded/906538.jpg
4,61144,sbu_downloaded/61144.jpg
...,...,...
7995,808897,sbu_downloaded/808897.jpg
7996,482671,sbu_downloaded/482671.jpg
7997,536489,sbu_downloaded/536489.jpg
7998,665963,sbu_downloaded/665963.jpg


**Perform NER to Title, becoming "NER-Title"**

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_trf
!pip install spacy-transformers
!pip install keyphrase_vectorizers
!pip install keybert

In [12]:
import spacy
from spacy import displacy
import spacy_transformers

# Load English model
nlp = spacy.load("en_core_web_trf")

In [13]:
def identifyNERAndCombine(sentence):
  sentence = sentence.lower()
  ner_dict = {}
  doc = nlp(sentence)
  for ent in doc.ents:
      if ent.text not in ner_dict:
          ner_dict[ent.text] = ent.label_
  for word, ner in ner_dict.items():
    sentence = sentence.replace(word, f"{word} ({ner})")
  return sentence

In [14]:
# Apply the conversion function to each caption in the DataFrame
df['ner_title'] = df['title'].apply(lambda x: identifyNERAndCombine(x))

**Load CLIP model**

In [15]:
def get_model_info(model_ID, device):
  # Save the model to device
	model = CLIPModel.from_pretrained(model_ID).to(device)
 	# Get the processor
	processor = CLIPProcessor.from_pretrained(model_ID)
  # Get the tokenizer
	tokenizer = CLIPTokenizer.from_pretrained(model_ID)
  # Return model, processor & tokenizer
	return model, processor, tokenizer

# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Define the model ID
model_ID = clip_model_vers
# Get model, processor & tokenizer
model, processor, tokenizer = get_model_info(model_ID, device)

**Text-Embedding: Convert "Title" & "NER-Title" to 2 different Text-Embeddings**

In [16]:
def get_single_text_embedding(text_list):
  inputs = tokenizer(text_list, return_tensors = "pt", padding="max_length", truncation=True, max_length=max_context_length)
  text_embeddings = model.get_text_features(**inputs.to(device))
  # convert the embeddings to numpy array
  embedding_as_np = text_embeddings.cpu().detach().numpy()
  embedding_as_np = np.expand_dims(embedding_as_np, axis=1)
  return embedding_as_np

def get_all_text_embeddings(df):
  df['title_embeddings'], df['ner_title_embeddings'] = zip(*df.apply(lambda x: get_single_text_embedding([x.title, x.ner_title]), axis=1))
  return df

df = get_all_text_embeddings(df)

In [17]:
df

Unnamed: 0,title,image_id,ner_title,title_embeddings,ner_title_embeddings
0,Spider rock dominates the landscape at canyon...,78,spider rock dominates the landscape at canyon...,"[[-0.19251586, 0.038147885, 0.071399026, -0.13...","[[-0.13344195, 0.06701599, 0.02025596, -0.0666..."
1,Taken in scotiabank theater where they have s...,102,taken in scotiabank theater (FAC) where they ...,"[[0.16504101, -0.11080879, -0.24759056, -0.144...","[[0.26642585, -0.19346175, -0.14683522, -0.148..."
2,Old Pepsi machine beside a building in Rockfo...,185,old pepsi (ORG) machine beside a building in ...,"[[0.39722258, 0.13342771, 0.3347901, 0.1302333...","[[0.2855332, 0.20317626, 0.31337717, 0.1598594..."
3,boat ride in the lake,423,boat ride in the lake,"[[0.25573865, 0.04942262, 0.24811772, -0.01462...","[[0.25573865, 0.04942262, 0.24811772, -0.01462..."
4,"The girl in the red is my niece, Emily. I don...",785,"the girl in the red is my niece, emily (PERSO...","[[0.063320726, -0.21294405, -0.48721588, 0.160...","[[0.03422323, -0.15437979, -0.4250669, 0.18321..."
...,...,...,...,...,...
7995,"Looking through the glass in the train, while...",999025,"looking through the glass in the train, while...","[[0.09465354, 0.4204727, -0.03252518, 0.325041...","[[0.09465354, 0.4204727, -0.03252518, 0.325041..."
7996,Mean boy in black aint' no queer,999029,mean boy in black aint' no queer,"[[0.3465486, 0.20587496, 0.15962212, 0.0173307...","[[0.3465486, 0.20587496, 0.15962212, 0.0173307..."
7997,frog bin and wildflowers by liverpool footbal...,999378,frog bin and wildflowers by liverpool footbal...,"[[-0.3164907, -0.44918257, 0.11903918, -0.3421...","[[-0.3179527, -0.3701128, 0.1833227, -0.394836..."
7998,Must get the pink flowers growing in front of...,999560,must get the pink flowers growing in front of...,"[[-0.20206104, -0.3833797, 0.13510478, -0.5714...","[[-0.20206104, -0.3833797, 0.13510478, -0.5714..."


**Image-Embedding**

In [18]:
# Convert Image to RBG File
import cv2

# Function to convert an image from BGR to RGB
def convert_to_rgb(image_path):
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image_rgb

# # Apply the conversion function to each row in the DataFrame
image_df['image_rgb'] = image_df['image_path'].apply(lambda x: convert_to_rgb(x))

In [19]:
# Convert RGB_File to Image_Embedding
def get_single_image_embedding(my_image):
  image = processor(
    text = None,
		images = my_image,
		return_tensors="pt"
  )["pixel_values"].to(device)
  embedding = model.get_image_features(image)
  # convert the embeddings to numpy array
  embedding_as_np = embedding.cpu().detach().numpy()
  return embedding_as_np

def get_all_images_embedding(df, img_column):
  df["img_embeddings"] = df[str(img_column)].apply(get_single_image_embedding)
  return df

image_df = get_all_images_embedding(image_df, "image_rgb")

**Matching Text To Image (Retrieve most similar Top-100 image_id)**

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_N_images_id(image_id, cos_sim, top_K):
  # Map image_id to the cos_sim score
  image_id_cos_sim_dict = dict(zip(map(int, image_id), cos_sim))
  # Sort the image_id desc according to their cos_sim
  most_similar_images = sorted(image_id_cos_sim_dict.items(), key=lambda x: x[1], reverse=True)[:top_K]
  # Extract the image_id from the top N items
  most_similar_images_ids = [img[0] for img in most_similar_images]
  return most_similar_images_ids


# Text to image Search
def get_similar_images(query_embed, new_col, image_df, top_K=100):
  _text_embeddings = np.array(query_embed.tolist())  # Shape: (8500, 1, 512)
  _image_embeddings = np.array(image_df["img_embeddings"].tolist())  # Shape: (8500, 1, 512)
  # Reduce the dimension of embeddings to Shape: (8500, 512)
  _text_embeddings = _text_embeddings.reshape(_text_embeddings.shape[0], _text_embeddings.shape[2]) # Shape: (8500, 512)
  _image_embeddings = _image_embeddings.reshape(_image_embeddings.shape[0], _image_embeddings.shape[2]) # Shape: (8500, 512)

  # Compute Cosine Similarity
  cosine_sim_matrix = cosine_similarity(_text_embeddings, _image_embeddings)
  cosine_sim_matrix_df = pd.DataFrame({'cos_sim': cosine_sim_matrix.tolist()})
  image_id = image_df["image_id"].tolist()

  # Extract the similar image_ids
  df[new_col] = cosine_sim_matrix_df['cos_sim'].apply(lambda x: get_top_N_images_id(image_id, x, top_K))
  return df


In [21]:
df = get_similar_images(df['title_embeddings'], "title_image_ids", image_df, top_K=100)
df = get_similar_images(df['ner_title_embeddings'], "ner_title_image_ids", image_df, top_K=100)

In [22]:
df

Unnamed: 0,title,image_id,ner_title,title_embeddings,ner_title_embeddings,title_image_ids,ner_title_image_ids
0,Spider rock dominates the landscape at canyon...,78,spider rock dominates the landscape at canyon...,"[[-0.19251586, 0.038147885, 0.071399026, -0.13...","[[-0.13344195, 0.06701599, 0.02025596, -0.0666...","[78, 739003, 55070, 125145, 393586, 922595, 95...","[78, 596499, 739003, 725466, 860776, 393586, 8..."
1,Taken in scotiabank theater where they have s...,102,taken in scotiabank theater (FAC) where they ...,"[[0.16504101, -0.11080879, -0.24759056, -0.144...","[[0.26642585, -0.19346175, -0.14683522, -0.148...","[102, 589025, 903820, 581492, 179232, 816761, ...","[102, 903820, 589025, 179232, 581492, 210044, ..."
2,Old Pepsi machine beside a building in Rockfo...,185,old pepsi (ORG) machine beside a building in ...,"[[0.39722258, 0.13342771, 0.3347901, 0.1302333...","[[0.2855332, 0.20317626, 0.31337717, 0.1598594...","[185, 608419, 594452, 962546, 66927, 396971, 3...","[185, 594452, 608419, 962546, 638302, 396971, ..."
3,boat ride in the lake,423,boat ride in the lake,"[[0.25573865, 0.04942262, 0.24811772, -0.01462...","[[0.25573865, 0.04942262, 0.24811772, -0.01462...","[121640, 628723, 311235, 417466, 136767, 31367...","[121640, 628723, 311235, 417466, 136767, 31367..."
4,"The girl in the red is my niece, Emily. I don...",785,"the girl in the red is my niece, emily (PERSO...","[[0.063320726, -0.21294405, -0.48721588, 0.160...","[[0.03422323, -0.15437979, -0.4250669, 0.18321...","[785, 250259, 378839, 577745, 136045, 546585, ...","[785, 250259, 397061, 837837, 378839, 3603, 62..."
...,...,...,...,...,...,...,...
7995,"Looking through the glass in the train, while...",999025,"looking through the glass in the train, while...","[[0.09465354, 0.4204727, -0.03252518, 0.325041...","[[0.09465354, 0.4204727, -0.03252518, 0.325041...","[202101, 528532, 791941, 571340, 171387, 76548...","[202101, 528532, 791941, 571340, 171387, 76548..."
7996,Mean boy in black aint' no queer,999029,mean boy in black aint' no queer,"[[0.3465486, 0.20587496, 0.15962212, 0.0173307...","[[0.3465486, 0.20587496, 0.15962212, 0.0173307...","[999029, 31121, 895156, 919038, 700477, 4898, ...","[999029, 31121, 895156, 919038, 700477, 4898, ..."
7997,frog bin and wildflowers by liverpool footbal...,999378,frog bin and wildflowers by liverpool footbal...,"[[-0.3164907, -0.44918257, 0.11903918, -0.3421...","[[-0.3179527, -0.3701128, 0.1833227, -0.394836...","[297712, 999378, 884424, 510014, 177024, 17538...","[999378, 297712, 884424, 177024, 832531, 97507..."
7998,Must get the pink flowers growing in front of...,999560,must get the pink flowers growing in front of...,"[[-0.20206104, -0.3833797, 0.13510478, -0.5714...","[[-0.20206104, -0.3833797, 0.13510478, -0.5714...","[824663, 867074, 590675, 214938, 816569, 31446...","[824663, 867074, 590675, 214938, 816569, 31446..."


**Evaluation: Precision@K scores**
- 1, 5, 10, 20, 50, 100

In [43]:
def computePrecisionK(kscore, id, similar_image_ids):
  # id: Grouth Truth
  precision = []
  for k in kscore:
    _similar_image_ids = similar_image_ids[:k]
    res = 0
    if id in _similar_image_ids:
      res = 1
    precision.append(res)
  return precision

In [44]:
precisionK = []
for _tc in ["title_image_ids", "ner_title_image_ids"]:
  # Compute each row, and see whether the ground truth appear in first_K
  # Result: [(is ground truth appear in top1, top5, top10, top20, top50, top100)]
  # Then combine each row result and sum thier appearance in [top1, top5, top10, top20, top50, top100]
  col_sum = [sum(col) for col in zip(*df.apply(lambda x: computePrecisionK(kscore, x["image_id"], x[f"{_tc}"]), axis=1).tolist())]
  precisionK.append(col_sum)

In [45]:
# Convert into Numpy Array
precisionK = np.array(precisionK)
# Convert into Precision@K Percentages
precisionK = precisionK / len(df)
# Put the results into Dataframe
kscore_df = pd.DataFrame(precisionK, columns=kscore)
# Append target_column before the results
kscore_df.insert(0, "target", ["title", "ner-title"], True)

In [46]:
kscore_df

Unnamed: 0,target,1,5,10,20,50,100
0,title,0.321375,0.549,0.63,0.7085,0.8045,0.862375
1,ner-title,0.3075,0.527875,0.614375,0.693875,0.794625,0.853125
