In [1]:
# Import required modules
import csv
import cv2
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Function that reads image dataset
def build_image_index(csv_file):
    image_index = {}
    ids = []
    with open(csv_file, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            image_id, path, label = row
            image_index[image_id] = {'path': path, 'label': label}
            ids.append(image_id)
    return image_index, ids


In [4]:
# Function to preprocess text
def preprocess_text(text):

    # Step 1: Normalize to lowercase
    text = text.lower()

    # Step 2: Tokenize
    tokens = word_tokenize(text)

    # Step 3: Remove punctuation from each word
    punc = str.maketrans('', '', string.punctuation)
    non_punc = [w.translate(punc) for w in tokens]

    # Step 4: Remove non-alphabetic tokens
    words = [word for word in non_punc if word.isalpha()]

    # Step 5: Remove stop words from tokens
    stop_words = set(stopwords.words('english'))

    words = [w for w in words if not w in stop_words]

    # Step 6: Stemming of tokens
    stemmer = PorterStemmer()
    stemmed_text = [stemmer.stem(word) for word in words]

    return ' '.join(map(str, stemmed_text))

In [5]:
# Function to preprocess image, like resize to target image, desaturate, etc
def preprocess_image(image_path, target_size=(256, 256)):
    # Read the image from the given path
    image = cv2.imread(image_path)

    # Resize the image to the target size
    image = cv2.resize(image, target_size)
    return image

In [6]:
# Function that returns text surrogate
def extract_text_surrogate(text):
    # Use a text feature extractor (e.g., Bag of Words, TF-IDF) to convert the text into a vector representation (surrogate)
    vectorizer = CountVectorizer()
    text_matrix = vectorizer.fit_transform([text])
    surrogate = text_matrix.toarray()
    return surrogate

In [7]:
# Function that returns text surrogate
def extract_text_surrogate(text):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    text_surrogate = vectorizer.fit_transform([text])
    return text_surrogate

In [8]:
# Function that search images for a given text
def search_images_by_text(text_query, image_index, image_ids):
  # Preprocess the text query
  processed_text_query = preprocess_text(text_query)

  # Extract the text labels for all images
  image_labels = [image_index[image_id]['label'] for image_id in image_ids]

  # Initialize the TfidfVectorizer
  vectorizer = TfidfVectorizer(tokenizer=preprocess_text)

  # Fit and transform the image labels to get their text surrogates
  text_surrogates = vectorizer.fit_transform(image_labels)

  # Transform the text query to get its text surrogate
  text_query_surrogate = vectorizer.transform([text_query])

  similarities = {}
  for i, image_id in enumerate(image_ids):
    # Calculate the similarity between the query's text surrogate and the image's text surrogate
    similarity = cosine_similarity(text_query_surrogate, text_surrogates[i])
    similarities[image_id] = similarity[0][0]

  sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
  top_similarities = sorted_similarities[:5]

  return top_similarities

In [10]:
# Read the CSV file containing the labeled image data
csv_file = '/content/drive/MyDrive/image_search/reverse_image_search.csv'
image_index, ids = build_image_index(csv_file)

# Perform a text-based image search
text_query = "fish"
search_results = search_images_by_text(text_query, image_index, ids)

# Print the search results
print("Top 5 Similar Images:")
print("+------------+--------------+-------------------+")
print("|  Image ID  | Similarity  |     Image Path    |")
print("+------------+--------------+-------------------+")
for image_id, similarity in search_results:
  image_path = image_index[image_id]['path']
  print(f"|  {image_id}  |   {similarity:.4f}    |   {image_path}   |")

Top 5 Similar Images:
+------------+--------------+-------------------+
|  Image ID  | Similarity  |     Image Path    |
+------------+--------------+-------------------+
|  640  |   0.7200    |   /content/drive/MyDrive/image_search/train/goldfish/n01443537_1415.JPEG   |
|  641  |   0.7200    |   /content/drive/MyDrive/image_search/train/goldfish/n01443537_2637.JPEG   |
|  642  |   0.7200    |   /content/drive/MyDrive/image_search/train/goldfish/n01443537_19638.JPEG   |
|  643  |   0.7200    |   /content/drive/MyDrive/image_search/train/goldfish/n01443537_2819.JPEG   |
|  644  |   0.7200    |   /content/drive/MyDrive/image_search/train/goldfish/n01443537_13189.JPEG   |
