# Find similar pictures in a directory
This notebooks shows how to find similar pictures in a directory using a pre-trained deep learning model that transforms
images into embeddings. The embeddings are then used to find similar images in the directory.

In [1]:
import os
import sys

# Add src directory to the Python path
sys.path.append(os.path.abspath('../src'))

from PIL import Image
import pandas as pd

from ImageEmbedder import ImageEmbedder
from EmbeddingRetriever import EmbeddingRetriever
import matplotlib.pyplot as plt
from IPython.display import display
from PIL import Image

# Create image embeddings

In [None]:
path = "C:/Users/User/Desktop/toprocess"
ie = ImageEmbedder()

In [3]:
# List all image files in the folder based on allowed extensions
def list_images_with_full_path(folder_path, allowed_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']):
    image_files = [os.path.join(os.path.abspath(folder_path), f) 
                   for f in os.listdir(folder_path) 
                   if os.path.splitext(f)[1].lower() in allowed_extensions]
    return image_files

img_paths = list_images_with_full_path(path)

In [None]:
# create and store all embeddings
for img_path in img_paths:
    ie.get_embedding(img_path)
ie.save_embeddings("../data/embeddings.parquet")

# Find similar images

In [5]:
# laod the embeddings
embeddings_df = pd.read_parquet("../data/embeddings.parquet")
# create the retriever
retriever = EmbeddingRetriever(embeddings_df)
# calculate the similar images
df_similar_images = retriever.update_similar_images(output_file='../data/similar_images.parquet')
df_similar_images

In [7]:
def list_images_in_folder(folder_path: str, filter_extensions: list = None) -> list:
    """
    List all image files in the specified folder.

    Args:
        folder_path (str): The path to the folder where images are located.
        filter_extensions (list): A list of file extensions to filter (e.g., ['jpg', 'jpeg', 'png']).

    Returns:
        list: A list of image file paths.
    """
    if filter_extensions is None:
        filter_extensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff']

    # Convert extensions to lowercase for consistent matching
    filter_extensions = [ext.lower() for ext in filter_extensions]
    image_files = []

    # Iterate through the folder
    for filename in os.listdir(folder_path):
        ext = filename.split('.')[-1].lower()
        if ext in filter_extensions:
            image_files.append(os.path.join(folder_path, filename))

    return image_files

def display_similar_images(path):
    """
    Displays the original image first, followed by similar images in a single row.
    Resizes all images to have a maximum dimension of 200 pixels (width or height).
    
    Args:
        path (str): The path to the original image.
    """
    def resize_image(img_path, max_size=200):
        img = Image.open(img_path)
        img.thumbnail((max_size, max_size))  # Resize with max width or height of 200
        return img

    # Display the original image
    print(f"Original Image: {path}")
    original_img = resize_image(path)
    display(original_img)

    # Retrieve similar images
    similar_images = retriever.find_similar_embeddings(path)
    
    # Set up a figure to display the images in one row
    num_images = len(similar_images)
    if num_images == 0:
        print("No similar images found.")
        return
    fig, axes = plt.subplots(1, num_images, figsize=(num_images * 3, 3))  # Adjust figure size to fit all images
    
    if num_images == 1:
        axes = [axes]  # If there's only one image, we need to wrap it in a list

    # Display each similar image on the same row
    for ax, (fpath, sim, embedding) in zip(axes, similar_images):
        similar_img = resize_image(fpath)
        ax.imshow(similar_img)
        ax.set_title(f"Sim: {sim:.2f}")
        ax.axis('off')  # Hide the axis

    plt.show()

In [None]:
# find similar image to an specific image
image_path = "image_path.jpg"
display_similar_images(image_path)

In [None]:
# check all similar images
all_images = list_images_in_folder("C:/Users/User/Desktop/toprocess/")
for img in all_images:
    display_similar_images(img)
    print("\n   \n")