In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from PIL import Image, ExifTags
import cv2
from scipy.spatial import distance
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise
import time
import sqlite3
import pickle
import torch
from torchvision import models, transforms
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap
import joblib
import plotly.express as px

# Retrival Pipeline

In [2]:
def find_image_files(root_dir, extensions=(".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif")):
    """
    Load all image paths and get the total number of images,
    we can't use tqdm here, because we have to determine the number of images first.
    """
    image_files = []

    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(extensions):
                image_files.append(os.path.join(subdir, file))
    return image_files

In [3]:
# Measurement functions


def image_rgb_calculation(image):
    # OpenCV uses BGR color space, we have to convert it to the RGB color space
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Calculate the histogram
    hist = cv2.calcHist([rgb_image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])

    # Normalize the histogram so that histograms of different images (different sizes, resolutions) are comparable
    hist = cv2.normalize(hist, hist)
    hist = hist.flatten()
    return hist


def image_hsv_calculation(image):
    # OpenCV uses BGR color space, we have to convert it to the RGB color space
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # Calculate the histogram (bin-sizes = 8, values from 0-255)
    hist = cv2.calcHist([hsv_image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])

    # Normalize the histogram so that histograms of different images (different sizes, resolutions) are comparable
    hist = cv2.normalize(hist, hist)
    hist = hist.flatten()
    return hist


def load_embedding_model():
    global model, preprocess
    # Load the efficientnet_v2_s model
    model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.DEFAULT)

    # Remove last layer (classificator), because we only need the features
    model = torch.nn.Sequential(*list(model.children())[:-1])
    model.eval()

    # Define preprocessing transformations
    preprocess = transforms.Compose(
        [
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )


def model_embeddings_calculation(image):
    global model, preprocess
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)

    with torch.no_grad():
        features = model(input_batch)
    features = torch.flatten(features, 1)

    return features.numpy().flatten()

In [4]:
def extract_image_details(image_id, path, resize_size):
    try:
        image = cv2.imread(path)
        if image is not None:
            image = cv2.resize(image, resize_size)
            rgb_histogram = image_rgb_calculation(image)
            hsv_histogram = image_hsv_calculation(image)
            model_embedding = model_embeddings_calculation(image)

            ### several more informations ###

            # Convert to RGB
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Calculate average color and brightness
            avg_color = np.mean(image_rgb, axis=(0, 1)).tolist()
            avg_brightness = np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

            # Convert to HSV and calculate average HSV
            image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
            avg_hsv = np.mean(image_hsv, axis=(0, 1)).tolist()

            # File details
            file_size = os.path.getsize(path)
            file_type = os.path.splitext(path)[1]

            with Image.open(path) as img:
                # Resolution and DPI
                resolution = img.size  # (width, height)
                dpi = img.info.get("dpi", (0, 0))

                # Extract metadata
                try:
                    exif_data = img._getexif()
                    metadata = {ExifTags.TAGS.get(k, k): v for k, v in exif_data.items()} if exif_data else {}
                except AttributeError:
                    metadata = {}

            #################################

            return {
                "ID": image_id,
                "Path": path,
                "RGB_Histogram": rgb_histogram,
                "HSV_Histogram": hsv_histogram,
                "Model_Embedding": model_embedding,
                "Average_Color": avg_color,
                "Brightness": avg_brightness,
                "Average_HSV": avg_hsv,
                "Resolution": resolution,
                "DPI": dpi,
                "File_Size": file_size,
                "File_Type": file_type,
                "Metadata": metadata,
            }

        else:
            print(f"Image at path {path} is None.")
            return None

    except Exception as e:
        print(f"Failed processing {path}: {e}")
        return None


def load_checkpoint():
    if not os.path.exists("checkpoint.pkl"):
        return 0, [], [], [], [], []

    with open("checkpoint.pkl", "rb") as f:
        batch_index, paths, rgb_hists, hsv_hists, embeddings, other_data = pickle.load(f)
        print(f"Loaded checkpoint.\nStarting from path with ID: {batch_index + 1}")

    return batch_index, paths, rgb_hists, hsv_hists, embeddings, other_data


def image_batch_generator(image_files, batch_size, resize_size, start_index=0, show_progress=True):
    total_batches = (
        len(image_files) - start_index + batch_size - 1
    ) // batch_size  # - start_index to display remaining batches correctly
    progress_bar = tqdm(total=total_batches, desc="Processing images") if show_progress else None

    current_id = start_index + 1

    for index in range(start_index, len(image_files), batch_size):
        batch = image_files[index : index + batch_size]
        details_list = []

        for i, path in enumerate(batch):
            features = extract_image_details(current_id, path, resize_size)
            if features is not None:
                details_list.append(features)
                current_id += 1  # Increase only if an image has been successfully read

        df = pd.DataFrame(details_list)
        yield df, index + batch_size
        if show_progress:
            progress_bar.update(1)

    if show_progress:
        progress_bar.close()


def main_load_images(batch_size, desired_size):
    # Load checkpoint
    start_index, paths, rgb_hists, hsv_hists, embeddings, other_data = load_checkpoint()

    image_paths = find_image_files(r"C:\Users\timsa\Desktop\Daten_Joschua\data\image_data\extra_collection\city")

    load_embedding_model()

    for df, batch_index in image_batch_generator(
        image_paths, batch_size, desired_size, start_index=start_index, show_progress=True
    ):
        paths.extend(df[["ID", "Path"]].values.tolist())
        rgb_hists.extend(df[["ID", "RGB_Histogram"]].values.tolist())
        hsv_hists.extend(df[["ID", "HSV_Histogram"]].values.tolist())
        embeddings.extend(df[["ID", "Model_Embedding"]].values.tolist())
        other_data.extend(
            df.drop(columns=["Path", "RGB_Histogram", "HSV_Histogram", "Model_Embedding"]).values.tolist()
        )

        # Save checkpoint, overrides old one and appends new data
        with open("checkpoint.pkl", "wb") as f:
            pickle.dump((batch_index, paths, rgb_hists, hsv_hists, embeddings, other_data), f)

    # Save results
    df_paths = pd.DataFrame(paths, columns=["ID", "Path"])
    df_rgb = pd.DataFrame(rgb_hists, columns=["ID", "Histogram"])
    df_hsv = pd.DataFrame(hsv_hists, columns=["ID", "Histogram"])
    df_embeddings = pd.DataFrame(embeddings, columns=["ID", "Embedding"])
    df_other_data = pd.DataFrame(
        other_data,
        columns=[
            "ID",
            "Average_Color",
            "Brightness",
            "Average_HSV",
            "Resolution",
            "DPI",
            "File_Size",
            "File_Type",
            "Metadata",
        ],
    )

    df_paths.to_pickle("Path.pkl")
    df_rgb.to_pickle("RGB_Hist.pkl")
    df_hsv.to_pickle("HSV_Hist.pkl")
    df_embeddings.to_pickle("Embedding.pkl")
    df_other_data.to_pickle("Other_data.pkl")

    # Checkpoint can be removed after program was successful
    if os.path.exists("checkpoint.pkl"):
        os.remove("checkpoint.pkl")

In [None]:
%%time
# Execute
batch_size = 1000
desired_size = (224, 224)
main_load_images(batch_size, desired_size)

In [5]:
def create_database():
    # Check if the folder 'databases' does not exist / create it
    if not os.path.exists("database"):
        os.makedirs("database")

    conn = sqlite3.connect("database/bd_database.db")
    curs = conn.cursor()

    curs.execute(
        """CREATE TABLE IF NOT EXISTS image_paths 
                    (ID INTEGER PRIMARY KEY,
                    Path text);"""
    )
    conn.commit()

In [6]:
def save_to_db(df, conn):
    curs = conn.cursor()
    for file_path in df["Path"]:
        curs.execute("""INSERT OR IGNORE INTO image_paths (Path) VALUES (?);""", (file_path,))
    conn.commit()

In [8]:
create_database()

In [10]:
path_df = pd.read_pickle("Path.pkl")
connection = conn = sqlite3.connect("database/bd_database.db")

# Change drive letter (if necessary)
# It could be (depending on the respective windows configuration)
# that the saved path is not the actual path on another windows system
old_drive_letter = "D"
new_drive_letter = "F"
path_df["Path"] = path_df["Path"].apply(lambda path: path.replace(f"{old_drive_letter}:", f"{new_drive_letter}:"))

save_to_db(path_df, connection)

# Testing Similarities

In [14]:
def load_pickles():
    rgb_df = pd.read_pickle("RGB_Hist.pkl")
    hsv_df = pd.read_pickle("HSV_Hist.pkl")
    embedding_df = pd.read_pickle("Embedding.pkl")
    path_df = pd.read_pickle("Path.pkl")
    other_data_df = pd.read_pickle("Other_data.pkl")

    return rgb_df, hsv_df, embedding_df, path_df, other_data_df


def find_similar_ids(measurement, similarity, df_input, best_n):
    similarity_functions = {"euclidean": "euclidean", "manhattan": "cityblock", "cosine": "cosine"}

    histogram_columns = {"RGB": "RGB_Histogram", "HSV": "HSV_Histogram", "Embedding": "Model_Embedding"}

    dataframes = {"RGB": rgb_df, "HSV": hsv_df, "Embedding": embedding_df}

    # Create new input-df with the needed column
    df_input_selected = df_input[["ID", histogram_columns[measurement]]]

    # Select needed comparison-df
    target_df = dataframes[measurement]

    # Select comparison function
    similarity_function = similarity_functions[similarity]

    similarity_ids = calculate_mean_similarity(df_input_selected, target_df, similarity_function, best_n)
    return similarity_ids

In [159]:
# Plot images
def get_result_paths(curs, similarity_results):
    result_paths = []
    for image_id in similarity_results:
        curs.execute(
            """SELECT path
                        FROM image_paths
                        WHERE ID == (?);""",
            (image_id,),
        )
        results = curs.fetchall()

        # If there are results save only the path (otherwise the output would be lists with tuples)
        if results:
            result_paths.append(results[0][0])
    return result_paths


def print_images(input_images, result_paths, similarities, best_n):
    input_images_number = len(input_images)
    max_images = max(input_images_number, best_n)
    figsize = (20, 5)

    # Subplots to plot input(s) and results
    # First row
    fig, axes = plt.subplots(1, max_images, figsize=figsize)

    for i in range(input_images_number):
        image = Image.open(input_images.iloc[i, 1])
        axes[i].imshow(image)
        axes[i].set_title(f"Input: {i+1}")
        axes[i].axis("off")

    # Fill remaining axes with empty plots if needed
    for i in range(input_images_number, max_images):
        axes[i].axis("off")

    # Second row
    fig, axes = plt.subplots(1, max_images, figsize=figsize)

    for i in range(best_n):
        image = Image.open(result_paths[i])
        axes[i].imshow(image)
        axes[i].set_title(f"Result ID: {similarities[i]}")
        axes[i].axis("off")

    # Fill remaining axes with empty plots if needed
    for i in range(best_n, max_images):
        axes[i].axis("off")

    plt.show()

In [59]:
# Calculate mean if more than one input image or just the similarity
# Creates a new df with the ID, similarity value per input image (and the mean)


def calculate_mean_similarity(df_input_measurements, df_comparison_data, similarity_function, best_n):
    input_ids = df_input_measurements["ID"].values
    comparison_ids = df_comparison_data["ID"].values

    # Extract the histogram columns by dropping the 'ID' column
    input_histogram_column = df_input_measurements.drop(columns=["ID"]).columns[0]
    comparison_histogram_column = df_comparison_data.drop(columns=["ID"]).columns[0]

    # Convert histogram columns to numeric arrays
    input_features = np.vstack(df_input_measurements[input_histogram_column].values)
    comparison_features = np.vstack(df_comparison_data[comparison_histogram_column].values)

    # Calculate similarity for each pair (rows)
    similarity_matrix = cdist(comparison_features, input_features, metric=similarity_function)

    similarity_results = pd.DataFrame(similarity_matrix, columns=input_ids)

    # Add ID column and order columns
    similarity_results["ID"] = df_comparison_data["ID"]
    similarity_results = similarity_results[["ID"] + list(input_ids)]

    # print(similarity_results)

    # Calculate mean
    if len(input_ids) > 1:
        similarity_results["Mean"] = similarity_results.drop(columns=["ID"]).mean(axis=1)
        sorted_results = similarity_results.sort_values(by="Mean", ascending=True)
        # print(sorted_results)
    else:
        sorted_results = similarity_results.sort_values(by=input_ids[0], ascending=True)
        # print(sorted_results)

    # print(sorted_results)

    # Return just a list with the best n similarities
    best_ids = sorted_results.head(best_n)["ID"].tolist()
    return best_ids

In [172]:
# same image is: cheng-feng-psdV2Rl-GvU-unsplash.jpg


def main_finding_similarities(input_images_number, measurement, similarity, best_n):
    global rgb_df, hsv_df, embedding_df, path_df, other_data_df

    # Load pickles (doing this outside is better for perform the main more than one time)
    # rgb_df, hsv_df, embedding_df, path_df, other_data_df = load_pickles()

    specific_image_path = [r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_1.jpg"]

    all_image_paths = [
        r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_1.jpg",
        r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_2.jpg",
        r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_3.jpg",
        r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_4.jpg",
        r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_5.jpg",
        r"C:\Users\timsa\Desktop\sample_pictures\testing\test_image_6.jpg",
    ]

    # Decide which image(s)
    if input_images_number == 1:
        input_images = specific_image_path
    else:
        input_images = all_image_paths[:input_images_number]

    # print(input_images)
    resize_size = (224, 224)
    max_id = path_df["ID"].max()

    load_embedding_model()

    current_id = max_id + 1  # Start ID from the maximum existing ID + 1
    details_list = []

    for i, path in enumerate(input_images):
        features = extract_image_details(current_id, path, resize_size)
        if features is not None:
            details_list.append(features)
            current_id += 1  # Increase only if an image has been successfully read

    df_input = pd.DataFrame(details_list)
    # print(df_input)

    id_list = find_similar_ids(measurement, similarity, df_input, best_n)
    # print(id_list)

    conn = sqlite3.connect("database/bd_database.db")
    curs = conn.cursor()

    result_paths_list = get_result_paths(curs, id_list)
    # print(result_paths_list)

    print_images(df_input, result_paths_list, id_list, best_n)

In [11]:
%%time
global rgb_df, hsv_df, embedding_df, path_df, other_data_df

# Load pickles
rgb_df, hsv_df, embedding_df, path_df, other_data_df = load_pickles()

CPU times: total: 2.31 s
Wall time: 7.68 s


In [None]:
%%time
main_finding_similarities(1, "RGB", "euclidean", 5)

# ___________________________________________________

In [10]:
# Required because removing entries with 'none' (during extraction) sometimes causes IDs to be missing
def correct_data():
    rgb_df, hsv_df, embedding_df, path_df, other_data_df = load_pickles()

    dataframes = [
        (rgb_df, "RGB_Hist.pkl"),
        (hsv_df, "HSV_Hist.pkl"),
        (embedding_df, "Embedding.pkl"),
        (path_df, "Path.pkl"),
        (other_data_df, "Other_data.pkl"),
    ]

    mismatch_found = False

    # Corrects mismatches from the mismatch-position until all mismatches are gone
    for df, filename in dataframes:
        corrected = False
        while True:
            # IDs are index + 1 always
            expected_ids = df.index + 1

            mismatch_index = (df["ID"] != expected_ids).idxmax()

            # Decide if there was a mismatch
            if df.loc[mismatch_index, "ID"] == expected_ids[mismatch_index]:
                if corrected:
                    print(f"Corrected: {filename}")

                    # Save only if corrections were made
                    with open(filename, "wb") as f:
                        pickle.dump(df, f)
                    print(f"Overwritten: {filename}\n")
                    first_mismatch_found = True
                else:
                    print(f"No mismatch: {filename}")

                break

            # Reduce all IDs - 1 beginning from the mismatch
            df.loc[mismatch_index:, "ID"] -= 1
            corrected = True

        # If no mismatch was found in the first file, exit the loop
        if not corrected and not mismatch_found:
            print(f"No mismatch found in {filename}. Skipping remaining files.")
            break

In [19]:
%%time
correct_data()

Corrected: RGB_Hist.pkl
Overwritten: RGB_Hist.pkl

Corrected: HSV_Hist.pkl
Overwritten: HSV_Hist.pkl

Corrected: Embedding.pkl
Overwritten: Embedding.pkl

Corrected: Path.pkl
Overwritten: Path.pkl

Corrected: Other_data.pkl
Overwritten: Other_data.pkl

CPU times: total: 2min 37s
Wall time: 3min 2s


In [20]:
right_data = pd.read_pickle("Embedding.pkl")
right_data

# ___________________________________________________

In [34]:
def reduce_dimensionality(df_to_reduce, algorithm, dimensions=2):
    if algorithm == "tsne":
        tsne = TSNE(n_components=dimensions)
        reduced_data = tsne.fit_transform(df_to_reduce)

    elif algorithm == "umap":
        umap_model = umap.UMAP(n_components=dimensions)
        reduced_data = umap_model.fit_transform(df_to_reduce)

    else:
        pca = PCA(n_components=dimensions)
        reduced_data = pca.fit_transform(df_to_reduce)

    return reduced_data


def save_dimensionality_results(df, algorithm_name):
    np.save(f"{algorithm_name}_results.npy", df)


def create_clusters(
    df_to_reduce,
    cluster_amount=100,
):
    kmeans = KMeans(n_clusters=cluster_amount)
    labels = kmeans.fit_predict(df_to_reduce)
    return labels


def plot_dimensionality_reduction(algorithm_data, labels=None, output_file=None):
    # Create empty lables to plot if labels=None
    if labels is None:
        labels = np.array([""] * len(algorithm_data))

    num_dims = algorithm_data.shape[1]

    # Load dataframe to get IDs
    df_embedding_ids = pd.read_pickle("Embedding.pkl")
    ids = df_embedding_ids["ID"].values

    # Decide 2D or 3D plot according to shape of algorithm_data (results)
    if num_dims == 2:
        fig = px.scatter(
            x=algorithm_data[:, 0],
            y=algorithm_data[:, 1],
            color=labels.astype(str),
            hover_name=ids,
            labels={"x": "Dim1", "y": "Dim2"},
            title="2D Plot",
        )

        if output_file is not None:
            fig.write_html(output_file)

        fig.show()

    else:
        fig = px.scatter_3d(
            x=algorithm_data[:, 0],
            y=algorithm_data[:, 1],
            z=algorithm_data[:, 2],
            color=labels.astype(str),
            hover_name=ids,
            labels={"x": "Dim1", "y": "Dim2", "z": "Dim3"},
            title="3D Plot",
        )
        fig.update_traces(marker=dict(size=3, opacity=0.6))
        fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))

        if output_file is not None:
            fig.write_html(output_file)

        fig.show()


def plot_selected_images(id_list):
    conn = sqlite3.connect("database/bd_database.db")
    curs = conn.cursor()

    result_paths_list = get_result_paths(curs, id_list)
    fig, axes = plt.subplots(1, len(id_list), figsize=(20, 5))

    for i in range(len(id_list)):
        if i < len(id_list):
            image = Image.open(result_paths_list[i])
            axes[i].imshow(image)
            axes[i].set_title(f"ID: {id_list[i]}")
        axes[i].axis("off")

    # plt.tight_layout()
    plt.show()

In [None]:
# Reducing and plotting
# df_from_pickle = pd.read_pickle("Embedding.pkl")
# df_to_reduce = np.vstack(df_from_pickle['Embedding'].values)
# print(len(df_to_reduce))
# print(len(df_to_reduce[0]))

In [None]:
# %%time
# reduced_data = reduce_dimensionality(df_to_reduce, "tsne", 2)
# save_dimensionality_results(reduced_data, "tsne")

In [None]:
# %%time
# clusters = create_clusters(df_to_reduce, 100)
# save_dimensionality_results(clusters, "kmeans")

In [None]:
# # Create clusters based on tsne-results
# %%time
# clusters = create_clusters(dimensions_tsne, 100)
# save_dimensionality_results(clusters, "tsne_kmeans")

In [None]:
%%time
# Loading and plotting results
dimensions_tsne = np.load("tsne_results.npy")
lables = np.load("kmeans_results_100.npy")
plot_dimensionality_reduction(dimensions_tsne, labels=lables, output_file="plot_tsne_2d.html")

In [None]:
%%time
images_to_plot = [435873, 23657, 436051, 42668, 431471, 435909]
plot_selected_images(images_to_plot)