In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from PIL import Image, ExifTags
import uuid
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from scipy.spatial import distance 
from sklearn.metrics import pairwise_distances
from numba import jit, prange
from concurrent.futures import ProcessPoolExecutor

from scipy.spatial import distance
from sklearn.metrics import pairwise
import time
import sqlite3
import pickle

import torch
from torchvision import models, transforms

# Retrival Pipeline

In [2]:
'''
Load all image paths and get the total number of images,
we can't use tqdm here, because we have to determine the number of images first.
'''

def find_image_files(root_dir, extensions=('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif')):
    image_files = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(extensions):
                image_files.append(os.path.join(subdir, file))
    return image_files


# here was: image_paths = find_image_files(r"")
# df_image_paths = pd.DataFrame({'ID': range(1, len(image_paths) + 1), 'Path': image_paths})
# df_image_paths = pd.DataFrame(image_paths, columns=["Path"])
# df_image_paths.to_pickle('Path.pkl')


# Show first 10 paths
# print(f"Number found images: {len(image_paths)}")
# if len(image_paths) > 10:
#     print("Some paths:")
#     for pic_path in image_paths[:10]:
#         print(pic_path)
# else:
#     print("Found paths:", image_paths)

In [3]:
# Measurement functions

def image_rgb_calculation(image):
    # OpenCV uses BGR color space, we have to convert it to the RGB color space
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Calculate the histogram
    hist = cv2.calcHist([rgb_image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    
    # Normalize the histogram so that histograms of different images (different sizes, resolutions) are comparable
    hist = cv2.normalize(hist, hist)
    hist = hist.flatten()
    return hist


def image_hsv_calculation(image):
    # OpenCV uses BGR color space, we have to convert it to the RGB color space
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Calculate the histogram (bin-sizes = 8, values from 0-255)
    hist = cv2.calcHist([hsv_image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    
    # Normalize the histogram so that histograms of different images (different sizes, resolutions) are comparable
    hist = cv2.normalize(hist, hist)
    hist = hist.flatten()
    return hist


# Load the efficientnet_v2_s model
model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.DEFAULT)

# Remove last layer (classificator), because we only need the features
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()

# Define preprocessing transformations
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def model_embeddings_calculation(image):
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)
    
    with torch.no_grad():
        features = model(input_batch)
    features = torch.flatten(features, 1)
    
    return features.numpy().flatten()

In [10]:
%%time
# Different Version 2 (more data)


def extract_image_details(image_id, path, resize_size):
    try:
        image = cv2.imread(path)
        if image is not None:
            image = cv2.resize(image, resize_size)
            rgb_histogram = image_rgb_calculation(image)
            hsv_histogram = image_hsv_calculation(image)
            model_embedding = model_embeddings_calculation(image)
            
            ### several more informations ###
            
            # Convert to RGB
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Calculate average color and brightness
            avg_color = np.mean(image_rgb, axis=(0, 1)).tolist()
            avg_brightness = np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

            # Convert to HSV and calculate average HSV
            image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
            avg_hsv = np.mean(image_hsv, axis=(0, 1)).tolist()

            # File details
            file_size = os.path.getsize(path)
            file_type = os.path.splitext(path)[1]

            with Image.open(path) as img:
                # Resolution and DPI
                resolution = img.size  # (width, height)
                dpi = img.info.get('dpi', (0, 0))
                
                # Extract metadata
                try:
                    exif_data = img._getexif()
                    metadata = {ExifTags.TAGS.get(k, k): v for k, v in exif_data.items()} if exif_data else {}
                except AttributeError:
                    metadata = {}
            
            
            #################################
            
            
            return {
                'ID': image_id,
                'Path': path,
                'RGB_Histogram': rgb_histogram,
                'HSV_Histogram': hsv_histogram,
                'Model_Embedding': model_embedding,
                'Average_Color': avg_color,
                'Brightness': avg_brightness,
                'Average_HSV': avg_hsv,
                'Resolution': resolution,
                'DPI': dpi,
                'File_Size': file_size,
                'File_Type': file_type,
                'Metadata': metadata
            }
        
        else:
            print(f"Image at path {path} is None.")
            return None
        
    except Exception as e:
        print(f"Failed processing {path}: {e}")
        return None

    
def load_checkpoint():
    if not os.path.exists('checkpoint.pkl'):
        return 0, [], [], [], [], []
    
    with open('checkpoint.pkl', 'rb') as f:
        batch_index, paths, rgb_hists, hsv_hists, embeddings, other_data = pickle.load(f)
        print(f"Loaded checkpoint.\nStarting from path with ID: {batch_index + 1}")

    return batch_index, paths, rgb_hists, hsv_hists, embeddings, other_data
    
    

def image_batch_generator(image_files, batch_size, resize_size, start_index = 0, show_progress=True):
    total_batches = (len(image_files) - start_index + batch_size - 1) // batch_size # - start_index to display remaining batches correctly
    progress_bar = tqdm(total=total_batches, desc="Processing images") if show_progress else None

    for index in range(start_index, len(image_files), batch_size):
        batch = image_files[index:index + batch_size]
        details_list = [extract_image_details(index + i + 1, path, resize_size) for i, path in enumerate(batch)]
        details_list = [features for features in details_list if features is not None]
        df = pd.DataFrame(details_list)
        yield df, index + batch_size
        if show_progress:
            progress_bar.update(1)

    if show_progress:
        progress_bar.close()

        
        
# Testing: Example
batch_size = 100
desired_size = (60, 60)


# Load checkpoint
start_index, paths, rgb_hists, hsv_hists, embeddings, other_data = load_checkpoint()

image_paths = find_image_files(r"C:\Users\timsa\Desktop\Daten_Joschua\data\image_data\extra_collection\beach")

for df, batch_index in image_batch_generator(image_paths, batch_size, desired_size, start_index=start_index, show_progress=True):
    paths.extend(df[['ID', 'Path']].values.tolist())
    rgb_hists.extend(df[['ID', 'RGB_Histogram']].values.tolist())
    hsv_hists.extend(df[['ID', 'HSV_Histogram']].values.tolist())
    embeddings.extend(df[['ID', 'Model_Embedding']].values.tolist())
    other_data.extend(df.drop(columns=['Path', 'RGB_Histogram', 'HSV_Histogram', 'Model_Embedding']).values.tolist())
    
    # Save checkpoint, overrides old one and appends new data
    with open('checkpoint.pkl', 'wb') as f:
        pickle.dump((batch_index, paths, rgb_hists, hsv_hists, embeddings, other_data), f)
    

# Save results
df_paths = pd.DataFrame(paths, columns=["ID", "Path"])
df_rgb = pd.DataFrame(rgb_hists, columns=["ID", "Histogram"])
df_hsv = pd.DataFrame(hsv_hists, columns=["ID", "Histogram"])
df_embeddings = pd.DataFrame(embeddings, columns=["ID", "Embedding"])
df_other_data = pd.DataFrame(other_data, columns=["ID", "Average_Color",
                                                 "Brightness", "Average_HSV",
                                                 "Resolution","DPI",
                                                 "File_Size", "File_Type",
                                                 "Metadata"
                                                ])

df_paths.to_pickle('Path.pkl')
df_rgb.to_pickle('RGB_Hist.pkl')
df_hsv.to_pickle('HSV_Hist.pkl')
df_embeddings.to_pickle('Embeddings.pkl')
df_other_data.to_pickle('Other_data.pkl')

# Checkpoint can be removed after program was successful
if os.path.exists('checkpoint.pkl'):
    os.remove('checkpoint.pkl')

Loaded checkpoint.
Starting from path with ID: 101


Processing images:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: total: 2min
Wall time: 24.8 s


In [5]:
len(df_rgb)

181

In [6]:
print(df_embeddings.tail())

      ID                                          Embedding
176  177  [-0.18032971, 0.3313287, -0.2374162, -0.217798...
177  178  [-0.16402672, 1.7069136, -0.18570574, 0.212251...
178  179  [-0.1656731, 0.4096732, -0.1702146, 0.02825574...
179  180  [-0.16269362, 1.903489, -0.23513995, 0.0794495...
180  181  [-0.02571875, 0.30269966, -0.14092457, 0.06659...


In [7]:
first_histogram_length = len(df_embeddings.iloc[0]['Embedding'])
first_histogram_length

1280

In [8]:
print(df_rgb.tail())
print(df_hsv.tail())
print(df_paths.tail())
print(df_other_data.tail())

      ID                                          Histogram
176  177  [0.010170917, 0.44461438, 0.01598287, 0.0, 0.0...
177  178  [0.04859704, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
178  179  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
179  180  [0.2950442, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
180  181  [0.1805076, 0.029868163, 0.0, 0.0, 0.0, 0.0, 0...
      ID                                          Histogram
176  177  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
177  178  [0.0, 0.0034040704, 0.0011346901, 0.010212211,...
178  179  [0.0, 0.0, 0.0, 0.0006082462, 0.0, 0.001824738...
179  180  [0.0, 0.0029820336, 0.0, 0.0, 0.0, 0.0, 0.0, 0...
180  181  [0.0, 0.0051383753, 0.017984314, 0.030830253, ...
      ID                                               Path
176  177  C:\Users\timsa\Desktop\Daten_Joschua\data\imag...
177  178  C:\Users\timsa\Desktop\Daten_Joschua\data\imag...
178  179  C:\Users\timsa\Desktop\Daten_Joschua\data\imag...
179  180  C:\Users\timsa\Desktop\Daten_J

In [7]:
# Create database

# Check if the folder 'databases' does not exist / create it
if not os.path.exists('database'):
    os.makedirs('database')
    
conn = sqlite3.connect("database/bd_database.db")
curs = conn.cursor()

curs.execute("""CREATE TABLE IF NOT EXISTS image_paths 
                (ID INTEGER PRIMARY KEY,
                Path text);""")
conn.commit()

In [8]:
# Save data to database
path_pickle_df = pd.read_pickle("Path.pkl")

for file_path in path_pickle_df['Path']:
    curs.execute('''INSERT OR IGNORE INTO image_paths (Path) VALUES (?);''', (file_path,))
    # print(f"Inserted path: {file_path}")
    
conn.commit()

In [9]:
curs.execute("""SELECT *
                FROM image_paths;""")
results = curs.fetchall()

df = pd.DataFrame(results, columns=['ID', 'Path'])

#conn.close()

df

Unnamed: 0,ID,Path
0,1,C:\Users\timsa\Desktop\sample_pictures\0031.png
1,2,C:\Users\timsa\Desktop\sample_pictures\adam-bi...
2,3,C:\Users\timsa\Desktop\sample_pictures\adrian-...
3,4,C:\Users\timsa\Desktop\sample_pictures\folder_...
4,5,C:\Users\timsa\Desktop\sample_pictures\folder_...
5,6,C:\Users\timsa\Desktop\sample_pictures\folder_...
6,7,C:\Users\timsa\Desktop\sample_pictures\folder_...
7,8,C:\Users\timsa\Desktop\sample_pictures\folder_...
8,9,C:\Users\timsa\Desktop\sample_pictures\folder_...
9,10,C:\Users\timsa\Desktop\sample_pictures\folder_...


In [11]:
# Similarity functions

def euclidean_distance():
    pass

def manhattan_distance():
    pass

def cosine_similarity():
    pass

def jaccard_similarity():
    pass

def hamming_distance():
    pass


# ___________________________________________________

In [70]:
# Parts for measurment - dimensionality reduction
# needs 1-D-vector


def extract_image_details(image_path):
    try:
        with Image.open(image_path) as img:
            img = img.convert('RGB')  # Convert to RGB

            # Resize image
            img_cv = cv2.imread(image_path)
            resized = cv2.resize(img_cv, desired_size)
            img_as_1d = np.array(resized).flatten() #this could be parallelized

            # UUID
            unique_id = str(uuid.uuid4())

        return {
            'ID': unique_id,
            'File_Path': image_path,
            'Resized_Image_Vector': img_as_1d
        }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [71]:
def process_large_image_dataset(generator, total_images, batch_size, n_components):
    """
    Processes a large image dataset using IncrementalPCA in batches.

    Parameters:
    generator (generator): Generator yielding batches of image data.
    total_images (int): Total number of images in the dataset.
    batch_size (int): Number of images to process in each batch.
    n_components (int): Number of principal components to keep.

    Returns:
    IncrementalPCA: The fitted IncrementalPCA model.
    """
    ipca = IncrementalPCA(n_components=n_components)
    
    for _ in range(0, total_images, batch_size):
        batch_df, image_vectors = next(generator)
        ipca.partial_fit(image_vectors)
    
    return ipca


def transform_image_features(batch_df, image_vectors, ipca):
    """
    Transforms image features using PCA.

    Parameters:
    batch_df (pd.DataFrame): The DataFrame containing batch image details.
    image_vectors (np.array): The array of image vectors.
    ipca (IncrementalPCA): The pre-fitted IncrementalPCA.

    Returns:
    pd.DataFrame: The updated DataFrame with PCA vectors added.
    """
    pca_vectors = ipca.transform(image_vectors)
    batch_df['PCA_Vectors'] = list(pca_vectors)
    return batch_df


In [72]:
def image_batch_generator(image_files, batch_size, show_progress=True):
    total_batches = (len(image_files) + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc="Processing images") if show_progress else None

    for index in range(0, len(image_files), batch_size):
        batch = image_files[index:index + batch_size]
        details_list = [extract_image_details(image) for image in batch]
        details_list = [features for features in details_list if features is not None]
        image_vectors = np.array([features['Resized_Image_Vector'] for features in details_list])
        df = pd.DataFrame(details_list)
        yield df, image_vectors
        if show_progress:
            progress_bar.update(1)

    if show_progress:
        progress_bar.close()

In [74]:
# Execution of code

total_images = len(image_paths)
batch_size = 100
desired_size = (60, 60)   # size which the images will be resized to
n_components = 10 # the amount of features our iPCA will keep for every image

generator = image_batch_generator(image_paths, batch_size, show_progress=True)   # gives back batch_df + image as 1d
ipca = process_large_image_dataset(generator, total_images, batch_size, n_components) # needed to fit the iPCA 

# After fitting IPCA, transform all data
for batch_df, image_vectors in image_batch_generator(image_paths, batch_size, show_progress=False):
    transformed_batch_df = transform_image_features(batch_df, image_vectors, ipca)

    
    transformed_batch_df.head()   


Processing images:   0%|          | 0/1 [00:00<?, ?it/s]

Error processing C:\Users\timsa\Desktop\sample_pictures\test_2.bmp: cannot identify image file 'C:\\Users\\timsa\\Desktop\\sample_pictures\\test_2.bmp'
Error processing C:\Users\timsa\Desktop\sample_pictures\test_2.bmp: cannot identify image file 'C:\\Users\\timsa\\Desktop\\sample_pictures\\test_2.bmp'


In [75]:
transformed_batch_df.head()

Unnamed: 0,ID,File_Path,Resized_Image_Vector,PCA_Vectors
0,34b0bb62-1f7b-45a8-8fc4-eaedddb5757f,C:\Users\timsa\Desktop\sample_pictures\0031.png,"[195, 187, 177, 188, 167, 162, 144, 123, 139, ...","[496.6541985933603, -888.0665161932402, -3756...."
1,8c0dfbb5-bba4-409d-8b57-849ce8bde435,C:\Users\timsa\Desktop\sample_pictures\adam-bi...,"[222, 212, 208, 224, 213, 207, 225, 216, 209, ...","[12632.571816986021, -259.16952814348303, 2069..."
2,e07de2bc-620f-4a07-b2b9-653b6c86f0b6,C:\Users\timsa\Desktop\sample_pictures\adrian-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-11009.597883374357, -501.0693401087662, -502..."
3,d332eb0a-e650-4c85-a5ae-cec91dfc3c8c,C:\Users\timsa\Desktop\sample_pictures\folder_...,"[98, 98, 94, 41, 48, 38, 20, 32, 32, 14, 26, 2...","[-3963.8594334778113, 2623.8053094494276, 1098..."
4,47e15f53-ee2d-42ac-81c7-a4e302982c86,C:\Users\timsa\Desktop\sample_pictures\folder_...,"[55, 51, 61, 36, 27, 30, 110, 51, 32, 161, 113...","[-4741.879883035555, -2071.263536722551, 1369...."


In [76]:
transformed_batch_df["PCA_Vectors"].describe()

count                                                    16
unique                                                   16
top       [496.6541985933603, -888.0665161932402, -3756....
freq                                                      1
Name: PCA_Vectors, dtype: object

In [77]:
transformed_batch_df["PCA_Vectors"][1]

array([12632.57181699,  -259.16952814,  2069.72584842,   444.01229774,
         481.85474691,   -62.15674047,   918.43916599,   427.64807317,
         293.81656367,   -74.17618535])