In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from PIL import Image, ExifTags
import uuid
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from scipy.spatial import distance 
from sklearn.metrics import pairwise_distances
from numba import jit, prange
from concurrent.futures import ProcessPoolExecutor

from scipy.spatial import distance
from sklearn.metrics import pairwise
import time
import sqlite3

# Retrival Pipeline

In [6]:
%%time
'''
Load all image paths and get the total number of images,
we can't use tqdm here, because we have to determine the number of images first.
'''

def find_image_files(root_dir, extensions=('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif')):
    image_files = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(extensions):
                image_files.append(os.path.join(subdir, file))
    return image_files


image_paths = find_image_files(r"C:\Users\timsa\Desktop\sample_pictures")
df_image_paths = pd.DataFrame(image_paths, columns=["Path"])
df_image_paths.to_pickle('Path.pkl')

# Show first 10 paths
print(f"Number found images: {len(image_paths)}")
if len(image_paths) > 10:
    print("Some paths:\n")
    for pic_path in image_paths[:10]:
        print(pic_path)
        print("\n")
else:
    print("Found paths:", image_paths)

Number found images: 17
Some paths:

C:\Users\timsa\Desktop\sample_pictures\0031.png


C:\Users\timsa\Desktop\sample_pictures\adam-birkett-6cXZnFCd2KQ-unsplash.jpg


C:\Users\timsa\Desktop\sample_pictures\adrian-regeci-SAS0lq2QGLs-unsplash.jpg


C:\Users\timsa\Desktop\sample_pictures\test_2.bmp


C:\Users\timsa\Desktop\sample_pictures\folder_1\0020.png


C:\Users\timsa\Desktop\sample_pictures\folder_1\folder_1_1\0001.png


C:\Users\timsa\Desktop\sample_pictures\folder_1\folder_1_1\0002.png


C:\Users\timsa\Desktop\sample_pictures\folder_1\folder_1_1\0003.png


C:\Users\timsa\Desktop\sample_pictures\folder_1\folder_1_2\0004.png


C:\Users\timsa\Desktop\sample_pictures\folder_1\folder_1_2\0005.png


CPU times: total: 0 ns
Wall time: 3.09 ms


In [3]:
# Create database

# Check if the folder 'databases' does not exist / create it
if not os.path.exists('database'):
    os.makedirs('database')
    
conn = sqlite3.connect("database/bd_database.db")
curs = conn.cursor()

curs.execute("""CREATE TABLE IF NOT EXISTS image_paths 
                (id INTEGER PRIMARY KEY,
                path text);""")
conn.commit()

In [14]:
# Save data to database
path_pickle_df = pd.read_pickle("Path.pkl")

for file_path in path_pickle_df['Path']:
    curs.execute('''INSERT OR IGNORE INTO image_paths (path) VALUES (?);''', (file_path,))
    # print(f"Inserted path: {file_path}")
    
conn.commit()

In [15]:
import pandas as pd
curs.execute("""SELECT *
                FROM image_paths;""")
results = curs.fetchall()

df = pd.DataFrame(results, columns=['id', 'path'])

#conn.close()

df

Unnamed: 0,id,path
0,1,C:\Users\timsa\Desktop\sample_pictures\0031.png
1,2,C:\Users\timsa\Desktop\sample_pictures\adam-bi...
2,3,C:\Users\timsa\Desktop\sample_pictures\adrian-...
3,4,C:\Users\timsa\Desktop\sample_pictures\test_2.bmp
4,5,C:\Users\timsa\Desktop\sample_pictures\folder_...
5,6,C:\Users\timsa\Desktop\sample_pictures\folder_...
6,7,C:\Users\timsa\Desktop\sample_pictures\folder_...
7,8,C:\Users\timsa\Desktop\sample_pictures\folder_...
8,9,C:\Users\timsa\Desktop\sample_pictures\folder_...
9,10,C:\Users\timsa\Desktop\sample_pictures\folder_...


In [3]:
# normalizin the image into same Dimension, 
# image as vector
# calculating distances     img with itself and others
# image embeddings?
# if pictures embedded, hamming distance might come handy
# 


'''
Using the JIT decorator to parallelize calculations
extracting image details and saving them to a df
processing the image vectors seperately because the Standardscaler which is crucial for PCA needs a batch of images
________
PCA_COMPONENTS need to find balance between reduced dimension and preserved variation in data - test with 
'''




#@jit(parallel=True)
def calculate_avg_color_brightness(pixels):
    """
    Calculates the average color and brightness of an image.
    
    Parameters:
    pixels (list): List of pixel values in the image.
    
    Returns:
    tuple: The average color (RGB) and brightness of the image.
    """
    avg_color = tuple(sum(col) // len(pixels) for col in zip(*pixels))
    avg_brightness = sum(sum(pixel) for pixel in pixels) // (3 * len(pixels))

    # Convert pixels to HSV and calculate average HSV
    hsv_pixels = cv2.cvtColor(np.array(pixels).reshape(-1, 1, 3).astype(np.uint8), cv2.COLOR_RGB2HSV).reshape(-1, 3)
    avg_hsv = tuple(np.mean(hsv_pixels, axis=0))

    return avg_color, avg_brightness, avg_hsv

def extract_image_details(image_path):
    """
    Extracts details from an image, including resizing, color values, brightness, resolution, DPI, metadata, and generates a unique ID.
    
    Parameters:
    image_path (str): The file path of the image.
    
    Returns:
    dict: A dictionary containing image details and the resized image vector.
    """
    try:
        with Image.open(image_path) as img:
            img = img.convert('RGB')  # Convert to RGB

            # Resize image
            img_cv = cv2.imread(image_path)
            resized = cv2.resize(img_cv, desired_size)
            img_as_1d = np.array(resized).flatten() #this could be parallelized


            # Color values and brightness
            pixels = list(img.getdata())
            avg_color, avg_brightness, avg_hsv = calculate_avg_color_brightness(pixels)


            # Resolution and DPI
            resolution = img.size  # (width, height)
            dpi = img.info.get('dpi', (0, 0))


            # Extract metadata
            try:
                exif_data = img._getexif()
                metadata = {ExifTags.TAGS[k]: v for k, v in exif_data.items() if k in ExifTags.TAGS} if exif_data else {}
            except AttributeError:
                metadata = {}

            # File details
            file_size = os.path.getsize(image_path)
            file_type = os.path.splitext(image_path)[1]

            # UUID
            unique_id = str(uuid.uuid4())

        return {
            'ID': unique_id,
            'File_Path': image_path,
            'Average_Color': avg_color,
            'Brightness': avg_brightness,
            'Average_HSV': avg_hsv,
            'Resolution': resolution,
            'DPI': dpi,
            'File_Size': file_size,
            'File_Type': file_type,
            'Metadata': metadata,
            'Resized_Image_Vector': img_as_1d
        }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [4]:
def process_large_image_dataset(generator, total_images, batch_size, n_components):
    """
    Processes a large image dataset using IncrementalPCA in batches.

    Parameters:
    generator (generator): Generator yielding batches of image data.
    total_images (int): Total number of images in the dataset.
    batch_size (int): Number of images to process in each batch.
    n_components (int): Number of principal components to keep.

    Returns:
    IncrementalPCA: The fitted IncrementalPCA model.
    """
    ipca = IncrementalPCA(n_components=n_components)
    
    for _ in range(0, total_images, batch_size):
        batch_df, image_vectors = next(generator)
        ipca.partial_fit(image_vectors)
    
    return ipca


def transform_image_features(batch_df, image_vectors, ipca):
    """
    Transforms image features using PCA.

    Parameters:
    batch_df (pd.DataFrame): The DataFrame containing batch image details.
    image_vectors (np.array): The array of image vectors.
    ipca (IncrementalPCA): The pre-fitted IncrementalPCA.

    Returns:
    pd.DataFrame: The updated DataFrame with PCA vectors added.
    """
    pca_vectors = ipca.transform(image_vectors)
    batch_df['PCA_Vectors'] = list(pca_vectors)
    return batch_df

def compute_and_store_pairwise_distances(df):
    """
    Computes and stores pairwise distances between PCA vectors in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing image details and PCA vectors.
    
    Returns:
    pd.DataFrame: The updated DataFrame with pairwise distances added.
    """
    pca_vectors = np.vstack(df['PCA_Vectors'].values)
    
    distances = pairwise_distances(pca_vectors, metric='euclidean')
    
    df['Pairwise_Distances'] = distances.tolist()
    return df

In [5]:
def image_batch_generator(image_files, batch_size, show_progress=True):
    total_batches = (len(image_files) + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc="Processing images") if show_progress else None

    for index in range(0, len(image_files), batch_size):
        batch = image_files[index:index + batch_size]
        details_list = [extract_image_details(image) for image in batch]
        details_list = [features for features in details_list if features is not None]
        image_vectors = np.array([features['Resized_Image_Vector'] for features in details_list])
        df = pd.DataFrame(details_list)
        yield df, image_vectors
        if show_progress:
            progress_bar.update(1)

    if show_progress:
        progress_bar.close()

In [7]:
# Execution of code

total_images = len(image_paths)
batch_size = 100
desired_size = (60, 60)   # size which the images will be resized to
n_components = 10 # the amount of features our iPCA will keep for every image

generator = image_batch_generator(image_paths, batch_size, show_progress=True)   # gives back batch_df + image as 1d
ipca = process_large_image_dataset(generator, total_images, batch_size, n_components) # needed to fit the iPCA 

# After fitting IPCA, transform all data
for batch_df, image_vectors in image_batch_generator(image_paths, batch_size, show_progress=False):
    transformed_batch_df = transform_image_features(batch_df, image_vectors, ipca)
    transformed_batch_df = compute_and_store_pairwise_distances(transformed_batch_df)

    
    transformed_batch_df.head()   

'''
This outputs the metadata batchwise and overwrites it.
You need extract the details from ___transformed_batch_df___ into SQLite.
In summary, extracted details by now are:

ID
File_Path
Average_Color
Brightness
Resolution
DPI
File_Size
File_Type
Metadata
Resized_Image_Vector
PCA_Vectors
Pairwise_Distances

NOTE "Metadata" might be empty
'''

Processing images:   0%|          | 0/1 [00:00<?, ?it/s]

Error processing C:\Users\timsa\Desktop\sample_pictures\test_2.bmp: cannot identify image file 'C:\\Users\\timsa\\Desktop\\sample_pictures\\test_2.bmp'
Error processing C:\Users\timsa\Desktop\sample_pictures\test_2.bmp: cannot identify image file 'C:\\Users\\timsa\\Desktop\\sample_pictures\\test_2.bmp'


'\nThis outputs the metadata batchwise and overwrites it.\nYou need extract the details from ___transformed_batch_df___ into SQLite.\nIn summary, extracted details by now are:\n\nID\nFile_Path\nAverage_Color\nBrightness\nResolution\nDPI\nFile_Size\nFile_Type\nMetadata\nResized_Image_Vector\nPCA_Vectors\nPairwise_Distances\n\nNOTE "Metadata" might be empty\n'

In [8]:
transformed_batch_df.head()

Unnamed: 0,ID,File_Path,Average_Color,Brightness,Average_HSV,Resolution,DPI,File_Size,File_Type,Metadata,Resized_Image_Vector,PCA_Vectors,Pairwise_Distances
0,7ebad40f-b5e5-4f95-b062-6732a09c6c4f,C:\Users\timsa\Desktop\sample_pictures\0031.png,"(128, 114, 94)",112,"(41.015004482619005, 86.44355840418763, 135.94...","(2040, 1356)","(0, 0)",4833865,.png,{},"[195, 187, 177, 188, 167, 162, 144, 123, 139, ...","[496.6541985933603, -888.0665161932402, -3756....","[0.0, 14485.56864333656, 13326.193082185568, 8..."
1,e56f4993-10ec-4bf5-8755-45ea56762511,C:\Users\timsa\Desktop\sample_pictures\adam-bi...,"(233, 234, 237)",235,"(109.27585941666666, 4.631473, 237.66863866666...","(4000, 6000)","(72, 72)",513161,.jpg,{},"[222, 212, 208, 224, 213, 207, 225, 216, 209, ...","[12632.571816986021, -259.16952814348303, 2069...","[14485.56864333656, 0.0, 23876.54477753193, 17..."
2,67d8e9f8-638b-4e7f-ac5e-35c6ef6f8654,C:\Users\timsa\Desktop\sample_pictures\adrian-...,"(6, 6, 6)",6,"(9.852003602358767, 6.238003348969191, 6.46427...","(3213, 5714)","(72, 72)",272983,.jpg,{},"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-11009.597883374357, -501.0693401087662, -502...","[13326.193082185568, 23876.54477753193, 0.0, 8..."
3,e517d62a-2a7d-4819-8527-ac2d93fffe70,C:\Users\timsa\Desktop\sample_pictures\folder_...,"(78, 78, 70)",75,"(47.099725673315305, 66.7132120520622, 83.9589...","(2040, 1392)","(0, 0)",3795639,.png,{},"[98, 98, 94, 41, 48, 38, 20, 32, 32, 14, 26, 2...","[-3963.8594334778113, 2623.8053094494276, 1098...","[8896.002320024618, 17006.6524923782, 8122.580..."
4,fbfcf3c4-fd61-4bc3-a51f-5c39c44152a0,C:\Users\timsa\Desktop\sample_pictures\folder_...,"(91, 55, 58)",68,"(75.47255355846042, 130.91772352103234, 92.193...","(2040, 1404)","(0, 0)",4953978,.png,{},"[55, 51, 61, 36, 27, 30, 110, 51, 32, 161, 113...","[-4741.879883035555, -2071.263536722551, 1369....","[9830.332968496801, 17635.260606687807, 7388.9..."


In [None]:
transformed_batch_df["PCA_Vectors"].describe()

count                                                   100
unique                                                  100
top       [-4512.473313312545, -1471.4342959269018, -138...
freq                                                      1
Name: PCA_Vectors, dtype: object