In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from PIL import Image, ExifTags
import uuid
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from scipy.spatial import distance 
from sklearn.metrics import pairwise_distances
from numba import jit, prange
from concurrent.futures import ProcessPoolExecutor

from scipy.spatial import distance
from sklearn.metrics import pairwise

In [2]:
"""
Es sind insgesamt 444670 Bilder, deshalb kann max_files gesetzt werden und tqdm läuft akkurat.
Eine andere Möglichkeit die tqdm bar anzupassen gibt es nicht, weil die Bilder dynamisch geladen werden.

"""

def find_image_files(root_dir, extensions=('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'), max_files=500):
    image_files = []
    # Initialisierung von tqdm außerhalb von os.walk, um die Anzahl der Dateien zu zählen
    pbar = tqdm(total=max_files, desc='Durchsuche Verzeichnisse')
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(extensions):
                image_files.append(os.path.join(subdir, file))
                pbar.update(1)
                if len(image_files) >= max_files:
                    pbar.close()
                    return image_files  # Beendet die Suche, wenn das Limit erreicht ist
    pbar.close()  # Schließe den tqdm-Balken, wenn die Suche beendet ist
    return image_files

# Verwenden der Funktion, um Bildpfade aus dem 'data'-Verzeichnis zu sammeln
image_paths = find_image_files("D:\\data\\image_data")

# Anzeigen der Anzahl der gefundenen Bilder und der ersten 10 Bildpfade zur Überprüfung
print(f"Anzahl gefundener Bilder: {len(image_paths)}")
if len(image_paths) > 10:
    print("Einige der gefundenen Bildpfade:", image_paths[:10])
else:
    print("Gefundene Bildpfade:", image_paths)

Durchsuche Verzeichnisse:   0%|          | 0/500 [00:00<?, ?it/s]

Anzahl gefundener Bilder: 500
Einige der gefundenen Bildpfade: ['D:\\data\\image_data\\coco2017_train\\train2017\\000000000034.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000049.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000071.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000078.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000081.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000089.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000208.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000241.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000247.jpg', 'D:\\data\\image_data\\coco2017_train\\train2017\\000000000283.jpg']


In [3]:
# normalizin the image into same Dimension, 
# image as vector
# calculating distances     img with itself and others
# image embeddings?
# if pictures embedded, hamming distance might come handy
# 


'''
Using the JIT decorator to parallelize calculations
extracting image details and saving them to a df
processing the image vectors seperately because the Standardscaler which is crucial for PCA needs a batch of images
________
PCA_COMPONENTS need to find balance between reduced dimension and preserved variation in data - test with 
'''




#@jit(parallel=True)
def calculate_avg_color_brightness(pixels):
    """
    Calculates the average color and brightness of an image.
    
    Parameters:
    pixels (list): List of pixel values in the image.
    
    Returns:
    tuple: The average color (RGB) and brightness of the image.
    """
    avg_color = tuple(sum(col) // len(pixels) for col in zip(*pixels))
    avg_brightness = sum(sum(pixel) for pixel in pixels) // (3 * len(pixels))
    return avg_color, avg_brightness

def extract_image_details(image_path):
    """
    Extracts details from an image, including resizing, color values, brightness, resolution, DPI, metadata, and generates a unique ID.
    
    Parameters:
    image_path (str): The file path of the image.
    
    Returns:
    dict: A dictionary containing image details and the resized image vector.
    """
    try:
        with Image.open(image_path) as img:
            img = img.convert('RGB')  # Convert to RGB

            # Resize image
            img_cv = cv2.imread(image_path)
            resized = cv2.resize(img_cv, desired_size)
            img_as_1d = np.array(resized).flatten() #this could be parallelized


            # Color values and brightness
            pixels = list(img.getdata())
            avg_color, avg_brightness = calculate_avg_color_brightness(pixels)


            # Resolution and DPI
            resolution = img.size  # (width, height)
            dpi = img.info.get('dpi', (0, 0))


            # Extract metadata
            try:
                exif_data = img._getexif()
                metadata = {ExifTags.TAGS[k]: v for k, v in exif_data.items() if k in ExifTags.TAGS} if exif_data else {}
            except AttributeError:
                metadata = {}

            # File details
            file_size = os.path.getsize(image_path)
            file_type = os.path.splitext(image_path)[1]

            # UUID
            unique_id = str(uuid.uuid4())

        return {
            'ID': unique_id,
            'File_Path': image_path,
            'Average_Color': avg_color,
            'Brightness': avg_brightness,
            'Resolution': resolution,
            'DPI': dpi,
            'File_Size': file_size,
            'File_Type': file_type,
            'Metadata': metadata,
            'Resized_Image_Vector': img_as_1d
        }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [4]:
def process_large_image_dataset(generator, total_images, batch_size, n_components):
    """
    Processes a large image dataset using IncrementalPCA in batches.

    Parameters:
    generator (generator): Generator yielding batches of image data.
    total_images (int): Total number of images in the dataset.
    batch_size (int): Number of images to process in each batch.
    n_components (int): Number of principal components to keep.

    Returns:
    IncrementalPCA: The fitted IncrementalPCA model.
    """
    ipca = IncrementalPCA(n_components=n_components)
    
    for _ in range(0, total_images, batch_size):
        batch_df, image_vectors = next(generator)
        ipca.partial_fit(image_vectors)
    
    return ipca


def transform_image_features(batch_df, image_vectors, ipca):
    """
    Transforms image features using PCA.

    Parameters:
    batch_df (pd.DataFrame): The DataFrame containing batch image details.
    image_vectors (np.array): The array of image vectors.
    ipca (IncrementalPCA): The pre-fitted IncrementalPCA.

    Returns:
    pd.DataFrame: The updated DataFrame with PCA vectors added.
    """
    pca_vectors = ipca.transform(image_vectors)
    batch_df['PCA_Vectors'] = list(pca_vectors)
    return batch_df

def compute_and_store_pairwise_distances(df):
    """
    Computes and stores pairwise distances between PCA vectors in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing image details and PCA vectors.
    
    Returns:
    pd.DataFrame: The updated DataFrame with pairwise distances added.
    """
    pca_vectors = np.vstack(df['PCA_Vectors'].values)
    
    distances = pairwise_distances(pca_vectors, metric='euclidean')
    
    df['Pairwise_Distances'] = distances.tolist()
    return df

In [5]:
def image_batch_generator(image_files, batch_size, show_progress=True):
    total_batches = (len(image_files) + batch_size - 1) // batch_size
    progress_bar = tqdm(total=total_batches, desc="Processing images") if show_progress else None

    for index in range(0, len(image_files), batch_size):
        batch = image_files[index:index + batch_size]
        details_list = [extract_image_details(image) for image in batch]
        details_list = [features for features in details_list if features is not None]
        image_vectors = np.array([features['Resized_Image_Vector'] for features in details_list])
        df = pd.DataFrame(details_list)
        yield df, image_vectors
        if show_progress:
            progress_bar.update(1)

    if show_progress:
        progress_bar.close()

In [6]:
# Execution of code

total_images = len(image_paths)
batch_size = 100
desired_size = (60, 60)   # size which the images will be resized to
n_components = 10 # the amount of features our iPCA will keep for every image

generator = image_batch_generator(image_paths, batch_size, show_progress=True)   # gives back batch_df + image as 1d
ipca = process_large_image_dataset(generator, total_images, batch_size, n_components) # needed to fit the iPCA 

# After fitting IPCA, transform all data
for batch_df, image_vectors in image_batch_generator(image_paths, batch_size, show_progress=False):
    transformed_batch_df = transform_image_features(batch_df, image_vectors, ipca)
    transformed_batch_df = compute_and_store_pairwise_distances(transformed_batch_df)

    
    transformed_batch_df.head()   

'''
This outputs the metadata batchwise and overwrites it.
You need extract the details from ___transformed_batch_df___ into SQLite.
In summary, extracted details by now are:

ID
File_Path
Average_Color
Brightness
Resolution
DPI
File_Size
File_Type
Metadata
Resized_Image_Vector
PCA_Vectors
Pairwise_Distances

NOTE "Metadata" might be empty
'''

Processing images:   0%|          | 0/5 [00:00<?, ?it/s]

'\nThis outputs the metadata batchwise and overwrites it.\nYou need extract the details from ___transformed_batch_df___ into SQLite.\nIn summary, extracted details by now are:\n\nID\nFile_Path\nAverage_Color\nBrightness\nResolution\nDPI\nFile_Size\nFile_Type\nMetadata\nResized_Image_Vector\nPCA_Vectors\nPairwise_Distances\n\nNOTE "Metadata" might be empty\n'

In [None]:
transformed_batch_df.head()

Unnamed: 0,ID,File_Path,Average_Color,Brightness,Resolution,DPI,File_Size,File_Type,Metadata,Resized_Image_Vector,PCA_Vectors,Pairwise_Distances
0,f3987f73-d46e-4d0d-9945-ca03d710bca6,D:\data\image_data\coco2017_train\train2017\00...,"(71, 67, 64)",67,"(640, 562)","(72, 72)",111422,.jpg,{},"[18, 18, 56, 26, 20, 70, 83, 85, 109, 110, 109...","[-4512.473313312545, -1471.4342959269018, -138...","[0.0, 6635.948449420372, 5687.580701034129, 32..."
1,427f5baf-a43f-434a-9db2-02809a268d59,D:\data\image_data\coco2017_train\train2017\00...,"(139, 121, 66)",109,"(640, 480)","(72, 72)",95569,.jpg,{},"[96, 184, 224, 113, 193, 224, 132, 206, 228, 1...","[-835.2963255805532, -942.5905031221771, 2307....","[6635.948449420372, 0.0, 5259.761316618222, 58..."
2,0d74e760-1cb3-49b2-a48e-3a177cbf355a,D:\data\image_data\coco2017_train\train2017\00...,"(103, 109, 85)",99,"(427, 640)","(72, 72)",268466,.jpg,{},"[197, 223, 217, 199, 183, 171, 195, 203, 198, ...","[-501.49420655028104, -4558.463264780337, -266...","[5687.580701034129, 5259.761316618222, 0.0, 55..."
3,75020130-1650-4e54-b2d9-9669bd7eadd5,D:\data\image_data\coco2017_train\train2017\00...,"(89, 88, 88)",88,"(640, 480)","(72, 72)",161052,.jpg,{},"[26, 25, 25, 52, 60, 65, 48, 61, 58, 186, 175,...","[-2304.2186613479794, -42.842444229631994, -14...","[3214.4082551706397, 5820.707502516365, 5526.4..."
4,8a22ede1-051c-4b9a-9e7d-4dcac3b7bf50,D:\data\image_data\coco2017_train\train2017\00...,"(171, 171, 167)",169,"(640, 411)","(72, 72)",97074,.jpg,{},"[159, 159, 159, 156, 156, 156, 157, 157, 157, ...","[5591.430876170691, 1999.325915828613, 196.939...","[11004.13469208075, 8007.076008069452, 9258.61..."


In [None]:
transformed_batch_df["PCA_Vectors"].describe()

count                                                   100
unique                                                  100
top       [-4512.473313312545, -1471.4342959269018, -138...
freq                                                      1
Name: PCA_Vectors, dtype: object