## Dataset Information

This dataset is from Hugging Face, it is the celebrity-1000 dataset which contains images of the top 1000 celebrities and can be used for Convolutional Neural Network tasks. It contains a total of 18,184 images at a 256x256 resolution. The link for the dataset is here: https://huggingface.co/datasets/tonyassi/celebrity-1000

## Changelog
### 23/03/2025
- Figure out a way to view images with PIL and reading bytes.
- Current method far too slow and inefficient, looking into vectorised approaches.
- Image conversion is using up all system memory and freezing the entire OS... looking for alternative approaches.

### 24/03/2025
- Discovered that memory issues are stemming from BytesIO. This saves the images in an 'efficient' way in memory rather on disk to be quicker. However, since there is just so many images, even so-called efficient storage is not enough. This is what is using up all the memory. A new approach will be to process N images, save them to disk, clear memory and resume.

In [2]:
import io
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

df = pq.read_table('data/data.parquet').to_pandas()

# extremely slow and inefficient, do not use
def decode_image(image):
    image = Image.open(io.BytesIO(image['bytes']))
    image = image.resize((256, 256))  # Resize to a consistent size
    image = np.array(image)
    image = image / 255.0  # Normalize the image to [0, 1] range
    return image

def decode_single_image(image_bytes):
    """Decode a single image from bytes."""
    image = Image.open(io.BytesIO(image_bytes))
    image = image.resize((256, 256))
    return np.array(image) / 255.0

def decode_images_batch(df, batch_size=32, use_parallel=True, max_workers=None):
    """Process images in batches to avoid memory issues."""
    if max_workers is None:
        max_workers = multiprocessing.cpu_count()
    
    total_images = len(df)
    print(f"Processing {total_images} images in batches of {batch_size}")
    
    for i in range(0, total_images, batch_size):
        batch_df = df.iloc[i:min(i+batch_size, total_images)]
        batch_labels = batch_df['label'].to_numpy()
        
        if use_parallel:
            image_bytes_list = [row['image']['bytes'] for _, row in batch_df.iterrows()]
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                batch_images = list(executor.map(decode_single_image, image_bytes_list))
        else:
            batch_images = []
            for _, row in batch_df.iterrows():
                batch_images.append(decode_single_image(row['image']['bytes']))
        
        yield np.array(batch_images), batch_labels


The above method is far too memory inefficient. Trying alternative below

In [3]:
def process_parquet_images(parquet_file, target_size=(256, 256)):
    """
    Efficiently processes images from a Parquet file.

    Args:
        parquet_file: Path to the Parquet file.
        target_size:  Tuple (width, height) for resizing.

    Returns:
        A list of processed images (as NumPy arrays).  Can be easily
        modified to yield images one at a time, or to write directly
        to a file/database.
    """

    table = pq.read_table(parquet_file)
    num_images = len(table)
    processed_images = []

    for i in range(num_images):
        image_bytes = table['image'][i]['bytes'].as_py()  # Get bytes for the i-th image
        image = Image.open(io.BytesIO(image_bytes)) # Use BytesIO to avoid file on disk
        image = image.resize(target_size)            # Resize
        image_array = np.array(image) / 255.0  # Convert to NumPy array and Normalize
        processed_images.append(image_array)

    return processed_images

# file_path = "data/data.parquet"
# images = process_parquet_images(file_path)
                                    
# print(images[0].shape)  # Check the shape of a processed image
# print(images[0].min(),images[0].max()) #verify normalization



Trying new method of saving N images to disk after processing them. Using garbage collection.

In [5]:
import gc # garbage collection
import os

def process_and_save_images(parquet_file, output_dir, target_size=(256, 256), batch_size=100):
    """
    Processes images from a Parquet file in batches, saving them to disk.

    Args:
        parquet_file: Path to the Parquet file.
        output_dir: Directory to save processed images.
        target_size: Tuple (width, height) for resizing.
        batch_size: Number of images to process per batch.
    """

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    table = pq.read_table(parquet_file)
    num_images = len(table)
    num_batches = (num_images + batch_size - 1) // batch_size  # Calculate number of batches

    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, num_images)

        processed_images = []

        for i in range(start_index, end_index):
            try:
                image_bytes = table['image'][i]['bytes'].as_py()
                image = Image.open(io.BytesIO(image_bytes))
                image = image.resize(target_size)
                image_array = np.array(image) / 255.0  # Normalize
                processed_images.append(image_array)
            except Exception as e:
                print(f"Error processing image {i}: {e}")
                continue

        # Save the processed images for this batch
        for j, image_array in enumerate(processed_images):
            image_index = start_index + j
            filename = os.path.join(output_dir, f"image_{image_index:05d}.npy")  # e.g., image_00000.npy
            np.save(filename, image_array)

        # Explicitly clear memory
        del processed_images
        gc.collect() #force garbage collection

        print(f"Processed and saved batch {batch_num + 1}/{num_batches}")

    print("Image processing complete.")


# Example usage:
parquet_file = "data/data.parquet"  # Replace with your file path
output_dir = "data/images"
process_and_save_images(parquet_file, output_dir, batch_size=5000) # Batch size of 5000 fits inside memory comfortably

Processed and saved batch 1/4
Processed and saved batch 2/4
Processed and saved batch 3/4
Processed and saved batch 4/4
Image processing complete.
