## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [1]:
import pickle as pkl
from PIL import Image
import numpy as np
import os
import webp
import random
from tqdm import tqdm

def get_random_samples(file_list, sample_size):
    """Randomly selects samples from the given file list."""
    return random.sample(file_list, sample_size)

def get_remaining_samples(file_list, selected_samples):
    """Returns the samples not selected from the file list."""
    return [sample for sample in file_list if sample not in selected_samples]

def preprocess_image(image_path, target_size):
    """Loads and preprocesses the image from the given path."""
    img = Image.open(image_path).resize(target_size)
    return np.array(img.convert("RGB")) / 255.

def preprocess_webp_image(image_path, target_size, crop_size):
    """Loads and preprocesses the WebP image from the given path."""
    img = webp.load_image(image_path, 'RGB').resize(target_size).crop(crop_size)
    return np.array(img) / 255.

def save_pickle_file(data, file_path):
    """Saves the data as a pickle file at the specified path."""
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

def create_batches(file_paths, folder, batch_size):
    """Creates batches of images and labels from the given file paths."""
    data = []
    labels = []
    for path in file_paths:
        img = preprocess_image(os.path.join(folder, path), (255, 245))
        data.append(img)
        labels.append([1, 0, 0])  # Change label based on category

    return np.stack(data), np.array(labels)

def create_webp_batches(file_paths, folder, batch_size):
    """Creates batches of WebP images and labels from the given file paths."""
    data = []
    labels = []
    for path in file_paths:
        img = preprocess_webp_image(os.path.join(folder, path), (256, 256), (0, 0, 255, 245))
        data.append(img)
        labels.append([0, 1, 0])  # Change label based on category

    return np.stack(data), np.array(labels)

def organize_data(human_path, dalle_path, stable_path, folder):
    # Image locations
    human_path = human_path
    dalle_path = dalle_path
    # stable_path = stable_path

    # Create a new folder to save information
    if not os.path.exists(folder + 'file_names/'):
        os.makedirs(folder + 'file_names/')
    if not os.path.exists(folder + 'train_batches/'):
        os.makedirs(folder + 'train_batches/')
    if not os.path.exists(folder + 'test_batches/'):
        os.makedirs(folder + 'test_batches/')

    # Randomly select test and training samples for each category
    human_files = os.listdir(human_path)
    human_train = get_random_samples(human_files, 400)
    human_test = get_remaining_samples(human_files, human_train)

    dalle_files = os.listdir(dalle_path)
    dalle_train = get_random_samples(dalle_files, 400)
    dalle_test = get_remaining_samples(dalle_files, dalle_train)

    # stable_files = os.listdir(stable_path)
    # stable_train = get_random_samples(stable_files, 40800)
    # stable_test = get_remaining_samples(stable_files, stable_train)

    # Save the train and test sample names as pickle files
    save_pickle_file([human_train, dalle_train], folder + 'file_names/train.pickle')
    save_pickle_file([human_test, dalle_test], folder + 'file_names/test.pickle')

    # Create training batches
    num_batches = 68
    batch_size = 600
    for batch in tqdm(range(num_batches)):
        human_train_batch = human_train[batch * batch_size:(batch + 1) * batch_size]
        dalle_train_batch = dalle_train[batch * batch_size:(batch + 1) * batch_size]
        # stable_train_batch = stable_train[batch * batch_size:(batch + 1) * batch_size]

        # Process and save batches
        human_data, human_labels = create_batches(human_train_batch, human_path, batch_size)
        dalle_data, dalle_labels = create_webp_batches(dalle_train_batch, dalle_path, batch_size)
        # stable_data, stable_labels = create_webp_batches(stable_train_batch, stable_path, batch_size)

        # Stack and save batch data
        data_batch = np.vstack((human_data, dalle_data))
        label_batch = np.vstack((human_labels, dalle_labels))
        batch_data = {'data': data_batch, 'labels': label_batch}
        
        save_pickle_file(batch_data, f"{folder}train_batches/batch_{batch}.pickle")
    
    # Repeat the process for test data
    human_test_data, human_test_labels = create_batches(human_test, human_path, batch_size)
    dalle_test_data, dalle_test_labels = create_webp_batches(dalle_test, dalle_path, batch_size)
    # stable_test_data, stable_test_labels = create_webp_batches(stable_test, stable_path, batch_size)

    # Stack test data and save
    test_data = np.vstack((human_test_data, dalle_test_data))
    test_labels = np.vstack((human_test_labels, dalle_test_labels))
    test_batch = {'data': test_data, 'labels': test_labels}
    save_pickle_file(test_batch, f"{folder}test_batches/test_batch.pickle")


organize_data("/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Human/","/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/DALLE","","/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/batches")