## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [27]:
import pickle as pkl
from PIL import Image
import numpy as np
import os
import webp
import random
from tqdm import tqdm

def get_random_samples(file_list, sample_size):
    """Randomly selects samples from the given file list."""
    return random.sample(file_list, sample_size)

def get_remaining_samples(file_list, selected_samples):
    """Returns the samples not selected from the file list."""
    return [sample for sample in file_list if sample not in selected_samples]

def preprocess_image(image_path, target_size):
    """Loads and preprocesses the image from the given path."""
    img = Image.open(image_path).resize(target_size)
    return np.array(img.convert("RGB")) / 255.

def preprocess_webp_image(image_path, target_size, crop_size):
    """Loads and preprocesses the WebP image from the given path."""
    img = webp.load_image(image_path, 'RGB').resize(target_size).crop(crop_size)
    return np.array(img) / 255.

def save_pickle_file(data, file_path):
    """Saves the data as a pickle file at the specified path."""
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

def create_batches(file_paths, folder, batch_size):
    """Creates batches of images and labels from the given file paths."""
    data = []
    labels = []
    for path in file_paths:
        img = preprocess_image(os.path.join(folder, path), (255, 245))
        data.append(img)
        labels.append([1, 0, 0])  # Change label based on category

    return np.stack(data), np.array(labels)

def create_webp_batches(file_paths, folder, batch_size):
    """Creates batches of WebP images and labels from the given file paths."""
    data = []
    labels = []
    for path in file_paths:
        img = preprocess_webp_image(os.path.join(folder, path), (256, 256), (0, 0, 255, 245))
        data.append(img)
        labels.append([0, 1, 0])  # Change label based on category

    return np.stack(data), np.array(labels)

def organize_data(human_path, ai_path, folder):
    # Image locations
    print("Initializing paths...")
    human_path = human_path
    ai_path = ai_path

    # Create a new folder to save information
    if not os.path.exists(folder + 'file_names/'):
        os.makedirs(folder + 'file_names/')
        print(f"Created folder: {folder}file_names/")
    if not os.path.exists(folder + 'train_batches/'):
        os.makedirs(folder + 'train_batches/')
        print(f"Created folder: {folder}train_batches/")
    if not os.path.exists(folder + 'test_batches/'):
        os.makedirs(folder + 'test_batches/')
        print(f"Created folder: {folder}test_batches/")

    # Randomly select test and training samples for each category
    human_files = os.listdir(human_path)
    print(f"Found {len(human_files)} human files.")
    human_train = get_random_samples(human_files, 200)
    print(f"Selected {len(human_train)} human train samples.")
    human_test = get_remaining_samples(human_files, human_train)
    print(f"Remaining human test samples: {len(human_test)}")

    ai_files = os.listdir(ai_path)
    print(f"Found {len(ai_files)} AI files.")
    ai_train = get_random_samples(ai_files, 150)
    print(f"Selected {len(ai_train)} AI train samples.")
    ai_test = get_remaining_samples(ai_files, ai_train)
    print(f"Remaining AI test samples: {len(ai_test)}")

    # Save the train and test sample names as pickle files
    save_pickle_file([human_train, ai_train], folder + 'file_names/train.pickle')
    print("Saved train sample names to train.pickle.")
    save_pickle_file([human_test, ai_test], folder + 'file_names/test.pickle')
    print("Saved test sample names to test.pickle.")

    # Create training batches
    num_batches = 10
    batch_size = 15
    print(f"Creating {num_batches} batches with batch size of {batch_size}...")
    for batch in tqdm(range(num_batches)):
        human_train_batch = human_train[batch * batch_size:(batch + 1) * batch_size]
        ai_train_batch = ai_train[batch * batch_size:(batch + 1) * batch_size]

        # Process and save batches
        print(f"Processing batch {batch + 1}/{num_batches}...")
        human_data, human_labels = create_batches(human_train_batch, human_path, batch_size)
        print(f"Processed human batch {batch + 1}: {human_data.shape} images.")
        
        ai_data, ai_labels = create_batches(ai_train_batch, ai_path, batch_size)
        print(f"Processed AI batch {batch + 1}: {ai_data.shape} images.")

        # Stack and save batch data
        data_batch = np.vstack((human_data, ai_data))
        label_batch = np.vstack((human_labels, ai_labels))
        batch_data = {'data': data_batch, 'labels': label_batch}

        save_pickle_file(batch_data, f"{folder}train_batches/batch_{batch}.pickle")
        print(f"Saved batch {batch} to {folder}train_batches/batch_{batch}.pickle")


    # Repeat the process for test data
    human_test_data, human_test_labels = create_batches(human_test, human_path, batch_size)
    ai_test_data, ai_test_labels = create_batches(ai_test, ai_path, batch_size)

    # Stack test data and save
    test_data = np.vstack((human_test_data, ai_test_data))
    test_labels = np.vstack((human_test_labels, ai_test_labels))
    test_batch = {'data': test_data, 'labels': test_labels}
    save_pickle_file(test_batch, f"{folder}test_batches/test_batch.pickle")


organize_data("/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/human",
              "/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/ai_generated",
              "/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/")

Initializing paths...
Found 213 human files.
Selected 200 human train samples.
Remaining human test samples: 13
Found 187 AI files.
Selected 150 AI train samples.
Remaining AI test samples: 37
Saved train sample names to train.pickle.
Saved test sample names to test.pickle.
Creating 10 batches with batch size of 15...


 20%|██        | 2/10 [00:00<00:00, 10.74it/s]

Processing batch 1/10...
Processed human batch 1: (15, 245, 255, 3) images.
Processed AI batch 1: (15, 245, 255, 3) images.
Saved batch 0 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_0.pickle
Processing batch 2/10...
Processed human batch 2: (15, 245, 255, 3) images.
Processed AI batch 2: (15, 245, 255, 3) images.
Saved batch 1 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_1.pickle
Processing batch 3/10...


 30%|███       | 3/10 [00:00<00:00, 10.36it/s]

Processed human batch 3: (15, 245, 255, 3) images.
Processed AI batch 3: (15, 245, 255, 3) images.
Saved batch 2 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_2.pickle
Processing batch 4/10...
Processed human batch 4: (15, 245, 255, 3) images.





UnidentifiedImageError: cannot identify image file '/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/ai_generated/.DS_Store'