## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [3]:
import pickle as pkl
from PIL import Image
import numpy as np
import os
import random
from tqdm import tqdm

def get_random_samples(file_list, sample_size):
    return random.sample(file_list, sample_size)

def get_remaining_samples(file_list, selected_samples):
    return [sample for sample in file_list if sample not in selected_samples]

def preprocess_image(image_path, target_size):
    img = Image.open(image_path).resize(target_size)
    return np.array(img.convert("RGB")) / 255.

def save_pickle_file(data, file_path):
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

def create_batches_with_labels(file_paths, folder, label):
    data = []
    labels = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
    
    for path in file_paths:
        if os.path.splitext(path)[1].lower() in valid_extensions:
            img = preprocess_image(os.path.join(folder, path), (256, 256))
            data.append(img)
            labels.append(label)  
        else:
            print(f"Skipped non-image file: {path}")

    return np.stack(data), np.array(labels)

def organize_data(human_path, ai_path, folder):
    human_path = human_path
    ai_path = ai_path

    if not os.path.exists(folder + 'file_names/'):
        os.makedirs(folder + 'file_names/')
    if not os.path.exists(folder + 'train_batches/'):
        os.makedirs(folder + 'train_batches/')
    if not os.path.exists(folder + 'test_batches/'):
        os.makedirs(folder + 'test_batches/')

    human_files = os.listdir(human_path)
    human_train = get_random_samples(human_files, 7000)
    human_test = get_remaining_samples(human_files, human_train)

    ai_files = os.listdir(ai_path)
    ai_train = get_random_samples(ai_files, 7000)
    ai_test = get_remaining_samples(ai_files, ai_train)

    save_pickle_file([human_train, ai_train], folder + 'file_names/train.pickle')
    save_pickle_file([human_test, ai_test], folder + 'file_names/test.pickle')

    num_batches = 20
    batch_size = 350
    
    human_label = [0, 1]
    ai_label = [1, 0]
    
    for batch in tqdm(range(num_batches)):
        human_train_batch = human_train[batch * batch_size:(batch + 1) * batch_size]
        ai_train_batch = ai_train[batch * batch_size:(batch + 1) * batch_size]

        human_data, human_labels = create_batches_with_labels(human_train_batch, human_path, human_label)
        
        ai_data, ai_labels = create_batches_with_labels(ai_train_batch, ai_path, ai_label)

        data_batch = {'data': np.vstack((human_data, ai_data)), 
                      'labels': np.vstack((human_labels, ai_labels))}

        if data_batch['data'].size > 0 and data_batch['labels'].size > 0:
            save_pickle_file(data_batch, f"{folder}train_batches/batch_{batch}.pickle")
        else:
            print(f"Warning: Batch {batch} is empty and will not be saved.")

    # Repeat the process for test data
    human_test_data, human_test_labels = create_batches_with_labels(human_test, human_path, human_label)
    ai_test_data, ai_test_labels = create_batches_with_labels(ai_test, ai_path, ai_label)

    # Stack test data and save
    test_data = np.vstack((human_test_data, ai_test_data))
    test_labels = np.vstack((human_test_labels, ai_test_labels))
    test_batch = {'data': test_data, 'labels': test_labels}
    save_pickle_file(test_batch, f"{folder}test_batches/test_batch.pickle")


organize_data("/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/new_human",
              "/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/new_ai",
              "/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/")


 40%|████      | 8/20 [01:48<02:44, 13.71s/it]

Skipped non-image file: .DS_Store


 90%|█████████ | 18/20 [04:06<00:27, 13.76s/it]

Skipped non-image file: .DS_Store


100%|██████████| 20/20 [04:34<00:00, 13.71s/it]
