## Image Processing

The code organizes the dataset of images into training and test sets for three categories: human, dalle, and stable. Random samples are selected from each category for training and the remaining samples are used for testing. The images are preprocessed, resized, and converted into numpy arrays. Training and test batches are created and saved as pickle files for later use.

In [2]:
import pickle as pkl
from PIL import Image
import numpy as np
import os
import webp
import random
from tqdm import tqdm

# lấy ngẫu nhiên file theo số lượng file trong list 
def get_random_samples(file_list, sample_size):
    """Randomly selects samples from the given file list."""
    return random.sample(file_list, sample_size)

def get_remaining_samples(file_list, selected_samples):
    """Returns the samples not selected from the file list."""
    return [sample for sample in file_list if sample not in selected_samples]

# load 1 ảnh từ đường dẫn lên, thay đổi kích thước của nó và chuẩn hoá giá trị pixel
# /255 : Chuẩn hóa giá trị pixel về khoảng [0, 1], mảng ảnh là 1 mảng 3 chiều [233,145,66] -> [0.93;0,53;0.21]
def preprocess_image(image_path, target_size):
    """Loads and preprocesses the image from the given path."""
    img = Image.open(image_path).resize(target_size)
    return np.array(img.convert("RGB")) / 255.

# .crop(crop_size) : Cắt ảnh thành kích thước xác định bởi crop_size, có dạng (left, upper, right, lower)
def preprocess_webp_image(image_path, target_size, crop_size):
    """Loads and preprocesses the WebP image from the given path."""
    # print(image_path)
    img = Image.open(image_path).convert('RGB')
    img = img.resize(target_size)
    img = img.crop(crop_size)
    return np.array(img) / 255.

def save_pickle_file(data, file_path):
    """Saves the data as a pickle file at the specified path."""
    with open(file_path, 'wb') as f:
        pkl.dump(data, f)

def create_batches(file_paths, folder, batch_size):
    """Creates batches of images and labels from the given file paths."""
    data = []
    labels = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
    
    for path in file_paths:
        if os.path.splitext(path)[1].lower() in valid_extensions:
            img = preprocess_image(os.path.join(folder, path), (255, 245))
            data.append(img)
            labels.append([0, 1, 0])  # Change label based on category
        else:
            print(f"Skipped non-image file: {path}")

    return np.stack(data), np.array(labels)

def create_webp_batches(file_paths, folder, batch_size):
    """Creates batches of WebP images and labels from the given file paths."""
    data = []
    labels = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
    
    for path in file_paths:
        if os.path.splitext(path)[1].lower() in valid_extensions:
            img = preprocess_webp_image(os.path.join(folder, path), (256, 256), (0, 0, 255, 245))
            data.append(img)
            labels.append([1, 0, 0])  # Change label based on category

    return np.stack(data), np.array(labels)

def create_batches_with_labels(file_paths, folder, label):
    data = []
    labels = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.webp']
    
    for path in file_paths:
        if os.path.splitext(path)[1].lower() in valid_extensions:
            img = preprocess_image(os.path.join(folder, path), (256, 256))
            data.append(img)
            labels.append(label)  # Change label based on category
        else:
            print(f"Skipped non-image file: {path}")

    return np.stack(data), np.array(labels)

def organize_data(human_path, ai_path, folder):
    # Image locations
    print("Initializing paths...")
    human_path = human_path
    ai_path = ai_path

    # Create a new folder to save information
    if not os.path.exists(folder + 'file_names/'):
        os.makedirs(folder + 'file_names/')
        print(f"Created folder: {folder}file_names/")
    if not os.path.exists(folder + 'train_batches/'):
        os.makedirs(folder + 'train_batches/')
        print(f"Created folder: {folder}train_batches/")
    if not os.path.exists(folder + 'test_batches/'):
        os.makedirs(folder + 'test_batches/')
        print(f"Created folder: {folder}test_batches/")

    # Randomly select test and training samples for each category
    human_files = os.listdir(human_path)
    print(f"Found {len(human_files)} human files.")
    human_train = get_random_samples(human_files, 8400)
    print(f"Selected {len(human_train)} human train samples.")
    human_test = get_remaining_samples(human_files, human_train)
    print(f"Remaining human test samples: {len(human_test)}")

    ai_files = os.listdir(ai_path)
    print(f"Found {len(ai_files)} AI files.")
    ai_train = get_random_samples(ai_files, 8400)
    print(f"Selected {len(ai_train)} AI train samples.")
    ai_test = get_remaining_samples(ai_files, ai_train)
    print(f"Remaining AI test samples: {len(ai_test)}")

    # Save the train and test sample names as pickle files
    save_pickle_file([human_train, ai_train], folder + 'file_names/train.pickle')
    print("Saved train sample names to train.pickle.")
    save_pickle_file([human_test, ai_test], folder + 'file_names/test.pickle')
    print("Saved test sample names to test.pickle.")

    # Create training batches
    num_batches = 14
    batch_size = 600
    print(f"Creating {num_batches} batches with batch size of {batch_size}...")
    
    human_label = [0, 1, 0]
    ai_label = [1, 0, 0]
    
    # tqdm dùng để hiển thị ra thanh progress màu đỏ
    
    for batch in tqdm(range(num_batches)):
        # truy cập đến dữ liệu trong khoảng batch*batch_size tới (batch+1)*batch_size
        human_train_batch = human_train[batch * batch_size:(batch + 1) * batch_size]
        ai_train_batch = ai_train[batch * batch_size:(batch + 1) * batch_size]

        # Process and save batches
        # print(f"Processing batch {batch + 1}/{num_batches}...")
        human_data, human_labels = create_batches_with_labels(human_train_batch, human_path, human_label)
        # print(f"Processed human batch {batch + 1}: {human_data.shape} images.")
        
        ai_data, ai_labels = create_batches_with_labels(ai_train_batch, ai_path, ai_label)
        # print(f"Processed AI batch {batch + 1}: {ai_data.shape} images.")

        # Stack and save batch data
        data_batch = {'data': np.vstack((human_data, ai_data)), 
                      'labels': np.vstack((human_labels, ai_labels))}

        # print(f'data batch {data_batch}')
        if data_batch['data'].size > 0 and data_batch['labels'].size > 0:
            save_pickle_file(data_batch, f"{folder}train_batches/batch_{batch}.pickle")
            print(f"Saved batch {batch} to {folder}train_batches/batch_{batch}.pickle")
        else:
            print(f"Warning: Batch {batch} is empty and will not be saved.")

    # Repeat the process for test data
    human_test_data, human_test_labels = create_batches_with_labels(human_test, human_path, human_label)
    ai_test_data, ai_test_labels = create_batches_with_labels(ai_test, ai_path, ai_label)

    # Stack test data and save
    test_data = np.vstack((human_test_data, ai_test_data))
    test_labels = np.vstack((human_test_labels, ai_test_labels))
    test_batch = {'data': test_data, 'labels': test_labels}
    save_pickle_file(test_batch, f"{folder}test_batches/test_batch.pickle")


organize_data("/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/human",
              "/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Dataset/ai_generated",
              "/Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/")


Initializing paths...
Found 10001 human files.
Selected 8400 human train samples.
Remaining human test samples: 1601
Found 10000 AI files.
Selected 8400 AI train samples.
Remaining AI test samples: 1600
Saved train sample names to train.pickle.
Saved test sample names to test.pickle.
Creating 14 batches with batch size of 600...


  7%|▋         | 1/14 [00:14<03:07, 14.42s/it]

Saved batch 0 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_0.pickle


 14%|█▍        | 2/14 [00:26<02:39, 13.26s/it]

Saved batch 1 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_1.pickle


 21%|██▏       | 3/14 [00:38<02:18, 12.56s/it]

Saved batch 2 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_2.pickle


 29%|██▊       | 4/14 [00:50<02:02, 12.28s/it]

Saved batch 3 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_3.pickle


 36%|███▌      | 5/14 [01:01<01:48, 12.01s/it]

Saved batch 4 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_4.pickle
Skipped non-image file: .DS_Store


 43%|████▎     | 6/14 [01:13<01:34, 11.82s/it]

Saved batch 5 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_5.pickle


 50%|█████     | 7/14 [01:24<01:21, 11.64s/it]

Saved batch 6 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_6.pickle


 57%|█████▋    | 8/14 [01:36<01:09, 11.54s/it]

Saved batch 7 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_7.pickle


 64%|██████▍   | 9/14 [01:47<00:57, 11.48s/it]

Saved batch 8 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_8.pickle


 71%|███████▏  | 10/14 [01:58<00:45, 11.39s/it]

Saved batch 9 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_9.pickle


 79%|███████▊  | 11/14 [02:11<00:35, 11.72s/it]

Saved batch 10 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_10.pickle


 86%|████████▌ | 12/14 [02:22<00:23, 11.65s/it]

Saved batch 11 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_11.pickle


 93%|█████████▎| 13/14 [02:34<00:11, 11.65s/it]

Saved batch 12 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_12.pickle


100%|██████████| 14/14 [02:45<00:00, 11.82s/it]

Saved batch 13 to /Users/sakai/VIET_Working/STUDY_WORK/Ky5/Python/Image_Classifier/train_batches/batch_13.pickle





Skipped non-image file: .DS_Store
