In [None]:
import os
import glob
import random
import shutil

from PIL import Image

In [None]:
output_base_dir = 'OUTPUT DIRECTORY'
path_list = glob.glob('PARENT DIRECTORY') # May use PARENT DIRECTORY/*/* where the stars stand in for class name, file

def slide_crop(image_path, output, crop_height=224, crop_width=224, step=100):
    """
    Take image path and slide crops each image within given path.
    
    Args:
        image_path (str): Path to image root directory.
        output (str): Path to output image directory.
        crop_height (int): Cropped image height.
        crop_width (int): Cropped image width.
        step (int): step size between each crop.
    """
    try:
        img = Image.open(image_path)
        width, height = img.size

        base_name = os.path.splitext(os.path.basename(image_path))[0]
        ext = os.path.splitext(os.path.basename(image_path))[1]

        class_name = os.path.basename(os.path.dirname(image_path))
        output_dir = os.path.join(output, class_name)
        os.makedirs(output_dir, exist_ok=True)

        for i in range(0, height - crop_height + 1, step):
            for j in range(0, width - crop_width + 1, step):
                left = j
                upper = i
                right = j + crop_width
                lower = i + crop_height
                cropped_img = img.crop((left, upper, right, lower))

                new_filename = f"{base_name}_{i}_{j}{ext}"
                save_path = os.path.join(output_dir, new_filename)
                cropped_img.save(save_path)
        print(f"Successfully cropped {image_path}")
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

for y in path_list:
    slide_crop(y, output=output_base_dir, crop_height=224, crop_width=224, step=50)

In [None]:
def split(
    input_dir, 
    output_train_dir, 
    output_test_dir, 
    train_ratio=0.7, 
    seed=None
):
    """
    Splits a dataset into train and test folders while keeping class structure.
    
    Args:
        input_dir (str): Path to dataset root directory containing class subfolders.
        output_train_dir (str): Path to output train directory.
        output_test_dir (str): Path to output test directory.
        train_ratio (float): Fraction of images to use for training (0 < train_ratio < 1).
        seed (int): Random seed for reproducibility.
    """
    if seed is not None:
        random.seed(seed)

    # Ensure output directories exist
    os.makedirs(output_train_dir, exist_ok=True)
    os.makedirs(output_test_dir, exist_ok=True)

    # Loop over each class folder
    for class_name in os.listdir(input_dir):
        class_path = os.path.join(input_dir, class_name)
        if not os.path.isdir(class_path):
            continue  # skip non-directory files

        images = os.listdir(class_path)
        random.shuffle(images)

        split_index = int(len(images) * train_ratio)
        train_images = images[:split_index]
        test_images = images[split_index:]

        # Make class folders in train and test dirs
        train_class_dir = os.path.join(output_train_dir, class_name)
        test_class_dir = os.path.join(output_test_dir, class_name)
        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(test_class_dir, exist_ok=True)

        # Copy files to train folder
        for img in train_images:
            shutil.copy(os.path.join(class_path, img), os.path.join(train_class_dir, img))

        # Copy files to test folder
        for img in test_images:
            shutil.copy(os.path.join(class_path, img), os.path.join(test_class_dir, img))

    print(f"Split completed: {output_train_dir} (train) and {output_test_dir} (test)")


In [None]:
split(input_dir='/global/scratch/users/eliothuang/final_processed_dataset/',
     output_train_dir='/global/scratch/users/eliothuang/final_train_dataset/',
     output_test_dir='/global/scratch/users/eliothuang/final_test_dataset/')