In [2]:
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# Configuration
original_dir = 'raw'
output_dir = 'data'
random_seed = 42

In [4]:
# Create output directories
for split in ['train', 'val', 'test']:
    for class_name in ['cat', 'dog']:
        os.makedirs(os.path.join(output_dir, split, class_name), exist_ok=True)
        print(f"Create directory: {output_dir}/{split}/{class_name}")

Create directory: data/train/cat
Create directory: data/train/dog
Create directory: data/val/cat
Create directory: data/val/dog
Create directory: data/test/cat
Create directory: data/test/dog


In [5]:
# Split train/val/test 80/10/10
def split_class(class_name):
    class_path = os.path.join(original_dir, class_name)

    # Get all image files
    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    print(f"\nFound {len(images)} {class_name} images")

    # Split into 80/10/10
    train, test = train_test_split(images, test_size=0.2, random_state=random_seed)
    val, test = train_test_split(test, test_size=0.5, random_state=random_seed)

    # Helper function to copy files
    def copy_files(files, split_name):
        for f in files:
            src = os.path.join(class_path, f)
            dst = os.path.join(output_dir, split_name, class_name, f)
            shutil.copy2(src, dst)

    copy_files(train, 'train')
    copy_files(val, 'val')
    copy_files(test, 'test')

    print(f"Split {class_name} images:")
    print(f" Training {len(train)}")
    print(f" Validation {len(val)}")
    print(f" Test {len(test)}")    

In [6]:
# Process both class
split_class('cat')
split_class('dog')


Found 500 cat images
Split cat images:
 Training 400
 Validation 50
 Test 50

Found 500 dog images
Split dog images:
 Training 400
 Validation 50
 Test 50


In [7]:
# Verification
print(f"\nFinal counts:")
for split in ['train', 'val', 'test']:
    for class_name in ['cat', 'dog']:
        count = len(os.listdir(os.path.join(output_dir, split, class_name)))
        print(f"{split}/{class_name}: {count} images")


Final counts:
train/cat: 400 images
train/dog: 400 images
val/cat: 50 images
val/dog: 50 images
test/cat: 50 images
test/dog: 50 images
