In [54]:
# organize_dataset.py
import os
import shutil
import numpy as np
from pathlib import Path

# terminal commands:
# Count only files (ignore subfolders):  find . -maxdepth 1 -type f | wc -l
# Count only folders:  find . -maxdepth 1 -type d | wc -l
# Count files recursively:   find . -maxdepth 1 -type d | wc -l
# Count all items in folder:  ls -1 | wc -l

orig_dir = Path("images/CatDog")
raw_dir = Path('images/raw/train')
train_cat = Path('images/train/cat')
#print("train_cat:" + str(train_cat))
train_non_cat = Path('images/train/non_cat')
val_cat = Path('images/val/cat')
val_non_cat = Path('images/val/non_cat')
test_cat = Path('images/test/cat')
test_non_cat = Path('images/test/non_cat')

for filename in os.listdir(orig_dir):
    src_file = os.path.join(orig_dir, filename)
    dst_file = os.path.join(raw_dir, filename)
    if os.path.isfile(src_file):   # only move files, not subfolders
        shutil.copy(src_file, dst_file)

In [55]:
def delete_files_in_folders(folder_paths):
    """
    Deletes all files in the specified folders, leaving directories intact.

    Args:
        folder_paths (list): List of paths to folders (e.g., ['A', 'B', 'C']).
    """
    for folder in folder_paths:
        # Check if the folder exists
        if not os.path.exists(folder):
            print(f"Folder {folder} does not exist.")
            continue

        # Iterate through all items in the folder
        for item in os.listdir(folder):
            item_path = os.path.join(folder, item)
            # Check if the item is a file (not a directory)
            if os.path.isfile(item_path):
                try:
                    os.remove(item_path)
                    print(f"Deleted file: {item_path}")
                except OSError as e:
                    print(f"Error deleting {item_path}: {e}")
            else:
                print(f"Skipped {item_path} (not a file).")

for d in [train_cat, train_non_cat, val_cat, val_non_cat, test_cat, test_non_cat]:
    d.mkdir(parents=True, exist_ok=True)




folders = [train_cat,train_non_cat,val_cat,val_non_cat,test_cat,test_non_cat]
delete_files_in_folders(folders)


Deleted file: images/train/cat/cat.4369.jpg
Deleted file: images/train/cat/cat.7106.jpg
Deleted file: images/train/cat/cat.8209.jpg
Deleted file: images/train/cat/cat.2526.jpg
Deleted file: images/train/cat/cat.10390.jpg
Deleted file: images/train/cat/cat.2283.jpg
Deleted file: images/train/cat/cat.205.jpg
Deleted file: images/train/cat/cat.8619.jpg
Deleted file: images/train/cat/cat.12418.jpg
Deleted file: images/train/cat/cat.11111.jpg
Deleted file: images/train/cat/cat.4989.jpg
Deleted file: images/train/cat/cat.6146.jpg
Deleted file: images/train/cat/cat.1629.jpg
Deleted file: images/train/cat/cat.365.jpg
Deleted file: images/train/cat/cat.5103.jpg
Deleted file: images/train/cat/cat.9922.jpg
Deleted file: images/train/cat/cat.6387.jpg
Deleted file: images/train/cat/cat.3598.jpg
Deleted file: images/train/cat/cat.31.jpg
Deleted file: images/train/cat/cat.12196.jpg
Deleted file: images/train/cat/cat.4552.jpg
Deleted file: images/train/cat/cat.2123.jpg
Deleted file: images/train/cat/c

In [56]:
overall_pct = 0.070
train_ratio, val_ratio, test_ratio = 0.7*overall_pct, 0.15*overall_pct, 0.15*overall_pct
print("train_ratio:" + str(train_ratio))
print("val_ratio:" + str(val_ratio))
print("test_ratio:" + str(test_ratio))

cat_files = list(raw_dir.glob('cat.*.jpg'))

print("cat files:" + str(cat_files))
np.random.shuffle(cat_files)

n_cat = len(cat_files)
print("n_cat:" + str(n_cat))

n_train = int(n_cat * train_ratio)
print("n_train:" + str(n_train))

n_val = int(n_cat * val_ratio)
print("n_val:" + str(n_val))

n_test = int(n_cat * test_ratio)
print("n_test:" + str(n_test))


for i, img in enumerate(cat_files):
    if i < n_train:
        shutil.copy(img, train_cat / img.name)
    elif i < n_train + n_val:
        shutil.copy(img, val_cat / img.name)
    elif i < (n_train + n_val + n_test):
        shutil.copy(img, test_cat / img.name)

dog_files = list(raw_dir.glob('dog.*.jpg'))
np.random.shuffle(dog_files)
n_dog = len(dog_files)
n_train = int(n_dog * train_ratio)
n_val = int(n_dog * val_ratio)

for i, img in enumerate(dog_files):
    if i < n_train:
        shutil.copy(img, train_non_cat / img.name)
    elif i < n_train + n_val:
        shutil.copy(img, val_non_cat / img.name)
    else:
        shutil.copy(img, test_non_cat / img.name)

print(f"Moved {n_cat} cat images: {n_train} train, {n_val} val, {n_cat - n_train - n_val} test")
print(f"Moved {n_dog} dog images: {n_train} train, {n_val} val, {n_dog - n_train - n_val} test")

train_ratio:0.049
val_ratio:0.0105
test_ratio:0.0105
cat files:[PosixPath('images/raw/train/cat.5077.jpg'), PosixPath('images/raw/train/cat.2718.jpg'), PosixPath('images/raw/train/cat.10151.jpg'), PosixPath('images/raw/train/cat.3406.jpg'), PosixPath('images/raw/train/cat.4369.jpg'), PosixPath('images/raw/train/cat.7660.jpg'), PosixPath('images/raw/train/cat.8553.jpg'), PosixPath('images/raw/train/cat.9895.jpg'), PosixPath('images/raw/train/cat.1211.jpg'), PosixPath('images/raw/train/cat.6218.jpg'), PosixPath('images/raw/train/cat.1577.jpg'), PosixPath('images/raw/train/cat.12020.jpg'), PosixPath('images/raw/train/cat.7106.jpg'), PosixPath('images/raw/train/cat.8235.jpg'), PosixPath('images/raw/train/cat.952.jpg'), PosixPath('images/raw/train/cat.3360.jpg'), PosixPath('images/raw/train/cat.11529.jpg'), PosixPath('images/raw/train/cat.10637.jpg'), PosixPath('images/raw/train/cat.5711.jpg'), PosixPath('images/raw/train/cat.946.jpg'), PosixPath('images/raw/train/cat.3374.jpg'), PosixPath(