In [1]:
import os
import shutil
import random

def split_recursive_dataset(source_dir, output_dir, train_ratio=0.7, test_ratio=0.1, val_ratio=0.2):
    # Validasi rasio
    assert abs(train_ratio + test_ratio + val_ratio - 1.0) < 1e-10, "Rasio pembagian harus total 1.0"
    
    # Buat direktori output
    os.makedirs(output_dir, exist_ok=True)
    train_dir = os.path.join(output_dir, 'train')
    test_dir = os.path.join(output_dir, 'test')
    val_dir = os.path.join(output_dir, 'val')
    
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    
    # Daftar ekstensi gambar
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']
    
    # Temukan semua gambar dalam struktur direktori
    all_images = []
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Cek apakah file adalah gambar
            if os.path.splitext(file)[1].lower() in image_extensions:
                # Dapatkan path relatif
                relative_path = os.path.relpath(root, source_dir)
                all_images.append({
                    'full_path': os.path.join(root, file),
                    'relative_path': relative_path,
                    'filename': file
                })
    
    # Acak urutan gambar
    random.shuffle(all_images)
    
    # Hitung jumlah gambar untuk setiap subset
    total_images = len(all_images)
    train_count = int(total_images * train_ratio)
    test_count = int(total_images * test_ratio)
    val_count = total_images - train_count - test_count
    
    # Pisahkan gambar ke subset
    train_images = all_images[:train_count]
    test_images = all_images[train_count:train_count + test_count]
    val_images = all_images[train_count + test_count:]
    
    # Fungsi untuk menyalin gambar dengan mempertahankan struktur direktori
    def copy_images(image_list, destination):
        for image in image_list:
            # Buat direktori tujuan jika belum ada
            relative_dir = image['relative_path']
            dest_subdir = os.path.join(destination, relative_dir)
            os.makedirs(dest_subdir, exist_ok=True)
            
            # Salin file
            dest_path = os.path.join(dest_subdir, image['filename'])
            shutil.copy2(image['full_path'], dest_path)
    
    # Salin gambar ke direktori masing-masing
    copy_images(train_images, train_dir)
    copy_images(test_images, test_dir)
    copy_images(val_images, val_dir)
    
    # Cetak informasi
    print(f"Total gambar: {total_images}")
    print(f"Train: {len(train_images)} ({train_ratio*100}%)")
    print(f"Test: {len(test_images)} ({test_ratio*100}%)")
    print(f"Validasi: {len(val_images)} ({val_ratio*100}%)")

# Contoh penggunaan
source_directory = '/Users/nero555/Documents/Dev/CV/kedelai-DS/dataset/resize'
output_directory = '/Users/nero555/Documents/Dev/CV/kedelai-DS/dataset/kedelay4'
split_recursive_dataset(source_directory, output_directory)

Total gambar: 770
Train: 539 (70.0%)
Test: 154 (20.0%)
Validasi: 77 (10.0%)
