In [1]:
from pathlib import Path
import random
import shutil
import os

# Get project root (navigate up from notebooks folder to project root)
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR

# Set paths relative to project root
RAW_ROOT = PROJECT_ROOT / "data" / "raw"
DEST_ROOT = PROJECT_ROOT / "data" / "processed" / "sample_1pct"
SAMPLE_RATIO = 0.1  # 10%
MIN_SAMPLE = 1       # ensure at least one image per folder with images
ALLOWED_EXT = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
SEED = 42

random.seed(SEED)
DEST_ROOT.mkdir(parents=True, exist_ok=True)

assert RAW_ROOT.exists(), f"Missing folder: {RAW_ROOT}"
print(f"Project root: {PROJECT_ROOT}")
print(f"Sampling from {RAW_ROOT.resolve()} to {DEST_ROOT.resolve()}")

Project root: c:\Users\Admin\Desktop\BTL-Deep-Learning
Sampling from C:\Users\Admin\Desktop\BTL-Deep-Learning\data\raw to C:\Users\Admin\Desktop\BTL-Deep-Learning\data\processed\sample_1pct


In [2]:
# Thu thập TẤT CẢ ảnh từ mỗi nguồn, sau đó sample 1% từ tổng số
def collect_all_images_by_source():
    """Thu thập tất cả ảnh theo từng nguồn (celeba, fairfacegen, etc.)"""
    sources = {}
    for source_dir in RAW_ROOT.iterdir():
        if source_dir.is_dir() and source_dir.name != "__pycache__":
            images = []
            for ext in ALLOWED_EXT:
                images.extend(source_dir.rglob(f"*{ext}"))
            if images:
                sources[source_dir.name] = images
    return sources

def copy_sample_images():
    total_copied = 0
    sources = collect_all_images_by_source()
    
    # Sample và copy cho từng nguồn
    for source_name, all_images in sources.items():
        print(f"\n{source_name}: {len(all_images)} images total")
        
        # Tính số lượng sample (1% tổng số ảnh)
        sample_size = max(MIN_SAMPLE, int(len(all_images) * SAMPLE_RATIO))
        sampled_images = random.sample(all_images, sample_size)
        
        print(f"  Sampling {sample_size} images ({sample_size/len(all_images)*100:.2f}%)")
        
        # Copy từng ảnh đã sample
        for src in sampled_images:
            # Giữ nguyên cấu trúc thư mục tương đối
            relative_path = src.relative_to(RAW_ROOT)
            dest_file = DEST_ROOT / relative_path
            dest_file.parent.mkdir(parents=True, exist_ok=True)
            
            if not dest_file.exists():
                shutil.copy2(src, dest_file)
                total_copied += 1
        
        print(f"  ✓ Copied {sample_size} images")
    
    print(f"\n{'='*50}")
    print(f"Done! Total copied: {total_copied} images")

copy_sample_images()


celeba: 202599 images total
  Sampling 20259 images (10.00%)
  ✓ Copied 20259 images

fairfacegen: 354097 images total
  Sampling 35409 images (10.00%)
  ✓ Copied 35409 images

person_face_dataset: 10000 images total
  Sampling 1000 images (10.00%)
  ✓ Copied 1000 images

stable_diffusion_faces: 9000 images total
  Sampling 900 images (10.00%)
  ✓ Copied 900 images

Done! Total copied: 57568 images
