In [1]:
import os
import random
import shutil
from pathlib import Path

# --------- USER SETTINGS ---------
# Source directory containing all UTK images (ensure it is an extracted folder)
src_dir = r"D:\utkcropped"

# Directory that already contains images to exclude (by filename)
exclude_dir = r"D:\utk_gender_balanced_6000"

# Output directory where the 1,000 sampled images will be copied
out_dir = r"D:\test"

# Number of images to sample
n_sample = 1000
# ---------------------------------

def list_images(root):
    # Adjust extensions as needed
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}
    imgs = []
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            if Path(fn).suffix.lower() in exts:
                imgs.append(os.path.join(dirpath, fn))
    return imgs

def main():
    # Validate source and exclude directories
    if not os.path.isdir(src_dir):
        raise FileNotFoundError(f"Source directory not found: {src_dir}\n"
                                "If you have a .zip file, extract it first and point src_dir to the extracted folder.")

    if not os.path.isdir(exclude_dir):
        print(f"Warning: Exclude directory not found: {exclude_dir}\nProceeding with no exclusions.")
        exclude_basenames = set()
    else:
        exclude_paths = list_images(exclude_dir)
        exclude_basenames = {os.path.basename(p) for p in exclude_paths}

    # Gather source images
    src_paths = list_images(src_dir)

    # Filter out images present in exclude_dir by filename
    remaining = [p for p in src_paths if os.path.basename(p) not in exclude_basenames]

    # Sample
    if len(remaining) == 0:
        print("No images available after exclusion.")
        return

    k = min(n_sample, len(remaining))
    sampled = random.sample(remaining, k)

    # Prepare output directory
    os.makedirs(out_dir, exist_ok=True)

    # Copy sampled images (preserve unique names; if collisions, add numeric suffix)
    used_names = set()
    for src_path in sampled:
        base = os.path.basename(src_path)
        name, ext = os.path.splitext(base)

        candidate = base
        counter = 1
        while candidate in used_names or os.path.exists(os.path.join(out_dir, candidate)):
            candidate = f"{name}_{counter}{ext}"
            counter += 1

        shutil.copy2(src_path, os.path.join(out_dir, candidate))
        used_names.add(candidate)

    print(f"Copied {k} images to: {out_dir}")

if __name__ == "__main__":
    main()

Copied 1000 images to: D:\test
