# Build a 10K-image test dataset
This notebook samples a fixed number of images from the full dataset at `./workspace/d23/d23/` and copies them into a dedicated test split folder.

## Workflow
1. Configure source and destination paths plus sampling options.
2. Index all image files recursively under the source root.
3. Randomly sample 10,000 unique paths (reproducible with a fixed seed).
4. Copy the sampled files into a new `test_dataset` folder while preserving their subdirectory structure.
5. Validate the copy by recounting files in the test split.

In [1]:
from pathlib import Path
import random
import shutil
import os

# --- Configuration ---
dataset_root = Path("./workspace/d23/d23")  # full dataset
test_root = Path("./workspace/d23/test_dataset")  # output folder for the sampled test split
sample_count = 10_000
random_seed = 1337
overwrite_existing = False  # set to True to delete and recreate the test folder if it already exists

image_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".gif"}

assert dataset_root.exists() and dataset_root.is_dir(), f"Dataset root {dataset_root} is missing"

if test_root.exists():
    if overwrite_existing:
        shutil.rmtree(test_root)
    else:
        raise FileExistsError(f"Destination {test_root} already exists. Set overwrite_existing=True to rebuild it.")

test_root.mkdir(parents=True, exist_ok=True)
print(f"Dataset root: {dataset_root.resolve()}")
print(f"Test split destination: {test_root.resolve()}")
print(f"Target sample size: {sample_count}")

Dataset root: /home/kazanplova/projects/latent_vae_upscale_train/workspace/d23/d23
Test split destination: /home/kazanplova/projects/latent_vae_upscale_train/workspace/d23/test_dataset
Target sample size: 10000


In [2]:
# Index all image files under the dataset root
all_images = []
for path, _, files in os.walk(dataset_root):
    for name in files:
        ext = Path(name).suffix.lower()
        if ext in image_extensions:
            all_images.append(Path(path) / name)

total_images = len(all_images)
print(f"Total images discovered: {total_images}")
if total_images < sample_count:
    raise ValueError(f"Requested {sample_count} samples but only found {total_images} eligible images.")

Total images discovered: 154598


In [3]:
from tqdm.auto import tqdm
# Sample unique image paths using a reproducible RNG
rng = random.Random(random_seed)
sampled_paths = rng.sample(all_images, sample_count)
print(f"Sampled {len(sampled_paths)} images.")

# Copy each sampled image while recreating its relative directory tree
for src_path in tqdm(sampled_paths):
    relative_path = src_path.relative_to(dataset_root)
    dest_path = test_root / relative_path
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src_path, dest_path)

print("Finished copying sampled files.")

  from .autonotebook import tqdm as notebook_tqdm


Sampled 10000 images.


100%|██████████| 10000/10000 [00:25<00:00, 390.21it/s]

Finished copying sampled files.





In [4]:
# Quick verification of the resulting test split
copied_images = [p for p in test_root.rglob('*') if p.suffix.lower() in image_extensions]
print(f"Images inside test split: {len(copied_images)}")
print("Sample preview:")
for preview_path in copied_images[:5]:
    print(f" - {preview_path.relative_to(test_root)}")

Images inside test split: 10000
Sample preview:
 - ApromaWp/ApromaWp_13040905.png
 - ApromaWp/ApromaWp_854586d5.png
 - ApromaWp/ApromaWp_978606d8.png
 - ApromaWp/ApromaWp_e36bbc7f.png
 - ApromaWp/ApromaWp_f167e7ad.png
