In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sautkin/imagenet1k1")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/sautkin/imagenet1k1?dataset_version_number=2...


100%|██████████| 11.0G/11.0G [01:26<00:00, 137MB/s] 

Extracting files...





Path to dataset files: /home/sagemaker-user/.cache/kagglehub/datasets/sautkin/imagenet1k1/versions/2


In [4]:
import os
import random
from pathlib import Path

def create_split_with_n_classes(
    root_dir,
    n_classes=500,
    val_ratio=0.2,
    train_file="train.txt",
    val_file="val.txt",
    exts=(".jpg", ".jpeg", ".png")
):
    root = Path(root_dir)

    # --- Step 1: choose N random classes ---
    all_classes = [d for d in root.iterdir() if d.is_dir()]
    chosen_classes = all_classes

    # --- Step 2: collect samples ---
    samples = []
    for class_dir in sorted(chosen_classes):
        class_name = class_dir.name
        for img_path in class_dir.rglob("*"):
            if img_path.suffix.lower() in exts:
                rel = img_path.relative_to(root)
                samples.append((f'imagenet/{rel.as_posix()}', class_name))

    # Shuffle
    random.shuffle(samples)

    # --- Step 3: train/val split ---
    val_size = int(len(samples) * val_ratio)
    val_samples = samples[:val_size]
    train_samples = samples[val_size:]

    # --- Step 4: write txt files ---
    with open(train_file, "w") as f:
        for p, c in train_samples:
            f.write(f"{p} {c}\n")

    with open(val_file, "w") as f:
        for p, c in val_samples:
            f.write(f"{p} {c}\n")

    print(f"Train: {len(train_samples)}   Val: {len(val_samples)}")
    print(f"Saved → {train_file}, {val_file}")


# Example
create_split_with_n_classes(
    root_dir="Data/imagenet",
    n_classes=10,
    val_ratio=0.1,
)


Selected classes: ['00500', '00501', '00502', '00503', '00504', '00505', '00506', '00507', '00508', '00509', '00510', '00511', '00512', '00513', '00514', '00515', '00516', '00517', '00518', '00519', '00520', '00521', '00522', '00523', '00524', '00525', '00526', '00527', '00528', '00529', '00530', '00531', '00532', '00533', '00534', '00535', '00536', '00537', '00538', '00539', '00540', '00541', '00542', '00543', '00544', '00545', '00546', '00547', '00548', '00549', '00550', '00551', '00552', '00553', '00554', '00555', '00556', '00557', '00558', '00559', '00560', '00561', '00562', '00563', '00564', '00565', '00566', '00567', '00568', '00569', '00570', '00571', '00572', '00573', '00574', '00575', '00576', '00577', '00578', '00579', '00580', '00581', '00582', '00583', '00584', '00585', '00586', '00587', '00588', '00589', '00590', '00591', '00592', '00593', '00594', '00595', '00596', '00597', '00598', '00599', '00600', '00601', '00602', '00603', '00604', '00605', '00606', '00607', '00608', 