In [2]:
import os
import json

root = "imagenet100"   # <<< CHANGE THIS ###

train_folders = ["train.X1", "train.X2", "train.X3", "train.X4"]
val_folder = "val.X"

# -------------------------------
# STEP 1 — Collect all class names
# -------------------------------
all_classes = set()

# gather classes from train
for split in train_folders:
    split_path = os.path.join(root, split)
    for cls in os.listdir(split_path):
        if os.path.isdir(os.path.join(split_path, cls)):
            all_classes.add(cls)

# gather classes from val
val_path = os.path.join(root, val_folder)
for cls in os.listdir(val_path):
    if os.path.isdir(os.path.join(val_path, cls)):
        all_classes.add(cls)

# sort + create mapping {class_string: index}
all_classes = sorted(list(all_classes))
class_to_idx = {cls_name: idx for idx, cls_name in enumerate(all_classes)}

print("Total classes:", len(class_to_idx))
print("Example mapping:", list(class_to_idx.items())[:10])

# Save mapping to json
with open("class_to_idx.json", "w") as f:
    json.dump(class_to_idx, f, indent=4)

# -------------------------------
# STEP 2 — Write train.txt
# -------------------------------
with open("train.txt", "w") as train_txt:
    for split in train_folders:
        split_path = os.path.join(root, split)

        for cls_name in sorted(os.listdir(split_path)):
            cls_path = os.path.join(split_path, cls_name)
            if not os.path.isdir(cls_path):
                continue

            label = class_to_idx[cls_name]

            for fname in os.listdir(cls_path):
                fpath = os.path.join(cls_path, fname)
                if os.path.isfile(fpath):
                    train_txt.write(f"{fpath} {label}\n")

# -------------------------------
# STEP 3 — Write val.txt
# -------------------------------
with open("val.txt", "w") as val_txt:
    for cls_name in sorted(os.listdir(val_path)):
        cls_path = os.path.join(val_path, cls_name)
        if not os.path.isdir(cls_path):
            continue

        label = class_to_idx[cls_name]

        for fname in os.listdir(cls_path):
            fpath = os.path.join(cls_path, fname)
            if os.path.isfile(fpath):
                val_txt.write(f"{fpath} {label}\n")

print("✔ train.txt, val.txt, and class_to_idx.json created successfully!")


Total classes: 100
Example mapping: [('n01440764', 0), ('n01443537', 1), ('n01484850', 2), ('n01491361', 3), ('n01494475', 4), ('n01496331', 5), ('n01498041', 6), ('n01514668', 7), ('n01514859', 8), ('n01531178', 9)]
✔ train.txt, val.txt, and class_to_idx.json created successfully!


In [3]:
import random
from collections import defaultdict

TRAIN_TXT = "train.txt"
VAL_TXT = "val.txt"
NEW_TRAIN_TXT = "train_new.txt"
NEW_VAL_TXT = "val_new.txt"

SAMPLES_PER_CLASS = 100
RANDOM_SEED = 42
MOVE_INSTEAD_OF_DUPLICATE = True  # set False if you want to KEEP them in train as well

random.seed(RANDOM_SEED)

# -----------------------------
# 1. Read existing train & val
# -----------------------------
with open(TRAIN_TXT, "r") as f:
    train_lines = [line.strip() for line in f if line.strip()]

with open(VAL_TXT, "r") as f:
    val_lines = [line.strip() for line in f if line.strip()]

# Parse train into (path, label)
train_entries = []
for line in train_lines:
    # Split from the right in case path ever has spaces
    path, label = line.rsplit(" ", 1)
    train_entries.append((path, label))

# ---------------------------------------
# 2. Group train entries by class/label
# ---------------------------------------


by_class = defaultdict(list)  # label -> list of indices in train_entries

for idx, (path, label) in enumerate(train_entries):
    by_class[label].append(idx)

print(f"Found {len(by_class)} classes in train.txt")

# ---------------------------------------
# 3. Sample up to 300 per class from train
# ---------------------------------------
selected_indices = set()
selected_lines_for_val = []

for label, indices in by_class.items():
    if len(indices) == 0:
        continue

    n_samples = min(SAMPLES_PER_CLASS, len(indices))
    sampled = random.sample(indices, n_samples)

    for idx in sampled:
        path, lbl = train_entries[idx]
        selected_lines_for_val.append(f"{path} {lbl}")
        selected_indices.add(idx)

    print(f"Class {label}: selected {n_samples} samples to move/add to val")

# ---------------------------------------
# 4. Build new train and val lists
# ---------------------------------------

# Option A: move them (remove from train)
if MOVE_INSTEAD_OF_DUPLICATE:
    new_train_lines = [
        f"{path} {label}"
        for i, (path, label) in enumerate(train_entries)
        if i not in selected_indices
    ]
else:
    # Option B: keep all original train entries
    new_train_lines = [f"{path} {label}" for (path, label) in train_entries]

# New val = old val + new sampled ones
new_val_lines = val_lines + selected_lines_for_val

# ---------------------------------------
# 5. Write out new files
# ---------------------------------------
with open(NEW_TRAIN_TXT, "w") as f:
    for line in new_train_lines:
        f.write(line + "\n")

with open(NEW_VAL_TXT, "w") as f:
    for line in new_val_lines:
        f.write(line + "\n")

print(f"Done!")
print(f"Original train: {len(train_lines)} lines")
print(f"New train:      {len(new_train_lines)} lines")
print(f"Original val:   {len(val_lines)} lines")
print(f"New val:        {len(new_val_lines)} lines")


Found 100 classes in train.txt
Class 0: selected 100 samples to move/add to val
Class 2: selected 100 samples to move/add to val
Class 4: selected 100 samples to move/add to val
Class 9: selected 100 samples to move/add to val
Class 20: selected 100 samples to move/add to val
Class 23: selected 100 samples to move/add to val
Class 29: selected 100 samples to move/add to val
Class 31: selected 100 samples to move/add to val
Class 41: selected 100 samples to move/add to val
Class 53: selected 100 samples to move/add to val
Class 55: selected 100 samples to move/add to val
Class 59: selected 100 samples to move/add to val
Class 61: selected 100 samples to move/add to val
Class 62: selected 100 samples to move/add to val
Class 65: selected 100 samples to move/add to val
Class 73: selected 100 samples to move/add to val
Class 74: selected 100 samples to move/add to val
Class 75: selected 100 samples to move/add to val
Class 78: selected 100 samples to move/add to val
Class 82: selected 100 

In [5]:
from collections import defaultdict

train_file = "train.txt"

class_counts = defaultdict(int)

with open(train_file, "r") as f:
    for line in f:
        path, label = line.strip().rsplit(" ", 1)
        class_counts[label] += 1

# Print counts
for label in sorted(class_counts.keys(), key=lambda x: int(x)):
    print(f"Class {label}: {class_counts[label]} images")

# Check if all equal to 1000
all_ok = all(count == 1000 for count in class_counts.values())

print("\nAll classes have 1000 images?:", all_ok)


Class 0: 1200 images
Class 1: 1200 images
Class 2: 1200 images
Class 3: 1200 images
Class 4: 1200 images
Class 5: 1200 images
Class 6: 1200 images
Class 7: 1200 images
Class 8: 1200 images
Class 9: 1200 images
Class 10: 1200 images
Class 11: 1200 images
Class 12: 1200 images
Class 13: 1200 images
Class 14: 1200 images
Class 15: 1200 images
Class 16: 1200 images
Class 17: 1200 images
Class 18: 1200 images
Class 19: 1200 images
Class 20: 1200 images
Class 21: 1200 images
Class 22: 1200 images
Class 23: 1200 images
Class 24: 1200 images
Class 25: 1200 images
Class 26: 1200 images
Class 27: 1200 images
Class 28: 1200 images
Class 29: 1200 images
Class 30: 1200 images
Class 31: 1200 images
Class 32: 1200 images
Class 33: 1200 images
Class 34: 1200 images
Class 35: 1200 images
Class 36: 1200 images
Class 37: 1200 images
Class 38: 1200 images
Class 39: 1200 images
Class 40: 1200 images
Class 41: 1200 images
Class 42: 1200 images
Class 43: 1200 images
Class 44: 1200 images
Class 45: 1200 image

In [8]:
import random

inp = "val.txt"
out = "val_shuffled.txt"

with open(inp, "r") as f:
    lines = [line for line in f if line.strip()]

random.shuffle(lines)

with open(out, "w") as f:
    f.writelines(lines)

print("✔ Shuffled train.txt → train_shuffled.txt")


✔ Shuffled train.txt → train_shuffled.txt


In [1]:
import random

in_file = "train.txt"
out_file = "train_expanded.txt"
factor = 10    # expand to 10× size

# Read original lines
with open(in_file, "r") as f:
    lines = [line.strip() for line in f if line.strip()]

orig_len = len(lines)
target_len = orig_len * factor

# Random sampling with replacement
expanded = [random.choice(lines) for _ in range(target_len)]

# Write new file
with open(out_file, "w") as f:
    f.write("\n".join(expanded))

print(f"Original length: {orig_len}")
print(f"Expanded length: {len(expanded)}")
print(f"Saved to: {out_file}")


Original length: 100000
Expanded length: 1000000
Saved to: train_expanded.txt


In [2]:
import random

input_file = "val.txt"
output_file = "val_shuffled.txt"

lines = []
with open(input_file, "r") as f:
    for line in f:
        path, cls = line.strip().split()
        # remove the first part of the path (everything before first '/')
        new_path = path.split('/', 1)[1]
        lines.append(f"{new_path} {cls}\n")

# shuffle lines
random.shuffle(lines)

with open(output_file, "w") as f:
    f.writelines(lines)

print("Done! Saved to", output_file)


Done! Saved to val_shuffled.txt
