In [5]:
import os
import shutil
import random
import cv2
import numpy as np

from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img

In [6]:
RAW_DATA_ROOT = '../../data/images/NGD_HACK'

AUG_DATA_ROOT = '../../data/aug_data'

# Train/Test split ratio
TRAIN_SPLIT = 0.8  # 80% train, 20% test

# We’ll do some typical augmentations: rotation, brightness, flips, etc.
# You can tweak these depending on your scenario.
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=False,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)


In [7]:

# ---------------------------------------
# FIND ALL CLASSES AND IMAGES
# ---------------------------------------
class_to_images = {}  # { "4011": [list_of_image_paths], ... }

for class_name in os.listdir(RAW_DATA_ROOT):
    class_path = os.path.join(RAW_DATA_ROOT, class_name)
    if not os.path.isdir(class_path):
        continue  # skip non-directory files
    # Collect all image files in this folder
    all_files = os.listdir(class_path)
    image_files = [
        f for f in all_files
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    ]
    # Store absolute paths
    image_paths = [os.path.join(class_path, img) for img in image_files]
    class_to_images[class_name] = image_paths

In [8]:
# ---------------------------------------
# DETERMINE CLASS DISTRIBUTION & TARGET
# ---------------------------------------
class_sizes = {cls: len(paths) for cls, paths in class_to_images.items() if len(paths) > 0}
for c, count in class_sizes.items():
    print(f"Class '{c}' has {count} images.")

max_count = max(class_sizes.values()) if class_sizes else 0
print(f"\nMax number of images in any class is {max_count}")

# (Optionally, if you want a user-defined target that’s different from max_count, you could override it)
# Example: TARGET_IMAGES_PER_CLASS = 2000
# But for now, we’ll use:
TARGET_IMAGES_PER_CLASS = max_count

Class '4011' has 234 images.
Class '4015' has 484 images.
Class '4088' has 350 images.
Class '4196' has 474 images.
Class '7020097009819' has 362 images.
Class '7020097026113' has 122 images.
Class '7023026089401' has 198 images.
Class '7035620058776' has 52 images.
Class '7037203626563' has 94 images.
Class '7037206100022' has 324 images.
Class '7038010009457' has 140 images.
Class '7038010013966' has 322 images.
Class '7038010021145' has 140 images.
Class '7038010054488' has 240 images.
Class '7038010068980' has 326 images.
Class '7039610000318' has 232 images.
Class '7040513000022' has 276 images.
Class '7040513001753' has 110 images.
Class '7040913336684' has 140 images.
Class '7044610874661' has 576 images.
Class '7048840205868' has 136 images.
Class '7071688004713' has 102 images.
Class '7622210410337' has 148 images.
Class '90433917' has 186 images.
Class '90433924' has 270 images.
Class '94011' has 450 images.

Max number of images in any class is 576


In [9]:
# data/aug_data/train/<class_name>  and data/aug_data/test/<class_name>
for split in ['train', 'test']:
    split_dir = os.path.join(AUG_DATA_ROOT, split)
    os.makedirs(split_dir, exist_ok=True)

In [10]:
def augment_class_images(class_name, image_paths, target_count):
    """
    Takes a list of raw image paths for one class,
    splits them into train/test, augments the train set up to target_count,
    and saves them under data/aug_data/train/class_name and data/aug_data/test/class_name.
    """
    random.shuffle(image_paths)
    n_images = len(image_paths)
    if n_images == 0:
        return

    # Determine train/test sizes
    train_size = int(n_images * TRAIN_SPLIT)
    train_paths = image_paths[:train_size]
    test_paths = image_paths[train_size:]

    # Create subdirs
    train_out_dir = os.path.join(AUG_DATA_ROOT, 'train', class_name)
    test_out_dir = os.path.join(AUG_DATA_ROOT, 'test', class_name)
    os.makedirs(train_out_dir, exist_ok=True)
    os.makedirs(test_out_dir, exist_ok=True)

    # Copy test images as-is (no augmentation)
    for i, test_img_path in enumerate(test_paths):
        # We keep the original file name or rename systematically
        filename = os.path.basename(test_img_path)
        out_path = os.path.join(test_out_dir, filename)
        shutil.copy2(test_img_path, out_path)

    # For the train set, we want to “oversample” if needed, so total train images = target_count
    # If the class already has enough images to meet or exceed target_count, we can either:
    #   - do no augmentation
    #   - or still do some augmentation for variety but limit total.
    #
    # For demonstration, we’ll do augmentation in any case. 
    # But if you want to keep original images + just generate new to meet target_count, adapt logic.
    
    # Step 1: Copy each original train image into train folder
    for i, train_img_path in enumerate(train_paths):
        filename = os.path.basename(train_img_path)
        out_path = os.path.join(train_out_dir, filename)
        shutil.copy2(train_img_path, out_path)

    current_count = len(train_paths)
    # Step 2: Use Keras ImageDataGenerator to create more images
    # We'll pick from the original train images randomly to generate new ones until we reach target_count
    print(f"Augmenting {class_name} from {current_count} up to {target_count} images in train set...")
    
    if current_count < target_count:
        needed = target_count - current_count
        # We'll do a simple approach: cycle through train images until we generate 'needed' augmented samples
        # For large expansions, you may want a more advanced approach or a direct flow_from_directory pipeline
        gen_count = 0
        while gen_count < needed:
            # pick a random train image
            source_img_path = random.choice(train_paths)
            # open it, augment 1 sample
            img = load_img(source_img_path)
            x = img_to_array(img)  # shape: (h, w, 3)
            x = np.expand_dims(x, axis=0)  # shape: (1, h, w, 3)

            # datagen.flow yields batches of augmented images
            batch = next(datagen.flow(x, batch_size=1))
            augmented_img = batch[0].astype('uint8')  # shape: (h, w, 3)

            # save to disk
            aug_filename = f"aug_{class_name}_{gen_count}.jpg"
            aug_out_path = os.path.join(train_out_dir, aug_filename)
            cv2.imwrite(aug_out_path, cv2.cvtColor(augmented_img, cv2.COLOR_RGB2BGR))
            gen_count += 1
    else:
        print(f"No augmentation needed for {class_name} (already has >= target_count)")

In [11]:

# ---------------------------------------
# RUN AUGMENTATION FOR EACH CLASS
# ---------------------------------------

for class_name, image_paths in tqdm(class_to_images.items(), desc="Classes"):
    if len(image_paths) == 0:
        continue
    augment_class_images(class_name, image_paths, TARGET_IMAGES_PER_CLASS)

print("\nData augmentation complete!")
print(f"Augmented data directory: {AUG_DATA_ROOT}")


Classes:   0%|          | 0/26 [00:00<?, ?it/s]

Augmenting 4011 from 187 up to 576 images in train set...


Classes:   4%|▍         | 1/26 [00:20<08:28, 20.33s/it]

Augmenting 4015 from 387 up to 576 images in train set...


Classes:   8%|▊         | 2/26 [00:31<05:55, 14.83s/it]

Augmenting 4088 from 280 up to 576 images in train set...


Classes:  12%|█▏        | 3/26 [00:46<05:49, 15.22s/it]

Augmenting 4196 from 379 up to 576 images in train set...


Classes:  15%|█▌        | 4/26 [00:57<04:57, 13.51s/it]

Augmenting 7020097009819 from 289 up to 576 images in train set...


Classes:  19%|█▉        | 5/26 [01:12<04:54, 14.04s/it]

Augmenting 7020097026113 from 97 up to 576 images in train set...


Classes:  23%|██▎       | 6/26 [01:36<05:48, 17.41s/it]

Augmenting 7023026089401 from 158 up to 576 images in train set...


Classes:  27%|██▋       | 7/26 [01:58<05:56, 18.79s/it]

Augmenting 7035620058776 from 41 up to 576 images in train set...


Classes:  31%|███       | 8/26 [02:23<06:16, 20.90s/it]

Augmenting 7037203626563 from 75 up to 576 images in train set...


Classes:  35%|███▍      | 9/26 [02:49<06:20, 22.35s/it]

Augmenting 7037206100022 from 259 up to 576 images in train set...


Classes:  38%|███▊      | 10/26 [03:06<05:34, 20.88s/it]

Augmenting 7038010009457 from 112 up to 576 images in train set...


Classes:  42%|████▏     | 11/26 [03:30<05:23, 21.58s/it]

Augmenting 7038010013966 from 257 up to 576 images in train set...


Classes:  46%|████▌     | 12/26 [03:47<04:44, 20.32s/it]

Augmenting 7038010021145 from 112 up to 576 images in train set...


Classes:  50%|█████     | 13/26 [04:10<04:32, 21.00s/it]

Augmenting 7038010054488 from 192 up to 576 images in train set...


Classes:  54%|█████▍    | 14/26 [04:29<04:05, 20.44s/it]

Augmenting 7038010068980 from 260 up to 576 images in train set...


Classes:  58%|█████▊    | 15/26 [04:46<03:33, 19.40s/it]

Augmenting 7039610000318 from 185 up to 576 images in train set...


Classes:  62%|██████▏   | 16/26 [05:07<03:18, 19.81s/it]

Augmenting 7040513000022 from 220 up to 576 images in train set...


Classes:  65%|██████▌   | 17/26 [05:26<02:56, 19.66s/it]

Augmenting 7040513001753 from 88 up to 576 images in train set...


Classes:  69%|██████▉   | 18/26 [05:51<02:49, 21.21s/it]

Augmenting 7040913336684 from 112 up to 576 images in train set...


Classes:  73%|███████▎  | 19/26 [06:13<02:30, 21.49s/it]

Augmenting 7044610874661 from 460 up to 576 images in train set...


Classes:  77%|███████▋  | 20/26 [06:20<01:42, 17.16s/it]

Augmenting 7048840205868 from 108 up to 576 images in train set...


Classes:  81%|████████  | 21/26 [06:42<01:33, 18.69s/it]

Augmenting 7071688004713 from 81 up to 576 images in train set...


Classes:  85%|████████▍ | 22/26 [07:06<01:20, 20.19s/it]

Augmenting 7622210410337 from 118 up to 576 images in train set...


Classes:  88%|████████▊ | 23/26 [07:28<01:02, 20.89s/it]

Augmenting 90433917 from 148 up to 576 images in train set...


Classes:  92%|█████████▏| 24/26 [07:50<00:42, 21.01s/it]

Augmenting 90433924 from 216 up to 576 images in train set...


Classes:  96%|█████████▌| 25/26 [08:07<00:20, 20.04s/it]

Augmenting 94011 from 360 up to 576 images in train set...


Classes: 100%|██████████| 26/26 [08:19<00:00, 19.22s/it]


Data augmentation complete!
Augmented data directory: ../../data/aug_data





In [None]:
# ---------------------------------------
#  SPLIT INTO TRAIN/TEST FOLDERS
# ---------------------------------------
# import os
# import shutil
# import random

# SOURCE_DIR = '../../data/images/NGD_HACK'
# TRAIN_DIR = '../../data/train'
# TEST_DIR = '../../data/test'
# SPLIT_RATIO = 0.8  # 80% train, 20% test

# os.makedirs(TRAIN_DIR, exist_ok=True)
# os.makedirs(TEST_DIR, exist_ok=True)

# for class_name in os.listdir(SOURCE_DIR):
#     class_path = os.path.join(SOURCE_DIR, class_name)
#     if not os.path.isdir(class_path):
#         continue
    
#     # Gather all image files
#     all_files = [f for f in os.listdir(class_path) 
#                  if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
#     if len(all_files) == 0:
#         continue
    
#     # Shuffle for randomization
#     random.shuffle(all_files)
#     split_index = int(len(all_files) * SPLIT_RATIO)
    
#     train_files = all_files[:split_index]
#     test_files = all_files[split_index:]
    
#     # Make sure class subfolders exist in train/test
#     train_class_dir = os.path.join(TRAIN_DIR, class_name)
#     test_class_dir = os.path.join(TEST_DIR, class_name)
#     os.makedirs(train_class_dir, exist_ok=True)
#     os.makedirs(test_class_dir, exist_ok=True)
    
#     # Copy images into train
#     for file_name in train_files:
#         src = os.path.join(class_path, file_name)
#         dst = os.path.join(train_class_dir, file_name)
#         shutil.copy2(src, dst)
    
#     # Copy images into test
#     for file_name in test_files:
#         src = os.path.join(class_path, file_name)
#         dst = os.path.join(test_class_dir, file_name)
#         shutil.copy2(src, dst)

# print("Split complete! Train/Test folders created.")
