In [17]:
import os
import random
import cv2
from PIL import Image
from imgaug import augmenters as iaa

In [18]:
# Define the path to the directory containing class folders
data_dir = "./img_dataset"
new_data_dir = "./new_img_dataset"  # New directory to save the resized and augmented images

# Define the target number of images per class
target_images_per_class = 200

# Define data augmentation transformations
seq = iaa.Sequential([
    iaa.Fliplr(0.5),  # horizontal flips
    iaa.Affine(rotate=(-10, 10)),  # rotation
    iaa.GaussianBlur(sigma=(0, 1.0)),  # Gaussian blur
    iaa.AdditiveGaussianNoise(scale=(0, 0.05*255)),  # add Gaussian noise
])

In [19]:
# Iterate through each class folder
for class_folder in os.listdir(data_dir):
    class_path = os.path.join(data_dir, class_folder)
    if os.path.isdir(class_path):
        # Create a corresponding folder in the new dataset directory
        new_class_path = os.path.join(new_data_dir, class_folder)
        os.makedirs(new_class_path, exist_ok=True)

        # List all image files in the class folder
        image_files = [f for f in os.listdir(class_path) if f.endswith('.jpg') or f.endswith('.png')]
        num_images = len(image_files)

        # Calculate the number of additional images needed to reach the target
        images_needed = max(0, target_images_per_class - num_images)

        print(f"Class: {class_folder}, Original images: {num_images}, Images needed: {images_needed}")

        # Randomly select the subset of images if there are more than 200
        if num_images > target_images_per_class:
            selected_images = random.sample(image_files, target_images_per_class)
        else:
            selected_images = image_files

        # Load and resize selected images
        for img_file in selected_images:
            img = cv2.imread(os.path.join(class_path, img_file))
            pil_image = Image.fromarray(img)
            resized_image = pil_image.resize((224, 224), Image.BICUBIC)
            resized_image.save(os.path.join(new_class_path, img_file))

        # Perform data augmentation on existing images if needed
        augmented_images = []
        for _ in range(images_needed):
            random_image = random.choice(selected_images)
            img = cv2.imread(os.path.join(class_path, random_image))
            augmented_image = seq.augment_image(img)
            augmented_images.append(augmented_image)

        # Save augmented images
        for i, img in enumerate(augmented_images):
            pil_image = Image.fromarray(img)
            resized_image = pil_image.resize((224, 224), Image.BICUBIC)
            resized_image.save(os.path.join(new_class_path, f"augmented_{i}.jpg"))


Class: distcc_exec_backdoor2, Original images: 4, Images needed: 196
Class: vsftpd, Original images: 1143, Images needed: 0
Class: mirai, Original images: 1140, Images needed: 0
Class: netbios_ssn2, Original images: 1137, Images needed: 0
Class: hydra_ftp2, Original images: 131, Images needed: 69
Class: smtp22, Original images: 31, Images needed: 169
Class: replayAttacks, Original images: 975, Images needed: 0
Class: vsftpd2, Original images: 200, Images needed: 0
Class: distcc_exec_backdoor, Original images: 4, Images needed: 196
Class: blackEnergy, Original images: 35, Images needed: 165
Class: unreallrcd, Original images: 383, Images needed: 0
Class: smtp, Original images: 31, Images needed: 169
Class: ruby_drb2, Original images: 379, Images needed: 0
Class: zeus, Original images: 67, Images needed: 133
Class: hydra_ssh2, Original images: 349, Images needed: 0
Class: netbios_ssn, Original images: 286, Images needed: 0
Class: ruby_drb, Original images: 200, Images needed: 0
Class: 0d