In [1]:
import os
import shutil
import random
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

dataset_path = r"E:\train_gen_joints"
balanced_dataset_path = r"E:\train_gen_joints_balanced"

if not os.path.exists(balanced_dataset_path):
    os.makedirs(balanced_dataset_path)

# Load images and labels
def load_images_and_labels(dataset_path):
    images = []
    labels = []
    for emotion_dir in os.listdir(dataset_path):
        emotion_path = os.path.join(dataset_path, emotion_dir)
        if os.path.isdir(emotion_path):
            for img_file in os.listdir(emotion_path):
                img_path = os.path.join(emotion_path, img_file)
                images.append(img_path)
                labels.append(emotion_dir)
    return images, labels

images, labels = load_images_and_labels(dataset_path)

# Randomize dataset with target size between 50k-60k
def balance_with_random_target_size(images, labels, min_size=50000, max_size=60000):
    balanced_images = []
    balanced_labels = []
    label_to_images = {}

    # Organize images by class
    for img, label in zip(images, labels):
        if label not in label_to_images:
            label_to_images[label] = []
        label_to_images[label].append(img)

    for label, imgs in label_to_images.items():
        target_size = random.randint(min_size, max_size)  # Random target size between min_size and max_size
        
        if len(imgs) > target_size:
            # Undersample if there are more than the target size
            imgs = random.sample(imgs, target_size)
        elif len(imgs) < target_size:
            # Oversample if there are less than the target size
            imgs = imgs * (target_size // len(imgs)) + random.sample(imgs, target_size % len(imgs))
        
        balanced_images.extend(imgs)
        balanced_labels.extend([label] * len(imgs))

    return balanced_images, balanced_labels

balanced_images, balanced_labels = balance_with_random_target_size(images, labels, min_size=50000, max_size=60000)

def save_balanced_dataset(images, labels, output_path):
    for img_path, label in zip(images, labels):
        # Create label directory if it doesn't exist
        label_dir = os.path.join(output_path, label)
        if not os.path.exists(label_dir):
            os.makedirs(label_dir)
        
        # Copy the image to the balanced dataset folder
        img_filename = os.path.basename(img_path)
        shutil.copy(img_path, os.path.join(label_dir, img_filename))

save_balanced_dataset(balanced_images, balanced_labels, balanced_dataset_path)

print(f"Original dataset distribution: {Counter(labels)}")
print(f"Balanced dataset distribution: {Counter(balanced_labels)}")


Original dataset distribution: Counter({'Happiness': 101214, 'Peace': 93486, 'Sadness': 53310, 'Anger': 44798, 'Surprise': 38936, 'Fear': 34626, 'Aversion': 29767})
Balanced dataset distribution: Counter({'Aversion': 57758, 'Sadness': 57347, 'Peace': 55694, 'Happiness': 54907, 'Surprise': 51685, 'Fear': 51145, 'Anger': 51073})
