In [None]:

import os
import cv2
import shutil
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import albumentations as A
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

config = {
    "csv_path": "processed_data/cleaned_imbalance_metadata.csv",
    "train_set_csv": "processed_data/train_cleaned_imbalance_metadata.csv",
    "val_set_csv": "processed_data/val_cleaned_imbalance_metadata.csv",
    "label_encoder_path": "processed_data/le_cleaned_imbalance_metadata.npy",
    "original_images_dir": "Dataset/train_images/",
    "generated_images_dir": "Dataset/generated_images/",
    "augmented_images_dir": "Dataset/augmented_images/",
}

def load_and_preprocess_data(random_state=42):
    df = pd.read_csv(config["csv_path"])
    
    le = LabelEncoder()
    df['label_encoded'] = le.fit_transform(df['label'])
    print(f"Label classes: {le.classes_}")
    
    with open(config['label_encoder_path'], 'wb') as f:
        np.save(f, le.classes_)
    
    train_df, val_df = train_test_split(
        df, 
        test_size=0.2, 
        stratify=df['label'],
        random_state=random_state,
    )
    
    train_df.to_csv(config['train_set_csv'], index=False)
    val_df.to_csv(config['val_set_csv'], index=False)
    
    return train_df, val_df


In [None]:
def move_images_based_on_csv(csv_path, src_dir, dest_dir):
    df = pd.read_csv(csv_path)
    for _, row in tqdm(df.iterrows(), total=len(df)):
        src = os.path.join(src_dir, row['label'], row['image_id'])
        dst = os.path.join(dest_dir, row['label'], row['image_id'])
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        shutil.copy2(src, dst)

In [None]:
def generate_augmented_images(df_resampled, original_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    aug = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.CLAHE(p=0.5),
        A.RandomGamma(p=0.5),
        A.HueSaturationValue(p=0.5, hue_shift_limit=20, sat_shift_limit=20, val_shift_limit=20),
    ])

    original_images = set()
    for label in os.listdir(original_dir):
        label_dir = os.path.join(original_dir, label)
        if os.path.isdir(label_dir):
            original_images.update(
                os.path.join(label, f) 
                for f in os.listdir(label_dir) 
                if f.endswith('.jpg')
            )

    generated_count = 0

    for _, row in tqdm(df_resampled.iterrows(), total=len(df_resampled)):
        original_path = os.path.join(row['label'], row['image_id'])
        output_path = os.path.join(output_dir, row['label'], row['image_id'])
        
        if original_path in original_images:
            continue
            
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        class_dir = os.path.join(original_dir, row['label'])
        class_images = [f for f in os.listdir(class_dir) if f.endswith('.jpg')]
        if not class_images:
            continue
            
        original_img = cv2.imread(os.path.join(class_dir, random.choice(class_images)))
        
        augmented = aug(image=original_img)['image']
        cv2.imwrite(output_path, augmented)
        generated_count += 1

    print(f"Generated {generated_count} synthetic images in {output_dir}")

  0%|          | 0/17420 [00:00<?, ?it/s]

100%|██████████| 17420/17420 [00:20<00:00, 854.84it/s] 

Generated 6125 synthetic images in Dataset/SMOT_images





In [None]:
def merge_datasets(original_dir, smote_dir, output_dir):
    """
    Merges original and SMOTE-augmented images into a single dataset.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate through all class subdirectories
    for label in tqdm(os.listdir(original_dir)):
        original_label_dir = os.path.join(original_dir, label)
        smote_label_dir = os.path.join(smote_dir, label)
        output_label_dir = os.path.join(output_dir, label)
        
        os.makedirs(output_label_dir, exist_ok=True)
        
        for img_file in os.listdir(original_label_dir):
            src = os.path.join(original_label_dir, img_file)
            dst = os.path.join(output_label_dir, img_file)
            if not os.path.exists(dst):
                shutil.copy2(src, dst)
        
        if os.path.exists(smote_label_dir):
            for img_file in os.listdir(smote_label_dir):
                src = os.path.join(smote_label_dir, img_file)
                dst = os.path.join(output_label_dir, img_file)
                if not os.path.exists(dst):
                    shutil.copy2(src, dst)

    print(f"Merged dataset created at: {output_dir}")

In [None]:

def count_images_in_folders(data_dir):
    counts = defaultdict(int)
    
    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        if os.path.isdir(label_dir):
            counts[label] = len([
                f for f in os.listdir(label_dir) 
                if f.lower().endswith(('.jpg', '.jpeg', '.png'))
            ])
            
    folder_counts = dict(counts)
    
    print("=== Image Counts by Label (From Folders) ===")
    for label, count in folder_counts.items():
        print(f"{label}: {count} images")
    print(f"TOTAL: {sum(folder_counts.values())} images")
    

train_df, val_df = load_and_preprocess_data()
df_resampled = pd.read_csv(config['train_set_csv']) 
generate_augmented_images(
    df_resampled,
    original_dir=config['original_images_dir'],
    output_dir=config['generated_images_dir'],
)

=== Image Counts by Label (From Folders) ===
bacterial_leaf_blight: 1004 images
bacterial_leaf_streak: 1075 images
bacterial_panicle_blight: 1114 images
blast: 24 images
brown_spot: 584 images
dead_heart: 225 images
downy_mildew: 887 images
hispa: 126 images
tungro: 487 images
TOTAL: 5526 images


In [None]:
data_path = config['generated_images_dir']  
count_images_in_folders(data_path)

In [None]:
data_path = "Dataset/train_images"  
count_images_in_folders(data_path)

=== Image Counts by Label (From Folders) ===
bacterial_leaf_blight: 479 images
bacterial_leaf_streak: 380 images
bacterial_panicle_blight: 337 images
blast: 1738 images
brown_spot: 965 images
dead_heart: 1442 images
downy_mildew: 620 images
hispa: 1594 images
normal: 1764 images
tungro: 1088 images
TOTAL: 10407 images


In [None]:



# Usage
merge_datasets(
    original_dir="Dataset/train_images",   
    smote_dir="Dataset/SMOT_images",        
    output_dir="Dataset/merged_SMOT_train" 
)

100%|██████████| 10/10 [00:03<00:00,  2.77it/s]

Merged dataset created at: Dataset/merged_SMOT_train





In [None]:
data_path = "Dataset/merged_SMOT_train"  
count_images_in_folders(data_path)

=== Image Counts by Label (From Folders) ===
bacterial_leaf_blight: 1483 images
bacterial_leaf_streak: 1455 images
bacterial_panicle_blight: 1451 images
blast: 1762 images
brown_spot: 1549 images
dead_heart: 1667 images
downy_mildew: 1507 images
hispa: 1720 images
normal: 1764 images
tungro: 1575 images
TOTAL: 15933 images
