In [1]:
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
import cv2
import matplotlib.pyplot as plt
import os
from os import listdir, path
import time    

%matplotlib inline

In [2]:
# Time string formatting
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [3]:
# Augmenting data function
def augment_data(file_dir, target_count, save_to_dir):
    os.makedirs(save_to_dir, exist_ok=True)  # Ensure save directory exists
    data_gen = ImageDataGenerator(
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest'
    )

    samples_to_generate = target_count  # Calculate the required samples to generate

    if samples_to_generate <= 0:
        print(f"No need to augment {file_dir} (already has samples).")
        return

    print(f"Augmenting {file_dir}: Generating samples...")

    # Generate the required number of samples
    i = 0
    for filename in listdir(file_dir):
        file_path = os.path.join(file_dir, filename)
        # Load the image
        image = cv2.imread(file_path)
        if image is None:
            print(f"Skipped file: {filename} (invalid image)")
            continue
        # Reshape the image
        image = image.reshape((1,) + image.shape)
        # Prefix for generated sample names
        save_prefix = 'aug_' + os.path.splitext(filename)[0]
        # Generate 'samples_to_generate' sample images
        for batch in data_gen.flow(
            x=image, 
            batch_size=1, 
            save_to_dir=save_to_dir, 
            save_prefix=save_prefix, 
            save_format='png'
        ):
            i += 1
            if i >= samples_to_generate:
                return  # Stop once the required samples are generated

In [4]:
# Paths to original and augmented data directories
base_dir = r'C:/_D/Uni/Minor Project/brain-tumor-classification/dataset'
cropped_dir = os.path.join(base_dir, 'cropped/train')
augmented_dir = os.path.join(base_dir, 'augmented/train')

# Class mappings for tumor types
class_counts = {
    '1': 'Glioma', 
    '2': 'Meningioma', 
    '3': 'Pituitary Tumor'
}

# 1. Find the maximum class size
max_existing_count = 0
for class_id in class_counts.keys():
    class_cropped_path = os.path.join(cropped_dir, class_id)
    class_count = len(listdir(class_cropped_path))
    if class_count > max_existing_count:
        max_existing_count = class_count

# 2. Generate 5 times the maximum existing count as the target count
target_count = max_existing_count * 5

# 3. Perform data augmentation for all classes
start_time = time.time()
for class_id, tumor_name in class_counts.items():
    class_cropped_path = os.path.join(cropped_dir, class_id)
    class_augmented_path = os.path.join(augmented_dir, class_id)
    augment_data(file_dir=class_cropped_path, target_count=target_count, save_to_dir=class_augmented_path)

end_time = time.time()
execution_time = end_time - start_time
print(f"Elapsed time: {hms_string(execution_time)}")

Augmenting C:/_D/Uni/Minor Project/brain-tumor-classification/dataset\cropped/train\1: Generating samples...
Augmenting C:/_D/Uni/Minor Project/brain-tumor-classification/dataset\cropped/train\2: Generating samples...
Augmenting C:/_D/Uni/Minor Project/brain-tumor-classification/dataset\cropped/train\3: Generating samples...
Elapsed time: 0:4:14.0


Let's see how many glioma, meningioma and pituitary tumor examples are there after performing data augmentation:

In [5]:
# Summary function to print dataset information
def data_summary(main_path):
    tumor_counts = {}
    for dir_num, tumor_name in class_counts.items():
        dir_path = f"{main_path}/{dir_num}/"
        tumor_counts[tumor_name] = len(listdir(dir_path))
    
    total_count = sum(tumor_counts.values())
    print(f"Number of examples: {total_count}")
    
    for tumor_name, count in tumor_counts.items():
        percentage = (count * 100.0) / total_count if total_count else 0
        print(f"Percentage of {tumor_name} examples: {percentage:.2f}%, number of {tumor_name} examples: {count}")

In [7]:
# Display data summary
data_summary(augmented_dir)

Number of examples: 13059
Percentage of Glioma examples: 33.38%, number of Glioma examples: 4359
Percentage of Meningioma examples: 33.30%, number of Meningioma examples: 4348
Percentage of Pituitary Tumor examples: 33.33%, number of Pituitary Tumor examples: 4352
