<a href="https://colab.research.google.com/github/sumaiya008/CCNY-DSE-Capstone-Project-Segmenting-Coral-Branch-tips/blob/main/notebooks/1_2_Data_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import cv2
import numpy as np
import random
import gdown
import pickle
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the path to your image folders
image_folder_path = "/content/drive/MyDrive/Capstone/Coral_images/"

# Initialize lists to store images and labels
images = []
labels = []

# Iterate through each folder in the main directory
for folder_name in os.listdir(image_folder_path):
    folder_path = os.path.join(image_folder_path, folder_name)

    # Check if the item in the main directory is a folder
    if os.path.isdir(folder_path):
        # Iterate through each image file in the subfolder
        for image_filename in os.listdir(folder_path):
            image_path = os.path.join(folder_path, image_filename)

            # Load the image using OpenCV
            image = cv2.imread(image_path)

            # Check if the image was loaded successfully
            if image is not None:
                # Resize the image to 1080x1080 if needed
                if image.shape[0] >= 1080 and image.shape[1] >= 1080:
                    image = cv2.resize(image, (1080, 1080))
                else:
                    print(f"Image dimensions are too small for resizing: {image_path}")

                # Append the image to the list of images
                images.append(image)

                # Append the label (folder name) to the list of labels
                labels.append(folder_name)
            else:
                # Handle the case where the image couldn't be loaded
                print(f"Error loading image: {image_path}")

Error loading image: /content/drive/MyDrive/Capstone/Coral_images/Pseudodiploria/.DS_Store


In [None]:
# Convert the lists to NumPy arrays for further analysis
images = np.array(images)
labels = np.array(labels)

# Now, check the shapes
print(f"Images shape: {images.shape}")
print(f"Labels shape: {labels.shape}")

Images shape: (907, 1080, 1080, 3)
Labels shape: (907,)


In [None]:
# Define the data augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=10,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    brightness_range=[0.5, 1.5],
    fill_mode='nearest'
)


In [None]:
# Divide the data into 20 batches
batch_size = len(images) // 20

for batch_number in range(20):
    # Define the start and end indices for the current batch
    start_idx = batch_number * batch_size
    end_idx = (batch_number + 1) * batch_size if batch_number < 19 else len(images)

    # Extract the batch of images and labels
    batch_images = images[start_idx:end_idx]
    batch_labels = labels[start_idx:end_idx]

    # Apply data augmentation to the batch
    augmented_images = []
    augmented_labels = []

    for i in range(len(batch_images)):
        img = batch_images[i]
        label = batch_labels[i]

        img = np.expand_dims(img, axis=0)

        for j in range(2):
            x_augmented = datagen.flow(img, batch_size=1).next()
            augmented_images.append(x_augmented[0])
            augmented_labels.append(label)

    # Convert the augmented data to NumPy arrays
    augmented_images = np.array(augmented_images)
    augmented_labels = np.array(augmented_labels)

    # Save the augmented data to a compressed NumPy file
    save_path = f'/content/drive/MyDrive/Capstone/Coral_images/data_augmented_{batch_number}.npz'
    np.savez_compressed(save_path, images=augmented_images, labels=augmented_labels)

    print(f'Saved batch {batch_number} to {save_path}')

Saved batch 0 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_0.npz
Saved batch 1 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_1.npz
Saved batch 2 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_2.npz
Saved batch 3 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_3.npz
Saved batch 4 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_4.npz
Saved batch 5 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_5.npz
Saved batch 6 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_6.npz
Saved batch 7 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_7.npz
Saved batch 8 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_8.npz
Saved batch 9 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_9.npz
Saved batch 10 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_10.npz
Saved batch 11 to /content/drive/MyDrive/Capstone/Coral_images/data_augmented_11.npz


In [1]:
import numpy as np

# Initialize a dictionary to store the number of images in each batch
image_counts = {}

# Define the path where the files are stored
base_path = '/content/drive/MyDrive/Capstone/Coral_images/'

for batch_number in range(20):
    # Construct the file path
    file_path = f'{base_path}data_augmented_{batch_number}.npz'

    # Load the NumPy file
    data = np.load(file_path)

    # Get the images from the loaded data
    images = data['images']

    # Get the number of images in the batch
    num_images = len(images)

    # Store the count in the dictionary
    image_counts[batch_number] = num_images

    # Close the NumPy file
    data.close()

# Print the number of images in each batch
for batch_number, count in image_counts.items():
    print(f'Batch {batch_number}: {count} images')


Batch 0: 90 images
Batch 1: 90 images
Batch 2: 90 images
Batch 3: 90 images
Batch 4: 90 images
Batch 5: 90 images
Batch 6: 90 images
Batch 7: 90 images
Batch 8: 90 images
Batch 9: 90 images
Batch 10: 90 images
Batch 11: 90 images
Batch 12: 90 images
Batch 13: 90 images
Batch 14: 90 images
Batch 15: 90 images
Batch 16: 90 images
Batch 17: 90 images
Batch 18: 90 images
Batch 19: 104 images


In [1]:
import numpy as np
import h5py

# Initialize an HDF5 file
hdf5_path = '/content/drive/MyDrive/Capstone/Coral_images/all_data_augmented.h5'
hdf5_file = h5py.File(hdf5_path, 'w')

# Define the path where the files are stored
base_path = '/content/drive/MyDrive/Capstone/Coral_images/'

for batch_number in range(20):
    # Construct the file path
    file_path = f'{base_path}data_augmented_{batch_number}.npz'

    # Load the NumPy file
    data = np.load(file_path)

    # Get the images and labels from the loaded data
    batch_images = data['images']
    batch_labels = data['labels']

    # Create datasets in the HDF5 file for this batch
    hdf5_file.create_dataset(f'images_{batch_number}', data=batch_images)
    hdf5_file.create_dataset(f'labels_{batch_number}', data=batch_labels)

    # Close the NumPy file
    data.close()

# Close the HDF5 file
hdf5_file.close()

# Print a confirmation message
print(f'Saved all data to {hdf5_path}')


TypeError: ignored

In [7]:
import numpy as np

# Paths to the two files to be combined
file_path_1 = '/content/drive/MyDrive/Capstone/Coral_images/data_augmented_1.npz'
file_path_2 = '/content/drive/MyDrive/Capstone/Coral_images/data_augmented_0.npz'

# Load the data from the first file and check available keys
data_1 = np.load(file_path_1)
print("Keys in file 1:", list(data_1.keys()))

# Load the data from the second file and check available keys
data_2 = np.load(file_path_2)
print("Keys in file 2:", list(data_2.keys()))

# Assuming the correct keys for images and labels are different, update as needed
images_1 = data_1['your_correct_images_key']
labels_1 = data_1['your_correct_labels_key']
images_2 = data_2['your_correct_images_key']
labels_2 = data_2['your_correct_labels_key']

# Combine the images and labels from both files
combined_images = np.concatenate([images_1, images_2], axis=0)
combined_labels = np.concatenate([labels_1, labels_2], axis=0)

# Define the path for the combined data
combined_file_path = '/content/drive/MyDrive/Capstone/Coral_images/data_augmented_com.npz'

# Save the combined data to a new .npz file
np.savez_compressed(combined_file_path, images=combined_images, labels=combined_labels)

# Print a confirmation message
print(f'Saved combined data to {combined_file_path}')


Keys in file 1: ['images', 'labels']
Keys in file 2: ['images', 'labels']


KeyError: ignored