# Step 1: Create a Google Colab Account

**Completed.** *Account: thomas.sanger92@gmail.com*

# Step 2: Load Dataset and Summary

In [None]:
# Import necessary libraries
from google.colab import drive

# Mount Google Drive to access data
drive.mount('/content/drive')

# Import data manipulation and visualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define paths to image and label data
images_path = '/content/drive/MyDrive/ColabNotebooks/images.npy'
labels_path = '/content/drive/MyDrive/ColabNotebooks/Labels.csv'

# Check if the data files exist
print(os.path.exists(images_path))  # Check if images.npy exists
print(os.path.exists(labels_path))  # Check if Labels.csv exists

# Load the image and label data
images = np.load(images_path) # image data
labels = pd.read_csv(labels_path) # label data

# Explore the dataset: Print column names, shapes, class distribution
print(labels.columns) # Shows the names of the columns in the labels DataFrame
print(f"Images shape: {images.shape}") # Prints the dimensions of the image data
print(f"Labels shape: {labels.shape}") # Prints the dimensions of the label data
print(labels['Label'].value_counts(normalize=True)) # Shows the distribution of plant species

# Visualize sample images: Display the first few images from the dataset
unique_labels = labels['Label'].unique() # Gets the unique plant species labels
plt.figure(figsize=(12, 12)) # Sets the size of the figure for the plot
for i, label in enumerate(unique_labels[:12], 1): # Loops through the first 12 unique labels
    plt.subplot(4, 3, i) # Creates a subplot within the figure
    idx = labels[labels['Label'] == label].index[0] # Finds the index of the first image with the current label
    plt.imshow(images[idx]) # Displays the image
    plt.title(label) # Sets the title of the subplot to the plant species label
    plt.axis('off') # Hides the axes
plt.tight_layout() # Adjusts the spacing between subplots
plt.show() # Displays the plot

# Step 3: Perform EDA on the Images

In [None]:
# Count Plot: Visualize the distribution of plant species using a count plot
import seaborn as sns

plt.figure(figsize=(12, 6)) # Sets the figure size
sns.countplot(data=labels, x='Label') # Creates a count plot using Seaborn
plt.title("Class Distribution") # Sets the title of the plot
plt.xticks(rotation=45) # Rotates x-axis labels for better readability
plt.show() # Displays the plot

# Plot the first image of each species
plt.figure(figsize=(12, 12)) # Sets the figure size
for i, label in enumerate(unique_labels[:12], 1): # Loops through the first 12 unique labels
    plt.subplot(4, 3, i) # Creates a subplot within the figure
    idx = labels[labels['Label'] == label].index[0] # Finds the index of the first image with the current label
    plt.imshow(images[idx]) # Displays the image
    plt.title(label) # Sets the title of the subplot to the plant species label
    plt.axis('off') # Hides the axes
plt.tight_layout() # Adjusts the spacing between subplots
plt.show() # Displays the plot

# Step 4: Illustrate Insights from EDA

My first insight into the Exploratory Data Analysis is that they are all
planted with the same mulch/pubble background. Secondly, it seems that the
dataset is relatively balanced, with each weed species having a similar number
of samples. Thirdly, upon visual inspection, the plant species show signs of
distict visual characteristics making them easily identifiable. Lastly, no
apparent missing or corrrupted images were found in the dataset.

# Step 5: Data Pre-Processing

In [None]:
# Import necessary libraries
from skimage.filters import gaussian
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


# Apply Gaussian Blurring to remove noise from images
blurred_images = np.array([gaussian(image, sigma=1, channel_axis=-1) for image in images])

# Clip pixel values to the range [0, 1] for normalization
blurred_images = np.clip(blurred_images, 0, 1)

# Normalize images: No need to divide by 255 as they are already clipped to [0,1]
normalized_images = blurred_images

# Visualize the effect of preprocessing: Compare original and preprocessed images
plt.figure(figsize=(10, 5)) # Set figure size for the plot
plt.subplot(1, 2, 1) # Create a subplot for the original image
plt.imshow(images[0]) # Display the first original image
plt.title("Original Image") # Set title for the original image subplot
plt.axis('off') # Hide axes for the original image

plt.subplot(1, 2, 2) # Create a subplot for the preprocessed image
plt.imshow(normalized_images[0]) # Display the first preprocessed image
plt.title("Preprocessed Image") # Set title for the preprocessed image subplot
plt.axis('off') # Hide axes for the preprocessed image

# Adjust subplot spacing and display the plot
plt.tight_layout()
plt.show()

# Split data into training and testing sets using stratified sampling
# 'stratify' ensures that the class distribution is maintained in both sets
X_train, X_test, y_train, y_test = train_test_split(
    normalized_images, labels['Label'], test_size=0.2, stratify=labels['Label'], random_state=42
)

# Step 6: Make Data Compatible

In [None]:
# Import libraries for label encoding and one-hot encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Create LabelEncoder and OneHotEncoder instances
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse_output=False)

# Encode labels using LabelEncoder and OneHotEncoder
# Fit_transform on training data, transform on testing data
y_train_encoded = onehot_encoder.fit_transform(label_encoder.fit_transform(y_train).reshape(-1, 1))
y_test_encoded = onehot_encoder.transform(label_encoder.transform(y_test).reshape(-1, 1))

# Reshape image data to be compatible with CNN input shape (samples, height, width, channels)
X_train = X_train.reshape(X_train.shape[0], images.shape[1], images.shape[2], images.shape[3])
X_test = X_test.reshape(X_test.shape[0], images.shape[1], images.shape[2], images.shape[3])

# Print shapes to verify data compatibility
print(f"X_train shape: {X_train.shape}")
print(f"y_train_encoded shape: {y_train_encoded.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test_encoded shape: {y_test_encoded.shape}")

# Step 7: Model Building

In [None]:
# Import necessary layers for building the CNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input

# Define the input shape for the model
input_tensor = Input(shape=(images.shape[1], images.shape[2], images.shape[3]))

# Create a Sequential model
model = Sequential()
model.add(input_tensor)  # Add the Input layer directly to the Sequential model
model.add(Conv2D(32, (3, 3), activation='relu'))  # Add a convolutional layer with 32 filters
model.add(MaxPooling2D((2, 2))) # Add a max pooling layer
model.add(Dropout(0.25)) # Add dropout for regularization
model.add(Conv2D(64, (3, 3), activation='relu')) # Add another convolutional layer with 64 filters
model.add(MaxPooling2D((2, 2))) # Add another max pooling layer
model.add(Dropout(0.25)) # Add dropout for regularization
model.add(Flatten()) # Flatten the output for the dense layers
model.add(Dense(128, activation='relu')) # Add a dense layer with 128 units
model.add(Dropout(0.5)) # Add dropout for regularization
model.add(Dense(12, activation='softmax'))  # Add the output layer with 12 units (for 12 classes)

# Compile the model with optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Step 8: Model Training

In [None]:
# Train the model using the training data and validate on the testing data
history = model.fit(X_train, y_train_encoded,
                    validation_data=(X_test, y_test_encoded),
                    epochs=20, batch_size=32)
                    # epochs: number of times the model sees the entire dataset
                    # batch_size: number of samples processed before weights are updated

# Plot the training and validation accuracy over epochs
plt.figure(figsize=(12, 6)) # Set figure size
plt.plot(history.history['accuracy'], label='Training Accuracy') # Plot training accuracy
plt.plot(history.history['val_accuracy'], label='Validation Accuracy') # Plot validation accuracy
plt.legend() # Show legend
plt.title("Model Accuracy") # Set title
plt.xlabel("Epochs") # Set x-axis label
plt.ylabel("Accuracy") # Set y-axis label
plt.show() # Display the plot

# Plot the training and validation loss over epochs
plt.figure(figsize=(12, 6)) # Set figure size
plt.plot(history.history['loss'], label='Training Loss') # Plot training loss
plt.plot(history.history['val_loss'], label='Validation Loss') # Plot validation loss
plt.legend() # Show legend
plt.title("Model Loss") # Set title
plt.xlabel("Epochs") # Set x-axis label
plt.ylabel("Loss") # Set y-axis label
plt.show() # Display the plot

# Step 9: Model Performance Evaluation

In [None]:
# Import necessary libraries for evaluation
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Make predictions on the test data
y_pred = model.predict(X_test) # Get predicted probabilities for each class
y_pred_classes = np.argmax(y_pred, axis=1) # Convert probabilities to class predictions
y_test_classes = np.argmax(y_test_encoded, axis=1) # Get true class labels

# Generate and print the classification report
# Includes precision, recall, F1-score, and support for each class
print(classification_report(y_test_classes, y_pred_classes,
                            target_names=label_encoder.classes_,
                            zero_division=0))
                            # zero_division handles cases where a class has no predicted samples

# Generate and plot the confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes) # Create confusion matrix
plt.figure(figsize=(12, 10)) # Set figure size

# Plot confusion matrix as a heatmap
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_,
            cmap='Blues')
            # annot: display values in cells
            # fmt: format of displayed values
            # xticklabels, yticklabels: class names for axes
            # cmap: color map for the heatmap

plt.title("Confusion Matrix") # Set title
plt.xlabel("Predicted") # Set x-axis label
plt.ylabel("True") # Set y-axis label
plt.show() # Display the plot

# Step 10: Conclusions and Key Takeaways

The model was able to successfully classify plant seedlings into 12 distinct
species with above average accuracy. Both the training and validation
performance demonstrated that the model effectively learned the visual
features associated with each plant category. The confusion matrix revealed
that most species were correctly classified, though some species with visually
similar characteristics experienced higher misclassification rates. This
suggests the model could benefit from additional data or more advanced
techniques to better differentiate between visually similar species.

To further improve the model, strategies such as data augmentation, which
enhances the diversity of training samples, could be implemented. Transfer
learning using pre-trained models might also improve classification
performance, particularly for more challenging species. Additionally,
fine-tuning hyperparameters or increasing the dataset size could enhance the
modelâ€™s generalization. Overall, this project highlights the potential of
deep learning in agriculture to automate tasks such as plant identification,
paving the way for more efficient and sustainable farming practices.