In [1]:
import numpy as np
import tensorflow as tf # Used only for loading CIFAR-10 dataset
from tensorflow.keras.datasets import cifar10

# Load the CIFAR-10 dataset
# X_train, X_test will be images (num_samples, height, width, channels)
# y_train, y_test will be labels (num_samples,)
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

print("--- CIFAR-10 Data Initial Shapes ---")
print(f"X_train shape: {X_train.shape}") # (50000, 32, 32, 3) for training images
print(f"y_train shape: {y_train.shape}") # (50000, 1) for training labels
print(f"X_test shape: {X_test.shape}")   # (10000, 32, 32, 3) for test images
print(f"y_test shape: {y_test.shape}")   # (10000, 1) for test labels

# --- Preprocessing ---

# 1. Normalize pixel values
# Convert integers to float and scale to [0, 1]
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# 2. Reshape labels for consistency (optional, but good practice for 1D labels)
# y_train and y_test are already (num_samples, 1), so reshape to (1, num_samples) for our previous logic
y_train = y_train.reshape(1, -1)
y_test = y_test.reshape(1, -1)

# 3. Get image dimensions and number of classes
input_height, input_width, input_channels = X_train.shape[1:]
num_classes = 10 # CIFAR-10 has 10 classes (0-9)

print("\n--- CIFAR-10 Data After Preprocessing ---")
print(f"X_train shape after normalization: {X_train.shape}")
print(f"y_train shape after reshape: {y_train.shape}")
print(f"Input image dimensions: {input_height}x{input_width}x{input_channels}")
print(f"Number of classes: {num_classes}")

# We will need these global variables for subsequent layers' initialization
# For the CNN, the input 'X' will not be flattened initially.
# It will be passed as (num_samples, height, width, channels) or (num_samples, channels, height, width)
# We'll stick to (num_samples, height, width, channels) for now and handle channel-first later if needed.


2025-08-19 18:13:14.245081: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-19 18:13:14.573213: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-19 18:13:16.563626: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 2us/step
--- CIFAR-10 Data Initial Shapes ---
X_train shape: (50000, 32, 32, 3)
y_train shape: (50000, 1)
X_test shape: (10000, 32, 32, 3)
y_test shape: (10000, 1)

--- CIFAR-10 Data After Preprocessing ---
X_train shape after normalization: (50000, 32, 32, 3)
y_train shape after reshape: (1, 50000)
Input image dimensions: 32x32x3
Number of classes: 10


In [None]:


# --- Helper Functions for Convolutional Layer ---

def zeropad(X, pad):
    """
    Pads the image X with zeros.
    
    Arguments:
    X -- numpy array of shape (m, H, W, C) representing a batch of images
    pad -- integer, amount of padding around the borders of an image
    
    Returns:
    X_pad -- padded image of shape (m, H + 2*pad, W + 2*pad, C)
    """
    X_pad = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)
    return X_pad

def initialize_conv_params(filter_size, num_filters, input_channels):
    """
    Initializes filters (weights) and biases for a convolutional layer.
    
    Arguments:
    filter_size -- integer, side length of the square filter (e.g., 3 for 3x3)
    num_filters -- integer, number of filters for this layer
    input_channels -- integer, number of channels in the input image/feature map
    
    Returns:
    filters -- numpy array of shape (filter_size, filter_size, input_channels, num_filters)
               Initialized with small random numbers.
    biases -- numpy array of shape (1, 1, 1, num_filters)
              Initialized with zeros.
    """
    # Filters are typically initialized with small random values to break symmetry
    # and prevent all neurons from learning the same features.
    filters = np.random.randn(filter_size, filter_size, input_channels, num_filters) * 0.01
    biases = np.zeros((1, 1, 1, num_filters)) # Bias per filter
    return filters, biases

# --- Convolutional Layer Forward Pass ---

def conv2d_forward(X, filters, biases, stride, padding):
    """
    Implements the forward propagation for a convolutional layer.
    
    Arguments:
    X -- input data of shape (m, H_prev, W_prev, C_prev)
    filters -- weights of shape (f, f, C_prev, n_C)
    biases -- biases of shape (1, 1, 1, n_C)
    stride -- integer, specifies the stride length
    padding -- string, "same" or "valid"
    
    Returns:
    Z -- output of the conv layer, numpy array of shape (m, H, W, n_C)
    cache -- tuple of values needed for the backward pass: (X, filters, biases, stride, padding)
    """
    m, H_prev, W_prev, C_prev = X.shape
    f, f, C_prev, n_C = filters.shape

    # Calculate output dimensions
    if padding == "same":
        # Calculate padding amount to maintain same output size for stride 1
        pad_h = ((H_prev - 1) * stride + f - H_prev) // 2
        pad_w = ((W_prev - 1) * stride + f - W_prev) // 2
        X_padded = zeropad(X, pad_h)
        H = int((H_prev + 2 * pad_h - f) / stride) + 1
        W = int((W_prev + 2 * pad_w - f) / stride) + 1
    elif padding == "valid":
        pad_h, pad_w = 0, 0 # No padding
        X_padded = X
        H = int((H_prev - f) / stride) + 1
        W = int((W_prev - f) / stride) + 1
    else:
        raise ValueError("Padding must be 'same' or 'valid'")

    # Initialize output volume Z
    Z = np.zeros((m, H, W, n_C))

    # Loop over each example in the batch
    for i in range(m):                   # loop over the training examples
        x_img = X_padded[i]              # Select ith image from the padded input
        # Loop over vertical axis of output volume
        for h in range(H):               # loop over vertical axis of the output volume
            # Loop over horizontal axis of output volume
            for w in range(W):           # loop over horizontal axis of the output volume
                # Loop over filters
                for c in range(n_C):     # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice" (patch)
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_end + f
                    
                    # Extract the slice from the image for the current filter
                    x_slice = x_img[vert_start:vert_end, horiz_start:horiz_end, :]
                    
                    # Convolve the slice with the filter and add bias
                    # Element-wise product (*) followed by sum for all elements in the slice.
                    Z[i, h, w, c] = np.sum(x_slice * filters[:, :, :, c]) + biases[0, 0, 0, c]
                                        
    cache = (X, filters, biases, stride, padding, X_padded) # Store X_padded for backprop
    
    return Z, cache

# --- Test the Convolutional Layer Forward Pass ---

# Define hyperparameters for our first conv layer
conv1_filter_size = 3
conv1_num_filters = 16
conv1_stride = 1
conv1_padding = "same"

# Initialize parameters for the first conv layer
# input_channels for the first layer is 3 (RGB)
conv1_filters, conv1_biases = initialize_conv_params(
    conv1_filter_size, conv1_num_filters, input_channels
)

print(f"\nShape of Conv1 Filters: {conv1_filters.shape}") # (f, f, C_prev, n_C) -> (3, 3, 3, 16)
print(f"Shape of Conv1 Biases: {conv1_biases.shape}")     # (1, 1, 1, n_C) -> (1, 1, 1, 16)

# Take a small batch of data to test (e.g., first 5 images)
X_batch = X_train[:5]
print(f"Input batch shape for testing: {X_batch.shape}") # (5, 32, 32, 3)

# Perform forward pass
Z_conv1, conv1_cache = conv2d_forward(
    X_batch, conv1_filters, conv1_biases, conv1_stride, conv1_padding
)

print(f"Output shape of Conv1 layer (Z_conv1): {Z_conv1.shape}")
# Expected output: (m, H, W, n_C) -> (5, 32, 32, 16) with "same" padding and stride 1.


In [None]:



# --- Pooling Layer Forward Pass ---

def max_pool_forward(A_prev, pool_size, stride):
    """
    Implements the forward pass of the max pooling layer.
    
    Arguments:
    A_prev -- input to the pooling layer, numpy array of shape (m, H_prev, W_prev, C_prev)
    pool_size -- integer, side length of the square pooling window (e.g., 2 for 2x2)
    stride -- integer, specifies the stride length
    
    Returns:
    A -- output of the pool layer, numpy array of shape (m, H, W, C_prev)
    cache -- tuple of values needed for the backward pass: (A_prev, pool_size, stride)
    """
    m, H_prev, W_prev, C_prev = A_prev.shape
    f = pool_size # Filter size for pooling
    
    # Calculate output dimensions
    H = int((H_prev - f) / stride) + 1
    W = int((W_prev - f) / stride) + 1
    
    # Initialize output volume A
    A = np.zeros((m, H, W, C_prev))
    
    # Loop over each example in the batch
    for i in range(m):              # loop over the training examples
        a_prev_slice = A_prev[i]    # Select ith image from the input
        # Loop over vertical axis of output volume
        for h in range(H):          # loop over vertical axis of the output volume
            # Loop over horizontal axis of output volume
            for w in range(W):      # loop over horizontal axis of the output volume
                # Loop over channels (pooling is applied independently to each channel)
                for c in range(C_prev): # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice" (patch)
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    
                    # Extract the slice from the input for the current channel
                    a_slice = a_prev_slice[vert_start:vert_end, horiz_start:horiz_end, c]
                    
                    # Compute the max value in the slice and store in output
                    A[i, h, w, c] = np.max(a_slice)
                                        
    cache = (A_prev, pool_size, stride)
    
    return A, cache

# --- Test the Pooling Layer Forward Pass ---

# Assume Z_conv1 from the previous test is the input to this pooling layer
# Use a small batch for testing as done for conv2d_forward
X_batch = X_train[:5]

# First, run the conv layer to get input for pooling
# You'll need to define conv1_filter_size, conv1_num_filters, conv1_stride, conv1_padding, input_channels
# from the previous section if you're running this part in isolation.
conv1_filter_size = 3
conv1_num_filters = 16
conv1_stride = 1
conv1_padding = "same"
conv1_filters, conv1_biases = initialize_conv_params(conv1_filter_size, conv1_num_filters, input_channels)
Z_conv1, conv1_cache = conv2d_forward(X_batch, conv1_filters, conv1_biases, conv1_stride, conv1_padding)

# Now, apply ReLU activation to the convolutional output
A_conv1 = np.maximum(0, Z_conv1) # ReLU activation for convolutional layer output

print(f"\nInput shape to Pooling Layer (A_conv1): {A_conv1.shape}") # (5, 32, 32, 16)

# Define hyperparameters for our first pooling layer
pool1_size = 2
pool1_stride = 2

# Perform forward pass for pooling layer
A_pool1, pool1_cache = max_pool_forward(A_conv1, pool1_size, pool1_stride)

print(f"Output shape of Pool1 layer (A_pool1): {A_pool1.shape}")
# Expected output with 32x32 input, 2x2 pool, stride 2: (5, 16, 16, 16)


In [None]:


# --- Activation Functions (from previous project) ---
def ReLU(Z):
    """
    Implements the Rectified Linear Unit (ReLU) activation function.
    
    Arguments:
    Z -- The output of the linear layer, a numpy array of any shape.
    
    Returns:
    A -- The output of ReLU(Z), same shape as Z.
    """
    return np.maximum(0, Z)

def softmax(Z):
    """
    Implements the Softmax activation function.
    
    Arguments:
    Z -- The output of the linear layer, a numpy array.
    
    Returns:
    A -- The output of softmax(Z), a probability distribution over classes.
    """
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    A = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
    return A

# --- Flatten Layer ---
def flatten(A):
    """
    Flattens the input array into a 2D array (m, num_features_flattened).
    
    Arguments:
    A -- input array of shape (m, H, W, C)
    
    Returns:
    A_flattened -- flattened array of shape (m, H*W*C)
    cache -- the original shape of A for backward pass
    """
    m = A.shape[0]
    A_flattened = A.reshape(m, -1) # -1 infers the remaining dimension
    cache = A.shape
    return A_flattened, cache

# --- Dense Layer Initialization ---
def initialize_dense_params(input_dim, output_dim):
    """
    Initializes weights and biases for a dense (fully connected) layer.
    
    Arguments:
    input_dim -- dimension of the input to this layer
    output_dim -- dimension of the output of this layer
    
    Returns:
    W -- numpy array of shape (output_dim, input_dim)
    b -- numpy array of shape (output_dim, 1)
    """
    W = np.random.randn(output_dim, input_dim) * 0.01
    b = np.zeros((output_dim, 1))
    return W, b

# --- Dense Layer Forward Pass ---
def dense_forward(A_prev, W, b):
    """
    Implements the forward propagation for a dense layer.
    
    Arguments:
    A_prev -- activations from previous layer (or input data), shape (input_dim, m)
    W -- weights matrix, shape (output_dim, input_dim)
    b -- bias vector, shape (output_dim, 1)
    
    Returns:
    Z -- the linear output, shape (output_dim, m)
    A -- the activated output, shape (output_dim, m)
    cache -- tuple of values needed for the backward pass: (A_prev, W, b, Z)
    """
    # Adjust A_prev shape if it comes from flatten layer (m, num_features_flattened)
    # to (num_features_flattened, m) for matrix multiplication
    if A_prev.ndim == 2 and A_prev.shape[0] != W.shape[1]:
        A_prev = A_prev.T # Transpose if it's (m, input_dim)

    Z = W.dot(A_prev) + b
    # A will be calculated by calling activation function separately
    
    cache = (A_prev, W, b, Z)
    return Z, cache

# --- Full CNN Forward Propagation ---

def cnn_forward_prop(X, params):
    """
    Implements the forward propagation for the entire CNN.
    
    Arguments:
    X -- input data of shape (m, H, W, C)
    params -- dictionary containing parameters for all layers
    
    Returns:
    A_L -- the final output probabilities of the network
    caches -- list of caches for each layer (conv, pool, flatten, dense)
    """
    caches = []
    
    # Layer 1: Conv1 -> ReLU1 -> Pool1
    conv1_filters = params['Wc1']
    conv1_biases = params['bc1']
    A_conv1, conv1_cache = conv2d_forward(X, conv1_filters, conv1_biases, stride=1, padding="same")
    A_relu1 = ReLU(A_conv1)
    A_pool1, pool1_cache = max_pool_forward(A_relu1, pool_size=2, stride=2)
    caches.append((conv1_cache, pool1_cache)) # Store caches for backward pass
    
    # Layer 2: Conv2 -> ReLU2 -> Pool2
    conv2_filters = params['Wc2']
    conv2_biases = params['bc2']
    A_conv2, conv2_cache = conv2d_forward(A_pool1, conv2_filters, conv2_biases, stride=1, padding="same")
    A_relu2 = ReLU(A_conv2)
    A_pool2, pool2_cache = max_pool_forward(A_relu2, pool_size=2, stride=2)
    caches.append((conv2_cache, pool2_cache))
    
    # Flatten Layer
    A_flat, flatten_cache = flatten(A_pool2)
    caches.append(flatten_cache)
    
    # Dense Layer 1: FC1 -> ReLU3
    W_fc1 = params['Wd1']
    b_fc1 = params['bd1']
    Z_fc1, fc1_cache = dense_forward(A_flat, W_fc1, b_fc1)
    A_relu3 = ReLU(Z_fc1)
    caches.append(fc1_cache) # Store cache (A_flat, W_fc1, b_fc1, Z_fc1)
    
    # Dense Layer 2: FC2 -> Softmax (Output Layer)
    W_fc2 = params['Wd2']
    b_fc2 = params['bd2']
    Z_fc2, fc2_cache = dense_forward(A_relu3, W_fc2, b_fc2)
    A_softmax = softmax(Z_fc2)
    caches.append(fc2_cache) # Store cache (A_relu3, W_fc2, b_fc2, Z_fc2)
    
    return A_softmax, caches

# --- Initialize All Parameters for the CNN Model ---
def initialize_cnn_params():
    """
    Initializes all parameters for the entire CNN model.
    Architecture:
    Conv1 (3x3, 16 filters, stride 1, same padding) -> ReLU -> MaxPool (2x2, stride 2)
    Conv2 (3x3, 32 filters, stride 1, same padding) -> ReLU -> MaxPool (2x2, stride 2)
    Flatten
    Dense1 (128 units) -> ReLU
    Dense2 (10 units) -> Softmax (Output)
    
    Returns:
    params -- dictionary containing all initialized W and b for conv and dense layers.
    """
    params = {}

    # Conv1 Layer
    # Input: (H, W, 3) -> (32, 32, 3)
    # Output: (32, 32, 16) after conv and relu, (16, 16, 16) after maxpool
    params['Wc1'], params['bc1'] = initialize_conv_params(3, 16, input_channels) # 3x3 filter, 16 filters, 3 input channels

    # Conv2 Layer
    # Input: (16, 16, 16) from Pool1 output
    # Output: (16, 16, 32) after conv and relu, (8, 8, 32) after maxpool
    params['Wc2'], params['bc2'] = initialize_conv_params(3, 32, 16) # 3x3 filter, 32 filters, 16 input channels from previous layer

    # After Conv2 and Pool2: (8, 8, 32)
    # Flattened size: 8 * 8 * 32 = 2048
    flattened_dim = 8 * 8 * 32 

    # Dense Layer 1 (FC1)
    # Input: 2048 (from flattened)
    # Output: 128 units
    params['Wd1'], params['bd1'] = initialize_dense_params(flattened_dim, 128)

    # Dense Layer 2 (FC2 - Output Layer)
    # Input: 128
    # Output: 10 (num_classes)
    params['Wd2'], params['bd2'] = initialize_dense_params(128, num_classes)

    return params

# --- Testing the Full CNN Forward Pass ---

# Initialize all parameters for the network
cnn_params = initialize_cnn_params()

print("\n--- Testing Full CNN Forward Pass ---")
# Take a small batch of data to test (e.g., first 2 images)
X_batch_test = X_train[:2]
print(f"Input batch shape for full CNN test: {X_batch_test.shape}") # (2, 32, 32, 3)

# Perform forward pass through the entire CNN
final_output_probs, all_caches = cnn_forward_prop(X_batch_test, cnn_params)

print(f"Final output probabilities shape: {final_output_probs.shape}") # Expected: (num_classes, m) -> (10, 2)
print(f"Sample output probabilities for first image:\n {final_output_probs[:, 0]}")
print(f"Sum of probabilities for first image: {np.sum(final_output_probs[:, 0]):.4f}")

In [None]:

# --- BACKWARD PASS FUNCTIONS ---

def one_hot(Y, num_classes):
    """
    Converts a vector of labels into a one-hot encoded matrix.
    
    Arguments:
    Y -- The label vector of shape (1, m).
    num_classes -- The total number of unique classes.
    
    Returns:
    one_hot_Y -- A one-hot encoded matrix of shape (num_classes, m).
    """
    m = Y.shape[1]
    one_hot_Y = np.zeros((num_classes, m))
    one_hot_Y[Y.flatten(), np.arange(m)] = 1
    return one_hot_Y

def relu_backward(dA, Z):
    """
    Implements the backward propagation for a ReLU unit.
    
    Arguments:
    dA -- post-activation gradient, of any shape
    Z -- input of the activation function
    
    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    dZ = np.array(dA, copy=True) # make a copy to avoid modifying original dA
    dZ[Z <= 0] = 0 # When Z <= 0, we set dZ to 0 as derivative is 0
    return dZ

def dense_backward(dZ, cache):
    """
    Implements the backward propagation for a dense layer.
    
    Arguments:
    dZ -- Gradient of the cost with respect to the linear output of the current layer (Z)
    cache -- tuple of values from forward pass: (A_prev, W, b, Z_linear)
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (output) of the previous layer
    dW -- Gradient of the cost with respect to W
    db -- Gradient of the cost with respect to b
    """
    A_prev, W, b, Z_linear = cache
    m = A_prev.shape[1] if A_prev.ndim == 2 else A_prev.shape[0] # Handle (input_dim, m) or (m, input_dim)

    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    # If A_prev was transposed in forward, dA_prev needs to be transposed back
    if cache[0].ndim == 2 and cache[0].shape[0] == m and A_prev.shape[0] != W.shape[1]: # This check needs to be more robust
         dA_prev = dA_prev.T

    return dA_prev, dW, db

def flatten_backward(dA, cache):
    """
    Implements the backward propagation for the flatten layer.
    
    Arguments:
    dA -- Gradient from the next layer (dense), shape (m, H*W*C)
    cache -- The original shape of the input to flatten layer (m, H, W, C)
    
    Returns:
    dA_prev -- Gradient reshaped back to original 3D/4D shape
    """
    original_shape = cache
    return dA.reshape(original_shape)

def create_max_pool_mask(A_prev_slice):
    """
    Creates a mask with 1s at the maximum elements and 0s elsewhere.
    
    Arguments:
    A_prev_slice -- slice of input to max pool layer
    
    Returns:
    mask -- mask matrix
    """
    mask = (A_prev_slice == np.max(A_prev_slice))
    return mask

def max_pool_backward(dA, cache):
    """
    Implements the backward pass for the max pooling layer.
    
    Arguments:
    dA -- gradient of output of max pool layer, shape (m, H, W, C)
    cache -- tuple of values from forward pass: (A_prev, pool_size, stride)
    
    Returns:
    dA_prev -- gradient of input to max pool layer, shape (m, H_prev, W_prev, C_prev)
    """
    A_prev, pool_size, stride = cache
    m, H_prev, W_prev, C_prev = A_prev.shape
    f = pool_size

    H, W = dA.shape[1], dA.shape[2] # Output dimensions of pooling layer

    dA_prev = np.zeros(A_prev.shape) # Initialize gradient for previous layer

    for i in range(m):
        a_prev_slice = A_prev[i]
        for h in range(H):
            for w in range(W):
                for c in range(C_prev):
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f

                    # Extract the slice from A_prev_slice
                    slice_A_prev = a_prev_slice[vert_start:vert_end, horiz_start:horiz_end, c]
                    
                    # Create mask for the max element
                    mask = create_max_pool_mask(slice_A_prev)
                    
                    # Distribute the gradient dA[i, h, w, c] to the max element in dA_prev
                    dA_prev[i, vert_start:vert_end, horiz_start:horiz_end, c] += mask * dA[i, h, w, c]
                    
    return dA_prev

def conv2d_backward(dZ, cache):
    """
    Implements the backward pass for a convolutional layer.
    
    Arguments:
    dZ -- gradient of the output of the conv layer, shape (m, H, W, n_C)
    cache -- tuple of values from forward pass: (X, filters, biases, stride, padding, X_padded)
    
    Returns:
    dA_prev -- gradient of the activation from the previous layer, shape (m, H_prev, W_prev, C_prev)
    dW -- gradient of the filters, shape (f, f, C_prev, n_C)
    db -- gradient of the biases, shape (1, 1, 1, n_C)
    """
    X, filters, biases, stride, padding, X_padded = cache
    m, H_prev, W_prev, C_prev = X.shape
    f, f, C_prev, n_C = filters.shape
    m_dz, H_dz, W_dz, n_C_dz = dZ.shape

    # Initialize gradients
    dA_prev = np.zeros(X.shape)
    dW = np.zeros(filters.shape)
    db = np.zeros(biases.shape)

    # Calculate padding for dA_prev (same as forward pad for same padding)
    if padding == "same":
        pad_h = ((H_prev - 1) * stride + f - H_prev) // 2
        pad_w = ((W_prev - 1) * stride + f - W_prev) // 2
        dA_prev_padded = zeropad(dA_prev, pad_h)
    elif padding == "valid":
        pad_h, pad_w = 0, 0
        dA_prev_padded = dA_prev # No padding means dA_prev_padded is dA_prev
    else:
        raise ValueError("Padding must be 'same' or 'valid'")

    # Loop over each example
    for i in range(m):
        x_img = X_padded[i] # Padded original input
        dz_img = dZ[i]      # Gradient for the current example
        da_prev_padded_img = dA_prev_padded[i] # Padded gradient for previous layer

        for h in range(H_dz):
            for w in range(W_dz):
                for c in range(n_C): # Loop over output channels
                    # Find the corners of the current "slice" (patch)
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f

                    # Slice from X_padded (for dW)
                    x_slice = x_img[vert_start:vert_end, horiz_start:horiz_end, :]

                    # Slice from da_prev_padded_img (for dA_prev)
                    # We need to add the gradient from this convolution output
                    # to the corresponding part of dA_prev_padded_img
                    
                    # Update dW and db
                    dW[:, :, :, c] += x_slice * dz_img[h, w, c]
                    db[:, :, :, c] += dz_img[h, w, c]

                    # Update dA_prev_padded_img
                    # The filter is conceptually "flipped" or "rotated" for dA_prev calculation
                    # (effectively, this is a full convolution/transposed convolution)
                    # Here, we're performing a form of 'transposed convolution' or 'full convolution'
                    # which involves adding the dZ value * filter value to dA_prev
                    da_prev_padded_img[vert_start:vert_end, horiz_start:horiz_end, :] += \
                        filters[:, :, :, c] * dz_img[h, w, c]
        
        # After processing all h, w, c for one example, unpad dA_prev_padded_img
        if padding == "same":
            dA_prev[i, :, :, :] = da_prev_padded_img[pad_h:da_prev_padded_img.shape[0]-pad_h,
                                                     pad_w:da_prev_padded_img.shape[1]-pad_w, :]
        elif padding == "valid":
            dA_prev[i, :, :, :] = da_prev_padded_img
                    
    # Average over batch size
    dW = dW / m
    db = db / m
    
    return dA_prev, dW, db

# --- Full CNN Backward Propagation ---

def cnn_backward_prop(A_softmax, Y, caches, params):
    """
    Implements the backward propagation for the entire CNN.
    
    Arguments:
    A_softmax -- output probabilities of the network from forward pass
    Y -- true labels (1, m)
    caches -- list of caches from forward pass
    params -- dictionary of parameters
    
    Returns:
    grads -- dictionary containing gradients for all parameters (dW, db for all layers)
    """
    grads = {}
    m = Y.shape[1] # Number of examples in the batch
    num_classes = A_softmax.shape[0]

    # Convert Y to one-hot encoding for loss calculation
    one_hot_Y = one_hot(Y, num_classes)

    # Start backward pass from the last layer (Dense2)
    
    # Dense Layer 2 (Output Layer): Softmax + Cross-Entropy Loss
    # dZ for softmax + cross-entropy is A_softmax - one_hot_Y
    dZ_fc2 = A_softmax - one_hot_Y
    
    fc2_cache = caches.pop() # Retrieve cache for FC2
    dA_relu3, dWd2, dbd2 = dense_backward(dZ_fc2, fc2_cache)
    grads['dWd2'] = dWd2
    grads['dbd2'] = dbd2

    # Dense Layer 1: ReLU3 -> FC1 (Backward through ReLU then FC1)
    fc1_cache, A_relu3 = caches.pop() # Retrieve cache for FC1 and A_relu3
    dZ_fc1 = relu_backward(dA_relu3, fc1_cache[3]) # fc1_cache[3] is Z_fc1 from forward pass
    dA_flat, dWd1, dbd1 = dense_backward(dZ_fc1, fc1_cache)
    grads['dWd1'] = dWd1
    grads['dbd1'] = dbd1

    # Flatten Layer (Backward)
    flatten_cache = caches.pop() # Retrieve flatten cache (original_shape)
    dA_pool2 = flatten_backward(dA_flat, flatten_cache)

    # Layer 2: Pool2 -> ReLU2 -> Conv2 (Backward through Pool2 then ReLU2 then Conv2)
    conv2_cache, pool2_cache, A_relu2 = caches.pop() # Retrieve caches for Conv2, Pool2, and A_relu2
    dA_relu2 = max_pool_backward(dA_pool2, pool2_cache)
    dZ_conv2 = relu_backward(dA_relu2, conv2_cache[0]) # conv2_cache[0] is original X (input to conv2)
                                                      # Correction: Z from conv2 (Z_conv2) for relu_backward
    # For ReLU backward, we need the Z that went *into* ReLU. A_relu2 came from ReLU(Z_conv2).
    # So, we need Z_conv2 from `conv2_cache` (which actually stores X, filters, biases, stride, padding, X_padded)
    # The actual Z_conv2 itself needs to be passed or re-calculated if not in cache.
    # We should have stored Z_conv2 from `cnn_forward_prop` for ReLU backward.
    # Let's adjust `cnn_forward_prop` to store Z values for ReLU.
    # For now, let's assume A_relu2 is dA, and Z_conv2 is Z for relu_backward.
    # The cache should store Z_conv for ReLU backward.
    # The problem is that in `cnn_forward_prop`, we store `A_relu1` and `A_relu2` but not `Z_conv1` and `Z_conv2`
    # which are needed for `relu_backward`.
    
    # Corrected cache storage in `cnn_forward_prop` to include Z_convX for ReLU backward:
    # caches.append((conv1_cache, pool1_cache, Z_conv1))
    # caches.append((conv2_cache, pool2_cache, Z_conv2))
    # Let's assume Z_conv2 is available in `conv2_cache` as `Z_conv2` (it's not currently, only `X_padded`)
    # To fix this, we need to modify `cnn_forward_prop` to store Z_conv1 and Z_conv2.
    # For now, let's pass `None` for Z and make relu_backward handle it gracefully (it won't).
    # This means the current `relu_backward` call will fail without the correct Z value.
    # Let's adjust the `cnn_forward_prop` in this code block to pass Z_conv.

    # Re-reading `cnn_forward_prop` cache storage:
    # caches.append((conv1_cache, pool1_cache, A_relu1)) # <-- This stores A_relu1, not Z_conv1
    # caches.append((conv2_cache, pool2_cache, A_relu2)) # <-- This stores A_relu2, not Z_conv2

    # Let's fix cnn_forward_prop to store Z_conv for ReLU backward
    # We will pass Z_conv1 for A_relu1 in first tuple
    # We will pass Z_conv2 for A_relu2 in second tuple
    # And Z_fc1 for A_relu3 in third tuple
    
    # --- TEMPORARY FIX FOR TESTING (will be properly fixed later in cnn_forward_prop) ---
    # For `relu_backward(dA, Z)`, Z needs to be the `Z` that went *into* ReLU.
    # This `Z` comes from the output of the linear part of the previous layer.
    # For A_relu2, the `Z` was `Z_conv2`.
    # For A_relu3, the `Z` was `Z_fc1`.
    
    # We need the Z_conv2 for this dZ_conv2 calculation. It's not in cache explicitly.
    # It means we need to get Z_conv2 from the conv2_cache.
    # However, conv2_cache is (X, filters, biases, stride, padding, X_padded) -- it does not contain Z_conv2.
    # This implies that `cnn_forward_prop`'s caching strategy needs to be modified
    # to store the Z value for each activation layer.
    # Let's update cnn_forward_prop to store Z values directly.
    # This is a major change, so I'll put it in the next Canvas.
    # For now, let's assume the Z values are retrievable.

    # FIXING THE CACHE STORAGE IN `cnn_forward_prop` for the provided code block
    # (This means the current cnn_forward_prop in the Canvas needs to be updated first)
    # The `cnn_forward_prop` should return:
    # caches.append((conv1_cache, pool1_cache, Z_conv1)) # Store Z_conv1
    # caches.append((conv2_cache, pool2_cache, Z_conv2)) # Store Z_conv2
    # caches.append((fc1_cache, Z_fc1)) # Store Z_fc1

    # Assuming `Z_conv2` is available from somewhere, for now, let's use `A_relu2` as a placeholder
    # which will likely lead to incorrect gradients for ReLU backward, but allows the structure to run.
    # The correct implementation needs the Z before ReLU for `relu_backward`.
    
    # Let's stick to the current cache structure and fix it properly in the next step.
    # For relu_backward, we need the Z values that were passed to ReLU.
    # In cnn_forward_prop, we store A_relu1, A_relu2, A_relu3. These are the *outputs* of ReLU.
    # We need the *inputs* to ReLU (Z_conv1, Z_conv2, Z_fc1).
    # I will modify cnn_forward_prop to store these Z values.

    # Let's assume Z_conv2 is the linear output from the conv2_forward call
    # This requires accessing `Z_conv2` directly, which is not in `conv2_cache` (X, filters, biases, stride, padding, X_padded)
    # The `cnn_forward_prop` function needs to change how it stores caches.
    # I will modify the provided cnn_forward_prop to store `Z_conv` values for ReLU backward.

    # --- REVISED cnn_forward_prop (to be included in the next Canvas with backprop) ---
    # def cnn_forward_prop(X, params):
    #     caches = []
    #     # Layer 1: Conv1 -> ReLU1 -> Pool1
    #     A_conv1, conv1_cache = conv2d_forward(X, params['Wc1'], params['bc1'], stride=1, padding="same")
    #     Z_relu1 = A_conv1 # Store Z for ReLU
    #     A_relu1 = ReLU(Z_relu1)
    #     A_pool1, pool1_cache = max_pool_forward(A_relu1, pool_size=2, stride=2)
    #     caches.append(((conv1_cache, Z_relu1), pool1_cache)) # Store Z_relu1 for ReLU backward
        
    #     # Layer 2: Conv2 -> ReLU2 -> Pool2
    #     A_conv2, conv2_cache = conv2d_forward(A_pool1, params['Wc2'], params['bc2'], stride=1, padding="same")
    #     Z_relu2 = A_conv2 # Store Z for ReLU
    #     A_relu2 = ReLU(Z_relu2)
    #     A_pool2, pool2_cache = max_pool_forward(A_relu2, pool_size=2, stride=2)
    #     caches.append(((conv2_cache, Z_relu2), pool2_cache)) # Store Z_relu2 for ReLU backward
        
    #     # Flatten Layer
    #     A_flat, flatten_cache = flatten(A_pool2)
    #     caches.append(flatten_cache)
        
    #     # Dense Layer 1: FC1 -> ReLU3
    #     Z_fc1_linear, fc1_cache = dense_forward(A_flat, params['Wd1'], params['bd1'])
    #     Z_relu3 = Z_fc1_linear # Store Z for ReLU
    #     A_relu3 = ReLU(Z_relu3)
    #     caches.append((fc1_cache, Z_relu3)) # Store Z_relu3 for ReLU backward
        
    #     # Dense Layer 2: FC2 -> Softmax (Output Layer)
    #     Z_fc2_linear, fc2_cache = dense_forward(A_relu3, params['Wd2'], params['bd2'])
    #     A_softmax = softmax(Z_fc2_linear)
    #     caches.append(fc2_cache)
        
    #     return A_softmax, caches
    # --- END REVISED cnn_forward_prop ---

    # For now, let's assume `Z_conv2_from_cache` is passed from a modified `cnn_forward_prop`
    # or that Z_conv is the first element of conv_cache. It is not, this will cause an error
    # if `relu_backward` expects `Z` value from the `Z_conv`
    
    # We need to make sure that the cache for ReLU backward contains the Z value.
    # The current cache from cnn_forward_prop for conv layers is (conv_cache, pool_cache, A_relu).
    # It should be (conv_cache, pool_cache, Z_relu_input).
    
    # I will modify `cnn_forward_prop` immediately below this to properly store Z values.
    # Then the backprop will work.

    # Assume the `cnn_forward_prop` *has been modified* to store the Z values.
    # (conv_cache, pool_cache, Z_relu_input)
    conv2_sub_cache, pool2_cache, Z_relu2_input = caches.pop() # Z_relu2_input is Z_conv2
    dA_relu2_prev = max_pool_backward(dA_pool2, pool2_cache)
    dZ_conv2 = relu_backward(dA_relu2_prev, Z_relu2_input)
    dA_pool1_prev, dWc2, dbc2 = conv2d_backward(dZ_conv2, conv2_sub_cache)
    grads['dWc2'] = dWc2
    grads['dbc2'] = dbc2

    # Layer 1: Pool1 -> ReLU1 -> Conv1 (Backward through Pool1 then ReLU1 then Conv1)
    conv1_sub_cache, pool1_cache, Z_relu1_input = caches.pop() # Z_relu1_input is Z_conv1
    dA_relu1_prev = max_pool_backward(dA_pool1_prev, pool1_cache)
    dZ_conv1 = relu_backward(dA_relu1_prev, Z_relu1_input)
    dA_prev_final, dWc1, dbc1 = conv2d_backward(dZ_conv1, conv1_sub_cache)
    grads['dWc1'] = dWc1
    grads['dbc1'] = dbc1

    return grads

# --- Update Parameters ---
def update_cnn_params(params, grads, alpha):
    """
    Updates the parameters of the CNN using gradient descent.
    
    Arguments:
    params -- dictionary containing current parameters
    grads -- dictionary containing gradients
    alpha -- learning rate
    
    Returns:
    params -- updated dictionary of parameters
    """
    # Update Dense Layer 2
    params['Wd2'] -= alpha * grads['dWd2']
    params['bd2'] -= alpha * grads['dbd2']

    # Update Dense Layer 1
    params['Wd1'] -= alpha * grads['dWd1']
    params['bd1'] -= alpha * grads['dbd1']

    # Update Conv Layer 2
    params['Wc2'] -= alpha * grads['dWc2']
    params['bc2'] -= alpha * grads['dbc2']

    # Update Conv Layer 1
    params['Wc1'] -= alpha * grads['dWc1']
    params['bc1'] -= alpha * grads['dbc1']

    return params

# --- Full CNN Forward Propagation (MODIFIED TO STORE Z FOR ReLU BACKWARD) ---
# This modification is crucial for the backward pass to work correctly.
# The previous `cnn_forward_prop` stored A_relu, but relu_backward needs Z_relu_input.
def cnn_forward_prop(X, params):
    caches = []
    
    # Layer 1: Conv1 -> ReLU1 -> Pool1
    A_conv1, conv1_cache = conv2d_forward(X, params['Wc1'], params['bc1'], stride=1, padding="same")
    Z_relu1_input = A_conv1 # Store the input to ReLU for backward pass
    A_relu1 = ReLU(Z_relu1_input)
    A_pool1, pool1_cache = max_pool_forward(A_relu1, pool_size=2, stride=2)
    caches.append((conv1_cache, pool1_cache, Z_relu1_input)) # Now storing Z_relu1_input
    
    # Layer 2: Conv2 -> ReLU2 -> Pool2
    A_conv2, conv2_cache = conv2d_forward(A_pool1, params['Wc2'], params['bc2'], stride=1, padding="same")
    Z_relu2_input = A_conv2 # Store the input to ReLU for backward pass
    A_relu2 = ReLU(Z_relu2_input)
    A_pool2, pool2_cache = max_pool_forward(A_relu2, pool_size=2, stride=2)
    caches.append((conv2_cache, pool2_cache, Z_relu2_input)) # Now storing Z_relu2_input
    
    # Flatten Layer
    A_flat, flatten_cache = flatten(A_pool2)
    caches.append(flatten_cache)
    
    # Dense Layer 1: FC1 -> ReLU3
    Z_fc1_linear, fc1_cache = dense_forward(A_flat, params['Wd1'], params['bd1'])
    Z_relu3_input = Z_fc1_linear # Store the input to ReLU for backward pass
    A_relu3 = ReLU(Z_relu3_input)
    caches.append((fc1_cache, Z_relu3_input)) # Now storing Z_relu3_input
    
    # Dense Layer 2: FC2 -> Softmax (Output Layer)
    Z_fc2_linear, fc2_cache = dense_forward(A_relu3, params['Wd2'], params['bd2'])
    A_softmax = softmax(Z_fc2_linear)
    caches.append(fc2_cache) # Still storing (A_relu3, W_fc2, b_fc2, Z_fc2_linear)
    
    return A_softmax, caches

# --- Prediction and Accuracy Functions (from previous project, adapted for CNN output) ---
def get_cnn_predictions(A_softmax):
    """
    Gets the class predictions from the output probabilities.
    
    Arguments:
    A_softmax -- The output probabilities from the softmax layer, shape (num_classes, m).
    
    Returns:
    predictions -- A 1D array of predicted class labels.
    """
    return np.argmax(A_softmax, axis=0)

def get_cnn_accuracy(predictions, Y):
    """
    Calculates the accuracy of the predictions against the true labels.
    
    Arguments:
    predictions -- A 1D array of predicted class labels.
    Y -- The true labels, shape (1, m).
    
    Returns:
    accuracy -- The percentage of correct predictions.
    """
    # Y is (1, m), predictions is (m,). Flatten Y for comparison.
    return np.sum(predictions == Y.flatten()) / Y.size

# --- Gradient Descent (Training Loop) for CNN ---
def cnn_gradient_descent(X, Y, alpha, iterations):
    """
    Performs gradient descent to train the CNN.
    
    Arguments:
    X -- The input training data (m, H, W, C).
    Y -- The true labels for the training data (1, m).
    alpha -- The learning rate.
    iterations -- The number of training iterations (epochs).
    
    Returns:
    params -- The trained parameters.
    """
    params = initialize_cnn_params() # Initialize all CNN parameters
    
    print("\n--- Starting CNN Training ---")
    for i in range(1, iterations + 1):
        # Forward Propagation
        A_softmax, caches = cnn_forward_prop(X, params)
        
        # Backward Propagation
        grads = cnn_backward_prop(A_softmax, Y, caches, params)
        
        # Update Parameters
        params = update_cnn_params(params, grads, alpha)
        
        # Print progress
        if i % 5 == 0 or i == 1: # Print every 5 iterations or at the first
            predictions = get_cnn_predictions(A_softmax)
            current_accuracy = get_cnn_accuracy(predictions, Y) * 100
            print(f"Iteration: {i}, Training Accuracy: {current_accuracy:.2f}%")
            
    print("--- CNN Training Complete ---")
    return params

# --- Main Execution ---
if __name__ == "__main__":
    # Ensure X_train and y_train are correctly prepared (already done above)

    # Train the model
    learning_rate = 0.001 # CNNs often use smaller learning rates
    num_iterations = 20 # Start with a small number of iterations for testing backprop
                        # Full training will require many more.

    print(f"\nTraining CNN with Learning Rate: {learning_rate}, Iterations: {num_iterations}")
    trained_cnn_params = cnn_gradient_descent(X_train[:500], y_train[:, :500], learning_rate, num_iterations)
    # Using a small subset of data (e.g., first 500 images) for faster testing of backprop.
    # For full training, use X_train, y_train.

    # Test the model on the unseen test set
    print("\n--- Testing on Test Set ---")
    # Perform forward propagation on the test data using the trained parameters
    A_softmax_test, _ = cnn_forward_prop(X_test, trained_cnn_params)
    
    # Get predictions and calculate accuracy on the test set
    test_predictions = get_cnn_predictions(A_softmax_test)
    test_accuracy = get_cnn_accuracy(test_predictions, y_test) * 100
    print(f"Final Test Accuracy: {test_accuracy:.2f}%")

    # Optional: Display a sample prediction (requires matplotlib)
    # import matplotlib.pyplot as plt
    # index = 0 # Change this to view different examples
    # sample_image = X_test[index]
    # predicted_class = test_predictions[index]
    # actual_class = y_test[0, index]

    # plt.imshow(sample_image)
    # plt.title(f"Predicted: {predicted_class}, Actual: {actual_class}")
    # plt.axis('off')
    # plt.show()
