In [19]:
import numpy as np
import pandas as pd

def load_data(filename):
    """
    Loads data from a CSV file and processes it into suitable format.
    For training and validation data, it splits into features and labels.
    For test data, it returns only features and assumes the first column is a placeholder.
    """
    data = pd.read_csv(filename, header=None)
    if 'train' in filename or 'validate' in filename:
        # Split into features and labels
        labels = data.iloc[:, 0].values - 1  # Convert class labels to zero-indexed
        features = data.iloc[:, 1:].values
    else:
        # Test data does not include labels
        labels = None
        features = data.iloc[:, 1:].values  # Ignore the placeholder column
    
    # Reshape features into 32x32x3 format for RGB images
    features = features.reshape((-1, 32, 32, 3))
    return features, labels


# Example usage
train_features, train_labels = load_data('./data/train.csv')
validate_features, validate_labels = load_data('./data/validate.csv')
test_features, _ = load_data('./data/test.csv')  # Test data doesn't have labels

### 1. Define Helper Functions for Each Layer

In [34]:
import numpy as np
from numpy.lib.stride_tricks import as_strided
import numpy as np
from numpy.lib.stride_tricks import as_strided
import numpy as np
from numpy.lib.stride_tricks import as_strided
def dropout_forward(A, dropout_rate):
    """
    Applies dropout to the activations A.
    
    Parameters:
      A (np.ndarray): Activations from a layer (any shape).
      dropout_rate (float): Probability of dropping a unit (e.g., 0.5 for 50% dropout).
      
    Returns:
      A_dropout (np.ndarray): The activations after dropout is applied.
      mask (np.ndarray): The dropout mask used (should be stored for the backward pass).
    """
    # Create a mask that drops neurons with probability dropout_rate.
    # We divide by (1 - dropout_rate) so that the expected value of the activations remains unchanged.
    mask = (np.random.rand(*A.shape) >= dropout_rate) / (1 - dropout_rate)
    A_dropout = A * mask
    return A_dropout, mask

def convolve2d(images, kernel, bias=None, stride=1, padding=1):
    """
    Convolve a batch of images with a set of kernels using a vectorized approach.
    
    Parameters:
      images : numpy.ndarray
          Input tensor of shape (batch_size, image_height, image_width, image_channels).
      kernel : numpy.ndarray
          Convolution kernel tensor of shape (kernel_height, kernel_width, image_channels, num_kernels).
      bias : numpy.ndarray, optional
          Bias tensor of shape (1, 1, 1, num_kernels). If provided, will be added to the output.
      stride : int, optional
          Stride length. (Default is 1)
      padding : int, optional
          Padding width applied to the input images. (Default is 1)
    
    Returns:
      output : numpy.ndarray
          Convolved output of shape (batch_size, out_height, out_width, num_kernels).
      cache : dict
          Dictionary with values needed for the backward pass.
    """
    # Unpack dimensions.
    batch_size, image_height, image_width, image_channels = images.shape
    kernel_height, kernel_width, kernel_channels, num_kernels = kernel.shape

    # Ensure the kernel depth matches the image depth.
    assert kernel_channels == image_channels, "Kernel and image channels must match."

    # Apply padding.
    padded_images = np.pad(images, 
                           ((0, 0), (padding, padding), (padding, padding), (0, 0)), 
                           mode='constant')
    padded_height = image_height + 2 * padding
    padded_width = image_width + 2 * padding

    # Calculate output dimensions.
    out_height = (padded_height - kernel_height) // stride + 1
    out_width = (padded_width - kernel_width) // stride + 1

    # Create a sliding window view of the padded images.
    # The new shape is (batch_size, out_height, out_width, kernel_height, kernel_width, image_channels)
    shape = (batch_size, out_height, out_width, kernel_height, kernel_width, image_channels)
    strides = (padded_images.strides[0],
               stride * padded_images.strides[1],
               stride * padded_images.strides[2],
               padded_images.strides[1],
               padded_images.strides[2],
               padded_images.strides[3])
    windows = as_strided(padded_images, shape=shape, strides=strides)

    # Perform the convolution using tensordot.
    # This contracts over the window dimensions (kernel_height, kernel_width, image_channels),
    # resulting in an output of shape (batch_size, out_height, out_width, num_kernels).
    output = np.tensordot(windows, kernel, axes=([3, 4, 5], [0, 1, 2]))

    # Add bias if provided.
    if bias is not None:
        # Ensure the bias has the shape (1, 1, 1, num_kernels) for broadcasting.
        output = output + bias

    # Store necessary values for backpropagation.
    cache = {
        "images_padded": padded_images,
        "kernel": kernel,
        "bias": bias,
        "padding": padding,
        "stride": stride
    }

    return output, cache




def relu(x):
    return np.where(x > 0, x, 0.01 * x)
    #return np.maximum(0, x)


import numpy as np
from numpy.lib.stride_tricks import as_strided

def max_pooling(images, size=2, stride=2):
    """
    Perform max pooling on a 4D tensor of images with shape
    (batch_size, image_height, image_width, num_feature_maps).

    Returns:
      - output: pooled output with shape (batch_size, out_height, out_width, num_feature_maps)
      - cache: tuple containing (images, mask, size, stride) for backpropagation.
    """
    batch_size, image_height, image_width, num_feature_maps = images.shape
    out_height = (image_height - size) // stride + 1
    out_width = (image_width - size) // stride + 1

    # Create a sliding window view of the images:
    # The new shape will be (batch_size, out_height, out_width, size, size, num_feature_maps)
    new_shape = (batch_size, out_height, out_width, size, size, num_feature_maps)
    new_strides = (images.strides[0],
                   stride * images.strides[1],
                   stride * images.strides[2],
                   images.strides[1],
                   images.strides[2],
                   images.strides[3])
    windows = as_strided(images, shape=new_shape, strides=new_strides)

    # Compute the max over each window (axes 3 and 4 correspond to the pooling window)
    output = np.max(windows, axis=(3, 4))

    # To build the mask, first reshape the windows so each window becomes a vector.
    windows_reshaped = windows.reshape(batch_size, out_height, out_width, size * size, num_feature_maps)
    # Find the indices of the maximum values within each window.
    argmax_indices = np.argmax(windows_reshaped, axis=3)  # shape: (batch_size, out_height, out_width, num_feature_maps)
    # Convert the flattened indices back into 2D indices (row, col) inside the pooling window.
    max_row = argmax_indices // size  # integer division gives the row
    max_col = argmax_indices % size   # modulo gives the col

    # Build the mask array of zeros (same shape as images)
    mask = np.zeros_like(images)

    # Create index arrays for each dimension
    # batch indices with shape (batch_size, 1, 1, 1)
    batch_range = np.arange(batch_size)[:, None, None, None]
    # y and x indices for the pooled output with shapes (1, out_height, 1, 1) and (1, 1, out_width, 1)
    y_range = np.arange(out_height)[None, :, None, None]
    x_range = np.arange(out_width)[None, None, :, None]
    # Channel indices with shape (1, 1, 1, num_feature_maps)
    channel_range = np.arange(num_feature_maps)[None, None, None, :]

    # Compute the actual row and column indices in the original images:
    # For each pooling window, the top-left corner in the original image is:
    # (y_range*stride, x_range*stride). Then add the offset from max pooling (max_row, max_col).
    row_indices = y_range * stride + max_row  # shape: (batch_size, out_height, out_width, num_feature_maps)
    col_indices = x_range * stride + max_col  # shape: (batch_size, out_height, out_width, num_feature_maps)

    # Use advanced indexing to set the mask to 1 at the maximum locations.
    mask[batch_range, row_indices, col_indices, channel_range] = 1

    cache = (images, mask, size, stride)
    return output, cache







def fully_connected(x, weights, biases):
    """
    Apply a fully connected layer and return both the output and a cache for the backward pass.
    
    Parameters:
    x (numpy.ndarray): Input data or features.
    weights (numpy.ndarray): Weights matrix of the fully connected layer.
    biases (numpy.ndarray): Biases vector of the fully connected layer.
    
    Returns:
    numpy.ndarray: Output of the fully connected layer.
    tuple: Cache containing inputs and weights needed for the backward pass.
    """
    # Compute the output of the fully connected layer
    y = np.dot(x, weights) + biases
    cache = {"input": x, "weights": weights, "biases": biases}
    return y, cache



### 2. Forward and Backward Propagation Functions

In [21]:
import numpy as np

def softmax(logits):
    """
    Apply the softmax function to the last layer's output to convert logits into probabilities.

    Args:
    logits (numpy.ndarray): Logits array from the last fully connected layer of shape (batch_size, num_classes).

    Returns:
    numpy.ndarray: The probabilities after applying softmax of shape (batch_size, num_classes).
    """
    # Shift the logits by subtracting the maximum value to prevent large exponentials
    exp_shifted = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    # Normalize the exponentials so that the sum of each row is 1, resulting in probabilities
    probabilities = exp_shifted / np.sum(exp_shifted, axis=1, keepdims=True)
    
    return probabilities
def batch_norm(x, epsilon=1e-5):
    mean = np.mean(x, axis=0, keepdims=True)
    variance = np.var(x, axis=0, keepdims=True)
    return (x - mean) / np.sqrt(variance + epsilon)

def forward_pass(X_batch, params, dropout_rate=0.5, training=True):
    """
    Forward pass through the CNN with two convolutional layers and two fully connected layers,
    with batch normalization and dropout applied.
    
    Parameters:
      X_batch (np.ndarray): Input batch.
      params (dict): Model parameters.
      dropout_rate (float): Dropout probability.
      training (bool): Flag indicating whether to apply dropout (True during training).
      
    Returns:
      softmax_output (np.ndarray): Final output probabilities.
      caches (dict): A dictionary of caches needed for the backward pass.
    """
    caches = {}

    # First Convolutional Layer
    conv1_output, conv1_cache = convolve2d(X_batch, params['conv1_w'], bias=params['conv1_b'])
    bn1_output = batch_norm(conv1_output) if training else conv1_output  # Apply Batch Normalization only during training
    relu1_output = relu(bn1_output)        # Apply ReLU
    relu1_cache = bn1_output
    pool1_output, pool1_cache = max_pooling(relu1_output)

    # Second Convolutional Layer
    conv2_output, conv2_cache = convolve2d(pool1_output, params['conv2_w'], bias=params['conv2_b'])
    bn2_output = batch_norm(conv2_output) if training else conv2_output  # Apply Batch Normalization only during training
    relu2_output = relu(bn2_output)        # Apply ReLU
    relu2_cache = bn2_output
    pool2_output, pool2_cache = max_pooling(relu2_output)

    # Flatten for fully connected layers
    batch_size = pool2_output.shape[0]
    flattened_pool_output = pool2_output.reshape(batch_size, -1)

    # First Fully Connected Layer
    fc1_output, fc1_cache = fully_connected(flattened_pool_output, params['fc1_w'], params['fc1_b'])
    bn_fc1_output = batch_norm(fc1_output) if training else fc1_output  # Apply Batch Normalization only during training
    fc1_output = relu(bn_fc1_output)       # Apply ReLU
    fc1_cache['output_shape'] = pool2_output.shape

    # Apply dropout after the first fully connected layer (only during training)
    if training:
        fc1_output, dropout1_cache = dropout_forward(fc1_output, dropout_rate)
        fc1_cache['dropout_cache'] = dropout1_cache
    else:
        fc1_cache['dropout_cache'] = None

    # Second Fully Connected Layer (Output Layer)
    fc2_output, fc2_cache = fully_connected(fc1_output, params['fc2_w'], params['fc2_b'])
    fc2_cache['output_shape'] = fc2_output.shape

    # Apply softmax activation to get final probabilities
    softmax_output = softmax(fc2_output)

    # Store caches for backward pass
    caches = {
        'conv1_cache': conv1_cache,
        'relu1_cache': relu1_cache,
        'pool1_cache': pool1_cache,
        'conv2_cache': conv2_cache,
        'relu2_cache': relu2_cache,
        'pool2_cache': pool2_cache,
        'fc1_cache': fc1_cache,
        'fc2_cache': fc2_cache
    }
    return softmax_output, caches



def compute_loss(probs, labels):
    batch_size = probs.shape[0]
    num_classes = probs.shape[1]
    label_smoothing = 0.1
    true_labels = np.eye(num_classes)[labels] * (1 - label_smoothing) + (label_smoothing / num_classes)

    # true_labels = np.eye(num_classes)[labels] 
    cross_entropy_loss = -np.sum(true_labels * np.log(probs + 1e-12)) / batch_size
    return cross_entropy_loss

# def compute_loss(predictions, targets):

#     num_samples = 10
#     num_classes = predictions.shape[1]
#     # Avoid numerical instability by adding a small epsilon value
#     epsilon = 1e-7
#     predictions = np.clip(predictions, epsilon, 1 - epsilon)
#     true_labels = np.eye(num_classes)[targets] 
#     loss = -np.sum(true_labels * np.log(predictions)) / num_samples
#     return loss


### 3. Training Loop and Parameter Updates

In [22]:
import numpy as np
import numpy as np
from numpy.lib.stride_tricks import as_strided

def dropout_backward(dA, mask):
    """
    Backward pass for dropout.
    
    Parameters:
      dA (np.ndarray): Gradient of the loss with respect to the dropout layer's output.
      mask (np.ndarray): Dropout mask that was used in the forward pass.
      
    Returns:
      dA_prev (np.ndarray): Gradient of the loss with respect to the dropout layer's input.
    """
    dA_prev = dA * mask
    return dA_prev

def conv_backward(dout, cache):
    """
    Optimized backward pass for a convolutional layer.
    
    Args:
      dout (numpy.ndarray): Upstream gradients of shape 
                            (batch_size, output_height, output_width, num_filters).
      cache (dict): Dictionary containing:
          - 'images_padded': Padded input images of shape (batch_size, padded_height, padded_width, num_channels).
          - 'kernel': Convolution kernel of shape (kernel_height, kernel_width, num_channels, num_filters).
          - 'stride': Stride used during the forward pass.
          - 'padding': Padding applied during the forward pass.
    
    Returns:
      dA_prev (numpy.ndarray): Gradient with respect to the input images (without padding).
      dW (numpy.ndarray): Gradient with respect to the kernel weights.
      db (numpy.ndarray): Gradient with respect to the biases.
    """
    # Extract information from the cache.
    images_padded = cache['images_padded']  # Padded input images.
    kernel = cache['kernel']
    stride = cache['stride']
    padding = cache['padding']

    # Unpack shapes.
    batch_size, padded_height, padded_width, num_channels = images_padded.shape
    kernel_height, kernel_width, _, num_filters = kernel.shape
    _, out_height, out_width, _ = dout.shape

    # -----------------------
    # Compute dW (gradient with respect to the kernel weights)
    # -----------------------
    # Create a sliding window view of the padded images.
    shape = (batch_size, out_height, out_width, kernel_height, kernel_width, num_channels)
    strides = (images_padded.strides[0],
               stride * images_padded.strides[1],
               stride * images_padded.strides[2],
               images_padded.strides[1],
               images_padded.strides[2],
               images_padded.strides[3])
    windows = as_strided(images_padded, shape=shape, strides=strides)
    
    # Use tensordot to contract over batch, out_height, and out_width.
    # This produces dW with shape (kernel_height, kernel_width, num_channels, num_filters).
    dW = np.tensordot(windows, dout, axes=([0, 1, 2], [0, 1, 2]))
    
    # -----------------------
    # Compute db (gradient with respect to the biases)
    # -----------------------
    # Sum over batch, out_height, and out_width.
    db = np.sum(dout, axis=(0, 1, 2), keepdims=True)  # Shape: (1, 1, 1, num_filters)
    
    # -----------------------
    # Compute dA_prev (gradient with respect to the input images)
    # -----------------------
    # Initialize gradient for padded input.
    dA_prev_padded = np.zeros_like(images_padded)
    
    # Instead of four nested loops, we loop over the kernel dimensions only.
    # For each (i, j) position in the kernel, add contributions to the appropriate
    # positions in dA_prev_padded.
    for i in range(kernel_height):
        for j in range(kernel_width):
            # The slice of dA_prev_padded to update:
            # It starts at index i and j, and then covers the region that the kernel "saw"
            # during the forward pass. The region has shape (batch_size, out_height, out_width, num_channels).
            i_end = i + out_height * stride
            j_end = j + out_width * stride
            
            # Using np.einsum to multiply dout (shape: [batch_size, out_height, out_width, num_filters])
            # with the kernel slice (shape: [num_channels, num_filters]).
            # The einsum 'bxyf,cf->bxyc' computes, for each batch b and spatial position (x,y),
            # the sum over f: dout[b,x,y,f] * kernel[i, j, c, f], resulting in shape (batch_size, out_height, out_width, num_channels).
            dA_prev_padded[:, i:i_end:stride, j:j_end:stride, :] += np.einsum('bxyf,cf->bxyc', dout, kernel[i, j, :, :])

    # Remove the padding to obtain the gradient with respect to the original images.
    if padding != 0:
        dA_prev = dA_prev_padded[:, padding:-padding, padding:-padding, :]
    else:
        dA_prev = dA_prev_padded

    return dA_prev, dW, db

  
def relu_backward(dout, relu_cache):
    """
    Backward pass for the ReLU activation function.

    Parameters:
      dout: Upstream derivatives.
      relu_cache: Cache from the forward pass, typically the input to the ReLU.

    Returns:
      The gradient with respect to the input of the ReLU.
    """
    # Only propagate the gradient for positive inputs.
    return dout * (relu_cache > 0)


def pool_backward(dout, cache):
    """
    Backward pass for the max pooling layer.
    
    Parameters:
      dout: Gradient of the loss with respect to the output of the pooling layer.
            Shape: (batch_size, out_height, out_width, channels)
      cache: Tuple containing (image, mask, size, stride), where:
             - image: the original input to the pooling layer, shape (batch_size, image_height, image_width, channels)
             - mask: a binary mask of the same shape as image indicating the positions of the max values during the forward pass.
             - size: the pooling window size.
             - stride: the stride used during pooling.
             
    Returns:
      dimage: Gradient of the loss with respect to the input of the pooling layer.
    """
    image, mask, size, stride = cache
    dimage = np.zeros_like(image)
    
    batch_size, image_height, image_width, image_channels = image.shape
    _, out_height, out_width, _ = dout.shape
    
    # Loop over the batch, spatial dimensions, and channels.
    for i in range(batch_size):
        for z in range(image_channels):
            for y in range(out_height):
                for x in range(out_width):
                    # Calculate the window boundaries.
                    start_y = y * stride
                    start_x = x * stride
                    end_y = start_y + size
                    end_x = start_x + size

                    # Extract the mask window corresponding to this pooling region.
                    window = mask[i, start_y:end_y, start_x:end_x, z]
                    
                    # Propagate the gradient only to the max element(s) (where mask == 1).
                    dimage[i, start_y:end_y, start_x:end_x, z] += window * dout[i, y, x, z]
    
    return dimage


def fc_backward(dout, weights, input_cache):
    """
    Backward pass for the fully connected layer.
    
    Parameters:
      dout: Upstream derivatives, expected shape (batch_size, num_classes)
            or (num_classes,) if a single example is provided.
      weights: Weights used in the forward pass, shape (num_features, num_classes)
      input_cache: Input data used in the forward pass, shape (batch_size, num_features)
                   or (num_features,) for a single example.
    
    Returns:
      dinput: Gradient with respect to the input, shape (batch_size, num_features)
      dweights: Gradient with respect to the weights, shape (num_features, num_classes)
      dbiases: Gradient with respect to the biases, shape (num_classes,)
    """
    # Ensure dout and input_cache are at least 2D.
    dout = np.atleast_2d(dout)         # Now shape is (batch_size, num_classes)
    input_cache = np.atleast_2d(input_cache)  # Now shape is (batch_size, num_features)

    # Compute gradients
    dinput = dout.dot(weights.T)       # (batch_size, num_classes) dot (num_classes, num_features) -> (batch_size, num_features)
    dweights = input_cache.T.dot(dout)   # (num_features, batch_size) dot (batch_size, num_classes) -> (num_features, num_classes)
    dbiases = np.sum(dout, axis=0)       # Sum over the batch dimension

    # If you want to average gradients over the batch, you can uncomment the following lines:
    # batch_size = input_cache.shape[0]
    # dweights /= batch_size
    # dbiases /= batch_size

    # Optionally, if you require dinput to be 1D when the batch size is 1, you can do:
    # if dinput.shape[0] == 1:
    #     dinput = dinput.flatten()

    return dinput, dweights, dbiases

In [23]:
def backward_pass(dout, caches, dropout_rate=0.5, training=True):
    """
    Backward pass for the CNN with two convolutional layers and two fully connected layers.
    
    Parameters:
      dout: Gradient from the loss w.r.t. the output of the softmax (shape: (batch_size, num_classes))
      caches: A dictionary containing caches from the forward pass for:
              - conv1_cache, conv2_cache: caches from the convolutional layers
              - relu1_cache, relu2_cache: caches from the ReLU activations
              - pool1_cache, pool2_cache: caches from the max pooling layers
              - fc1_cache, fc2_cache: caches from the fully connected layers, which must include:
                  - 'weights': the FC layer weights,
                  - 'input': the FC layer input (flattened output from pooling),
                  - 'output_shape': the shape of the pooled output before flattening.
              - training (bool): Flag indicating whether dropout was applied.
    Returns:
      grads: Dictionary containing gradients for:
             - 'fc1_w', 'fc1_b', 'fc2_w', 'fc2_b'
             - 'conv1_w', 'conv1_b', 'conv2_w', 'conv2_b'
    """
    # Unpack caches
    conv1_cache = caches['conv1_cache']
    relu1_cache = caches['relu1_cache']
    pool1_cache = caches['pool1_cache']

    conv2_cache = caches['conv2_cache']
    relu2_cache = caches['relu2_cache']
    pool2_cache = caches['pool2_cache']

    fc1_cache = caches['fc1_cache']
    fc2_cache = caches['fc2_cache']

    grads = {}

    # Backprop through Fully Connected Layer 2 (Output Layer)
    dinput_fc1, grads['fc2_w'], grads['fc2_b'] = fc_backward(
        dinput_fc2, fc2_cache['weights'], fc2_cache['input']
    )

    # Apply dropout backward for FC1 (if training)
    if training and fc1_cache.get('dropout_cache') is not None:
        dinput_fc1 = dropout_backward(dinput_fc1, fc1_cache['dropout_cache'])

    # Backprop through Fully Connected Layer 1
    dinput_pool, grads['fc1_w'], grads['fc1_b'] = fc_backward(
        dinput_fc1, fc1_cache['weights'], fc1_cache['input']
    )

    # Reshape the gradient to the shape of the pooling layer output
    dinput_pool = dinput_pool.reshape(fc1_cache['output_shape'])

    # Backprop through Pooling Layer 2
    dinput_relu2 = pool_backward(dinput_pool, pool2_cache)

    # Backprop through ReLU 2
    dinput_conv2 = relu_backward(dinput_relu2, relu2_cache)

    # Backprop through Convolution Layer 2
    dinput_pool1, grads['conv2_w'], grads['conv2_b'] = conv_backward(dinput_conv2, conv2_cache)

    # Backprop through Pooling Layer 1
    dinput_relu1 = pool_backward(dinput_pool1, pool1_cache)

    # Backprop through ReLU 1
    dinput_conv1 = relu_backward(dinput_relu1, relu1_cache)

    # Backprop through Convolution Layer 1
    _, grads['conv1_w'], grads['conv1_b'] = conv_backward(dinput_conv1, conv1_cache)
    # print(grads['conv2_w'][0])
    return grads




In [24]:
def update_params(params, grads, learning_rate):
    """
    Update the parameters of the network using gradient descent with L2 regularization
    and potentially adaptive learning rate techniques.

    Args:
    params (dict): Dictionary containing the parameters of the model.
    grads (dict): Dictionary containing the gradients of the parameters.
    learning_rate (float): Learning rate for the update step.

    Returns:
    dict: Updated parameters.
    """
    # Regularization strength
    reg_lambda = params.get('reg_lambda', 0.01)  # Default to some small lambda if not specified

    # Update convolutional weights with L2 regularization
    params['conv1_w'] -= learning_rate * (grads['conv1_w'] + reg_lambda * params['conv1_w'])
    # print(grads['conv1_w'][0])
    # params['conv1_w'] -= learning_rate * (grads['conv1_w'])
    params['conv2_w'] -= learning_rate * (grads['conv2_w'] + reg_lambda * params['conv2_w'])
    # params['conv2_w'] -= learning_rate * (grads['conv2_w'])

    # Update fully connected layer weights with L2 regularization
    params['fc1_w'] -= learning_rate * (grads['fc1_w'] + reg_lambda * params['fc1_w'])
    params['fc2_w'] -= learning_rate * (grads['fc2_w'] + reg_lambda * params['fc2_w'])
    # params['fc3_w'] -= learning_rate * (grads['fc3_w'] + reg_lambda * params['fc3_w'])
    # params['fc1_w'] -= learning_rate * (grads['fc1_w'])
    # params['fc2_w'] -= learning_rate * (grads['fc2_w'])
    # params['fc3_w'] -= learning_rate * (grads['fc3_w'])
    
    # If biases are included in the params and grads, update them as well
        # Biases typically do not have regularization applied
    params['fc1_b'] -= learning_rate * grads['fc1_b']
    params['fc2_b'] -= learning_rate * grads['fc2_b']
    # params['fc3_b'] -= learning_rate * grads['fc3_b']
    
    # Update convolutional biases if provided (biases typically are not regularized).
    params['conv1_b'] -= learning_rate * grads['conv1_b']
    params['conv2_b'] -= learning_rate * grads['conv2_b']

    return params


In [25]:
import numpy as np

def evaluate(X_val, y_val, params):
    """
    Evaluate the model's accuracy on the validation set using batch processing.

    X_val: Validation features (numpy array of shape [batch_size, features])
    y_val: Validation labels (numpy array of shape [batch_size])
    params: Trained parameters of the network
    
    Returns:
    float: The accuracy of the model on the validation set
    """
    # Process the entire validation set as a single batch
    logits, _ = forward_pass(X_val, params, dropout_rate=0.5, training=True)  # forward_pass must handle batch processing
    predicted_classes = np.argmax(logits, axis=1)  # axis=1 for batch processing

    # Calculate the number of correctly predicted examples
    total_correct = np.sum(predicted_classes == y_val)
    accuracy = total_correct / len(X_val)

    return accuracy


In [35]:
import time
def train(X_train, y_train, X_val, y_val, epochs, learning_rate, params, batch_size=32, dropout_rate=0.2):
    n_samples = len(X_train)
    decay_rate = 0.95  # Decay rate per epoch
    min_lr = 0.001
    max_lr = 0.01
    for epoch in range(epochs):
        loss_total = 0
        # Learning rate decay
        # current_lr = learning_rate
        # current_lr = learning_rate * (0.95 ** epoch)
        # current_lr = max(0.005, min(learning_rate * (decay_rate ** epoch), 0.01))
        current_lr = min(0.01, learning_rate * (1 + 0.05 * epoch))  # Slowly increases up to 0.01
        current_lr = min_lr + 0.5 * (max_lr - min_lr) * (1 + np.cos(np.pi * epoch /epochs))

        for start in range(0, n_samples, batch_size):
            
            start_time = time.time()  # Start timing
            end = start + batch_size
            X_batch = X_train[start:end]
            y_batch = y_train[start:end]

            # Forward pass with dropout (training=True)
            softmax_output, caches = forward_pass(X_batch, params, dropout_rate, training=True)
            
            # Debug: Check for non-finite values in softmax_output
            if not np.all(np.isfinite(softmax_output)):
                print("Warning: non-finite values found in softmax output.")
            
            # Compute loss (with epsilon for numerical stability)
            loss = compute_loss(softmax_output, y_batch)
            if np.isnan(loss) or np.isinf(loss):
                print("Warning: loss is non-finite!")
            loss_total += loss

            # Backward pass with dropout (training=True)
            grads = backward_pass(softmax_output, caches, dropout_rate, training=True)
            
            # Optionally clip gradients to prevent exploding gradients
            grads = clip_gradients(grads, max_norm=2.0)
            
            # Update parameters
            params = update_params(params, grads, current_lr)
            end_time = time.time()  # End timing

            # Uncomment if you want to print batch details:
            # print(f"Batch from {start} to {end}, loss: {loss}, total: {n_samples}, Time: {elapsed_time:.2f} seconds")

        val_accuracy = evaluate(X_val, y_val, params)
        print(f'Epoch {epoch + 1}/{epochs}, Average Loss: {loss_total / (n_samples // batch_size)}, Validation Accuracy: {val_accuracy}')
    
    return params




import numpy as np
def init_params():
    params = {}

    # He Initialization for ReLU activations
    def he_init(shape):
        return np.random.randn(*shape) * np.sqrt(2.0 / np.prod(shape[:-1]))

    # Convolutional Layer 1 Parameters
    kernel_height1, kernel_width1, input_channels1, num_filters1 = 3, 3, 3, 8
    params['conv1_w'] = he_init((kernel_height1, kernel_width1, input_channels1, num_filters1))
    params['conv1_b'] = np.zeros((1, 1, 1, num_filters1))

    # Convolutional Layer 2 Parameters
    kernel_height2, kernel_width2, input_channels2, num_filters2 = 3, 3, num_filters1, 16
    params['conv2_w'] = he_init((kernel_height2, kernel_width2, input_channels2, num_filters2))
    params['conv2_b'] = np.zeros((1, 1, 1, num_filters2))

    # Fully Connected Layer 1 Parameters
    num_inputs_fc1, num_outputs_fc1 = 1024, 512
    params['fc1_w'] = he_init((num_inputs_fc1, num_outputs_fc1))
    params['fc1_b'] = np.zeros((1, num_outputs_fc1))

    # Fully Connected Layer 2 (Output Layer) Parameters
    num_inputs_fc2, num_outputs_fc2 = num_outputs_fc1, 10  # Assuming 10 output classes
    params['fc2_w'] = he_init((num_inputs_fc2, num_outputs_fc2))
    params['fc2_b'] = np.zeros((1, num_outputs_fc2))

    # Regularization strength
    params['reg_lambda'] = 0.001

    return params



def clip_gradients(grads, max_norm=2.0):
    total_norm = np.sqrt(sum(np.sum(g ** 2) for g in grads.values()))
    if total_norm > max_norm:
        scaling_factor = max_norm / (total_norm + 1e-6)
        for key in grads:
            grads[key] *= scaling_factor
    return grads



params = init_params()
num_epochs = 1000
learning_rate = 0.005
batch_size = 32
trained_params = train(train_features, train_labels, validate_features, validate_labels, num_epochs, learning_rate, params,batch_size)


ValueError: operands could not be broadcast together with shapes (32,32) (32,27) (32,32) 