<a href="https://colab.research.google.com/github/topister/simpleConv2d/blob/main/simpleConv2d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

In [3]:
class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, learning_rate=0.01):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.learning_rate = learning_rate

        # Initialize weights and biases
        self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * np.sqrt(2. / (in_channels * kernel_size * kernel_size))
        self.biases = np.zeros((out_channels, 1))

    def forward(self, x):
        self.x = x

        # Add padding to the input
        if self.padding > 0:
            self.x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        else:
            self.x_padded = x

        # Calculate the dimensions of the output
        self.out_height = int((self.x_padded.shape[2] - self.kernel_size) / self.stride) + 1
        self.out_width = int((self.x_padded.shape[3] - self.kernel_size) / self.stride) + 1

        # Initialize output
        out = np.zeros((x.shape[0], self.out_channels, self.out_height, self.out_width))

        # Perform the convolution
        for i in range(self.out_height):
            for j in range(self.out_width):
                x_slice = self.x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size]
                for k in range(self.out_channels):
                    out[:, k, i, j] = np.sum(x_slice * self.weights[k, :, :, :], axis=(1, 2, 3)) + self.biases[k]

        return out

    def backward(self, d_out):
        d_weights = np.zeros_like(self.weights)
        d_biases = np.zeros_like(self.biases)
        d_x_padded = np.zeros_like(self.x_padded)

        # Backpropagation to calculate gradients
        for i in range(self.out_height):
            for j in range(self.out_width):
                x_slice = self.x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size]
                for k in range(self.out_channels):
                    d_weights[k, :, :, :] += np.sum(x_slice * d_out[:, k, i, j][:, None, None, None], axis=0)
                    d_biases[k] += np.sum(d_out[:, k, i, j], axis=0)
                    d_x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size] += self.weights[k, :, :, :] * d_out[:, k, i, j][:, None, None, None]

        # Remove padding from d_x_padded to get d_x
        if self.padding > 0:
            d_x = d_x_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]
        else:
            d_x = d_x_padded

        # Update the weights and biases
        self.weights -= self.learning_rate * d_weights
        self.biases -= self.learning_rate * d_biases

        return d_x


In [5]:
# usage
np.random.seed(0)

# Create a dummy input (batch_size=1, in_channels=1, height=28, width=28)
x = np.random.randn(1, 1, 28, 28)

# Initialize Conv2d layer
conv = Conv2d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding=1)

# Forward pass
out = conv.forward(x)
print("Output shape:", out.shape)

# Dummy gradient to test backward pass
d_out = np.random.randn(*out.shape)

# Backward pass
d_x = conv.backward(d_out)
print("d_x shape:", d_x.shape)


Output shape: (1, 8, 28, 28)
d_x shape: (1, 1, 28, 28)


### [Problem 2] Experiments with 2D convolutional layers on small arrays

In [6]:
import numpy as np

# Input array
x = np.array([[[[ 1,  2,  3,  4],
                [ 5,  6,  7,  8],
                [ 9, 10, 11, 12],
                [13, 14, 15, 16]]]])

# Weights (2 filters, 1 channel, 3x3 dimension)
w = np.array([[[[ 0.,  0.,  0.],
                [ 0.,  1.,  0.],
                [ 0., -1.,  0.]]],

              [[[ 0.,  0.,  0.],
                [ 0., -1.,  1.],
                [ 0.,  0.,  0.]]]])

# Expected output from forward propagation:
expected_forward_output = np.array([[[[-4, -4],
                                      [-4, -4]],

                                     [[ 1,  1],
                                      [ 1,  1]]]])

# Error delta for backward propagation:
delta = np.array([[[ -4,  -4],
                   [ 10,  11]],

                  [[  1,  -7],
                   [  1, -11]]])


In [14]:
class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, learning_rate=0.01):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.learning_rate = learning_rate

        # Initialize weights and biases
        self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * np.sqrt(2. / (in_channels * kernel_size * kernel_size))
        self.biases = np.zeros((out_channels, 1))

    def set_weights(self, weights):
        self.weights = weights

    def forward(self, x):
        self.x = x

        # Add padding to the input
        if self.padding > 0:
            self.x_padded = np.pad(x, ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        else:
            self.x_padded = x

        # Calculate the dimensions of the output
        self.out_height = int((self.x_padded.shape[2] - self.kernel_size) / self.stride) + 1
        self.out_width = int((self.x_padded.shape[3] - self.kernel_size) / self.stride) + 1

        # Initialize output
        out = np.zeros((x.shape[0], self.out_channels, self.out_height, self.out_width))

        # Perform the convolution
        for i in range(self.out_height):
            for j in range(self.out_width):
                x_slice = self.x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size]
                for k in range(self.out_channels):
                    out[:, k, i, j] = np.sum(x_slice * self.weights[k, :, :, :], axis=(1, 2, 3)) + self.biases[k]

        return out

    def backward(self, d_out):
        d_weights = np.zeros_like(self.weights, dtype=np.float64)
        d_biases = np.zeros_like(self.biases, dtype=np.float64)
        d_x_padded = np.zeros_like(self.x_padded, dtype=np.float64)

        batch_size, _, height, width = d_out.shape

        # Backpropagation to calculate gradients
        for i in range(self.out_height):
            for j in range(self.out_width):
                x_slice = self.x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size]
                for k in range(self.out_channels):
                    # Adjust the summation to handle batch size correctly
                    d_weights[k, :, :, :] += np.sum(x_slice * d_out[:, k:k+1, i:i+1, j:j+1], axis=0)
                    d_biases[k] += np.sum(d_out[:, k, i, j], axis=0)
                    d_x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size] += self.weights[k, :, :, :] * d_out[:, k:k+1, i:i+1, j:j+1]

        # Remove padding from d_x_padded to get d_x
        if self.padding > 0:
            d_x = d_x_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]
        else:
            d_x = d_x_padded

        # Update the weights and biases
        self.weights -= self.learning_rate * d_weights
        self.biases -= self.learning_rate * d_biases

        return d_x






In [15]:
# Initialize the Conv2d layer
conv = Conv2d(in_channels=1, out_channels=2, kernel_size=3, stride=1, padding=0)

# Set custom weights
conv.set_weights(w)

# Perform forward propagation
forward_output = conv.forward(x)
print("Forward output:\n", forward_output)
print("Expected forward output:\n", expected_forward_output)


Forward output:
 [[[[-4. -4.]
   [-4. -4.]]

  [[ 1.  1.]
   [ 1.  1.]]]]
Expected forward output:
 [[[[-4 -4]
   [-4 -4]]

  [[ 1  1]
   [ 1  1]]]]


In [16]:
# Reshape delta to be 4D
delta = delta.reshape(1, 2, 2, 2)


In [17]:
# Perform backward propagation
backward_output = conv.backward(delta)
print("Backward output:\n", backward_output)


Backward output:
 [[[[  0.   0.   0.   0.]
   [  0.  -5.   4.  -7.]
   [  0.  13.  27. -11.]
   [  0. -10. -11.   0.]]]]


### [Problem 3] Output size after 2-dimensional convolution

In [19]:
def calculate_output_size(N_in, P, F, S):
    """
    Calculate the output size of a 2D convolution operation.

    Parameters:
    N_in (int): Size of the input (number of features).
    P (int): Number of paddings in one direction.
    F (int): Filter size (kernel size).
    S (int): Stride size.

    Returns:
    int: Size of the output feature map.
    """
    return (N_in + 2 * P - F) // S + 1

# usage:
N_in_h = 28  # Input height
N_in_w = 28  # Input width
P_h = 1      # Padding in height direction
P_w = 1      # Padding in width direction
F_h = 3      # Filter height
F_w = 3      # Filter width
S_h = 1      # Stride in height direction
S_w = 1      # Stride in width direction

# Calculate output height and width
N_out_h = calculate_output_size(N_in_h, P_h, F_h, S_h)
N_out_w = calculate_output_size(N_in_w, P_w, F_w, S_w)

print(f"Output Height: {N_out_h}")
print(f"Output Width: {N_out_w}")


Output Height: 28
Output Width: 28


### [Problem 4] Creation of maximum pooling layer


In [20]:
import numpy as np

class MaxPool2D:
    def __init__(self, pool_size, stride):
        self.pool_size = pool_size  # Pooling size (height, width)
        self.stride = stride        # Stride size (height, width)

    def forward(self, x):
        self.x = x
        batch_size, channels, height, width = x.shape

        # Calculate output dimensions
        out_height = (height - self.pool_size[0]) // self.stride[0] + 1
        out_width = (width - self.pool_size[1]) // self.stride[1] + 1

        # Initialize output and indices to track max positions
        out = np.zeros((batch_size, channels, out_height, out_width))
        self.max_indices = np.zeros_like(out, dtype=int)

        # Perform max pooling
        for i in range(out_height):
            for j in range(out_width):
                x_slice = x[:, :, i*self.stride[0]:i*self.stride[0]+self.pool_size[0],
                                  j*self.stride[1]:j*self.stride[1]+self.pool_size[1]]
                out[:, :, i, j] = np.max(x_slice, axis=(2, 3))

                # Save the indices of the max values for backpropagation
                max_indices = np.argmax(x_slice.reshape(batch_size, channels, -1), axis=2)
                self.max_indices[:, :, i, j] = max_indices

        return out

    def backward(self, d_out):
        batch_size, channels, out_height, out_width = d_out.shape
        d_x = np.zeros_like(self.x)

        for i in range(out_height):
            for j in range(out_width):
                d_out_slice = d_out[:, :, i, j]

                # Get the original x slice shape and max indices
                h_start = i * self.stride[0]
                w_start = j * self.stride[1]
                h_end = h_start + self.pool_size[0]
                w_end = w_start + self.pool_size[1]

                x_slice = self.x[:, :, h_start:h_end, w_start:w_end]
                max_indices = self.max_indices[:, :, i, j]

                # Backpropagate the gradient to the max indices
                for b in range(batch_size):
                    for c in range(channels):
                        max_index = np.unravel_index(max_indices[b, c], x_slice.shape[2:])
                        d_x[b, c, h_start:h_end, w_start:w_end][max_index] += d_out_slice[b, c]

        return d_x

# usage:
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]]]])

pool = MaxPool2D(pool_size=(2, 2), stride=(2, 2))
out = pool.forward(x)
print("Forward output:\n", out)

# delta for backpropagation
delta = np.array([[[[1, -1],
                    [2, -2]]]])

d_x = pool.backward(delta)
print("Backward output:\n", d_x)


Forward output:
 [[[[ 6.  8.]
   [14. 16.]]]]
Backward output:
 [[[[ 0  0  0  0]
   [ 0  1  0 -1]
   [ 0  0  0  0]
   [ 0  2  0 -2]]]]


### [Problem 5] (Advance task) Creating average pooling

In [22]:
import numpy as np

class AveragePool2D:
    def __init__(self, pool_size, stride):
        self.pool_size = pool_size  # Pooling size (height, width)
        self.stride = stride        # Stride size (height, width)

    def forward(self, x):
        self.x = x
        batch_size, channels, height, width = x.shape

        # Calculate output dimensions
        out_height = (height - self.pool_size[0]) // self.stride[0] + 1
        out_width = (width - self.pool_size[1]) // self.stride[1] + 1

        # Initialize output
        out = np.zeros((batch_size, channels, out_height, out_width))

        # Perform average pooling
        for i in range(out_height):
            for j in range(out_width):
                x_slice = x[:, :, i*self.stride[0]:i*self.stride[0]+self.pool_size[0],
                                  j*self.stride[1]:j*self.stride[1]+self.pool_size[1]]
                out[:, :, i, j] = np.mean(x_slice, axis=(2, 3))

        return out

    def backward(self, d_out):
        batch_size, channels, out_height, out_width = d_out.shape
        d_x = np.zeros_like(self.x, dtype=np.float64)  # Ensure d_x is float64

        # Calculate the gradient for each slice
        for i in range(out_height):
            for j in range(out_width):
                h_start = i * self.stride[0]
                w_start = j * self.stride[1]
                h_end = h_start + self.pool_size[0]
                w_end = w_start + self.pool_size[1]

                gradient = d_out[:, :, i, j][:, :, None, None] / (self.pool_size[0] * self.pool_size[1])
                d_x[:, :, h_start:h_end, w_start:w_end] += gradient

        return d_x

# usage:
x = np.array([[[[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12],
                [13, 14, 15, 16]]]])

pool = AveragePool2D(pool_size=(2, 2), stride=(2, 2))
out = pool.forward(x)
print("Forward output:\n", out)

# delta for backpropagation
delta = np.array([[[[1, -1],
                    [2, -2]]]])

d_x = pool.backward(delta)
print("Backward output:\n", d_x)


Forward output:
 [[[[ 3.5  5.5]
   [11.5 13.5]]]]
Backward output:
 [[[[ 0.25  0.25 -0.25 -0.25]
   [ 0.25  0.25 -0.25 -0.25]
   [ 0.5   0.5  -0.5  -0.5 ]
   [ 0.5   0.5  -0.5  -0.5 ]]]]


### [Problem 6] Smoothing

In [23]:
import numpy as np

class Flatten:
    def __init__(self):
        self.input_shape = None

    def forward(self, x):
        # Store the input shape for use in the backward pass
        self.input_shape = x.shape
        # Flatten the input array
        return x.reshape(x.shape[0], -1)

    def backward(self, d_out):
        # Reshape the gradient to match the original input shape
        return d_out.reshape(self.input_shape)

# usage:
x = np.array([[[[1, 2, 3],
                [4, 5, 6]],

               [[7, 8, 9],
                [10, 11, 12]]]])

flatten = Flatten()
out = flatten.forward(x)
print("Forward output:\n", out)

# delta for backpropagation
delta = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]])

d_x = flatten.backward(delta)
print("Backward output:\n", d_x)


Forward output:
 [[ 1  2  3  4  5  6  7  8  9 10 11 12]]
Backward output:
 [[[[ 1  2  3]
   [ 4  5  6]]

  [[ 7  8  9]
   [10 11 12]]]]


### [Problem 7] Learning and estimation

In [37]:
def softmax(x):
    # Subtracting the max value for numerical stability
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)


In [38]:
def cross_entropy_loss(y_true, y_pred):
    # Avoiding log(0) by adding a small value
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))


In [4]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Define the Conv2d class
class Conv2d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.01
        self.biases = np.zeros(out_channels)
        self.d_weights = np.zeros_like(self.weights)
        self.d_biases = np.zeros_like(self.biases)
        self.x_padded = None

    def forward(self, x):
        if self.padding > 0:
            x_padded = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        else:
            x_padded = x

        self.x_padded = x_padded
        batch_size, in_channels, height, width = x_padded.shape
        out_height = (height - self.kernel_size) // self.stride + 1
        out_width = (width - self.kernel_size) // self.stride + 1

        out = np.zeros((batch_size, self.out_channels, out_height, out_width))

        for i in range(out_height):
            for j in range(out_width):
                x_slice = x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size]
                for k in range(self.out_channels):
                    out[:, k, i, j] = np.sum(x_slice * self.weights[k, :, :, :], axis=(1, 2, 3)) + self.biases[k]

        return out

    def backward(self, d_out):
        batch_size, out_channels, out_height, out_width = d_out.shape
        _, in_channels, height, width = self.x_padded.shape

        d_x_padded = np.zeros_like(self.x_padded)
        d_weights = np.zeros_like(self.weights)
        d_biases = np.zeros_like(self.biases)

        for i in range(out_height):
            for j in range(out_width):
                x_slice = self.x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size]
                for k in range(self.out_channels):
                    d_out_k = d_out[:, k, i, j][:, np.newaxis, np.newaxis, np.newaxis]  # Shape (batch_size, 1, 1, 1)

                    # Compute gradient w.r.t. weights
                    d_weights[k, :, :, :] += np.sum(x_slice * d_out_k, axis=0)

                    # Compute gradient w.r.t. biases
                    d_biases[k] += np.sum(d_out[:, k, i, j])

                    # Compute gradient w.r.t. input
                    # Expand dimensions for broadcasting: (batch_size, in_channels, kernel_size, kernel_size)
                    expanded_weights = self.weights[k, :, :, :][np.newaxis, :, :, :]  # Shape (1, in_channels, kernel_size, kernel_size)
                    d_x_padded[:, :, i*self.stride:i*self.stride+self.kernel_size, j*self.stride:j*self.stride+self.kernel_size] += expanded_weights * d_out_k

        # Remove padding from d_x_padded
        if self.padding > 0:
            d_x_padded = d_x_padded[:, :, self.padding:-self.padding, self.padding:-self.padding]

        return d_x_padded




    def update(self, learning_rate):
        self.weights -= learning_rate * self.d_weights
        self.biases -= learning_rate * self.d_biases


# Define the ReLU activation function
def relu(x):
    return np.maximum(x, 0)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

# Define the Flatten class
class Flatten:
    def forward(self, x):
        self.input_shape = x.shape
        return x.reshape(x.shape[0], -1)

    def backward(self, d_out):
        return d_out.reshape(self.input_shape)

# Define the FullyConnected layer
class FullyConnected:
    def __init__(self, in_features, out_features):
        self.weights = np.random.randn(in_features, out_features) * 0.01
        self.biases = np.zeros(out_features)
        self.d_weights = np.zeros_like(self.weights)
        self.d_biases = np.zeros_like(self.biases)
        self.x = None

    def forward(self, x):
        self.x = x
        return np.dot(x, self.weights) + self.biases

    def backward(self, d_out):
        d_x = np.dot(d_out, self.weights.T)
        self.d_weights = np.dot(self.x.T, d_out)
        self.d_biases = np.sum(d_out, axis=0)
        return d_x

    def update(self, learning_rate):
        self.weights -= learning_rate * self.d_weights
        self.biases -= learning_rate * self.d_biases

# Define the Softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

# Define the loss function and its derivative
def cross_entropy_loss(y_true, y_pred):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))

def cross_entropy_loss_derivative(y_true, y_pred):
    return y_pred - y_true



In [6]:
def train_and_evaluate_conv2d():
    # Load and preprocess data
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = np.expand_dims(x_train, axis=1).astype(np.float32) / 255.0
    x_test = np.expand_dims(x_test, axis=1).astype(np.float32) / 255.0
    y_train = to_categorical(y_train, 10)
    y_test = to_categorical(y_test, 10)

    # Define network architecture
    conv1 = Conv2d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding=1)
    flatten = Flatten()
    fc1 = FullyConnected(in_features=8 * 28 * 28, out_features=10)

    # Training parameters
    learning_rate = 0.001
    epochs = 1
    batch_size = 128

    # Training loop
    for epoch in range(epochs):
        for i in range(0, len(x_train), batch_size):
            x_batch = x_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            # Forward pass
            out = conv1.forward(x_batch)
            out = relu(out)
            out_flat = flatten.forward(out)
            out = fc1.forward(out_flat)
            out = softmax(out)

            # Compute loss
            loss = cross_entropy_loss(y_batch, out)
            print(f'Epoch {epoch+1}, Batch {i//batch_size+1}, Loss: {loss}')

            # Backward pass
            d_out = cross_entropy_loss_derivative(y_batch, out)
            d_out = fc1.backward(d_out)
            d_out = flatten.backward(d_out)
            d_out = relu_derivative(conv1.forward(x_batch)) * d_out
            d_out = conv1.backward(d_out)

            # Update weights
            fc1.update(learning_rate)
            conv1.update(learning_rate)

        # Evaluation after each epoch
        test_out = conv1.forward(x_test)
        test_out = relu(test_out)
        test_out = flatten.forward(test_out)
        test_out = fc1.forward(test_out)
        test_out = softmax(test_out)

        accuracy = np.mean(np.argmax(test_out, axis=1) == np.argmax(y_test, axis=1))
        print(f'Epoch {epoch+1}, Accuracy: {accuracy * 100:.2f}%')

train_and_evaluate_conv2d()


Epoch 1, Batch 1, Loss: 2.302859377245677
Epoch 1, Batch 2, Loss: 2.3024328154442553
Epoch 1, Batch 3, Loss: 2.301961384684776
Epoch 1, Batch 4, Loss: 2.300749422898787
Epoch 1, Batch 5, Loss: 2.301431071226006
Epoch 1, Batch 6, Loss: 2.3017124166985936
Epoch 1, Batch 7, Loss: 2.3006252759185206
Epoch 1, Batch 8, Loss: 2.298890888552586
Epoch 1, Batch 9, Loss: 2.299764526900762
Epoch 1, Batch 10, Loss: 2.298851757968366
Epoch 1, Batch 11, Loss: 2.300083192736074
Epoch 1, Batch 12, Loss: 2.299542113769463
Epoch 1, Batch 13, Loss: 2.2986976119316846
Epoch 1, Batch 14, Loss: 2.2979698969620808
Epoch 1, Batch 15, Loss: 2.2978214991043027
Epoch 1, Batch 16, Loss: 2.298580056265103
Epoch 1, Batch 17, Loss: 2.2975747962894233
Epoch 1, Batch 18, Loss: 2.2985463491072817
Epoch 1, Batch 19, Loss: 2.297231103042497
Epoch 1, Batch 20, Loss: 2.296445054811387
Epoch 1, Batch 21, Loss: 2.296446867855397
Epoch 1, Batch 22, Loss: 2.294685629632518
Epoch 1, Batch 23, Loss: 2.292800422966841
Epoch 1, Bat

### [Problem 8] (Advance assignment) LeNet

In [7]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical


In [9]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
x_test = x_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)


In [10]:
model = models.Sequential()

# First Convolutional Layer
model.add(layers.Conv2D(6, (5, 5), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))

# Second Convolutional Layer
model.add(layers.Conv2D(16, (5, 5), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

# Flattening Layer
model.add(layers.Flatten())

# Fully Connected Layer 1
model.add(layers.Dense(120, activation='relu'))

# Fully Connected Layer 2
model.add(layers.Dense(84, activation='relu'))

# Output Layer
model.add(layers.Dense(10, activation='softmax'))


In [11]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [12]:
model.fit(x_train, y_train, epochs=10, batch_size=128, validation_data=(x_test, y_test))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 53ms/step - accuracy: 0.8105 - loss: 0.6545 - val_accuracy: 0.9727 - val_loss: 0.0900
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 53ms/step - accuracy: 0.9710 - loss: 0.0947 - val_accuracy: 0.9763 - val_loss: 0.0794
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 53ms/step - accuracy: 0.9791 - loss: 0.0662 - val_accuracy: 0.9847 - val_loss: 0.0498
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 52ms/step - accuracy: 0.9839 - loss: 0.0492 - val_accuracy: 0.9844 - val_loss: 0.0501
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 58ms/step - accuracy: 0.9860 - loss: 0.0432 - val_accuracy: 0.9857 - val_loss: 0.0442
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 54ms/step - accuracy: 0.9891 - loss: 0.0347 - val_accuracy: 0.9875 - val_loss: 0.0374
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x79312b56eaa0>

In [13]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9863 - loss: 0.0444
Test Accuracy: 0.9885


### [Problem 9] (Advance assignment) Survey of famous image recognition models........

1. AlexNet (2012)

Developed by Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton.


### Architecture summary:
- Layers - 8 layers (5 convolutional layers followed by 3 fully connected layers).

- Innovations
1. Introduced the use of ReLU (Rectified Linear Unit) activation function, which helped in faster training.


2. Used dropout in the fully connected layers to reduce overfitting.
3. Employed data augmentation techniques like random cropping and flipping to enhance generalization.
4. Implemented overlapping max-pooling to reduce the spatial dimensions while retaining important information.

- Impact - AlexNet won the 2012 ImageNet Large Scale Visual Recognition Challenge (ILSVRC) with a significant margin, marking a breakthrough in deep learning and computer vision.

### Applications
- Image classification
-  Object detection
- Feature extraction for various computer vision tasks

2. VGG16 (2014)

Developed by Karen Simonyan and Andrew Zisserman from the University of Oxford.

### Architecture summary
- Layers - 16 layers (13 convolutional layers followed by 3 fully connected layers).

- Innovations
1. The architecture consists of very small (3x3) convolution filters stacked together, which was a novel approach at the time.
2. VGG16 increased the depth of the network significantly compared to AlexNet, leading to better feature extraction capabilities.
3. The consistent use of 3x3 filters across all layers allowed the network to capture more complex patterns, leading to improved performance.

- Impact - VGG16 demonstrated that increasing the depth of the network could significantly improve accuracy in image recognition tasks. It became one of the most popular models for transfer learning.

### Applications
1. Image classification
2. Object detection (used in pipelines like Fast R-CNN)
3. Feature extraction in various computer vision applications

### [Problem 10] Calculation of output size and number of parameters

### 1. Convolution Layer 1

Input size: 144 x 144, 3 channels

Filter size: 3 x 3, 6 output channels

Stride: 1

Padding: None (0)

Output Size:

𝐻<sub>out</sub> = (144 - 3 + 2 × 0) /1 + 1 = 142

𝑊<sub>out</sub> = (144 - 3 + 2 × 0) /1 + 1 = 142


So, the output size is 142 × 142 with 6 output channels.

Number of Parameters = (3 × 3 × 3 × 6) + 6=162 + 6=168


### 2. Convolution Layer 2
Input size: 60 x 60, 24 channels

Filter size: 3 x 3, 48 output channels

Stride: 1

Padding: None (0)

Output Size:

𝐻<sub>out</sub> = (60 - 3 + 2 × 0) /1 + 1 = 58

𝑊<sub>out</sub> = (0 - 3 + 2 × 0) /1 + 1 = 58

So, the output size is 58×58 with 48 output channels.

Number of Parameters=(3×3×24×48)+48=31,104+48=31,152

### 3. Convolution Layer 3
Input size: 20 x 20, 10 channels

Filter size: 3 x 3, 20 output channels

Stride: 2

Padding: None (0)

𝐻<sub>out</sub> = (20 - 3 + 2 × 0) /1 + 1 = 9.5

Since you can't have a fractional output size in practical scenarios, the framework would truncate or round down the size, so:

𝐻<sub>out</sub>=𝑊<sub>out</sub> = 9

So, the output size is 9×9 with 20 output channels.

Number of Parameters=(3×3×10×20)+20=1,800+20=1,820





### [Problem 11] (Advance assignment) Survey on filter size

1. Efficiency and computational cost

A 3×3 filter has fewer parameters than a larger 7×7 filter, which reduces the computational cost and memory usage. For example, a 3×3 filter has 9 parameters, while a 7×7 filter has 49. Using smaller filters allows for more manageable and efficient training of deep neural networks

Instead of using a single 7×7 filter, stacking multiple 3×3 filters can achieve a larger receptive field (the area of the input image that affects a particular output pixel) with fewer parameters. For instance, two consecutive 3×3 convolutions have a receptive field equivalent to a 5×5 filter, and three consecutive 3×3 filters cover the same area as a 7×7 filter, but with greater depth and computational efficiency.

2. Effective capture of local patterns

Smaller filters like 3×3 are better suited for capturing fine-grained features, such as edges and textures, which are crucial for recognizing complex patterns in images. Larger filters might overlook these subtle details.

By stacking multiple 3×3 filters, non-linearities (like ReLU) are introduced between layers, which enhances the model's ability to capture complex features and improve the overall representational power of the network.


3. Flexibility and modularity

3×3 filters are versatile and can be combined in various ways to build deep networks, making them a flexible choice in designing architectures like VGG and ResNet. The modular nature allows for easy scaling and adaptation to different tasks and datasets.

### The effect of a 1×1 filter with no height or width direction
1. Dimensionality reduction

A 1×1 filter acts on individual pixels across the depth (channels) of the input image, allowing the model to perform a weighted combination of the input channels. This is particularly useful for reducing the number of channels (dimensionality reduction) while retaining important information, which can decrease the computational cost in subsequent layers.

In architectures like ResNet, 1×1 convolutions are used in bottleneck layers to compress the input features, allowing for deeper networks without significantly increasing the number of parameters.

2. Adding non-linearity

Although a 1×1 filter doesn't change the spatial dimensions, it introduces non-linearity through the activation function (e.g., ReLU) applied after the convolution. This adds complexity and enables the network to learn more intricate patterns and relationships between channels.

3. Cross-channel interactions

1×1 convolutions enable interaction between different feature maps, allowing the network to combine information from multiple channels in a flexible way. This can be particularly effective in refining features learned from previous layers and improving the overall expressiveness of the model.

4. Efficient use in deep architectures

In architectures like MobileNet, 1×1 convolutions are used in conjunction with depthwise separable convolutions to significantly reduce the computational load while maintaining accuracy. This makes 1×1 filters an essential tool for building efficient deep learning models, especially for mobile and embedded devices.
