# FeedForward Neural Network

In [42]:
import numpy as np


class FeedForwardLayer():
    def __init__(self, input_size, output_size) -> None:
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = 1024
        self.W1 = np.random.randn(input_size, self.hidden_size)
        self.b1 = np.random.randn(self.hidden_size)
        self.W2 = np.random.randn(self.hidden_size, output_size)
        self.b2 = np.random.randn(self.output_size)

    def sigmoid(self, X):
        return 1/(1+np.exp(-X))    

    def sigmoid_derivative(self, X):
        sig_X = self.sigmoid(X)
        return sig_X*(1-sig_X)

    def forward(self, X):
        ''' 
        X: input data, shape: (N, D), where N is the number of samples, D is the dimension of the input data
        '''
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.sigmoid(self.z1)
        
        # print((X @ self.W1).shape, self.b1.shape)
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.sigmoid(self.z2)
        return self.a2
    
    def backward(self, X, y, learning_rate):
        '''
        X: input data, shape: (N, D)
        y: output labels, shape: (N, O)
        '''

        # MSE error
        loss = self.a2 - y
        z2_loss = loss * self.sigmoid_derivative(self.z2)
        W2_grad =  self.a1.T @ z2_loss
        b2_grad = np.sum(z2_loss, axis=0)
        
        a1_loss = z2_loss @ self.W2.T
        z1_loss = a1_loss * self.sigmoid_derivative(self.z1)
        W1_grad = X.T @ z1_loss
        b1_grad = np.sum(z1_loss, axis=0)

        # Update parameters
        self.W2 -= learning_rate * W2_grad
        self.b2 -= learning_rate * b2_grad
        self.W1 -= learning_rate * W1_grad
        self.b1 -= learning_rate * b1_grad


    def train(self, X, y, epochs, learning_rate):
        for _ in range(epochs):
            output = self.forward(X)
            self.backward(X, y, learning_rate=learning_rate)





In [45]:
import numpy as np

# Generate synthetic data
num_samples = 1000
input_size = 10
output_size = 5

# Generate random input data
X = np.random.randn(num_samples, input_size)

# Generate random weights for a linear transformation
weights = np.random.randn(input_size, output_size)
bias = np.random.randn(output_size)

# Apply the linear transformation to the input data
y = np.dot(X, weights) + bias

# Apply a non-linear function (sigmoid) to the output
y = 1 / (1 + np.exp(-y))

# Add some noise to the output
noise = np.random.normal(0, 0.1, y.shape)
y += noise

# Create an instance of the FeedForwardLayer
layer = FeedForwardLayer(input_size, output_size)

# Train the neural network
epochs = 100
learning_rate = 0.01
layer.train(X, y, epochs, learning_rate)

# Generate test samples
test_samples = 10
test_X = np.random.randn(test_samples, input_size)

# Generate true output values for the test samples
test_y = np.dot(test_X, weights) + bias
test_y = 1 / (1 + np.exp(-test_y))

# Predict the output for the test samples
predicted_y = layer.forward(test_X)

print("Predicted output:")
print(np.round(predicted_y,1))
print("True output:")
print(np.round(test_y,1))

Predicted output:
[[1.  0.  0.  1.  0. ]
 [0.  0.  0.  1.  0. ]
 [0.1 0.  0.  0.  0. ]
 [1.  0.  0.  0.2 0. ]
 [1.  1.  0.  1.  0. ]
 [1.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0. ]
 [1.  0.  0.  1.  0. ]
 [1.  1.  0.  0.  0. ]]
True output:
[[0.8 0.4 0.7 0.8 0.7]
 [0.  0.  0.  0.7 0.1]
 [0.3 0.  0.  0.1 0.7]
 [0.8 0.  0.  0.5 0.9]
 [1.  0.9 0.  1.  0.8]
 [0.9 0.1 0.2 0.3 1. ]
 [0.  0.  0.  0.2 0.3]
 [0.4 0.  0.3 0.1 0.4]
 [1.  0.  1.  1.  0.4]
 [1.  0.6 0.8 0.2 0.3]]


# FeedForward with Softmax

Normally, for classification problems, we use softmax activation function in the output layer. It is usually paired with the cross-entropy loss function. Because compared to MSE loss, it has simple derivative, it is easy to implement and it is widely used in practice.  

For detailed derivation of softmax and cross-entropy loss, you can refer to https://chat.openai.com/share/018aa327-76e9-43bd-a326-4c7b03f5db02.

The derivative of softmax function (also for all normalizing functions) is a Jacobian matrix. It is a square matrix where the number of rows and columns is equal to the number of classes. The diagonal elements of the Jacobian matrix are the softmax function multiplied by (1 - softmax function). The off-diagonal elements are the negative of the softmax function multiplied by the softmax function. For more details https://e2eml.school/softmax.

In [None]:
import numpy as np


class FeedForwardWithSoftmax():
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = 1024

        self.W1 = np.random.randn(self.input_size, self.hidden_size)
        self.b1 = np.random.randn(self.hidden_size)

        self.W2 = np.random.randn(self.hidden_size, self.output_size)
        self.b2 = np.random.randn(self.output_size)

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def d_sigmoid(self, x):
        return self.sigmoid(x)(1-self.sigmoid(x))

    def softmax(self, x):
        return np.exp(x)/np.sum(np.exp(x))
    
    def d_softmax(self, x):
        pass
    
    def forward(self, X):
        ''' 
        X has shape [N, I], N is number of sample. I is input dimension
        '''
        self.z1 = X @ self.W1 + self.b1
        self.h1 = self.sigmoid(self.z1)

        self.z2 = self.h1 @ self.W2 + self.b2
        self.output = self.softmax(self.z2)

        return self.output
    
    def cross_entropy_loss(y_true, y_pred):
        ''' 
        y_true has shape [N, O]. N is number of sample. O is number of class.
        '''
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)
        loss = - np.sum(y_true * np.log(y_pred))
        mean_loss = loss / y_true.shape[0]
        return mean_loss

    def backward(self, X, y, learning_rate):
        ''' 
        X has shape [N, I]
        y has shape [N, O]
        '''
        # The derative of cross entropy loss and softmax
        d_z2 = self.output - y
        d_w2 = self.h1.T @ d_z2
        d_b2 = np.sum(d_z2, axis=0)

        d_h1 = d_z2 @ self.W2.T
        d_z1 = d_h1 * self.d_sigmoid(self.z1)
        d_W1 = X.T @ d_z1
        d_b1 = np.sum(d_z1, axis=0)

        self.W1 -= learning_rate*d_W1
        self.b1 -= learning_rate*d_b1
        self.W2 -= learning_rate*d_w2
        self.b2 -= learning_rate*d_b2

    def train(self, X, y, epochs, learning_rate):
        for _ in range(epochs):
            output = self.forward(X)
            self.backward(X, y, learning_rate)

    


## Finding the Jocabian matrix of softmax function

In [55]:
import numpy as np

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

def softmax_jacob(x):
    ''' 
    Jacobian matrix for softmax is a symmetric matrix
    '''
    return np.diag(softmax(x)) - np.outer(softmax(x),softmax(x))


x = np.random.randn(10) 



array([[0.16285523, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.07200342, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.08273631, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.04645867, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.06146745,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.15080146, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.04559796, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.       