#### Operations

An operation in a neural network, in the forward pass, receives an input, does some operation on it, which could depend on some other parameter, like another matrix in a matrix multiplication operation, and outputs an answer. In the backward pass, the operation receives an output gradient, which represents the gradient of the loss function with respect to the output of the operation, which is caluclated from the next operation in the network as input gradient and passed backward to the current operation node, and calculates the gradient of the loss with resepct to it's input, the input gradient, if the node has parameters, like weights, it will calculate the gradient of the parameters also. Note that the shape of output and output gradient must be equal and the same holds for the input and input gradient.

**TODO:** needs clarification on the input/output grad confusion! and the shapes (assertion checks)!

Layers are a series of linear operations followed by a nonlinear operation, e.g. sigmoid function, called the activation function, which outputs the activations. The zeroth layer of a network is the input layer, $x$, the last one is the output layer and the layers in between are the hidden layers.

In [None]:
class Op:
    """Base class for a single operation."""
    def __init__(self):
        pass
    
    def forward(self, _input):
        self.input_ = input_
        self.output  = self._output()
        return self.output
    
    def backward(self, output_grad):
        self.input_grad = self._input_grad(output_grad)
        return self.input_grad
    
    def _output(self):
        """Helper method to calculate the forward pass of the operation.
        Each operation needs to define this method.
        """
        raise NotImplementedError(f"_output helper function not implemented for {self}")
        
    def _input_grad(self, output_grad):
        """Helper method to calculate the backward pass of the operation.
        Each operation needs to define this method.
        """
        raise NotImplementedError(f"_input_grad helper function not implemented for {self}")
        

In [None]:
class ParamOp(Op):
    """Operations with parameters need to caculate gradients wrt. param also."""
    def __init__(self, param):
        super().__init__()
        self.param = param
        
    def backward(self, output_grad):
        self.input_grad = self._input_grad(output_grad)
        self.param_grad = self._param_grad(output_grad)
        return self.input_grad
    
    def _param_grad(self, output_grad):
        """Helper method to calculate the gradients wrt. parameter of the operation.
        Each param operation needs to define this method.
        """
        raise NotImplementedError(f"_param_grad helper function not implemented for {self}")

In [None]:
class WeightMultiply(ParamOp):
    """Matrix multiplication operation."""
    def __init__(self, W):
        super().__init__(W)
        
    def _output(self):
        return np.dot(self.input_, self.param)
    
    def _input_grad(self, output_grad):
        """Given output=X.W, and loss:=loss(output), computes
        dloss/dinput = dloss/doutput . doutput/dinput = output_grad . W^T
        """
        return np.dot(output_grad, np.transpose(W, (1, 0)))
    
    def _param_grad(self, output_grad):
        """Given output=X.W, and loss:=loss(output), computes
        dloss/dparam = dloss/doutput . doutput/dparam = X^T . output_grad
        """
        return np.dot(np.transpose(self.input_, (1, 0)), output_grad)

In [21]:
class BiasAdd(ParamOp):
    """Adding bias operation."""
    def __init__(self, B):
        """B is bias, a one dimensional vector. B.shape[0]==1"""
        super().__init__(B)
        
    def _output(self):
        return self.input_ + self.param
    
    def _input_grad(self, output_grad):
        """Given output=input + B, and loss:=loss(output), computes
        dloss/dinput = dloss/doutput . doutput/dinput = output_grad * I
        """
        return np.ones_like(self.input_) * output_grad
    
    def _param_grad(self, output_grad):
        """Given output=input + B, and loss:=loss(output), computes
        dloss/dB = dloss/doutput . doutput/dB = output_grad * I
        """
        param_grad = np.ones_like(self.param) * output_grad
        return np.sum(param_grad, axis=0).reshape(1, param_grad.shape[1]) # ? reshape into bias like?

In [20]:
class Sigmoid(Op):
    """The sigmoid activation function."""
    def __init__(self):
        super().__init__()
        
    def _output(self):
        return 1.0 / (1.0 + np.exp(-self.input_))
    
    def _input_grad(self, output_grad):
        """Given output=1.0/(1+exp(-input)), and loss:=loss(output), computes
        dloss/dinput = dloss/doutput * doutput/dinput = output * (1 - output) * output_grad
        """
        return self.output * (1.0 - self.output) * output_grad

In [30]:
class Linear(Op):
    """The identity activation function."""
    def __init__(self):
        super().__init__()
        
    def _output(self):
        return self._input
    
    def _input_grad(self, output_grad):
        """Given output=input, and loss:=loss(output), computes
        dloss/dinput = dloss/douput * doutput/dinput = output_grad * I
        """
        return np.ones_like(self.input_) * output_grad

In [19]:
class Layer:
    """"The base class for layers."""
    def __init__(self, neurons):
        self.neurons = neurons # number of outputs of the layer
        self.first = True # whether it's the first layer or not
        self.params = []
        self.param_grads = []
        self.ops = []
        
    def _setup(self, num_in):
        """Define the operations of the layer. 
        Each layer needs to define this method.
        """
        raise NotImplementedError(f"_setup helper function is not implemented for {self}")
        
    def forward(self, input_):
        """Feeds the input through the operations in the layer."""
        if self.first:
            self._setup(input_)
            self.first = False
        self.input_ = input_
        for op in self.ops:
            input_ = op.forward(input_)
        self.output = input_
        return self.output
    
    def backward(self, output_grad):
        """Feed the output_grad backward through the operations of the layer."""
        for op in reversed(self.ops):
            output_grad = op.backward(output_grad) # NOTE HERE!!!!
        input_grad = output_grad
        self._param_grads()
        return input_grad
    
    def _param_grads(self):
        """Gather param_grads from param operations of the layer."""
        #self.param_grads = []
        for op in self.ops:
            if issubclass(op.__class__, ParamOp):
                self.param_grads.append(op.param_grad)
                
    def _params(self):
        """Gather all the params from the operations of the layer."""
        #self.params = []
        for op in self.ops:
            if issubclass(op.__class__, ParamOp):
                self.params.append(op.param)

In [18]:
class Dense(Layer):
    """A fully connected layer."""
    def __init__(self, neurons, activation = Sigmoid()):
        super().__init__(neurons)
        self.activation = activation
        
    def _setup(self, input_):
        if self.seed: np.random.seed(self.seed) # the model passes the seed if any
        self.params = []
        self.params.append(np.random.randn(input_.shape[1], self.neurons)) # weights
        self.params.append(np.random.randn(1, self.neurons)) # bias
        self.ops = [WeightMultiply(self.params[0]), BiasAdd(self.params[1]), self.activation]
        return None

In [22]:
class Loss:
    """The base class for loss functions."""
    def __init__(self):
        pass
    
    def forward(self, predictions, targets):
        """Compute the loss value."""
        self.predictions = predictions
        self.targets = targets
        loss = self._output()
        return loss
    
    def backward(self):
        """Compute the gradient of the loss value wrt. loss function."""
        self.input_grad = self._input_grad()
        return self.input_grad
    
    def _output(self):
        """Every loss function needs to implement _output method."""
        raise NotImplementedError(f"_output helper function has not been implemented for {self}")
        
    def _input_grad(self):
        """Every loss function needs to implement _input_grad method."""
        raise NotImplementedError(f"_input_grad helper function has not been implemented for {self}")

In [23]:
class MeanSquaredError(Loss):
    def __init__(self):
        super().__init__()
        
    def _output(self):
        loss = np.sum(np.power(self.predictions, self.targets, 2)) / self.predictions.shape[0]
        return loss
    
    def _input_grad(self):
        """Given loss=sum(p_i - y_i)^2/num_p, computes
        dloss/dp = 2(p_i - y_i)/num_p
        """
        return 2.0 * (self.predictions - self.targets) / self.predictions.shape[0]

In [27]:
class Net:
    """The neural network comprised of multiple layers."""
    def __init__(self, layers, loss, learning_rate, seed = 42):
        self.layers = layers
        self.loss = loss # function pointer
        self.learning_rate = learning_rate
        self.seed = seed
        if seed:
            for layer in layers:
                setattr(layer, "seed", self.seed)
                
    def forward(self, x_batch):
        x_out = x_batch
        for layer in self.layers:
            x_out = layer.forward(x_out)
        return x_out
    
    def backward(self, loss_grad):
        grad = loss_grad
        for layer in reversed(self.layers):
            grad = layer.backward(grad)
        return None
    
    def train_batch(self, x_batch, y_batch):
        """Forward pass, get the loss, do the backward pass in a batch."""
        predictions = self.forward(x_batch)
        loss = self.loss.forward(predictions, y_batch)
        self.backward(self.loss.backward())
        return loss
    
    def params(self):
        """Yield the parameters of the network layer by layer."""
        for layer in self.layers:
            yield from layer.params
            
    def param_grads(self):
        """Yield the gradients of the parameters of the network layer by layer."""
        for layer in self.layers:
            yield from layer.param_grads

In [28]:
# an example linear regression network
linear_regression = Net(layers=[Dense(neurons=1)],
                        loss = MeanSquaredError(),
                        learning_rate=0.01)

In [31]:
# an example multi layer perceptron, a fully connected network
mlp = Net(layers=[Dense(neurons=13, activation=Sigmoid()), Dense(neurons=1, activation=Linear())],
          loss=MeanSquaredError(), learning_rate=0.01)