#### Operations

An operation in a neural network, in the forward pass, receives an input, does some operation on it, which could depend on some other parameter, like another matrix in a matrix multiplication operation, and outputs an answer. In the backward pass, the operation receives an output gradient, which represents the gradient of the loss function with respect to the output of the operation, which is caluclated from the next operation in the network as input gradient and passed backward to the current operation node, and calculates the gradient of the loss with resepct to it's input, the input gradient, if the node has parameters, like weights, it will calculate the gradient of the parameters also. Note that the shape of output and output gradient must be equal and the same holds for the input and input gradient.

**TODO:** needs clarification on the input/output grad confusion! and the shapes!

In [12]:
class Op:
    """Base class for a single operation."""
    def __init__(self):
        pass
    
    def forward(self, _input):
        self.input_ = input_
        self.output  = self._output()
        return self.output
    
    def backward(self, output_grad):
        self.input_grad = self._input_grad(output_grad)
        return self.input_grad
    
    def _output(self):
        """Helper method to calculate the forward pass of the operation.
        Each operation needs to define this method.
        """
        raise NotImplementedError(f"_output helper function not implemented for {self}")
        
    def _input_grad(self, output_grad):
        """Helper method to calculate the backward pass of the operation.
        Each operation needs to define this method.
        """
        raise NotImplementedError(f"_input_grad helper function not implemented for {self}")
        

In [13]:
class ParamOp(Op):
    """Operations with parameters need to caculate gradients wrt. param also."""
    def __init__(self, param):
        super().__init__()
        self.param = param
        
    def backward(self, output_grad):
        self.input_grad = self._input_grad(output_grad)
        self.param_grad = self._param_grad(output_grad)
        return self.input_grad
    
    def _param_grad(self, output_grad):
        """Helper method to calculate the gradients wrt. parameter of the operation.
        Each param operation needs to define this method.
        """
        raise NotImplementedError(f"_param_grad helper function not implemented for {self}")

Layers are a series of linear operations followed by a nonlinear operation, e.g. sigmoid function, called the activation function, which outputs the activations. The zeroth layer of a network is the input layer, $x$, the last one is the output layer and the layers in between are the hidden layers.

In [14]:
class WeightMultiply(ParamOp):
    """Matrix multiplication operation."""
    def __init__(self, W):
        super().__init__(W)
        
    def _output(self):
        return np.dot(self.input_, self.param)
    
    def _input_grad(self, output_grad):
        """Given output=X.W, and loss:=loss(output), computes
        dloss/dinput = dloss/doutput . doutput/dinput = output_grad . W^T
        """
        return np.dot(output_grad, np.transpose(W, (1, 0)))
    
    def _param_grad(self, output_grad):
        """Given output=X.W, and loss:=loss(output), computes
        dloss/dparam = dloss/doutput . doutput/dparam = X^T . output_grad
        """
        return np.dot(np.transpose(self.input_, (1, 0)), output_grad)

In [15]:
class BiasAdd(ParamOp):
    def __init__(self, B):
        """B is bias, a one dimensional vector. B.shape[0]==1"""
        super().__init__(B)
        
    def _output(self):
        return self.input_ + self.param
    
    def _input_grad(self, output_grad):
        """Given output=input + B, and loss:=loss(output), computes
        dloss/dinput = dloss/doutput . doutput/dinput = output_grad * I
        """
        return np.ones_like(self.input_) * output_grad
    
    def _param_grad(self, output_grad):
        """Given output=input + B, and loss:=loss(output), computes
        dloss/dB = dloss/doutput . doutput/dB = output_grad * I
        """
        param_grad = np.ones_like(self.param) * output_grad
        return np.sum(param_grad, axis=0).reshape(1, param_grad.shape[1]) # ? reshape into bias like?

In [16]:
class Sigmoid(Op):
    def __init__(self):
        super().__init__()
        
    def _output(self):
        return 1.0 / (1.0 + np.exp(-self.input_))
    
    def _input_grad(self, output_grad):
        """Given output=1.0/(1+exp(-input)), and loss:=loss(output), computes
        dloss/dinput = dloss/doutput * doutput/dinput = output * (1 - output) * output_grad
        """
        return self.output * (1.0 - self.output) * output_grad