### Lecture 12 - Building a Neural Network Framework from Scratch

In [1]:
import numpy as np

In [2]:
class Node:
    """
    Declare a class, node, with the following properties:
    -- inputs
    -- outputs
    -- calculate and forward values to the next node
    -- receive values from backward-propagation (given by partial derivatives)
        to adjust parameters ("gradients")
    """
    def __init__(self, inputs=None):
        inputs = inputs or []
        self.inputs = inputs
        self.outputs = []
        
        for n in self.inputs:
            n.outputs.append(self)
            # 'self' node as inbound nodes' outbound nodes
             
            self.value = None
            
            self.gradients = {
                # a dictionary where the key is "self",
                # and the value is the partial derivative of "self"
                # if the functional form is wx + b, then the partial derivatives are:
                # w: x
                # x: w
                # b: 1
                
            }
    
    def forward(self):
        """
        Forward propagation
        Computing the output value based on inbound nodes and store the results 
        in self.value
        """
        raise NotImplemented
        
    def backward(self):
        """
        Backward propagation
        """
        raise NotImplemented
    

In [3]:
class Input(Node):
    """
    Defining the input nodes
    """
    
    def __init__(self):
        """
        An input node has no inbound nodes,
        so no need to pass anything to the 'Node' instantiator
        """
        Node.__init__(self)
        
    def forward(self, value=None):
        """
        Only input node is the node where the value can be passed as
        an argument to forward(), since it has no inbound nodes;
        all other node implementations should receive the values from
        the previous node
        """
        if value is not None:
            self.value = value
        
    def backward(self):
        self.gradients = {self:0}
        for n in self.outputs:
            grad_cost = n.gradients[self]
            self.gradients[self] = grad_cost * 1
            

In [4]:
class Add(Node):
    """
    Define a subclass of "Node" where the calculation is simply adding all inputs.
    """
    
    def __init__(self, nodes):
        Node.__init__(self, nodes)
        
    def forward(self):
        self.value = sum([n.value for n in self.inputs])
        # alternatively, self.value = sum(map(lamda n:n.value, self.inputs))
    
    # note there isn't a backpropagation process since the function, "add",
    # does not have any parameters (and thus cannot calculate partial derivatives)!       

In [5]:
class Linear(Node):
    """
    Define a subclss of "Node" where the calculation is a linear function 
    f(X) = wX + b. (X is a vector)
    """
    def __init__(self, nodes, weights, bias):
        Node.__init__(self, [nodes, weights, bias])
        # initialize the Linear Node with a list where 
        #  -- nodes = inbound nodes vector
        #  -- weights = parameter vector W
        #  -- bias = intercept vector b
            
    def forward(self):
        inputs = self.inputs[0].value
        weights = self.inputs[1].value
        bias = self.inputs[2].value
        
        self.value = np.dot(inputs, weights) + bias
        
    def backward(self):
        
        # initialize a partial for each of the inbound nodes
        self.gradients = {n: np.zeros_like(n.value) for n in self.inputs}
        
        for n in self.outputs:
            # get the partial (grad_cost) with regard to this node
            grad_cost = n.gradients[self] 
            
            self.gradients[self.inputs[0]] = np.dot(
                grad_cost, self.inputs[1].value.T
            ) # the partials of f(X) on X are weights W (self.input[1])
            
            self.gradients[self.inputs[1]] = np.dot(
                grad_cost, self.inputs[0].value.T
            ) # the partials of f(X) on W are X (self.input[0])
            
            self.gradients[self.inputs[2]] = np.dot(
                grad_cost, 1 # the partial derivative of f(X) on b = 1
            )
            
            # alternatively, 
            # self.gradients[self.inputs[2]] = np.sum(grad_cost, axis=0, keepdims=False)
             

In [6]:
class sigmoid(Node):
    """
    Define a subclss of "Node" where the calculation is a sigmoid function 
    f(x) = 1 / 1 + e^(-x)
    """
    def __init__(self, node):
        Node.__init__(self, [node])
        
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-1 * x))
    
    def forward(self):
        self.x = self.inputs[0].value
        self.value = self._sigmoid(self.x)
        
    def backward(self):
        self.partial = self._sigmoid(self.x) * (1 - self._sigmoid(self.x))
        
        # note that for a sigmoid function, its derivative has the following property:
        # f'(x) = f(x)[1-f(x)]
        
        self.gradients = {n: np.zeros_like(n.value) for n in self.inputs}
        
        for n in self.outputs:
            # get the partial with respect to this node
            grad_cost = n.gradients[self]
            
            self.gradients[self.inputs[0]] = grad_cost * self.partial
            # note - use * to keep all the dimensions consistent!
            

In [7]:
class MSE(Node):
    """
    Define the loss function
    """
    def __init__(self, y_true, y_hat):
        Node.__init__(self, [y_true, y_hat])
        
    def forward(self):
        y_true = self.inputs[0].value.reshape(-1, 1) 
        y_hat = self.inputs[0].value.reshape(-1, 1)
        # reshape the value to fall in the range of [-1,1]
        assert(y_true.shape == y_hat.shape)
        
        self.n = self.inputs[0].value.shape[0] # # of observations (length of vector y)
        self.diff = y_true - y_hat
        
        self.value = np.mean(self.diff ** 2) # mean std. err
        
    def backward(self):    
        self.gradients[self.inputs[0]] = (2 / n) * self.diff # the partial of MSE on y_true
        self.gradients[self.inputs[1]] = -1 * (2 / n) * self.diff # the partial of MSE on y_hat
    

In [11]:
def run_one_epoch(output_node, graph: list):
    """
    Define a procedure to connect all the nodes defined above
    (i.e., run the neural net for one round --- forward and backward)
    """
    for n in graph:
        n.forward()
        # each node execute forward,
        # get self.value based on the topological sorted graph
        
    for n in graph[::-1]:
        n.backward()

# Note:
# in practice, it is common to feed in multiple data examples in each epoch,
# since they can be processes in parallel;
# the number of examples is called 'batch size'.

In [12]:
def topological_sort(graph):
    """
    Define a topological sort procedure where the input is a @graph,
    and the output is a @sorted_list
    """
    
    pass

In [13]:
def gradient_descent_update(trainable_nodes, learning_rate=1e-3):
    for node in trainable_nodes:
        update_value += -1 *(learning_rate * node.gradient[node])

In [14]:
from sklearn.datasets import load_boston

In [15]:
data = load_boston()

In [16]:
X_ = data['data']

In [17]:
y_ = data['target']

In [None]:
X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0) # normalize X

In [18]:
x, y = Input(), Input()

In [19]:
W1, b1 = Input(), Input()

In [20]:
W2, b2 = Input(), Input()

In [None]:
output_1 = Linear(X_, W1, b1)
sigmoid_1 = Sigmoid(loss1)
linear2 = Linear(sigmoid_1, W2, b2)
loss = MES(y, linear2)

In [21]:
from sklearn.utils import resample, shuffle

In [None]:
topological_sorted_list = topological_sort()

In [None]:
epochs = 1000
batch_size = 16  # we supply 16 values for each input
batch_num = X_.shape[0] / batch_size

for epoch in range(epochs):
    loss = 0
    
    for batch in range(batch_num):
        
    