# MLC v2

In [11]:
import numpy as np

# The Equation for Backpropagation
$$\begin{array}{cll}
\delta^L & = & \nabla_a C \odot \sigma'(z^L) \\ 
\delta^l & = & ((w^{l+1})^T \delta^{l+1}) \odot \sigma'(z^l) \\
\frac{\partial C}{\partial b^l_j} & = & \delta^l_j \\
\frac{\partial C}{\partial w^l_{jk}} & = & a_k^{l-1} \delta^l_j
\end{array}$$

# Layer Class

In [12]:
class Layer:
    def __init__(self, input_size, output_size, activation):
        self.weights = np.random.rand(input_size, output_size)
        self.bias = np.random.rand(1, output_size)
        self.activate, self.d_activate = activation
    def forward(self, x):
        self.inputs = x
        self.outputs = self.activate(x @ self.weights + self.bias)
        return self.outputs
    # Returns the delta for this layer
    def err(self, delta_p, w_p):
        #print(f"delta: {delta_p}")
        delta = (delta_p @ w_p.T) * self.d_activate(self.outputs)
        #delta = (w_p @ delta_p.T) * self.d_activate(self.outputs)
        return delta
    # Perform backpropagation, returning the weights and bias gradients for this layer
    def back(self, delta):
        #print(f"self.inputs: {self.inputs.T}")
        #print(f"delta input: {delta}")
        print(f"delta: {delta}")
        print(f"self.inputs.T: {self.inputs.T}")
        
        grad_b = delta
        grad_w =  self.inputs.T @ delta 

        return (grad_w, grad_b)

# Network 

In [13]:
class Network:
    def __init__(self, loss, d_loss):
        self.layers = []
        self.loss = loss
        self.d_loss = d_loss
    def add(self, layer):
        self.layers.append(layer)
    def fit(self, inputs, labels, epochs, learn_rate):
        # epoch is deprecated; no use pls
        for epoch in range(epochs):
            print(f"Epoch {epoch}")
            epoch_loss = 0 
            grad_w = []
            grad_b = []
            for x in enumerate(inputs):
                output = x[1]
                # forward propagate
                for layer in self.layers:
                    output = layer.forward(output)

                epoch_loss += self.loss(output, labels[x[0]])
                
                lg_w = [] # gradients are stored in reverse of layers
                lg_b = []
                # backpropagate
                #delta = self.d_loss(output, labels[x[0]]) *  self.layers[-1].d_activate(output)
                delta = self.d_loss(output, labels[x[0]]) * self.layers[-1].d_activate(output) 
                #print(f"initial delta: {delta}")
                #print(f"# of layers: {len(self.layers)}")
                for i in range(len(self.layers)-1, -1, -1):
                    #print(f"======At layer {i}")
                    gw, gb = self.layers[i].back(delta)
                    lg_w.append(gw)
                    lg_b.append(gb)
                    if i-1 >= 0:
                        delta = self.layers[i-1].err(delta, layer.weights)
                grad_w.append(lg_w)
                grad_b.append(lg_b)

            # Sum and average
            #grad_w = np.add.reduce(grad_w) / len(inputs)
            #grad_b = np.add.reduce(grad_b) / len(inputs)
            #print(f"grad_b: {grad_b}")
            grad_b_avg = []
            for i in range(len(grad_b[0])):
                b0 = grad_b[0][i]
                for j in range(1, len(grad_b)):
                    #print(f"+=+= {grad_b[j][i]}")
                    b0 += grad_b[j][i]
                grad_b_avg.append(b0 / len(grad_b))

            #print(f"grad_w: {grad_w}")
            grad_w_avg = []
            for i in range(len(grad_w[0])):
                w0 = grad_w[0][i]
                for j in range(1, len(grad_w)):
                    #print(f"+=+= {grad_w[j][i]}")
                    w0 += grad_w[j][i]
                #print(f"number of grad_ws: {len(grad_w)}")
                grad_w_avg.append(w0 / len(grad_w))
            #print(f"grad_w_avg: {grad_w_avg}")
                
            
            #print(f"grad_w: {grad_w}")
            #print(f"grad_b: {grad_b}")
            for i in range(len(self.layers)-1, -1, -1):
                layer = self.layers[i]
                #print(f"layer weights: {layer.weights}")
                #print(f"grad_w_avg: {grad_w_avg}")
                layer.weights -= learn_rate * grad_w_avg[-i+1]
                layer.bias -= learn_rate * grad_b_avg[-i+1]
            
            print(f"loss: {epoch_loss / len(inputs)}")
            

# Activator functions

In [14]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))
    
def d_sigmoid(x: np.ndarray) -> np.ndarray:
    return np.exp(-x) / ((1 + np.exp(-x))**2)

# Loss functions

In [15]:
def mse(dy, y):
    return np.mean(np.power(dy-y, 2));

def mse_prime(dy, y):
    return 2*(dy-y)/dy.size;

def tanh(x):
    return np.tanh(x);

def tanh_prime(x):
    return 1-np.tanh(x)**2;

# Batchtize

In [47]:
from random import randint

def batchtize(inputs, labels, batch_size):
    if len(inputs) != len(labels):
        raise
    if len(inputs) % batch_size != 0:
        raise
        
    c = 0
    chosen_indexes = []
    total_x = []
    total_y = []
    batch_x = []
    batch_y = []
    while c < len(x):
        i = randint(0, len(inputs)-1)
        while i in chosen_indexes:
            i = randint(0, len(inputs)-1)
        chosen_indexes += [i]
        if c != 0 and c % batch_size == 0:
            total_x += batch_x
            total_y += batch_y
            batch_x = []
            batch_y = []
        #print(f"inserting input: {inputs[i]}")
        batch_x.append(inputs[i])
        batch_y.append(labels[i])
        c+= 1
        #print("ran!")

    total_x += batch_x
    total_y += batch_y
    
    return (total_x, total_y)
    

# Test Batchtize

In [48]:
x = [[1,0], [0,1], [1,1], [0,0]]
y = [[1], [1], [0], [0]]

print(batchtize(x, y, 1))

([[0, 1], [1, 1], [0, 0], [1, 0]], [[1], [0], [0], [1]])


# Example

In [56]:
net = Network(mse, mse_prime)
activator = (tanh, tanh_prime)

net.add(Layer(2, 3, activator))
net.add(Layer(3, 1, activator))

#x_train = np.array([[[0,0]], [[0,1]], [[1,0]], [[1,1]]])
#y_train = np.array([[[0]], [[1]], [[1]], [[0]]])
#net.fit(x_train, y_train, epochs=1000, learn_rate=0.1)

# training data
x_train = [np.array([[0,0]]), np.array([[0,1]]), np.array([[1,0]]), np.array([[1,1]])]
y_train = [np.array([[0]]), np.array([[1]]), np.array([[1]]), np.array([[0]])]
#net.fit(x_train, y_train, epochs=1000, learn_rate=0.1)

epochs = 1000
batch_size = 1
for i in range(epochs):
    x_b, y_b = batchtize(x_train, y_train, batch_size)
    for i in range(len(x_b)):
        print(f"x_b[i]: {x_b}")
        net.fit([x_train[i]], [y_train[i]], epochs=1, learn_rate=0.1)


x_b[i]: [array([[0, 0]]), array([[1, 0]]), array([[0, 1]]), array([[1, 1]])]
Epoch 0
delta: [[0.86532852]]
self.inputs.T: [[0.23863582]
 [0.09947087]
 [0.72542194]]
delta: [[0.39980036 0.34511047 0.48405496]]
self.inputs.T: [0 0]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 2)