# Training a neural network from scratch

Formulas from https://udlbook.com.

In [7]:
import numpy as np

## Load data

In [223]:
train_X = np.load("data/train_images.npy") / 255
train_y = np.load("data/train_labels.npy").astype(int).flatten()

print(f"train_X.shape: {train_X.shape}")
print(f"train_y.shape: {train_y.shape}")

train_X.shape: (60000, 784)
train_y.shape: (60000,)


## Utility functions

ReLU, softmax

In [215]:
def ReLU(x):
    return np.clip(x, 0, None)

In [216]:
def softmax(z):
    """
    Takes in an ndarray (..., ..., N) and applies softmax
        along the last dimension.
    """
    if not z.any():
        return np.ones(z.shape) / z.shape[-1]

    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=-1, keepdims=True)

# Do a sense check
print(softmax(np.array([
    [1, 2, 3, 4],
    [1, 0, 0, 0],
    [10, -100, -100, -100],
    [0, 0, 0, 0]
])))

[[3.20586033e-02 8.71443187e-02 2.36882818e-01 6.43914260e-01]
 [4.75366886e-01 1.74877705e-01 1.74877705e-01 1.74877705e-01]
 [1.00000000e+00 1.68891188e-48 1.68891188e-48 1.68891188e-48]
 [2.50000000e-01 2.50000000e-01 2.50000000e-01 2.50000000e-01]]


## Implement the model

In [217]:
class Layer:
    """
    Represents a layer in a deep neural network and its incoming weights.
    Takes in an ndarray as input and returns outputs.
    Has utilities for computing gradients etc.
    """
    def __init__(self, in_size, out_size, act_func="relu"):
        """
        in_size: number of input nodes
        out_size: number of output nodes
        act_func: activation function (string)
        Use He initialization for weights.
        """
        self.weights = np.random.normal(
            loc=0,
            scale=np.sqrt(4 / (in_size + out_size)),
            size=(out_size, in_size)
        )
        self.biases = np.zeros(out_size)
        self.act_func = act_func.lower()

        self.activations = None

    def __call__(self, inputs):
        """
        Feed forward.
        inputs.shape: (batch_size, in_size)
        """
        preactivations = (self.weights @ inputs.T).T

        # Add biases
        preactivations += np.broadcast_to(self.biases, preactivations.shape)

        # Apply the activation function
        if self.act_func == "relu":
            self.activations = ReLU(preactivations)
        else:
            self.activations = preactivations
        return self.activations

In [218]:
class Network:
    def __init__(self, in_size, out_size, hidden_sizes):
        """
        Initialize the deep network.
        If params is defined, it should have this structure:
            [
                (weights_0, biases_0),
                (weights_1, biases_1),
                ...
                (weights_(n_layers-1), biases_(k-1))
            ]
        """
        self.layers = []
        self.layers.append(Layer(in_size, hidden_sizes[0]))
        for i in range(len(hidden_sizes) - 1):
            self.layers.append(Layer(hidden_sizes[i], hidden_sizes[i+1]))
        self.layers.append(Layer(hidden_sizes[-1], out_size))

    def __call__(self, inputs):
        """
        inputs must have shape (batch_size, in_size)
        Returns the output logits (pre-normalization outputs)
        """
        cur = inputs
        for layer in self.layers:
            cur = layer(cur)
        return cur
    
    def predict(self, inputs):
        """
        Runs the inputs through the model, but argmax-es the last layer.
        """
        return np.argmax(self.__call__(inputs), axis=-1)

In [225]:
nn = Network(784, 10, [200])
y_pred = nn(train_X[:3])
print(y_pred)
print(f"Network output shape: {y_pred.shape}")

[[0.         0.         0.50142435 0.         0.         0.7360939
  0.85285735 0.         0.08255522 0.        ]
 [0.         0.51219257 1.46683306 0.6032505  0.09178937 0.
  0.23770187 1.86382545 0.         0.        ]
 [0.         0.         0.56915337 0.         0.         0.
  0.8843121  0.12999033 0.         0.        ]]
Network output shape: (3, 10)


## Loss function

In [220]:
def cross_entropy_loss(outputs, y_true, from_logits=True):
    """
    Computes the average loss across these examples.
    outputs has shape (N, out_size)
    y_true has shape (out_size)
    """
    if from_logits:
        out_probs = softmax(outputs)
    else:
        out_probs = outputs

    # Terrible hack to select only the outputs we care about
    # https://stackoverflow.com/questions/70664524/numpy-use-all-rows-in-multidimensional-integer-indexing

    if len(out_probs.shape) == 1:
        assert np.issubdtype(y_true, int), "Output and label dimensions are incompatible."
        return -np.log(out_probs[y_true])

    return -np.sum(np.log(np.take_along_axis(out_probs, y_true[:,None], 1))) / len(y_true)

In [224]:
for i in range(len(train_X)):
    outputs = nn(train_X[i])
    try:
        loss = cross_entropy_loss(outputs, train_y[i])
    except RuntimeWarning as e:
        print(e)
        print("oops", i)