In [None]:
import tempfile
from pathlib import Path

import torch
import numpy as np
import matplotlib.pylab as plt

from utils import (
    load_cifar10,
    batch_plot,
    seed_everything,
    matrix_to_diagonals,
    accuracy,
    weights_to_images,
)


seed_everything()
np.set_printoptions(precision=3)

%load_ext autoreload
%autoreload 2


benchmark = False
search = False
save_weights_animation = False
run_with_cross_entropy = False
compare_with_linear = False
save_weights_update = False

# Softmax

## Softmax Derivative

$
f(x_{i})=\frac{e^{x_i}}{\sum_{j}e^{x_j}} \tag {1}
$

If $i=j$,

$$
\begin{align}
\frac{\partial f(x_{i})}{\partial x_j} &= \frac{e^{x_i}\cdot \sum_{j}e^{x_j} - e^{x_i}\cdot e^{x_j}}{\left(\sum_{j}e^{x_j}\right)^2} \tag{D1} \\
    &= \frac{e^{x_i}\cdot(\sum_{j}e^{x_j} - e^{x_j})}{\left(\sum_{j}e^{x_j}\right)^2} \\
    &= \frac{e^{x_i}}{\sum_{j}e^{x_j}} \cdot \frac{\sum_{j}e^{x_j} - e^{x_j}}{\sum_{j}e^{x_j}} \\
    &= \frac{e^{x_i}}{\sum_{j}e^{x_j}} \cdot \left( 1 - \frac{e^{x_j}}{\sum_{j}e^{x_j}} \right) \\
    &= f(x_{i})\cdot\left(1-f(x_{j}) \right) \\
    &= f(x_{i}) - f(x_{i})\cdot f(x_{j})
\end{align}
$$

If $i\neq j$,

$$
\begin{align}
\frac{\partial f(x_{i})}{\partial x_j} &= \frac{0\cdot \sum_{j}e^{x_j} - e^{x_i}\cdot e^{x_j}}{(\sum_{j}e^{x_j})^2} \tag{D2} \\
    &= \frac{-e^{x_i}\cdot e^{x_j}}{(\sum_{j}e^{x_j})^2} \\
    &= - \frac{e^{x_i}}{\sum_{j}e^{x_j}} \cdot \frac{e^{x_j}}{\sum_{j}e^{x_j}} \\
    &= - f(x_{i})\cdot f(x_{j})
\end{align}
$$


$$
\frac{\partial f(x_{i})}{\partial x_j}=
\begin{cases}
f(x_{i}) - f(x_{i})\cdot f(x_{j}) &i==j\\
- f(x_{i})\cdot f(x_{j}) &i\neq j
\end{cases}
$$

## Cross Entropy with Softmax

$$
L(x_{i}) = - y_i \log f(x_i) = - \log\left({\frac{e^{x_i}}{\sum_{j}e^{x_j}}}\right)
$$

If $i=j$,

$$
\begin{align}
\frac{\partial L(x_{i})}{\partial x_j} &= -\frac{1}{f(x_i)} \cdot \frac{\partial f(x_{i})}{\partial x_j} \\
    &= -\frac{1}{f(x_i)} \cdot f(x_{i})\cdot\left(1-f(x_{j}) \right) \\
    &= \left(f(x_{j}) -1 \right) \\
    &= f(x_{j}) - 1
\end{align}
$$

If $i\neq j$,

$$
\begin{align}
\frac{\partial L(x_{i})}{\partial x_j} &= -\frac{1}{f(x_i)} \cdot \frac{\partial f(x_{i})}{\partial x_j} \\
    &= -\frac{1}{f(x_i)} \cdot \left(-f(x_{i})\cdot f(x_{j}) \right) \\
    &= f(x_{j})
\end{align}
$$

## Log Softmax Derivative

$
f(x_{i})=\log\left({\frac{e^{x_i}}{\sum_{j}e^{x_j}}}\right) \tag {2}
$

If $i=j$,

$$
\begin{align}
\frac{\partial f(x_{i})}{\partial x_j} &= \frac{\sum_{j}e^{x_j}}{e^{x_i}} \cdot \frac{e^{x_i}\cdot \sum_{j}e^{x_j} - e^{x_i}\cdot e^{x_j}}{\left(\sum_{j}e^{x_j}\right)^2} \tag{D1} \\
    &= \frac{\sum_{j}e^{x_j}}{e^{x_i}} \cdot \frac{e^{x_i}\cdot(\sum_{j}e^{x_j} - e^{x_j})}{\left(\sum_{j}e^{x_j}\right)^2} \\
    &= \frac{\sum_{j}e^{x_j} - e^{x_j}}{\sum_{j}e^{x_j}} \\
    &= 1 - \frac{e^{x_j}}{\sum_{j}e^{x_j}} \\
    &= 1-f(x_{j})
\end{align}
$$

If $i\neq j$,

$$
\begin{align}
\frac{\partial f(x_{i})}{\partial x_j} &= \frac{\sum_{j}e^{x_j}}{e^{x_i}} \cdot \frac{0\cdot \sum_{j}e^{x_j} - e^{x_i}\cdot e^{x_j}}{(\sum_{j}e^{x_j})^2} \tag{D2} \\
    &= \frac{\sum_{j}e^{x_j}}{e^{x_i}} \cdot \frac{-e^{x_i}\cdot e^{x_j}}{(\sum_{j}e^{x_j})^2} \\
    &= -\frac{e^{x_j}}{\sum_{j}e^{x_j}} \\
    &= -f(x_{j})
\end{align}
$$


$$
\frac{\partial f(x_{i})}{\partial x_j}=
\begin{cases}
1 - f(x_{j}) &i==j\\
-f(x_{j}) &i\neq j
\end{cases}
$$

In [None]:
def log_softmax(x):
    t = x - np.max(x, axis=-1, keepdims=True)
    return t - np.log(np.sum(np.exp(t), axis=-1, keepdims=True))


def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)


def softmax_backward_1(z):
    return np.asarray([np.diag(i) - np.outer(i, i) for i in z])


def softmax_backward_2(z):
    return matrix_to_diagonals(z) - np.einsum("bi,bj->bij", z, z)


def log_softmax_backward_1(z):
    n, c = z.shape
    return np.tile(np.identity(c), (n, 1, 1)) - np.tile(z[:, None, :], (1, c, 1))


def log_softmax_backward_2(z):
    n, c = z.shape
    return np.repeat(np.eye(c)[None, :, :], n, axis=0) - np.repeat(z[:, None, :], c, axis=1)


x = np.random.randn(4, 5)
z = softmax(x)
s = log_softmax(x)

assert np.allclose(z.sum(axis=1), 1)
assert np.allclose(softmax_backward_1(z), softmax_backward_2(z))
assert np.allclose(log_softmax_backward_1(z), log_softmax_backward_2(z))

if benchmark:
    %timeit softmax_backward_1(z)
    %timeit softmax_backward_2(z)

    %timeit log_softmax_backward_1(z)
    %timeit log_softmax_backward_2(z)

In [None]:
bz = softmax_backward_2(z)
bs = log_softmax_backward_2(z)

tx = torch.tensor(x, requires_grad=True, dtype=torch.float)
tx.retain_grad()
tz = torch.nn.functional.softmax(tx, dim=1)
ts = torch.nn.functional.log_softmax(tx, dim=1)

assert np.allclose(z, tz.detach().numpy())
assert np.allclose(s, ts.detach().numpy())

assert np.allclose(bz[:, 0, :], torch.autograd.grad(list(tz[:, 0]), tx, retain_graph=True)[0].numpy())
assert np.allclose(bs[:, 0, :], torch.autograd.grad(list(ts[:, 0]), tx, retain_graph=True)[0].numpy())

In [None]:
from functional import nll_loss, nll_loss_derivative
from nn import NLLLoss

reduction = "mean"

loss = NLLLoss(reduction=reduction)

# each element in target has to have 0 <= value < C
y = np.random.randint(0, high=x.shape[1] - 1, size=x.shape[0])

o = nll_loss(log_softmax(x), y, reduction=reduction)
sx = log_softmax(x)
lo = loss.forward(sx, y)
bo = loss.backward(sx, y)

ty = torch.tensor(y, dtype=torch.long)
ts = torch.nn.functional.log_softmax(tx, dim=1)
to = torch.nn.functional.nll_loss(ts, ty, reduction=reduction)

assert np.allclose(o, to.detach().numpy())
assert np.allclose(lo, to.detach().numpy())
assert np.allclose(
    nll_loss_derivative(x, y, reduction=reduction), torch.autograd.grad(to, ts, retain_graph=True)[0].detach().numpy()
)
assert np.allclose(bo, torch.autograd.grad(to, ts, retain_graph=True)[0].detach().numpy())

# Linear

## Softmax with Cross Entropy

In [None]:
from functional import (
    linear,
    log_softmax,
    log_softmax_derivative,
    nll_loss,
    nll_loss_derivative,
    softmax,
    cross_entropy,
)


num_hidden = 64
weight = np.random.normal(size=(num_hidden, x.shape[1]), scale=10)
bias = np.random.randn(num_hidden)

torch_weight = torch.tensor(weight, requires_grad=True, dtype=torch.float)
torch_bias = torch.tensor(bias, requires_grad=True, dtype=torch.float)

torch_test_target = torch.randint_like(torch_bias, high=x.shape[1] - 1, dtype=torch.long)

np.testing.assert_array_almost_equal(
    torch.nn.functional.cross_entropy(torch_weight, torch_test_target, reduction="none").detach().numpy(),
    cross_entropy(weight, torch_test_target.detach().numpy(), reduction="none", with_softmax=True),
    decimal=5,
)

In [None]:
# forward
linear_output = linear(x, weight.T, bias)
loss = cross_entropy(linear_output, y, reduction="mean", with_softmax=True)

torch_linear_output = torch.nn.functional.linear(tx, torch_weight, torch_bias)
# torch cross entropy embeded softmax
torch_loss = torch.nn.functional.cross_entropy(torch_linear_output, ty, reduction="mean")

np.testing.assert_array_almost_equal(linear_output, torch_linear_output.detach().numpy(), decimal=5)
assert np.allclose(loss, torch_loss.detach().numpy())

## Log Softmax with NLLLoss

In [None]:
# forward
linear_output = linear(x, weight.T, bias)
log_output, softmax_output = log_softmax(linear_output, with_softmax=True)
loss = nll_loss(log_output, y)
# backward
loss_grad = nll_loss_derivative(log_output, y)
log_grad = (np.expand_dims(loss_grad, 1) @ log_softmax_derivative(softmax_output)).squeeze(axis=1)
weight_grad = x.T @ log_grad
bias_grad = np.sum(log_grad, axis=0, keepdims=True)


torch_linear_output = torch.nn.functional.linear(tx, torch_weight, torch_bias)
torch_log_output = torch.nn.functional.log_softmax(torch_linear_output, dim=1)
torch_loss = torch.nn.functional.nll_loss(torch_log_output, ty)
torch_loss.backward()

np.testing.assert_array_almost_equal(linear_output, torch_linear_output.detach().numpy(), decimal=5)
np.testing.assert_array_almost_equal(log_output, torch_log_output.detach().numpy(), decimal=5)
assert np.allclose(loss, torch_loss.detach().numpy())

assert np.allclose(weight_grad.T, torch_weight.grad.detach().numpy())
assert np.allclose(bias_grad, torch_bias.grad.detach().numpy())

# Simple Neural Network

In [None]:
(X_train, y_train), (X_test, y_test) = load_cifar10("../code/cs231n/datasets/cifar-10-batches-py/")

assert X_train.shape == (50000, 32, 32, 3)
assert y_train.shape == (50000,)
assert X_test.shape == (10000, 32, 32, 3)
assert y_test.shape == (10000,)

In [None]:
# 0: airplane
# 1. automobile
# 2: bird
# 3: cat
target_classes = [0, 1, 2, 3]
train_indices = np.isin(y_train, target_classes)
test_indices = np.isin(y_test, target_classes)

valid_rate = 0.2

X_train, y_train = X_train[train_indices], y_train[train_indices]
X_test, y_test = X_test[test_indices], y_test[test_indices]

num_valid = int(len(X_train) * 0.2)
X_valid, y_valid = X_train[-num_valid:], y_train[-num_valid:]
X_train, y_train = X_train[:-num_valid], y_train[:-num_valid]

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
plt.bar(*np.unique(y_train, return_counts=True))
plt.bar(*np.unique(y_valid, return_counts=True))
plt.show()

In [None]:
# Get the indexes of 'batch_size' random digits
batch_size = 16
random_indexes = np.random.randint(X_train.shape[0], size=batch_size)
# Plot digits with labels
batch_plot(X_train[random_indexes], y_train[random_indexes], with_border=False)

In [None]:
mean_image = np.mean(X_train, axis=0)

X_train = (X_train.astype(float) - mean_image).reshape((X_train.shape[0], -1))
X_valid = (X_valid.astype(float) - mean_image).reshape((X_valid.shape[0], -1))
X_test = (X_test.astype(float) - mean_image).reshape((X_test.shape[0], -1))

# Search a better model

In [None]:
from nn import Linear, ReLU, LogSoftmax, NLLLoss, SGD, Sequential, CrossEntropy


# With random weights and bias with an two layer neural network
hidden_units = 100
num_classes = len(target_classes)
(m, n), o = X_train.shape, y.max() + 1
model = Sequential(
    [
        Linear(n, hidden_units),
        ReLU(),
        Linear(hidden_units, num_classes),
        LogSoftmax(),
    ]
)

preds = np.argmax(model.predict(X_test), axis=1)
accuracy(y_test, preds)

## compare with svm and softmax

In [None]:
if compare_with_linear:
    import io
    from PIL import Image
    from functional import relu

    tmp_dir = Path(tempfile.mkdtemp())

    # predict on sample images
    sample_images = {"cat": 110, "bird": 314, "airplane_0": 153, "automobile": 49, "airplane_1": 25}
    # With optimized weights and bias
    hidden_units = len(target_classes) * 4

    model = Sequential(
        [
            Linear(n, hidden_units, regularization=5),
            ReLU(),
            Linear(hidden_units, num_classes),
        ]
    )

    X_sample = np.concatenate([X_train, X_valid])[list(sample_images.values())]
    preds = model.predict(X_sample)
    preds = softmax(preds - np.max(preds, axis=-1, keepdims=True))
    print("preds on epoch 0" + "=" * 50)
    print(preds)

    optimizer = SGD(params=model.parameters, learning_rate=3e-4)
    loss = CrossEntropy()
    model.compile(loss=loss, optimizer=optimizer)
    history = model.fit(
        np.concatenate([X_train, X_valid]), np.concatenate([y_train, y_valid]), epochs=50, store_weights=True
    )

    train_preds = np.argmax(model.predict(X_train), axis=1)
    valid_preds = np.argmax(model.predict(X_valid), axis=1)
    test_preds = np.argmax(model.predict(X_test), axis=1)

    print(f"train acc: {accuracy(y_train, train_preds)}")
    print(f"test acc: {accuracy(y_test, test_preds)}")

    X_sample = np.concatenate([X_train, X_valid])[list(sample_images.values())]
    preds = model.predict(X_sample)
    preds = softmax(preds - np.max(preds, axis=-1, keepdims=True))
    print("preds on epoch 0" + "=" * 50)
    print(preds)

    template_weights = softmax(model.parameters[2].value)
    for epoch in [0, 49]:
        weights = history["weights"][epoch]
        # layer 2 as weighted sum of layer 1 templates
        weights = (relu(history["weights"][epoch]) @ template_weights).T
        weights_image = weights_to_images(weights.reshape((-1, 32, 32, 3)))

        img_buf = io.BytesIO()
        batch_plot(weights_image, with_border=False, save_path=img_buf, flatten_layout=True, flatten_columns=True)
        Image.open(img_buf).save(tmp_dir.joinpath(f"neural_network_weighted_weights_epoch_{epoch:04d}.png"))
        img_buf.close()

    weights = history["weights"][-1]
    weights_image = weights_to_images(weights.T.reshape((-1, 32, 32, 3)))

    img_buf = io.BytesIO()
    batch_plot(weights_image, with_border=False, save_path=img_buf, flatten_layout=True, flatten_columns=True)
    Image.open(img_buf).save(tmp_dir.joinpath(f"neural_network_weights_epoch_{epoch:04d}.png"))
    img_buf.close()
    # for i, img in enumerate(weights_image):
    #     Image.fromarray(img).resize((128, 128)).save(tmp_dir.joinpath(f"neural_network_weights_{i}_epoch_{epoch:04d}.png"))
    print(f"templates weights: {template_weights}")

In [None]:
!open {tmp_dir}

In [None]:
import itertools


def grid_search(epoch=30, learning_rates=None, regularization_strengths=None, num_hiddens=None, verbose=True):
    if learning_rates is None:
        learning_rates = [1e-4, 3e-4]
    if regularization_strengths is None:
        regularization_strengths = np.logspace(-2, 2, 3, endpoint=True)
    if num_hidden is None:
        num_hiddens = np.power(2, np.arange(6, 9))

    scores = {}
    best_val_acc = -1

    for lr, reg, hidden in itertools.product(learning_rates, regularization_strengths, num_hiddens):
        model = Sequential(
            [
                Linear(n, hidden, regularization=reg),
                ReLU(),
                Linear(hidden, num_classes),
                LogSoftmax(),
            ]
        )

        optimizer = SGD(params=model.parameters, learning_rate=lr)
        loss = NLLLoss()

        model.compile(loss=loss, optimizer=optimizer)
        model.fit(X_train, y_train, epochs=epoch, validation_data=(X_valid, y_valid), verbose=False)

        train_preds = np.argmax(model.predict(X_train), axis=1)
        valid_preds = np.argmax(model.predict(X_valid), axis=1)

        train_acc = accuracy(y_train, train_preds)
        valid_acc = accuracy(y_valid, valid_preds)

        scores[(lr, reg, hidden)] = (train_acc, valid_acc)
        if valid_acc >= best_val_acc:
            best_val_acc = valid_acc
            best_model = model

            if verbose:
                print(
                    f"best lr {lr:.2e} reg {reg:.2e} hiddens {hidden:3} train accuracy: {train_acc:.3f} val accuracy: {valid_acc:.3f}"
                )

    return scores, best_model


if search:
    scores, best_model = grid_search(
        learning_rates=[2e-4, 3e-4],
        num_hiddens=[64, 81, 100, 121],
        regularization_strengths=[5, 10, 20, 30],
        verbose=True,
    )
    print("=" * 20)
    for (lr, reg, hidden), (train_acc, valid_acc) in scores.items():
        print(
            f"lr {lr:.2e} reg {reg:.2e} hiddens {hidden} train accuracy: {train_acc:.3f} val accuracy: {valid_acc:.3f}"
        )

In [None]:
if run_with_cross_entropy:
    # With optimized weights and bias
    hidden_units = 121

    model = Sequential(
        [
            Linear(n, hidden_units, regularization=5),
            ReLU(),
            Linear(hidden_units, num_classes),
        ]
    )

    optimizer = SGD(params=model.parameters, learning_rate=3e-4)
    loss = CrossEntropy()
    model.compile(loss=loss, optimizer=optimizer)
    history = model.fit(
        X_train, y_train, epochs=50, validation_data=(X_valid, y_valid), store_weights=save_weights_animation
    )

    train_preds = np.argmax(model.predict(X_train), axis=1)
    valid_preds = np.argmax(model.predict(X_valid), axis=1)
    test_preds = np.argmax(model.predict(X_test), axis=1)

    print(f"train acc: {accuracy(y_train, train_preds)}")
    print(f"valid acc: {accuracy(y_valid, valid_preds)}")
    print(f"test acc: {accuracy(y_test, test_preds)}")

In [None]:
# With optimized weights and bias
hidden_units = 121

model = Sequential(
    [
        Linear(n, hidden_units, regularization=5),
        ReLU(),
        Linear(hidden_units, num_classes),
        LogSoftmax(),
    ]
)

optimizer = SGD(params=model.parameters, learning_rate=3e-4)
loss = NLLLoss()
model.compile(loss=loss, optimizer=optimizer)
history = model.fit(
    X_train, y_train, epochs=50, validation_data=(X_valid, y_valid), store_weights=save_weights_animation
)

train_preds = np.argmax(model.predict(X_train), axis=1)
valid_preds = np.argmax(model.predict(X_valid), axis=1)
test_preds = np.argmax(model.predict(X_test), axis=1)

print(f"train acc: {accuracy(y_train, train_preds)}")
print(f"valid acc: {accuracy(y_valid, valid_preds)}")
print(f"test acc: {accuracy(y_test, test_preds)}")

In [None]:
plt.plot(history["train_loss"])
plt.plot(history["valid_loss"])
plt.show()

In [None]:
linear_weights = model.parameters[0].value.T.reshape((-1, 32, 32, 3))
linear_weights_image = weights_to_images(linear_weights)

batch_plot(linear_weights_image, with_border=False)

In [None]:
if save_weights_update:
    import io
    from PIL import Image

    total_weights = []
    update_steps = 2

    for weights in history["weights"][::update_steps]:
        weights_image = weights_to_images(weights.T.reshape((-1, 32, 32, 3)))

        img_buf = io.BytesIO()
        batch_plot(weights_image, with_border=False, save_path=img_buf)
        total_weights.append(Image.open(img_buf).resize((512, 512)).copy())
        img_buf.close()

    total_weights[0].save(
        f"images/neural_network_weights_update.gif",
        save_all=True,
        append_images=total_weights[1:],
        optimize=False,
        duration=100,
        loop=0,
    )

<img src="images/neural_network_weights_update.gif" width="800">

<img src="images/neural_network_training_epoch_0049.png">