Implement the softmax activation class.

In [1]:
import numpy as np

In [2]:
class Softmax:
    def __init__(self):
        pass

    def forward(self, z):
        # shift values for stability
        z_shift = z - np.max(z, axis=0, keepdims=True)
        exp_z = np.exp(z_shift)
        self.a = exp_z / np.sum(exp_z, axis=0, keepdims=True)
        return self.a

    def backward(self, y):
        # gradient of softmax + cross-entropy
        return self.a - y


In [3]:
# logits for 3 classes, 3 samples
z = np.array([[2.0, 1.0, 0.5],
              [1.0, 3.0, 0.2],
              [0.1, 0.5, 2.0]])

softmax = Softmax()
probs = softmax.forward(z)
print("Softmax probabilities:\n", probs)

# true labels (one-hot)
y = np.array([[1, 0, 0],
              [0, 1, 0],
              [0, 0, 1]])

# backprop result
dz = softmax.backward(y)
print("Gradient wrt logits:\n", dz)


Softmax probabilities:
 [[0.65900114 0.11116562 0.16070692]
 [0.24243297 0.82140902 0.11905462]
 [0.09856589 0.06742536 0.72023846]]
Gradient wrt logits:
 [[-0.34099886  0.11116562  0.16070692]
 [ 0.24243297 -0.17859098  0.11905462]
 [ 0.09856589  0.06742536 -0.27976154]]
