In [None]:
## Load libraries
import numpy as np
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [None]:
import tensorflow as tf

In [None]:
tf.__version__

'2.15.0'

In [None]:
## Mount the Google Drive folder, if needed, for accessing data
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/OddSem2023MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

Mounted at /content/drive


In [None]:
# Generate artificial data with 5 samples, 4 features per sample
# and 3 output classes
num_samples = 5 # number of samples
num_features = 4 # number of features (a.k.a. dimensionality)
num_labels = 3 # number of output labels
# Data matrix (each column = single sample)
X = np.random.choice(np.arange(3, 10), size = (num_features, num_samples), replace = True)
# Class labels
y = np.random.choice([0, 1, 2], size = num_samples, replace = True)
print(X)
print('------')
print(y)
print('------')
# One-hot encode class labels
y = tf.keras.utils.to_categorical(y)
print(y)

[[5 6 4 6 7]
 [5 8 7 9 8]
 [9 4 3 3 6]
 [7 5 8 9 8]]
------
[2 0 2 0 2]
------
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


---

A generic layer class with forward and backward methods

----

In [None]:
class Layer:
  def __init__(self):
    self.input = None
    self.output = None

  def forward(self, input):
    pass

  def backward(self, output_gradient, learning_rate):
    pass

---

The softmax classifier steps for a generic sample $\mathbf{x}$ with (one-hot encoded) true label $\mathbf{y}$ (3 possible categories) using a randomly initialized weights matrix (with bias abosrbed as its last last column):

1. Calculate raw scores vector $$\mathbf{z} = \mathbf{Wx}.$$
2. Calculate softmax probabilities (that is, softmax-activate the raw scores) $$\mathbf{a} = \text{softmax}(\mathbf{z})\Rightarrow\begin{bmatrix}a_1\\a_2\\a_3\end{bmatrix}= \text{softmax}\left(\begin{bmatrix}z_1\\z_2\\z_3\end{bmatrix}\right)=\begin{bmatrix}\frac{e^{z_1}}{e^{z_1}+e^{z_2}+e^{z_3}}\\\frac{e^{z_2}}{e^{z_1}+e^{z_2}+e^{z_3}}\\\frac{e^{z_3}}{e^{z_1}+e^{z_2}+e^{z_3}}\end{bmatrix}$$
3. Predicted probability vector that the sample belongs to each one of the putput categories is $\hat{\mathbf{y}} = \mathbf{a}.$
4. Softmax loss for this sample is
$$\begin{align*}L &=  -\log(a_y) \\&= -\log\left(\left[\text{softmax}(\mathbf{z})\right]_y\right)\\ &= -\log\left(\left[\text{softmax}(\mathbf{Wx})\right]_y\right)\\\Rightarrow L\left(\mathbf{y},\hat{\mathbf{y}}\right)&=\sum_{k=1}^3-y_k\log\left(\hat{y}_k\right)\end{align*},$$
which is also referred to as the categorical crossentropy loss.
5. Calculate the gradient of loss w.r.t. weights:
$$\begin{align*} L\\\downarrow\\ \hat{\mathbf{y}} &= \mathbf{a}\\\downarrow\\\mathbf{z}\\\downarrow\\\mathbf{W}\end{align*}$$
$$\begin{align*}\nabla_\mathbf{W}(L) &= \nabla_\mathbf{W}(\mathbf{z}) \times\nabla_\mathbf{z}(\mathbf{a})\times\nabla_\mathbf{a}(L).\end{align*}$$

$\nabla_\mathbf{z}(\mathbf{a}) = \nabla_\mathbf{z}\left(\begin{bmatrix}a_1\\a_2\\a_3\end{bmatrix}\right) = \begin{bmatrix}\nabla_\mathbf{z}(a_1)&\nabla_\mathbf{z}(a_2)&\nabla_\mathbf{z}(a_3)\end{bmatrix}.$

---

In [None]:
np.array(tf.nn.softmax([-1., 0., 1.]))

array([0.09003057, 0.24472848, 0.66524094], dtype=float32)

In [None]:
# prompt: softmax activation class

class Softmax(Layer):
  def __init__(self):
    super().__init__()

  def forward(self, input):
    self.input = input
    self.output = np.array(tf.nn.softmax(self.input))
    return self.output

  def backward(self, output_gradient, learning_rate):
    return(np.dot((np.identity(np.size(self.output))-self.output.T) * self.output, output_gradient))


In [None]:
def cce(y, yhat):
  return(-np.sum(y*np.log(yhat)))
def cce_gradient(y, yhat):
  return(-y/yhat)

In [None]:
y = np.array([0, 1, 0])
yhat =np.array([0.75, 0.1, 0.15])
print(cce(y, yhat))

2.3025850929940455


In [None]:
# Step-1: Add the bias feature to all the samples
X = np.vstack([X, np.ones(X.shape[1])])

# Step-2: Initialize the entries of the weights matrix randomly
np.random.seed(42)
W = np.random.rand(num_labels, num_features + 1)  # +1 for the bias feature

# Step-3: Create softmax layer object
softmax = Softmax()

# Step-4: Run over each sample
for i in range(X.shape[1]):
    # Step-5: Forward step
    # (a) Calculate the raw scores vector for a generic sample
    z = np.dot(W, X[:, i])

    # (b) Softmax activation
    softmax.forward(z)

    # (c) Calculate categorical crossentropy (CCE) loss for the sample
    loss = cce(y[i, :], softmax.output)

    # Step-6: Backward step
    # (a) Compute the gradient of the loss for the sample w.r.t. weights
    gradient = softmax.backward(cce_gradient(y[i, :], softmax.output), learning_rate=0.01)

    # (d) Print CCE loss
    print(f"Sample {i + 1} - CCE Loss: {loss}")

    # (e) Print gradient
    print(f"Sample {i + 1} - Gradient: {gradient}")

    # Step-7: Update weights using gradient descent
    W -= 0.01 * gradient

# Print the final weights
print("\nFinal Weights:")
print(W)
