In [None]:
import numpy as np
import wandb
wandb.init(
    project="fashion-mnist-q8_a",
    name="CE_vs_Squared_Error"
)
from keras.datasets import fashion_mnist
def load_fashion_mnist(flatten=True, normalize=True, one_hot=True, num_classes=10):
    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
    if flatten:
        x_train = x_train.reshape(x_train.shape[0], -1)
        x_test = x_test.reshape(x_test.shape[0], -1)
    if normalize:
        x_train = x_train / 255.0
        x_test = x_test / 255.0
    if one_hot:
        y_train = np.eye(num_classes)[y_train]
        y_test = np.eye(num_classes)[y_test]
    return x_train, y_train, x_test, y_test
class Activate:
    @staticmethod
    def relu(Z):
        return np.maximum(0, Z)
    @staticmethod
    def relu_der(Z):
        return (Z > 0).astype(float)
    @staticmethod
    def softmax(Z):
        expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return expZ / np.sum(expZ, axis=1, keepdims=True)
class FNN:
    def __init__(self, input_size, hidden_layers, output_size, seed=42):
        np.random.seed(seed)
        self.layers = [input_size] + hidden_layers + [output_size]
        self.L = len(self.layers) - 1
        self.weights = {}
        self.biases = {}
        for l in range(1, self.L+1):
            limit = np.sqrt(6 / (self.layers[l-1] + self.layers[l]))
            self.weights['W'+str(l)] = np.random.uniform(-limit, limit, (self.layers[l-1], self.layers[l]))
            self.biases['b'+str(l)] = np.zeros((1, self.layers[l]))
        self.opt_caches = {}
    def forward(self, X):
        self.cache = {'A0': X}
        for l in range(1, self.L):
            Z = np.dot(self.cache['A'+str(l-1)], self.weights['W'+str(l)]) + self.biases['b'+str(l)]
            A = Activate.relu(Z)
            self.cache['Z'+str(l)] = Z
            self.cache['A'+str(l)] = A
        ZL = np.dot(self.cache['A'+str(self.L-1)], self.weights['W'+str(self.L)]) + self.biases['b'+str(self.L)]
        AL = Activate.softmax(ZL)
        self.cache['Z'+str(self.L)] = ZL
        self.cache['A'+str(self.L)] = AL
        return AL
    def backward(self, Y_true, loss_type="cross_entropy"):
        m = Y_true.shape[0]
        self.grads = {}
        Y_pred = self.cache['A'+str(self.L)]
        if loss_type == "cross_entropy":
          dZ = Y_pred - Y_true
        elif loss_type == "squared_error":
          dZ = (Y_pred - Y_true) * Y_pred * (1 - Y_pred)
        self.grads['dW'+str(self.L)] = np.dot(self.cache['A'+str(self.L-1)].T, dZ)/m
        self.grads['db'+str(self.L)] = np.sum(dZ, axis=0, keepdims=True)/m
        dA_prev = np.dot(dZ, self.weights['W'+str(self.L)].T)
        for l in reversed(range(1, self.L)):
            dZ = dA_prev * Activate.relu_der(self.cache['Z'+str(l)])
            self.grads['dW'+str(l)] = np.dot(self.cache['A'+str(l-1)].T, dZ)/m
            self.grads['db'+str(l)] = np.sum(dZ, axis=0, keepdims=True)/m
            if l > 1:
                dA_prev = np.dot(dZ, self.weights['W'+str(l)].T)
    def update_parameters(self, lr=0.01, optimizer='sgd', beta1=0.9, beta2=0.999, epsilon=1e-8, t=1):
        for l in range(1, self.L+1):
            dW = self.grads['dW'+str(l)]
            db = self.grads['db'+str(l)]
            if optimizer == 'sgd':
                self.weights['W'+str(l)] -= lr * dW
                self.biases['b'+str(l)] -= lr * db
            elif optimizer == 'momentum':
                if 'vW'+str(l) not in self.opt_caches:
                    self.opt_caches['vW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['vb'+str(l)] = np.zeros_like(db)
                self.opt_caches['vW'+str(l)] = beta1*self.opt_caches['vW'+str(l)] + (1-beta1)*dW
                self.opt_caches['vb'+str(l)] = beta1*self.opt_caches['vb'+str(l)] + (1-beta1)*db
                self.weights['W'+str(l)] -= lr * self.opt_caches['vW'+str(l)]
                self.biases['b'+str(l)] -= lr * self.opt_caches['vb'+str(l)]
            elif optimizer == 'nesterov':
                if 'vW'+str(l) not in self.opt_caches:
                    self.opt_caches['vW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['vb'+str(l)] = np.zeros_like(db)
                vW_prev = self.opt_caches['vW'+str(l)].copy()
                vb_prev = self.opt_caches['vb'+str(l)].copy()
                self.opt_caches['vW'+str(l)] = beta1*self.opt_caches['vW'+str(l)] + lr*dW
                self.opt_caches['vb'+str(l)] = beta1*self.opt_caches['vb'+str(l)] + lr*db
                self.weights['W'+str(l)] -= beta1*vW_prev + (1+beta1)*(self.opt_caches['vW'+str(l)] - vW_prev)
                self.biases['b'+str(l)] -= beta1*vb_prev + (1+beta1)*(self.opt_caches['vb'+str(l)] - vb_prev)
            elif optimizer == 'rmsprop':
                if 'sW'+str(l) not in self.opt_caches:
                    self.opt_caches['sW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['sb'+str(l)] = np.zeros_like(db)
                self.opt_caches['sW'+str(l)] = beta2*self.opt_caches['sW'+str(l)] + (1-beta2)*(dW**2)
                self.opt_caches['sb'+str(l)] = beta2*self.opt_caches['sb'+str(l)] + (1-beta2)*(db**2)
                self.weights['W'+str(l)] -= lr * dW / (np.sqrt(self.opt_caches['sW'+str(l)]) + epsilon)
                self.biases['b'+str(l)] -= lr * db / (np.sqrt(self.opt_caches['sb'+str(l)]) + epsilon)
            elif optimizer == 'adam':
                if 'vW'+str(l) not in self.opt_caches:
                    self.opt_caches['vW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['vb'+str(l)] = np.zeros_like(db)
                    self.opt_caches['sW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['sb'+str(l)] = np.zeros_like(db)
                self.opt_caches['vW'+str(l)] = beta1*self.opt_caches['vW'+str(l)] + (1-beta1)*dW
                self.opt_caches['vb'+str(l)] = beta1*self.opt_caches['vb'+str(l)] + (1-beta1)*db
                self.opt_caches['sW'+str(l)] = beta2*self.opt_caches['sW'+str(l)] + (1-beta2)*(dW**2)
                self.opt_caches['sb'+str(l)] = beta2*self.opt_caches['sb'+str(l)] + (1-beta2)*(db**2)
                vW_corr = self.opt_caches['vW'+str(l)] / (1 - beta1**t)
                vb_corr = self.opt_caches['vb'+str(l)] / (1 - beta1**t)
                sW_corr = self.opt_caches['sW'+str(l)] / (1 - beta2**t)
                sb_corr = self.opt_caches['sb'+str(l)] / (1 - beta2**t)
                self.weights['W'+str(l)] -= lr * vW_corr / (np.sqrt(sW_corr) + epsilon)
                self.biases['b'+str(l)] -= lr * vb_corr / (np.sqrt(sb_corr) + epsilon)
            elif optimizer == 'nadam':
                if 'vW'+str(l) not in self.opt_caches:
                    self.opt_caches['vW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['vb'+str(l)] = np.zeros_like(db)
                    self.opt_caches['sW'+str(l)] = np.zeros_like(dW)
                    self.opt_caches['sb'+str(l)] = np.zeros_like(db)
                vW_prev = self.opt_caches['vW'+str(l)].copy()
                vb_prev = self.opt_caches['vb'+str(l)].copy()
                self.opt_caches['vW'+str(l)] = beta1*self.opt_caches['vW'+str(l)] + (1-beta1)*dW
                self.opt_caches['vb'+str(l)] = beta1*self.opt_caches['vb'+str(l)] + (1-beta1)*db
                self.opt_caches['sW'+str(l)] = beta2*self.opt_caches['sW'+str(l)] + (1-beta2)*(dW**2)
                self.opt_caches['sb'+str(l)] = beta2*self.opt_caches['sb'+str(l)] + (1-beta2)*(db**2)
                vW_corr = (beta1*vW_prev + (1+beta1)*self.opt_caches['vW'+str(l)]/(1-beta1**t)) / (1 - beta1**t)
                vb_corr = (beta1*vb_prev + (1+beta1)*self.opt_caches['vb'+str(l)]/(1-beta1**t)) / (1 - beta1**t)
                sW_corr = self.opt_caches['sW'+str(l)] / (1 - beta2**t)
                sb_corr = self.opt_caches['sb'+str(l)] / (1 - beta2**t)
                self.weights['W'+str(l)] -= lr * vW_corr / (np.sqrt(sW_corr) + epsilon)
                self.biases['b'+str(l)] -= lr * vb_corr / (np.sqrt(sb_corr) + epsilon)
    def cross_entropy_loss(self, Y_pred, Y_true):
        m = Y_true.shape[0]
        return -np.sum(Y_true*np.log(Y_pred + 1e-8))/m
    def squared_error_loss(self, Y_pred, Y_true):
        return np.mean(np.sum((Y_pred - Y_true) ** 2, axis=1))
    def compute_loss(self, Y_pred, Y_true):
        return self.cross_entropy_loss(Y_pred, Y_true)
    def train(self, X_train, Y_train, X_test, Y_test, epochs=10, batch_size=64, lr=0.01, optimizer='sgd'):
        num_samples = X_train.shape[0]
        loss_history_ce = []
        loss_history_se = []
        for epoch in range(epochs):
          perm = np.random.permutation(num_samples)
          X_shuf, Y_shuf = X_train[perm], Y_train[perm]
          epoch_ce, epoch_se, num_batches = 0, 0, 0
          for i in range(0, num_samples, batch_size):
            Xb = X_shuf[i:i+batch_size]
            Yb = Y_shuf[i:i+batch_size]
            Y_pred = self.forward(Xb)
            loss_ce = self.cross_entropy_loss(Y_pred, Yb)
            loss_se = self.squared_error_loss(Y_pred, Yb)
            self.backward(Yb, loss_type="cross_entropy")
            self.update_parameters(lr=lr, optimizer=optimizer, t=epoch+1)
            epoch_ce += loss_ce
            epoch_se += loss_se
            num_batches += 1
          epoch_ce /= num_batches
          epoch_se /= num_batches
          loss_history_ce.append(epoch_ce)
          loss_history_se.append(epoch_se)
          Y_test_pred = self.forward(X_test)
          test_acc = np.mean(
          np.argmax(Y_test_pred, axis=1) == np.argmax(Y_test, axis=1)
          )
          wandb.log({
              "epoch": epoch,
              "cross_entropy_loss": epoch_ce,
              "squared_error_loss": epoch_se,
              "test_accuracy": test_acc
          })
        wandb.log({
                "loss_comparison": wandb.plot.line_series(
                xs=list(range(len(loss_history_ce))),
                ys=[loss_history_ce, loss_history_se],
                keys=["Cross Entropy", "Squared Error"],
                title="Loss Comparison: Cross-Entropy vs Squared Error",
                xname="Epoch"
                )
                })

In [None]:
X_train, Y_train, X_test, Y_test = load_fashion_mnist()
model = FNN(
    input_size=784,
    hidden_layers=[128, 64],
    output_size=10
)
model.train(
    X_train, Y_train,
    X_test, Y_test,
    epochs=10,
    batch_size=64,
    lr=0.01,
    optimizer='adam'
)
wandb.finish()