In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist
from sklearn.model_selection  import train_test_split
import itertools
import math
%matplotlib inline
!pip install wandb
import wandb
from wandb.keras import WandbCallback

In [None]:
(x_train, y_train),(x_test, y_test) = fashion_mnist.load_data()

Q1 : Printing sample images from each class

In [None]:
wandb.init(project='CS6910-assignment1')
samples = list(y_train)
class_names = ["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"]
for i in range(10):
    j=list.index(samples,i)
    #print(j)
    wandb.log({'label': i, 'image': [wandb.Image(x_train[j], caption=class_names[i])]})

In [None]:
x_train, x_val, y_train, y_val  = train_test_split(x_train, y_train, test_size=0.1,random_state = 42)

In [None]:
tmp = x_train.reshape(x_train.shape[0],-1)
center = tmp - tmp.mean(axis=0)
x_train = center/np.max(center.max(axis=0))

tmp = x_val.reshape(x_val.shape[0],-1)
center = tmp - tmp.mean(axis=0)
x_val = center/np.max(center.max(axis=0))

tmp = x_test.reshape(x_test.shape[0],-1)
center = tmp - tmp.mean(axis=0)
x_test = center/np.max(center.max(axis=0))

Q2, Q3 : y_hat gives the probability distribution over 10 classes. All the functions were written inside the class "NeualNetwork"

In [None]:
class NeuralNetwork:
    def __init__(self, sizeOfInput, numberOfNeuronsEachLayer, numberOfLayers, activationFunction, typeOfInit, L2regConst = 0):
        self.activationFunction = activationFunction
        self.sizeOfInput = sizeOfInput
        self.numberOfLayers = numberOfLayers
        self.numberOfNeuronsEachLayer = numberOfNeuronsEachLayer
        self.L2regConst = L2regConst
        self.W, self.b = self.initializer(typeOfInit)

    #Initialize weights and biases
    def initializer(self, init):
        W = [], b = []
        if init == 'random':
            W.append(np.random.randn(self.numberOfNeuronsEachLayer[0], self.sizeOfInput))
            for i in range(1,self.numberOfLayers):
                W.append(np.random.randn(self.numberOfNeuronsEachLayer[i],self.numberOfNeuronsEachLayer[i-1]))
            for i in range(self.numberOfLayers):
                b.append(np.random.rand(self.numberOfNeuronsEachLayer[i]))

        elif(init == 'xavier'):
            W.append(np.random.normal(0,math.sqrt(2/(self.numberOfNeuronsEachLayer[0]+ self.sizeOfInput)), (self.numberOfNeuronsEachLayer[0], self.sizeOfInput)))
            for i in range(1,self.numberOfLayers):
                W.append(np.random.normal(0, math.sqrt(2/(self.numberOfNeuronsEachLayer[i]+self.numberOfNeuronsEachLayer[i-1])),(self.numberOfNeuronsEachLayer[i],self.numberOfNeuronsEachLayer[i-1])))
            for i in range(self.numberOfLayers):
                b.append(np.random.rand(self.numberOfNeuronsEachLayer[i]))
        return W,b

    def initialize(self, sizeOfInput, numberOfLayers, numberOfNeuronsEachLayer):
        W, b = [], []
        W.append(np.zeros((numberOfNeuronsEachLayer[0], sizeOfInput)))
        b.append(np.zeros(numberOfNeuronsEachLayer[0]))
        for i in range(1,numberOfLayers):
            W.append(np.zeros((numberOfNeuronsEachLayer[i],numberOfNeuronsEachLayer[i-1])))
            b.append(np.zeros(numberOfNeuronsEachLayer[i]))
        return W, b

    #Function calls for all optimizers
    def optimize(self, X, Y, valImages, valLabels, optimizer, learningRate, epochs, batchSize):
        if optimizer == 'sgd':
            self.stochastic_gradient_descent(X, Y, valImages, valLabels, learningRate, epochs)
        elif optimizer == 'momentum':
            self.momentum_gradient_descent(X, Y, valImages,valLabels, learningRate, epochs, batchSize)
        elif optimizer == 'nag':
            self.nesterov_accelerated_gradient_descent(X, Y, valImages, valLabels, learningRate, epochs, batchSize)
        elif optimizer == 'rmsprop':
            self.rmsprop(X, Y, valImages,valLabels, learningRate, epochs, batchSize)
        elif optimizer == 'adam':
            self.adam(X, Y, valImages,valLabels, learningRate, epochs, batchSize)
        elif optimizer == 'nadam':
            self.nadam(X, Y, valImages, valLabels, learningRate, epochs, batchSize)

    def activation(self,x):
        if self.activationFunction == 'relu':
            return self.ReLU(x)
        elif self.activationFunction == 'tanh':
            return self.tanh(x)
        elif self.activationFunction == 'sigmoid':
            return self.sigmoid(x)

    def activationDerivative(self,x):
        if self.activationFunction == 'relu':
            return self.ReLUDerivative(x)
        elif self.activationFunction == 'tanh':
            return self.tanhDerivative(x)
        elif self.activationFunction == 'sigmoid':
            return self.sigmoidDerivative(x)

    def ReLU(self,Z):
        return np.maximum(0,Z)

    def ReLUDerivative(self,Z):
        return [1 if x>0 else 0 for x in Z]

    def tanh(self, Z):
        return np.array([((np.exp(x) - np.exp(-x))/((np.exp(x) + np.exp(-x)))) for x in Z])

    def tanhDerivative(self, Z):
        return np.array(1 - self.tanh(Z)**2)

    def sigmoidDerivative(self,Z):
        return self.sigmoid(Z)*(1-self.sigmoid(Z))

    def sigmoid(self,x):
        return np.where(x>=0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

    def softmaxFunction(self,Z):
        Z = Z - Z.max()
        return (np.exp(Z)/np.sum(np.exp(Z),axis=0))

    #Forward and backward Propagation
    def forwardPropagation(self,Input):
        A = []
        H = []
        Input = np.array(Input)
        A.append(self.W[0].dot(Input) + self.b[0])
        for i in range(1, self.numberOfLayers):
            H.append(self.activation(A[-1]))
            A.append(self.W[i].dot(H[-1]) + self.b[i])
        y_hat = self.softmaxFunction(A[-1])
        return A, H, y_hat

    def backwardPropagation(self, A, H, y_hat, y, Input):
        delA = []
        delH = []
        delW = []
        delb = []
        Input = np.array(Input)
        H.insert(0,Input)
        ey = np.zeros(self.numberOfNeuronsEachLayer[-1])
        ey[y] = 1
        delA.append(np.array(-(ey - y_hat)))
        for i in range(self.numberOfLayers-1,-1,-1):
            delW.insert(0,delA[-1].reshape(delA[-1].shape[0],1).dot(H[i].reshape(H[i].shape[0],1).T) + self.L2regConst*self.W[i])
            delb.insert(0,delA[-1])
            delH.append(self.W[i].T.dot(delA[-1]))
            if i-1>=0:
                delA.append(np.multiply(delH[-1], self.activationDerivative(A[i-1])))
        return delW,delb

    #Validation loss and validation accuracy
    def valLossAccuracy(self,valImages,valLabels):
        count = 0
        error = 0
        for i in range(valImages.shape[0]):
            A,H,y_hat = self.forwardPropagation(valImages[i])
            total = [x.sum() for x in self.W]
            error += -math.log(y_hat[valLabels[i]]) + self.L2regConst/2*sum(total)
            if np.argmax(y_hat) == valLabels[i]:
                count += 1
        return error/valImages.shape[0], count/valImages.shape[0]*100

    #Test result and accuracy
    def test(self, testImages, testLabels):
        count = 0
        y_hat = []
        for i in range(testImages.shape[0]):
            A,H,y = self.forwardPropagation(testImages[i])
            if np.argmax(y) == testLabels[i]:
                count += 1
            y_hat.append(y)
        return np.argmax(np.array(y_hat),axis=1), count/testImages.shape[0]*100

    #All optimisation functions are coded below

    def stochastic_gradient_descent(self, X, Y, valImages, valLabels, learningRate, epochs):
        for j in range(epochs):
            correct = 0
            error = 0
            delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            for i in range(X.shape[0]):
                A,H,y_hat = self.forwardPropagation(X[i])
                s = [x.sum() for x in self.W]
                error += -math.log(y_hat[Y[i]]) + self.L2regConst/2*sum(s)
                delW,delb = self.backwardPropagation(A,H,y_hat,Y[i],X[i])
                if(np.argmax(y_hat) == Y[i]):
                    correct +=1
                for i in range(self.numberOfLayers):
                    self.W[i] = self.W[i] - learningRate*delW[i]
                    self.b[i] = self.b[i] - learningRate*delb[i]

            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            validLoss, validAccuracy = self.valLossAccuracy(valImages, valLabels)
            #print('epoch', j+1, 'loss', error, 'accuracy', accuracy, 'valid_loss', validLoss, 'valid_accuracy', validAccuracy)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' : validLoss,'valid_accuracy' : validAccuracy})

    def nesterov_accelerated_gradient_descent(self, X, Y, valImages, valLabels, learningRate, epochs, batchSize, gamma = 0.5):
        updateW, updateb = self.initialize( self.sizeOfInput, self.numberOfLayers,self.numberOfNeuronsEachLayer)
        lookaheadW, lookaheadb = self.initialize( self.sizeOfInput, self.numberOfLayers,self.numberOfNeuronsEachLayer)
        thetaW, thetab = self.initialize( self.sizeOfInput, self.numberOfLayers,self.numberOfNeuronsEachLayer)

        for j in range(epochs):
            count = 0
            error = 0
            delW, delb = self.initialize( self.sizeOfInput, self.numberOfLayers,self.numberOfNeuronsEachLayer)

            for k in range( self.numberOfLayers):
                thetaW[k] = self.W[k]
                thetab[k] = self.b[k]
                lookaheadW[k] = thetaW[k] - gamma*updateW[k]
                lookaheadb[k] = thetab[k] - gamma*updateb[k]
                self.W[k] = lookaheadW[k]
                self.b[k] = lookaheadb[k]

            for i in range(X.shape[0]):
                A,H,y_hat = self.forwardPropagation(X[i])
                s = [x.sum() for x in self.W]
                error += -math.log(y_hat[Y[i]]) + self.L2regConst/2*sum(s)
                w,b = self.backwardPropagation(A,H,y_hat,Y[i],X[i])

                for k in range( self.numberOfLayers):
                    delW[k] += w[k]
                    delb[k] += b[k]
                    updateW[k] = gamma*updateW[k] + learningRate*delW[k]
                    updateb[k] = gamma*updateb[k] + learningRate*delb[k]

                if  (i%batchSize == 0 and i!=0) or i==X.shape[0]-1:
                    delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
                    for k in range(self.numberOfLayers):
                        self.W[k] += -updateW[k]
                        self.b[k] += -updateb[k]

                if(np.argmax(y_hat) == Y[i]):
                    count +=1

            error /= X.shape[0]
            accuracy = count/X.shape[0]*100
            validLoss, validAccuracy = self.valLossAccuracy(valImages, valLabels)
            #print('epoch', j+1, 'loss', error, 'accuracy', accuracy, 'valid_loss', validLoss, 'valid_accuracy', validAccuracy)
            wandb.log({'epoch' : j+1, 'loss' : error, 'accuracy' : accuracy,'val_loss' : validLoss,'val_accuracy' : validAccuracy})

    def momentum_gradient_descent(self,X, Y, valImages, valLabels, learningRate, epochs, batchSize, gamma = 0.6):
        updateW, updateb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

        for j in range(epochs):
            count = 0
            error = 0
            delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            for i in range(X.shape[0]):
                A,H,y_hat = self.forwardPropagation(X[i])
                s = [x.sum() for x in self.W]
                error += -math.log(y_hat[Y[i]]) + self.L2regConst/2*sum(s)
                w,b = self.backwardPropagation(A,H,y_hat,Y[i],X[i])

                for k in range(self.numberOfLayers):
                    delW[k] += w[k]
                    delb[k] += b[k]
                    updateW[k] = gamma*updateW[k] + learningRate*delW[k]
                    updateb[k] = gamma*updateb[k] + learningRate*delb[k]

                if  (i%batchSize == 0 and i!=0) or i==X.shape[0]-1:
                    delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
                    for k in range(self.numberOfLayers):
                        self.W[k] += -updateW[k]
                        self.b[k] += -updateb[k]

                if(np.argmax(y_hat) == Y[i]):
                    count +=1

            error /= X.shape[0]
            accuracy = count/X.shape[0]*100
            validLoss, validAccuracy = self.valLossAccuracy(valImages, valLabels)
            #print('epoch', j+1, 'loss', error, 'accuracy', accuracy, 'valid_loss', validLoss, 'valid_accuracy', validAccuracy)
            wandb.log({'epoch' : j+1, 'loss' : error, 'accuracy' : accuracy, 'val_loss' : validLoss, 'val_accuracy' : validAccuracy})

    def rmsprop(self,X, Y, valImages,valLabels, learningRate, epochs, batchSize, beta = 0.89, epsilon = 1e-6):
        v_W, v_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

        for j in range(epochs):
            count = 0
            error = 0
            delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            for i in range(X.shape[0]):
                A,H,y_hat = self.forwardPropagation(X[i])
                s = [x.sum() for x in self.W]
                error += -math.log(y_hat[Y[i]]) + self.L2regConst/2*sum(s)
                w,b = self.backwardPropagation(A,H,y_hat,Y[i],X[i])

                for k in range(self.numberOfLayers):
                    delW[k] += w[k]
                    delb[k] += b[k]
                    v_W[k] =  beta*v_W[k] + (1-beta)*delW[k]**2
                    v_b[k] = beta*v_b[k] + (1-beta)*delb[k]**2

                if(np.argmax(y_hat) == Y[i]):
                    count +=1

                if  (i%batchSize == 0 and i!=0) or i==X.shape[0]-1:
                    for k in range(self.numberOfLayers):
                        v_W[k] =  beta*v_W[k] + (1-beta)*delW[k]**2
                        v_b[k] = beta*v_b[k] + (1-beta)*delb[k]**2
                        self.W[k] = self.W[k] - (learningRate*delW[k])/np.sqrt(v_W[k] + epsilon)
                        self.b[k] = self.b[k] - (learningRate*delb[k])/np.sqrt(v_b[k] + epsilon)
                    delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            error /= X.shape[0]
            accuracy = count/X.shape[0]*100
            validLoss, validAccuracy = self.valLossAccuracy(valImages, valLabels)
            #print('epoch', j+1, 'loss', error, 'accuracy', accuracy, 'valid_loss', validLoss, 'valid_accuracy', validAccuracy)
            wandb.log({'epoch' : j+1, 'loss' : error, 'accuracy' : accuracy,'val_loss' : validLoss,'val_accuracy' : validAccuracy})

    def adam(self, X, Y, valImages, valLabels, learningRate, epochs, batchSize, beta1 = 0.89, beta2 = 0.989, epsilon = 1e-8):
        m_W, m_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
        m_hat_W, m_hat_b = self.initialize( self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
        v_W, v_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
        v_hat_W, v_hat_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

        for j in range(epochs):
            count = 0
            error = 0
            delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            for i in range(X.shape[0]):
                A,H,y_hat = self.forwardPropagation(X[i])
                s = [x.sum() for x in self.W]
                error += -math.log(y_hat[Y[i]]) + self.L2regConst/2*sum(s)
                w,b = self.backwardPropagation(A,H,y_hat,Y[i],X[i])

                for k in range(self.numberOfLayers):
                    delW[k] += w[k]
                    delb[k] += b[k]

                if(np.argmax(y_hat) == Y[i]):
                    count +=1

                if  (i%batchSize == 0 and i!=0) or i==X.shape[0]-1:
                    for k in range(self.numberOfLayers):
                        v_W[k] =  beta2*v_W[k] + (1-beta2)*delW[k]*delW[k]
                        v_b[k] = beta2*v_b[k] + (1-beta2)*delb[k]*delb[k]
                        m_W[k] = beta1*m_W[k] + (1-beta1)*delW[k]
                        m_b[k] = beta1*m_b[k] + (1-beta1)*delb[k]
                        m_hat_W[k] = m_W[k]/(math.pow(beta1,j))
                        m_hat_b[k] = m_b[k]/(math.pow(beta1,j))
                        v_hat_W[k] = v_W[k]/(math.pow(beta2,j))
                        v_hat_b[k] = v_b[k]/(math.pow(beta2,j))
                        self.W[k] = self.W[k] - (learningRate*m_hat_W[k])/np.sqrt(v_hat_W[k] + epsilon)
                        self.b[k] = self.b[k] - (learningRate*m_hat_b[k])/np.sqrt(v_hat_b[k] + epsilon)
                    delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            error /= X.shape[0]
            accuracy = count/X.shape[0]*100
            validLoss, validAccuracy = self.valLossAccuracy(valImages, valLabels)
            #print('epoch', j+1, 'loss', error, 'accuracy', accuracy, 'valid_loss', validLoss, 'valid_accuracy', validAccuracy)
            wandb.log({'epoch' : j+1, 'loss' : error, 'accuracy' : accuracy,'val_loss' : validLoss,'val_accuracy' : validAccuracy})

    def nadam(self, X, Y, valImages, valLabels, learningRate, epochs, batchSize, beta1 = 0.89, beta2 = 0.989, epsilon = 1e-8):
        m_W, m_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
        m_hat_W, m_hat_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
        v_W, v_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
        v_hat_W, v_hat_b = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

        for j in range(epochs):
            count = 0
            error = 0
            delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)

            for i in range(X.shape[0]):
                A,H,y_hat = self.forwardPropagation(X[i])
                s = [x.sum() for x in self.W]
                error += -math.log(y_hat[Y[i]]) + self.L2regConst/2*sum(s)
                w,b = self.backwardPropagation(A,H,y_hat,Y[i],X[i])

                for k in range(self.numberOfLayers):
                    delW[k] += w[k]
                    delb[k] += b[k]

                if(np.argmax(y_hat) == Y[i]):
                    count +=1

                if  (i%batchSize == 0 and i!=0) or i==X.shape[0]-1:
                    for k in range(self.numberOfLayers):
                        v_W[k] =  beta2*v_W[k] + (1-beta2)*delW[k]**2
                        v_b[k] = beta2*v_b[k] + (1-beta2)*delb[k]**2
                        m_W[k] = beta1*m_W[k] + (1-beta1)*delW[k]
                        m_b[k] = beta1*m_b[k] + (1-beta1)*delb[k]
                        m_hat_W[k] = m_W[k]/(math.pow(beta1,j))
                        m_hat_b[k] = m_b[k]/(math.pow(beta1,j))
                        v_hat_W[k] = v_W[k]/(math.pow(beta2,j))
                        v_hat_b[k] = v_b[k]/(math.pow(beta2,j))
                        self.W[k] = self.W[k] - (learningRate*(beta1*m_hat_W[k] + (1-beta1)*delW[k]/(1-beta1)))/np.sqrt(v_hat_W[k] + epsilon)
                        self.b[k] = self.b[k] - (learningRate*(beta1*m_hat_b[k] + (1-beta1)*delb[k]/(1-beta1)))/np.sqrt(v_hat_b[k] + epsilon)
                    delW, delb = self.initialize(self.sizeOfInput,self.numberOfLayers,self.numberOfNeuronsEachLayer)
            error /= X.shape[0]
            accuracy = count/X.shape[0]*100
            validLoss, validAccuracy = self.valLossAccuracy(valImages, valLabels)
            #print('epoch', j+1, 'loss', error, 'accuracy', accuracy, 'valid_loss', validLoss, 'valid_accuracy', validAccuracy)
            wandb.log({'epoch' : j+1, 'loss' : error, 'accuracy' : accuracy,'val_loss' : validLoss,'val_accuracy' : validAccuracy})



Q4, Q5, Q6 : Sweep using bayesian. I ran 271 sweeps and displayed the plots in the report

In [None]:
#Sweep
sweep_config = {
    'name':"my-sweep",
    'method': 'bayes',
    'metric': {'goal': 'maximize', 'name': 'valid_accuracy'},
    'parameters':{
                    'activation': {
                        'values': ['sigmoid', 'tanh', 'relu']
                    },
                    'batch_size': {
                        'values': [16, 32, 64]
                    },
                    'epochs': {
                        'values': [5,10]
                    },
                    'hidden_inputsize': {
                        'values': [32,64,128]
                    },
                    'number_hidden': {
                        'values' : [3,4,5]
                    },
                    'learning_rate': {
                        'values': [1e-3, 1e-4]
                    },
                    'optimizer': {
                        'values': ['sgd', 'momentum', 'nag', 'rmsprop', 'adam', 'nadam']
                    },
                    'weight_decay': {
                        'values': [0, 0.0005, 0.5]
                    },
                    'weight_init': {
                        'values': ['random', 'xavier']
                    }
                 }
    }
sweep_id = wandb.sweep(sweep_config, project='CS6910-assignment1')


In [None]:
def train():
    var1 = wandb.init()
    var2 = var1.config
    wandb.run.name = "hl_" + str(var2.hidden_inputsize)+"_bs_"+str(var2.batch_size)+"_ac_"+ var2.activation
    np.random.seed(1)
    obj = NeuralNetwork(x_train.shape[1], list(itertools.chain(*[[var2.hidden_inputsize]*var2.number_hidden, [10]])), var2.number_hidden+1, var2.activation, var2.weight_init, var2.weight_decay)
    obj.optimize(x_train, y_train, x_val, y_val, var2.optimizer, var2.learning_rate, var2.epochs, var2.batch_size)

wandb.agent(sweep_id, train)


Q7 : Best accuracy is possible for the configuration shown below.
     Generating y prediction to plot the confusion matrix.

In [None]:
best_config={
    "activation":"relu",
    "batch_size":64,
    "epochs":10,
    "hidden_inputsize":128,
    "learning_rate":1e-03,
    "number_hidden":4,
    "optimizer":"nadam",
    "weight_decay":0.0005,
    "weight_init":"xavier"
}

wandb.init(config = best_config,project = "CS6910-assignment1")
config = wandb.config
obj = NeuralNetwork(x_train.shape[1], list(itertools.chain(*[[config.hidden_inputsize]*config.number_hidden, [10]])), config.number_hidden+1, config.activation, config.weight_init, config.weight_decay)
obj.optimize(x_train, y_train, x_val, y_val, config.optimizer, config.learning_rate, config.epochs, config.batch_size)
y_pred, test_accuracy = obj.test(x_test, y_test)


Plotting confusion matrix

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat','Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=y_pred.T, y_true=y_test, class_names=class_names)})

Q8 : Comparing all the models on mean-squared loss and cross-entropy loss



In [None]:
sweep_config = {
    'name':"my-sweep",
    'method': 'random',
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'
    },

    'parameters': {
        'learn_rate': {
            'values': [1e-3, 1e-4]
        },
        'weight_initial': {
            'values':['random','xavier']
        },

        'hidden_size': {
            'values':[32, 64, 128]
        },
        'optimizer': {
            'values': ['momentum_gradient_descent', 'nesterov', 'rmsprop', 'adam', 'nadam','stochastic_gradient_descent']
        },
        'batch_size' : {
            'values':[16, 32, 64]
        },
        'activation': {
            'values': ['sigmoid','tanh','relu']
        },
        'hidden_layer': {
            'values': [3, 4, 5]
        },
        'losscomputation':{
            'values':['cross_entropy','mean_square']
        },
        'weight_decay': {
            'values':[0, 0.0005,  0.5]
        },
        'epochs': {
            'values': [5, 10]
        }

        }
}

sweep_id = wandb.sweep(sweep_config, project="CS6910-assignment1")

def train():
  with wandb.init() as run:
    config = wandb.config
    wandb.run.name = "hl_" + str(config.hidden_size)+"_bs_"+str(config.batch_size)+"_ac_"+ config.activation+"_lf_"+str(config.losscomputation)
    np.random.seed(1)
    obj = NeuralNetwork(x_train.shape[1], list(itertools.chain(*[[config.hidden_inputsize]*config.number_hidden, [10]])), config.number_hidden+1, config.activation, config.weight_init, config.weight_decay)
    obj.optimize(x_train, y_train, x_val, y_val, config.optimizer, config.learning_rate, config.epochs, config.batch_size)
    y_pred, test_accuracy = obj.test(x_test, y_test)

wandb.agent(sweep_id, train, project="CS6910-assignment1")


Q10 : Importing MNIST data to test best possible configurations. I have given 4 of them listed below int the sweep_config.

In [None]:
from keras.datasets import mnist
(x_train,y_train),(x_test,y_test) = mnist.load_data()

In [None]:
sweep_config = {
    'name':"my-sweep-mnist",
    'method': 'bayes',
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'
    },

    'parameters': {
        'epochs': {
            'values': [10]
        },
        'number_hidden': {
            'values': [5]
        },
        'hidden_inputsize': {
            'values':[64, 128]
        },
        'weight_decay': {
            'values':[0]
        },
        'learning_rate': {
            'values': [1e-3]
        },
        'optimizer': {
            'values': ['adam', 'nadam']
        },
        'batch_size' : {
            'values':[64]
        },
        'weight_init': {
            'values':['xavier']
        },
        'activation': {
            'values': ['relu']
        }

        }
}

sweep_id = wandb.sweep(sweep_config, project="CS6910-assignment1")


In [None]:
def train():
    with wandb.init() as run:
      config = wandb.config
      wandb.run.name = "hl_" + str(config.hidden_inputsize)+"_bs_"+str(config.batch_size)+"_ac_"+ config.activation
      np.random.seed(1)
      obj = NeuralNetwork(x_train.shape[1], list(itertools.chain(*[[config.hidden_inputsize]*config.number_hidden, [10]])), config.number_hidden+1, config.activation, config.weight_init, config.weight_decay)
      obj.optimize(x_train, y_train, x_val, y_val, config.optimizer, config.learning_rate, config.epochs, config.batch_size)
      y_pred, test_accuracy = obj.test(x_test, y_test)

wandb.agent(sweep_id,train,project="CS6910-assignment1")
