# 1. Library Dependencies

In [15]:
!pip install -q wandb

In [16]:
import numpy as np
import wandb
from keras.datasets import fashion_mnist,mnist
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib
import matplotlib.cm as cm

In [17]:
((x_train, y_train), (x_test, y_test)) = fashion_mnist.load_data()

# 2. Neural Network Class

In [18]:
# function constants
ACTIVATION_SIGMOID   = 'sigmoid'
ACTIVATION_SOFTMAX   = 'softmax'
ACTIVATION_THRESHOLD = 'threshold'
ACTIVATION_RELU      = 'relu'
ACTIVATION_TANH      = 'tanh'

LOSS_SQERROR         = 'sq_error'
LOSS_CROSSENTROPY    = 'cross_entropy'

GDOPT_NONE           = 'vanilla'
GDOPT_MOMENTUM       = 'momentum'
GDOPT_NESTEROV       = 'nesterov'
GDOPT_ADAGRAD        = 'adagrad'
GDOPT_RMSPROP        = 'rmsprop'
GDOPT_ADAM           = 'adam'
GDOPT_NADAM          = 'nadam'

WINIT_RANDOM         = 'random'
WINIT_XAVIER         = 'xavier'
WINIT_SAVED          = 'saved'

# function limits
EXP_INPUT_UPPER_TOL  = 300
EXP_INPUT_LOWER_TOL  = -300
EXP_OUTPUT_UPPER_TOL = 10**EXP_INPUT_UPPER_TOL
EXP_OUTPUT_LOWER_TOL = 10**EXP_INPUT_LOWER_TOL
DIVZERO_TOL          = 1e-300

class neuralNetwork:
  """
  Class for a neural network made up of multiple layers of perceptrons
  """
  def __init__(self,hyperparams):
    """
    Initialize parameters and hyperparameters of neural network
    """
    # assign default hyperparameters to neural network
    self.hyperparams = {
        "optmizer": GDOPT_NONE,
        "initWeightMethod": WINIT_RANDOM,
        "initWeightBounds": (-0.5,0.5),
        "regparam": 0,
        "wandb": False
    }
    self.setHyperparameters(hyperparams)

    # initialize the weight and bias matrices of the NN
    self.initModel()
  
  def setHyperparameters(self,hp):
    """
    Set hyperparameters of neural network
    """
    # change values of only the hyperparameters specified in the input variable
    self.hyperparams.update(hp)
    
    # use member variables for commonly used hyperparameters
    self.layerSizes       = self.hyperparams["layerSizes"]
    self.batchSize        = self.hyperparams["batchSize"]
    self.learningRate     = self.hyperparams["learningRate"]
    self.epochs           = self.hyperparams["epochs"]
    self.numLayers        = len(self.layerSizes) - 1
    
    # set math functions object hyperparameters
    assert len(self.hyperparams["activations"])==self.numLayers, "number of layers (%d) and number of activations (%d) don't match"%(self.numLayers,len(hp["activations"]))
    self.activations = self.hyperparams["activations"]
    self.lossFn = self.hyperparams["lossFn"]
    self.regparam = self.hyperparams["regparam"]

  def initModel(self):
    """
    Initialize parameters (weight and bias matrices) of neural network
    """
    # checking bounds arg
    assert len(self.hyperparams["initWeightBounds"])==2, "bounds arg has to be a list/tuple of 2 numbers"
    bounds = self.hyperparams["initWeightBounds"]

    # create list of weight matrices and bias vectors
    # the goal is to make the indexing same as that in lecture derivation, hence the dummy values
    self.wmat = [np.array([1],ndmin=2)]
    self.bias = [np.array([1],ndmin=2)]
    
    # create random initial parameters and append them to the above initialized lists
    for i in range(1,self.numLayers+1):
      if self.hyperparams["initWeightMethod"]==WINIT_XAVIER:
        bounds = (-1/(self.layerSizes[i-1])**0.5,1/(self.layerSizes[i-1])**0.5)
      self.wmat.append((bounds[1]-bounds[0])*np.random.rand(self.layerSizes[i],self.layerSizes[i-1])+bounds[0])
      self.bias.append((bounds[1]-bounds[0])*np.random.rand(self.layerSizes[i],1)+bounds[0])

  def activation(self,layerNum,x):
    """
    Compute and return activation values for a given layer and its sum values
    """
    layerNum -= 1 # index adjustment
    if   self.activations[layerNum]==ACTIVATION_SIGMOID:
      return 1/(1+np.exp(-np.maximum(np.minimum(x,EXP_INPUT_UPPER_TOL),EXP_INPUT_LOWER_TOL)))
    elif self.activations[layerNum]==ACTIVATION_SOFTMAX:
      z = np.exp(np.maximum(x - np.amax(x, axis=0) + EXP_INPUT_UPPER_TOL, EXP_INPUT_LOWER_TOL))
      return z/np.sum(z, axis=0)
    elif self.activations[layerNum]==ACTIVATION_THRESHOLD:
      return (x>=0)+0
    elif self.activations[layerNum]==ACTIVATION_RELU:
      return np.maximum(x,0)
    elif self.activations[layerNum]==ACTIVATION_TANH:
      return np.tanh(np.maximum(np.minimum(x, EXP_INPUT_UPPER_TOL), EXP_INPUT_LOWER_TOL))
  
  def activationDerivative(self,layerNum,**kwargs):
    """
    Compute and return activation derivative values for a given layer and its partial-sum or output values depending on the given argument
    """
    assert ( len(kwargs.keys())==1 and np.any([_ in kwargs.keys() for _ in ["x","y"]]) ), "activationDerivative argument malformed. \
    Use activationDerivative(layerNum,x=x_val) or activationDerivative(layerNum,y=y_val)"
    layerNum -= 1 # index adjustment
    
    if "y" in kwargs.keys():
      y = kwargs["y"]
      if   self.activations[layerNum]==ACTIVATION_SIGMOID:
        return y*(1-y)
      elif self.activations[layerNum]==ACTIVATION_SOFTMAX:
        return y*(1-y)
      elif self.activations[layerNum]==ACTIVATION_THRESHOLD:
        return y*(1-y)
      elif self.activations[layerNum]==ACTIVATION_RELU:
        return (y>=0)+0
      elif self.activations[layerNum]==ACTIVATION_TANH:
        return 1 - y**2
    else:
      x = kwargs["x"]
      if   self.activations[layerNum]==ACTIVATION_SIGMOID:
        return np.exp(-x)/(1+np.exp(-x))**2
      elif self.activations[layerNum]==ACTIVATION_SOFTMAX:
        z = np.exp(x)
        s = np.sum(z)
        return z*(s-z)/(s**2)
      elif self.activations[layerNum]==ACTIVATION_THRESHOLD:
        return np.exp(-x)/(1+np.exp(-x))**2
      elif self.activations[layerNum]==ACTIVATION_RELU:
        return (x>=0)+0
      elif self.activations[layerNum]==ACTIVATION_RELU:
        return 1-np.tanh(x)**2
  
  def loss(self, outputData, targetData):
    """
    Compute and return loss values for given output and target data
    """
    # non-regularised loss
    nonRegLoss     = 0
    if   self.lossFn == LOSS_SQERROR:
      nonRegLoss   = 0.5 * np.sum(np.linalg.norm(outputData - targetData, axis=0)**2)
    elif self.lossFn == LOSS_CROSSENTROPY:
      nonRegLoss   = - np.sum(targetData * np.log( np.maximum(outputData, EXP_OUTPUT_LOWER_TOL)) 
      + (1-targetData) * np.log( np.maximum((1-outputData), EXP_OUTPUT_LOWER_TOL)))
    
    # weight decay regularisation loss
    wdRegLoss      = 0
    if self.hyperparams["regparam"]!=0:
      modW_sq      = np.sum( np.array( [ np.linalg.norm(W) for W in self.wmat ] ) )**2
      modB_sq      = np.sum( np.array( [ np.linalg.norm(B) for B in self.bias ] ) )**2
      modtheta_sq  = modW_sq + modB_sq
      wdRegLoss    = 0.5 * self.hyperparams["regparam"] * modtheta_sq
    
    return (nonRegLoss + wdRegLoss)/len(targetData[0])
  
  def lossOutputDerivative(self, outputData, targetData):
    """
    Compute and return (non-regularised) loss derivatives for given output and target data
    """
    if self.lossFn==LOSS_SQERROR:
      return outputData - targetData
    elif self.lossFn==LOSS_CROSSENTROPY:
      dat = outputData
      return ( - targetData/np.maximum(dat,DIVZERO_TOL) + (1-targetData)/np.maximum(1-dat,DIVZERO_TOL) ) # for no zeros/ones in output
  
  def lossMetrics(self, outputData, targetData):
    """
    Compute loss metrics (loss, accuracy) for given data
    """
    # calculate metrics
    loss = self.loss(outputData, targetData)
    acc  = np.count_nonzero( np.argmax(targetData, axis=0) == np.argmax(outputData, axis=0) )/len(targetData[0])
    return loss, acc
  
  def forwardPass(self, inputData):
    """
    Compute output activations of all layers of neural network
    Data can also be given as sets of datapoints (dimensions being layer dimension x dataset size)
    """
    #                                              # --- PSEUDOCODE ---
    h     = inputData                              # h[0] = x
    hData = [h]                                    #
    datasetSize = np.shape(inputData)[1]           #
    #                                              #
    for i in range(1,self.numLayers+1):            # for i from 1 to L:
      a   = self.wmat[i] @ h + self.bias[i]        #     a[i] = w[i] @ h[i-1] + b[i]
      h   = self.activation(i,a)                   #     h[i] = f(a[i])
      hData.append(h)
    
    return hData
  
  def backwardPass(self, layerwiseOutputData, targetData):
    """
    Compute weight and bias gradients for all layers of neural network
    Data can also be given as sets of datapoints (dimensions being layer dimension x dataset size)
    """
    #                                                                                        # --- PSEUDOCODE ---
    lossData    = self.lossOutputDerivative(layerwiseOutputData[-1], targetData)             # loss_derivative = d(loss)/dh[L]
    Delta       = lossData                                                                   # Delta[L] = loss_derivative
    datasetSize = np.shape(targetData)[1]                                                    #
    biasInputs  = np.array(np.ones(datasetSize),ndmin=2).T                                   #
    gradW       = []                                                                         #
    gradB       = []                                                                         #
    #                                                                                        #
    for iFwd in range(self.numLayers):                                                       # for i from L to 1:
      i            = self.numLayers - iFwd                                                   #     // index correction
      stocBiasCorr = self.activationDerivative(i,y=layerwiseOutputData[i]) * Delta           #     stochastic_bias_corrections = f'(a[i]) * Delta[i]
      gW           = (stocBiasCorr @ layerwiseOutputData[i-1].T + self.regparam*self.wmat[i])/len(targetData[0])  #     grad(W[i]) = stochastic_bias_corrections x (h[i-1]).T
      gB           = (stocBiasCorr @ biasInputs + self.regparam*self.bias[i])/len(targetData[0])                  #     grad(b[i]) = sum(stochastic_bias_corrections)
      Delta        = self.wmat[i].T @ stocBiasCorr                                           #     Delta[i-1] = W[i] x stochastic_bias_corrections
      
      gradW.append(gW)
      gradB.append(gB)
    
    # dummy element and order handling
    gradW.append(np.array([0],ndmin=2))
    gradW.reverse()
    gradB.append(np.array([0],ndmin=2))
    gradB.reverse()
    
    return (gradW,gradB)

  def initOptimizerCollector(self):
    """
    Create variable for optimizer state collection
    """
    optType = self.hyperparams["optimizer"]
    opt = {
        "t": 1
    }
    
    if   optType == GDOPT_NONE:
      pass
    
    elif optType == GDOPT_MOMENTUM:
      opt["update_w"] = [0]*(self.numLayers+1)
      opt["update_b"] = [0]*(self.numLayers+1)
    
    elif optType == GDOPT_NESTEROV:
      opt["update_w"] = [0]*(self.numLayers+1)
      opt["update_b"] = [0]*(self.numLayers+1)
    
    elif optType == GDOPT_RMSPROP:
      opt["v_w"] = [0]*(self.numLayers+1)
      opt["v_b"] = [0]*(self.numLayers+1)
    
    elif optType == GDOPT_ADAM:
      opt["m_w"] = [0]*(self.numLayers+1)
      opt["m_b"] = [0]*(self.numLayers+1)
      opt["v_w"] = [0]*(self.numLayers+1)
      opt["v_b"] = [0]*(self.numLayers+1)
    
    elif optType == GDOPT_NADAM:
      opt["m_w"] = [0]*(self.numLayers+1)
      opt["m_b"] = [0]*(self.numLayers+1)
      opt["v_w"] = [0]*(self.numLayers+1)
      opt["v_b"] = [0]*(self.numLayers+1)
    
    return opt
  
  def updateParameters(self, inputData, targetData, opt):
    """
    Perform parameter updates for given input and target datapoints
    """
    # pre-common processing
    optType = self.hyperparams["optimizer"]
    eta = self.hyperparams["learningRate"]
    gamma = self.hyperparams["beta_1"]
    t = opt["t"]
    for i in range(1,self.numLayers+1):
      if optType == GDOPT_NESTEROV:
        update_w = opt["update_w"]; update_b = opt["update_b"]
        self.wmat[i] += -gamma*update_w[i]
        self.bias[i] += -gamma*update_b[i]

    # common processing
    layerwiseOutputData = self.forwardPass(inputData)
    (gradW, gradB)      = self.backwardPass(layerwiseOutputData,targetData)
    
    # post-common processing
    for i in range(1,self.numLayers+1):
      if   optType == GDOPT_NONE:
        self.wmat[i] += -eta * gradW[i]
        self.bias[i] += -eta * gradB[i]
      
      elif optType == GDOPT_MOMENTUM:
        update_w = opt["update_w"]; update_b = opt["update_b"]
        update_w[i] = gamma*update_w[i] + eta*gradW[i]
        update_b[i] = gamma*update_b[i] + eta*gradB[i]
        self.wmat[i] += -update_w[i]
        self.bias[i] += -update_b[i]
      
      elif optType == GDOPT_NESTEROV:
        update_w = opt["update_w"]; update_b = opt["update_b"]
        update_w[i] = gamma*update_w[i] + eta*gradW[i]
        update_b[i] = gamma*update_b[i] + eta*gradB[i]
        self.wmat[i] += -eta*gradW[i]
        self.bias[i] += -eta*gradB[i]
      
      elif optType == GDOPT_RMSPROP:
        beta = self.hyperparams["beta_2"]; epsilon = self.hyperparams["epsilon"]
        v_w = opt["v_w"]; v_b = opt["v_b"]
        v_w[i] = beta*v_w[i] + (1-beta)*gradW[i]**2
        v_b[i] = beta*v_b[i] + (1-beta)*gradB[i]**2 
        self.wmat[i] += -eta * (v_w[i] + epsilon)**-0.5 * gradW[i]
        self.bias[i] += -eta * (v_b[i] + epsilon)**-0.5 * gradB[i]
      
      elif optType == GDOPT_ADAM:
        beta_1 = gamma; beta_2 = self.hyperparams["beta_2"]; epsilon = self.hyperparams["epsilon"]
        m_w = opt["m_w"]; m_b = opt["m_b"]
        v_w = opt["v_w"]; v_b = opt["v_b"]
        m_w[i] = beta_1*m_w[i] + (1-beta_1)*gradW[i]
        m_b[i] = beta_1*m_b[i] + (1-beta_1)*gradB[i]
        v_w[i] = beta_2*v_w[i] + (1-beta_2)*gradW[i]**2
        v_b[i] = beta_2*v_b[i] + (1-beta_2)*gradB[i]**2
        m_w_hat = m_w[i]/(1-beta_1**t)
        m_b_hat = m_b[i]/(1-beta_1**t)
        v_w_hat = v_w[i]/(1-beta_2**t)
        v_b_hat = v_b[i]/(1-beta_2**t)
        self.wmat[i] += -eta * (v_w_hat + epsilon)**-0.5 * m_w_hat
        self.bias[i] += -eta * (v_b_hat + epsilon)**-0.5 * m_b_hat
      
      elif optType == GDOPT_NADAM:
        beta_1 = gamma; beta_2 = self.hyperparams["beta_2"]; epsilon = self.hyperparams["epsilon"]
        m_w = opt["m_w"]; m_b = opt["m_b"]
        v_w = opt["v_w"]; v_b = opt["v_b"]
        m_w[i] = beta_1*m_w[i] + (1-beta_1)*gradW[i]
        m_b[i] = beta_1*m_b[i] + (1-beta_1)*gradB[i]
        v_w[i] = beta_2*v_w[i] + (1-beta_2)*gradW[i]**2
        v_b[i] = beta_2*v_b[i] + (1-beta_2)*gradB[i]**2
        m_w_hat = (beta_1/(1-beta_1**(t+1)))*m_w[i] + ((1-beta_1)/(1-beta_1**t))*gradW[i]
        m_b_hat = (beta_1/(1-beta_1**(t+1)))*m_b[i] + ((1-beta_1)/(1-beta_1**t))*gradB[i]
        v_w_hat = v_w[i]/(1-beta_2**t)
        v_b_hat = v_b[i]/(1-beta_2**t)
        self.wmat[i] += -eta * (v_w_hat + epsilon)**-0.5 * m_w_hat
        self.bias[i] += -eta * (v_b_hat + epsilon)**-0.5 * m_b_hat
    
    # increment batch counter
    opt["t"] += 1
  
  def infer(self,inputData,**kwargs):
    """
    Perform inference on input dataset using the neural network
    Unless colwiseData=True is given as an argument, data will be interpreted as being dataset size x layer dimension
    """
    # resolving input dimensions
    inputData  = np.array(inputData,ndmin=2)
    if "colwiseData" in kwargs and kwargs["colwiseData"]==True:
      pass
    else:
      inputData  = inputData.T
    assert np.shape(inputData)[0]==self.layerSizes[0], "size of input datapoint (%d) differs from size of input vector given as hyperparameter (%d)"%(np.shape(inputData)[0],self.layerSizes[0])
    
    # perform forward pass and return last-layer outputs and loss metrics
    outputData = self.forwardPass(inputData)[-1]
    if "targetData" in kwargs:
      return outputData, self.lossMetrics(outputData, kwargs["targetData"])
    else:
      return outputData
  
  def train(self, inputTrainingData, targetTrainingData, inputValidationData = [], targetValidationData = [], **kwargs):
    """
    Train the network on the given input and target datasets
    Unless colwiseData=True is given as an argument, data will be interpreted as being dataset size x layer dimension
    """
    # resolving input and target dimensions
    inputTrainingData    = np.array(inputTrainingData,ndmin=2)
    targetTrainingData   = np.array(targetTrainingData,ndmin=2)
    inputValidationData  = np.array(inputValidationData,ndmin=2)
    targetValidationData = np.array(targetValidationData,ndmin=2)
    
    if "colwiseData" in kwargs and kwargs["colwiseData"]==True:
      pass
    else:
      inputTrainingData    = inputTrainingData.T
      targetTrainingData   = targetTrainingData.T
      inputValidationData  = inputValidationData.T
      targetValidationData = targetValidationData.T
    
    assert np.shape(inputTrainingData)[1]==np.shape(targetTrainingData)[1], "input (%d) and target (%d) training datasets have different sizes"%(np.shape(inputTrainingData)[1],np.shape(targetTrainingData)[1])
    assert np.shape(inputTrainingData)[0]==self.layerSizes[0], "size of input training datapoint (%d) differs from size of input training vector given as hyperparameter (%d)"%(np.shape(inputTrainingData)[0]==self.layerSizes[0])
    assert np.shape(targetTrainingData)[0]==self.layerSizes[-1], "size of target training datapoint (%d) differs from size of target training vector given as hyperparameter (%d)"%(np.shape(targetTrainingData)[0],self.layerSizes[-1])
    
    assert np.shape(inputValidationData)[1]==np.shape(targetValidationData)[1], "input (%d) and target (%d) validation datasets have different sizes"%(np.shape(inputValidationData)[1],np.shape(targetValidationData)[1])
    assert np.shape(inputValidationData)[0]==self.layerSizes[0], "size of input validation datapoint differs from size of input validation vector given as hyperparameter"%(np.shape(inputValidationData)[0]==self.layerSizes[0])
    assert np.shape(targetValidationData)[0]==self.layerSizes[-1], "size of target validation datapoint differs from size of target validation vector given as hyperparameter"%(np.shape(targetValidationData)[0],self.layerSizes[-1])
    
    datasetSize = np.shape(targetTrainingData)[1]

    # calculate batch parameters
    batchSize = datasetSize if self.batchSize==-1 else self.batchSize
    numBatches = int(np.ceil(datasetSize / batchSize))

    # initialize optimizer state processing object
    opt = self.initOptimizerCollector()

    # run training loop
    for epoch in range(self.hyperparams["epochs"]):
      # put in wandb's log if defined
      if self.hyperparams["wandb"]:
        # calculate and log loss and accuracy
        _, (loss_train, acc_train) = self.infer(inputTrainingData,   colwiseData = True, targetData = targetTrainingData)
        _, (loss_val, acc_val)     = self.infer(inputValidationData, colwiseData = True, targetData = targetValidationData)

        wandb.log({"Epoch": epoch, \
                    "Training Loss": loss_train, "Training Accuracy": acc_train, \
                    "Validation Loss": loss_val, "Validation Accuracy": acc_val })
      # run batch loop
      for batchIndex in range(numBatches):       
        # create data batches
        startIndex            = batchSize * batchIndex
        endIndex              = min(startIndex + batchSize, datasetSize)
        inputTrainingBatch    = inputTrainingData[:,startIndex:endIndex]
        targetTrainingBatch   = targetTrainingData[:,startIndex:endIndex]

        # perform parameter update
        self.updateParameters(inputTrainingBatch, targetTrainingBatch, opt)
        tempLD = self.lossOutputDerivative(self.infer(inputTrainingBatch,colwiseData=True),targetTrainingBatch)
        '''f.write(
             "BATCH %d-%d\n1. Outputs:"%(startIndex+1,endIndex) 
             + str(self.infer(inputTrainingData,colwiseData = True)) 
             + '\n2. Loss Output Derivative (Norm = %.3f)'%(np.linalg.norm(tempLD))
             + str(tempLD)
             + '\n\n\n')
      f.close()'''

    # put in wandb's log if defined
    if self.hyperparams["wandb"]:
      # calculate and log loss and accuracy
      _, (loss_train, acc_train) = self.infer(inputTrainingData,   colwiseData = True, targetData = targetTrainingData)
      _, (loss_val, acc_val)     = self.infer(inputValidationData, colwiseData = True, targetData = targetValidationData)

      wandb.log({"Epoch": epoch+1, \
                  "Training Loss": loss_train, "Training Accuracy": acc_train, \
                  "Validation Loss": loss_val, "Validation Accuracy": acc_val })

# 3. Dataset Initialization

In [19]:
classes, idx_sample_class = np.unique(y_train, return_index=True)

#Reshaping the 'x' data:
len_1D = x_train.shape[1]*x_train.shape[2]
x_train_1D = np.array( [x.reshape(len_1D) for x in x_train] )
x_test_1D  = np.array( [x.reshape(len_1D) for x in x_test] )

#Transforming 'y' data - changing scalar i to vector e(i)
y_train_1D = np.zeros( (len(y_train), len(classes)) , dtype = float)
for i in range(len(y_train)):
  y_train_1D[i, y_train[i]] = 1.0
y_test_1D = np.zeros( (len(y_test), len(classes)) , dtype = float)
for i in range(len(y_test)):
  y_test_1D[i, y_test[i]] = 1.0

frac_val = 0.1
all_idx = np.arange(len(x_train))
val_idx = np.random.choice(all_idx, int(frac_val*len(x_train)), replace=False)
tr2_idx = np.array([i for i in all_idx if i not in val_idx])

x_train2 = x_train_1D[tr2_idx]
y_train2 = y_train_1D[tr2_idx]
x_val = x_train_1D[val_idx]
y_val = y_train_1D[val_idx]

# 4. Runs

## 4.1. Local Runs

In [None]:
hyp = {
    "layerSizes": [len(x_train_1D[0]),len(y_train_1D[0])],
    "batchSize": -1,
    "learningRate": 1e-2,
    "epochs": 500,
    "activations": [ ACTIVATION_SOFTMAX],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightBounds": (-1,1),
    "initWeightMethod": WINIT_XAVIER,
    "optimizer": GDOPT_NONE,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-8,  # eta scaling hyperparam
    "regparam": 0,
    "wandb": False
}

NN = neuralNetwork(hyp)
NN.train(x_train2[:128]/255, y_train2[:128], x_val[:128]/255, y_val[:128])
# Note that x_val, y_val are given as an input just to calculate the loss and error at each epoch.
# They are not used anywhere to train the neural network

## 4.2. W&B Runs

In [None]:
!rm -rf outputs && mkdir outputs
wandb.init(project= "sample-runs")
hyp = {
    "layerSizes": [len(x_train_1D[0]),128,128,128,128,128,len(y_train_1D[0])],
    "batchSize": 32,
    "learningRate": 1e-3,
    "epochs": 2,
    "activations": [ ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_SOFTMAX],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightBounds": (-1,1),
    "initWeightMethod": WINIT_XAVIER,
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,    # momentum scaling hyperparam
    "beta_2": 0.999,  # eta scaling hyperparam
    "epsilon": 1e-8,  # eta scaling hyperparam
    "regparam": 0.5,
    "wandb": True
}

NN = neuralNetwork(hyp)
NN.train(x_train2[:]/255, y_train2[:], x_val/255, y_val)
# Note that x_val, y_val are given as an input just to calculate the loss and error at each epoch.
# They are not used anywhere to train the neural network


# 5. Sweeps

In [None]:
def runSweep():
  wandb.init()
  hyp = {}
  hyp["wandb"] = True
  cfg = wandb.config

  hyp["epochs"] = cfg.epochs

  layersHidden = []
  for i in range(cfg.numHiddenLayers):
    layersHidden.append(cfg.hiddenLayerSize)
  hyp["layerSizes"] = [len(x_train_1D[0])] + layersHidden + [len(y_train_1D[0])]

  hyp["regparam"] = cfg.L2Reg

  hyp["learningRate"] = cfg.learningRate

  hyp["optimizer"] = cfg.optimizer

  hyp["batchSize"] = cfg.batchSize

  hyp["initWeightMethod"] = cfg.initWeightMethod

  hyp["activations"] = []
  for i in range(cfg.numHiddenLayers):
    hyp["activations"].append(cfg.activationFn)
  hyp["activations"].append(ACTIVATION_SOFTMAX)

  hyp["lossFn"] = cfg.loss

  hyp.update({
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-8,  # eta scaling hyperparam
    })

  run_name = wandb.run.name
  wandb.run.name = run_name+'_ep_'+str(hyp["epochs"])+'_numhl_'+str(cfg.numHiddenLayers)\
  +'_hlsize_'+str(cfg.hiddenLayerSize)+'_l2reg_'+str(hyp["regparam"])\
  +'_eta_'+str(hyp["learningRate"])+'_'+str(hyp["optimizer"])\
  +'_bs_'+str(hyp["batchSize"])+'_'+str(hyp["initWeightMethod"])+'_'+str(hyp["lossFn"]+'_'+str(cfg.activationFn)) 
  wandb.run.save()

  nn = neuralNetwork(hyp)
  nn.train(x_train2/255, y_train2, x_val/255, y_val)

sweepCfg = {
    "name":"NN Fashion MNIST Parameter Sweep - Cross Entropy Loss", 
    "metric":{
        "name":"Validation Accuracy",
        "goal":"maximize"
    }, 
    "method": "bayes", 
    "parameters":{
        "epochs":{
          "values":[25]
        },
        "numHiddenLayers":{
          "values":[3,4,5]
        },
        "hiddenLayerSize":{
          "values":[32,64,128]
        },
        "L2Reg":{
          "values":[0,5e-4,0.5]
        },
        "learningRate":{
          "values":[1e-3, 1e-5]
        },
        "optimizer":{
          "values":[GDOPT_NONE, GDOPT_MOMENTUM, GDOPT_NESTEROV, GDOPT_RMSPROP, GDOPT_ADAM, GDOPT_NADAM]
        },
        "batchSize":{
          "values":[32,64,128]
        },
        "initWeightMethod":{
          "values":[WINIT_RANDOM, WINIT_XAVIER]
        },
        "activationFn":{
          "values":[ACTIVATION_SIGMOID, ACTIVATION_RELU, ACTIVATION_TANH]
        },
        "loss":{
          "values": [LOSS_CROSSENTROPY]
        },
    }
}

sweepId = wandb.sweep(sweepCfg)
wandb.agent(sweepId, function = runSweep)

# 6. Confusion Matrix

In [None]:
wandb.init(project="Test run - 1")
hyp = {
    "layerSizes": [len(x_train_1D[0]), 128, 128, 128, 128, 128,len(y_train_1D[0])],
    "batchSize": 64,
    "learningRate": 1e-3,
    "epochs": 1,
    "activations": [ ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_SOFTMAX ],
    "lossFn": LOSS_SQERROR, #LOSS_CROSSENTROPY LOSS_SQERROR
    "initWeightBounds": (-1,1),
    "initWeightMethod": WINIT_XAVIER,
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-3,  # eta scaling hyperparam
    "regparam": 5e-4,
    "wandb": True,
}

run_name = wandb.run.name
wandb.run.name = run_name + '_ep_' + str(hyp["epochs"]) + '_numhl_' + str(5)\
+'_hlsize_'+str(128)+'_l2reg_'+str(hyp["regparam"])\
+'_eta_'+str(hyp["learningRate"])+'_'+str(hyp["optimizer"])\
+'_bs_'+str(hyp["batchSize"])+'_'+str(hyp["initWeightMethod"])+'_'+str(hyp["lossFn"])+'_tanh'
wandb.run.save()

#wandb.config.update(hyp)

NN = neuralNetwork(hyp)
NN.train(x_train_1D/(255), y_train_1D, x_test_1D/(255), y_test_1D)
# Note that x_val, y_val are given as an input just to calculate the loss and error at each epoch.
# They are not used anywhere to train the neural network

In [23]:
y_pred_1D = NN.infer(x_test_1D/(255))
y_pred = np.argmax(y_pred_1D, axis=0)
conf_mat = confusion_matrix(y_test, y_pred)

In [None]:
X, Y = np.meshgrid( np.arange(len(classes))-0.3, np.arange(len(classes))-0.3 )
x_vec = X.ravel()
y_vec = Y.ravel()
z_vec = 0

dx = 0.6*np.ones_like(x_vec)
dy = 0.6*np.ones_like(y_vec)
dz = conf_mat.ravel()

%matplotlib inline

fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111, projection='3d')

minima = np.amin(conf_mat)
maxima = np.amax(conf_mat)

norm = matplotlib.colors.Normalize(vmin=minima, vmax=maxima, clip=True)
mapper = cm.ScalarMappable(norm=norm, cmap=cm.viridis)

color_vec = [ mapper.to_rgba(v) for v_vec in conf_mat for v in v_vec]

bar3d = ax.bar3d(x_vec, y_vec, z_vec, dx, dy, dz,
         zsort='average', color=color_vec, shade=True,alpha=0.4, edgecolor='black')
ax.set_xlabel('Predicted Class')
ax.set_ylabel('True Class')
ax.set_xticks(np.arange(10))
ax.set_yticks(np.arange(10))
ax.set_title('Confusion Matrix')
ax.view_init(50,-100)

fig.colorbar(bar3d, boundaries=np.linspace(np.amin(dz), np.amax(dz), 20 ))

plt.show()

In [None]:
wandb.log({"Confusion Matrix": wandb.Image(fig)})

# 7. Three Best Runs from Fashion-MNIST on MNIST Dataset


In [25]:
((x_train, y_train), (x_test, y_test)) = mnist.load_data()
classes, idx_sample_class = np.unique(y_train, return_index=True)
sample_images = x_train[ idx_sample_class ]

#Reshaping the 'x' data:
len_1D = x_train.shape[1]*x_train.shape[2]
x_train_1D = np.array( [x.reshape(len_1D) for x in x_train] )
x_test_1D  = np.array( [x.reshape(len_1D) for x in x_test] )

#Transforming 'y' data - changing scalar i to vector e(i)
y_train_1D = np.zeros( (len(y_train), len(classes)) )
for i in range(len(y_train)):
  y_train_1D[i, y_train[i]] = 1
y_test_1D = np.zeros( (len(y_test), len(classes)) )
for i in range(len(y_test)):
  y_test_1D[i, y_test[i]] = 1

y_train_1D = y_train_1D.astype(float)
y_test_1D = y_test_1D.astype(float)

frac_val = 0.1
all_idx = np.arange(len(x_train))
val_idx = np.random.choice(all_idx, int(frac_val*len(x_train)), replace=False)
tr2_idx = np.array([i for i in all_idx if i not in val_idx])

x_train2 = x_train_1D[tr2_idx]
y_train2 = y_train_1D[tr2_idx]
x_val = x_train_1D[val_idx]
y_val = y_train_1D[val_idx]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [26]:
def fullMNISTHarness(hyp):
  wandb.init(project="mnist-training-using-top-fmnist-xentropy-runs")

  NN = neuralNetwork(hyp)
  NN.train(x_train_1D/(255), y_train_1D, x_test_1D/(255), y_test_1D)

In [None]:
hyps = []

# Best of top 3
hyps.append({
    "layerSizes": [len(x_train_1D[0]),64, 64, 64,len(y_train_1D[0])],
    "batchSize": 128,
    "learningRate": 1e-3,
    "epochs": 25,
    "activations": [ ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_SOFTMAX ],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightMethod": WINIT_XAVIER,
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-3,  # eta scaling hyperparam
    "regparam": 0,
    "wandb":True,
})

# Second best of top 3
hyps.append({
    "layerSizes": [len(x_train_1D[0]),32, 32, 32, 32,len(y_train_1D[0])],
    "batchSize": 128,
    "learningRate": 1e-3,
    "epochs": 25,
    "activations": [ ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_SOFTMAX ],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightMethod": WINIT_XAVIER,
    "optimizer": GDOPT_NADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-3,  # eta scaling hyperparam
    "regparam": 0,
    "wandb":True,
})

# Third best of top 3
hyps.append({
    "layerSizes": [len(x_train_1D[0]),32, 32, 32, 32,len(y_train_1D[0])],
    "batchSize": 128,
    "learningRate": 1e-3,
    "epochs": 25,
    "activations": [ ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_TANH, ACTIVATION_SOFTMAX ],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightMethod": WINIT_XAVIER,
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-3,  # eta scaling hyperparam
    "regparam": 0,
    "wandb":True,
})

# run top 3 configs
for hyp in hyps:
  fullMNISTHarness(hyp)