<a href="https://colab.research.google.com/github/vasid99/cs6910-dl/blob/main/Assignment01/Assignment01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- change backpropagation for l2 reg

- do loss calculation for validation set, not training set

- change error calculation

- we are assuming that 0-1 error is to be reported





In [None]:
!pip install wandb

In [None]:
# imports
import numpy as np
import wandb
from keras.datasets import fashion_mnist

In [None]:
wandb.init()

In [None]:
((x_train, y_train), (x_test, y_test)) = fashion_mnist.load_data()

In [None]:
classes, idx_sample_class = np.unique(y_train, return_index=True)
sample_images = x_train[ idx_sample_class ]

In [None]:
wandb.log({"Sample_Images": \
[ wandb.Image(sample_images[i], caption="Label:"+str(classes[i])) \
for i in range(len(idx_sample_class)) ] })

In [None]:
# function constants
ACTIVATION_SIGMOID   = 0
ACTIVATION_SOFTMAX   = 1
ACTIVATION_THRESHOLD = 2
ACTIVATION_RELU      = 3
ACTIVATION_TANH      = 4

LOSS_SQERROR         = 0
LOSS_CROSSENTROPY    = 1

GDOPT_NONE           = 0
GDOPT_MOMENTUM       = 1
GDOPT_NESTEROV       = 2
GDOPT_ADAGRAD        = 3
GDOPT_RMSPROP        = 4
GDOPT_ADAM           = 5
GDOPT_NADAM          = 6

WINIT_RANDOM         = 0
WINIT_XAVIER         = 1

# math functions class
class neuralNetworkMathFunctions:
  """
  Helper class to handle mathematical operations of neural network passes, specifically activations and loss calculation
  """
  def __init__(self):
    """
    Initialize math helper class for a neural network. Hyperparameter object can be the same as given to neural network
    """
    # Hyperparameters are initialized separately by the parent neural network

  def setHyperparameters(self,hp):
    """
    Set mathematical hyperparameters of neural network
    """


class neuralNetwork:
  """
  Class for a neural network made up of multiple layers of perceptrons
  """
  def __init__(self,hyperparams):
    """
    Initialize parameters and hyperparameters of neural network
    """
    # create empty math functions object
    self.fns = neuralNetworkMathFunctions()

    # assign basic hyperparameters to neural network and math functions object
    self.hyperparams = {}
    self.setHyperparameters(hyperparams)

    # initialize the weight and bias matrices of the NN
    self.initModel(hyperparams)
  
  def setHyperparameters(self,hp):
    """
    Set hyperparameters of neural network
    """
    # change values of only the hyperparameters specified in the input variable
    self.hyperparams.update(hp)
    
    # use member variables for commonly used hyperparameters
    self.layerSizes       = self.hyperparams["layerSizes"]
    self.batchSize        = self.hyperparams["batchSize"]
    self.learningRate     = self.hyperparams["learningRate"]
    self.epochs           = self.hyperparams["epochs"]
    self.numLayers        = len(self.layerSizes) - 1
    
    # set math functions object hyperparameters
    assert len(self.hyperparams["activations"])==self.numLayers, "number of layers (%d) and number of activations (%d) don't match"%(self.numLayers,len(hp["activations"]))
    self.activations = self.hyperparams["activations"]
    self.lossFn = self.hyperparams["lossFn"]
    self.regparam = self.hyperparams["regparam"]

  def initModel(self,hyperparams):
    """
    Initialize parameters (weight and bias matrices) of neural network
    """
    # checking bounds arg
    bounds = (0,1)
    if "initWeightBounds" in hyperparams.keys():
      assert len(hyperparams["initWeightBounds"])==2, "bounds arg has to be a list/tuple of 2 numbers"
      bounds = hyperparams["initWeightBounds"]

    # create list of weight matrices and bias vectors
    # the goal is to make the indexing same as that in lecture derivation, hence the dummy values
    self.wmat = [np.array([1],ndmin=2)]
    self.bias = [np.array([1],ndmin=2)]
    
    # create random initial parameters and append them to the above initialized lists
    for i in range(1,self.numLayers+1):
      if self.hyperparams["initWeightMethod"]==WINIT_XAVIER:
        bounds = (-1/(self.layerSizes[i-1])**0.5,1/(self.layerSizes[i-1])**0.5)
      self.wmat.append((bounds[1]-bounds[0])*np.random.rand(self.layerSizes[i],self.layerSizes[i-1])+bounds[0])
      self.bias.append((bounds[1]-bounds[0])*np.random.rand(self.layerSizes[i],1)+bounds[0])

  def activation(self,layerNum,x):
    """
    Compute and return activation values for a given layer and its sum values
    """
    layerNum -= 1 # index adjustment
    if self.activations[layerNum]==ACTIVATION_SIGMOID:
      return 1/(1+np.exp(-x))
    elif self.activations[layerNum]==ACTIVATION_SOFTMAX:
      z = np.exp(x)
      return z/np.sum(z)
    elif self.activations[layerNum]==ACTIVATION_THRESHOLD:
      return (x>=0)+0
    elif self.activations[layerNum]==ACTIVATION_RELU:
      return np.maximum(x,0)
    elif self.activations[layerNum]==ACTIVATION_TANH:
      return np.tanh(x)
  
  def activationDerivative(self,layerNum,**kwargs):
    """
    Compute and return activation derivative values for a given layer and its sum or output values depending on the given argument
    """
    assert ( len(kwargs.keys())==1 and np.any([_ in kwargs.keys() for _ in ["x","y"]]) ), "activationDerivative argument malformed. \
    Use activationDerivative(layerNum,x=x_val) or activationDerivative(layerNum,y=y_val)"
    layerNum -= 1 # index adjustment
    
    if "y" in kwargs.keys():
      y = kwargs["y"]
      if self.activations[layerNum]==ACTIVATION_SIGMOID:
        return y*(1-y)
      elif self.activations[layerNum]==ACTIVATION_SOFTMAX:
        return y*(1-y)
      elif self.activations[layerNum]==ACTIVATION_THRESHOLD:
        return y*(1-y)
      elif self.activations[layerNum]==ACTIVATION_RELU:
        return (y>=0)+0
      elif self.activations[layerNum]==ACTIVATION_TANH:
        return 1/y
    else:
      x = kwargs["x"]
      if self.activations[layerNum]==ACTIVATION_SIGMOID:
        return np.exp(-x)/(1+np.exp(-x))**2
      elif self.activations[layerNum]==ACTIVATION_SOFTMAX:
        z = np.exp(x)
        s = np.sum(z)
        return z*(s-z)/(s**2)
      elif self.activations[layerNum]==ACTIVATION_THRESHOLD:
        return np.exp(-x)/(1+np.exp(-x))**2
      elif self.activations[layerNum]==ACTIVATION_RELU:
        return (x>=0)+0
      elif self.activations[layerNum]==ACTIVATION_RELU:
        return 1/np.tanh(x)
  
  def lossOutputDerivative(self,outputData,targetData):
    """
    Compute and return loss derivatives for given output and target data
    """
    print('outputData = ',outputData)
    if self.lossFn==LOSS_SQERROR:
      return outputData-targetData
    elif self.lossFn==LOSS_CROSSENTROPY:
      return -targetData / outputData 
  
  def forwardPass(self, inputData):
    """
    Compute output activations of all layers of neural network
    Data can also be given as sets of datapoints (dimensions being layer dimension x dataset size)
    """
    #                                              # --- PSEUDOCODE ---
    h     = inputData                              # h[0] = x
    hData = [h]                                    #
    datasetSize = np.shape(inputData)[1]           #
    #                                              #
    for i in range(1,self.numLayers+1):            # for i from 1 to L:
      a   = self.wmat[i] @ h + self.bias[i]        #     a[i] = w[i] @ h[i-1] + b[i]
      h   = self.activation(i,a)               #     h[i] = f(a[i])
      hData.append(h)
    
    return hData
  
  def backwardPass(self, layerwiseOutputData, targetData):
    """
    Compute weight and bias gradients for all layers of neural network
    Data can also be given as sets of datapoints (dimensions being layer dimension x dataset size)
    """
    #                                                                                        # --- PSEUDOCODE ---
    lossData    = self.lossOutputDerivative(layerwiseOutputData[-1], targetData)             # loss_derivative = d(loss)/dh[L]
    Delta       = lossData                                                                   # Delta[L] = loss_derivative
    datasetSize = np.shape(targetData)[1]                                                    #
    biasInputs  = np.array(np.ones(datasetSize),ndmin=2).T                                   #
    gradW       = []                                                                         #
    gradB       = []                                                                         #
    #                                                                                        #
    for iFwd in range(self.numLayers):                                                       # for i from L to 1:
      i            = self.numLayers - iFwd                                                   #     // index correction
      stocBiasCorr = self.activationDerivative(i,y=layerwiseOutputData[i]) * Delta           #     stochastic_bias_corrections = f'(a[i]) * Delta[i]
      gW           = stocBiasCorr @ layerwiseOutputData[i-1].T + self.regparam*self.wmat[i]  #     grad(W[i]) = stochastic_bias_corrections x (h[i-1]).T
      gB           = stocBiasCorr @ biasInputs + self.regparam*self.bias[i]                              #     grad(b[i]) = sum(stochastic_bias_corrections)
      Delta        = self.wmat[i].T @ stocBiasCorr                                           #     Delta[i-1] = W[i] x stochastic_bias_corrections
      
      gradW.append(gW)
      gradB.append(gB)
    
    # dummy element and order handling
    gradW.append(np.array([0],ndmin=2))
    gradW.reverse()
    gradB.append(np.array([0],ndmin=2))
    gradB.reverse()
    
    return (gradW,gradB)
  
  def infer(self,inputData,**kwargs):
    """
    Perform inference on input dataset using the neural network
    Unless colwiseData=True is given as an argument, data will be interpreted as being dataset size x layer dimension
    """
    # resolving input dimensions
    inputData  = np.array(inputData,ndmin=2)
    if "colwiseData" in kwargs and kwargs["colwiseData"]==True:
      pass
    else:
      inputData  = inputData.T
    assert np.shape(inputData)[0]==self.layerSizes[0], "size of input datapoint differs from size of input vector given as hyperparameter"
    
    # perform forward pass and return last-layer outputs
    return self.forwardPass(inputData)[-1]

  def gradtheta_for_batchindex(self, inputData, targetData, datasetSize, batchSize, numBatches, batchIndex):##
    # create data batches
    startIndex  = batchSize * batchIndex
    endIndex    = min(startIndex + batchSize, datasetSize)
    inputBatch  = inputData[:,startIndex:endIndex]
    targetBatch = targetData[:,startIndex:endIndex]
    # perform forward and backward passes to compute gradients
    layerwiseOutputData = self.forwardPass(inputBatch)
    print("fwd_done")
    (gradW, gradB)      = self.backwardPass(layerwiseOutputData,targetBatch)
    return gradW, gradB ##

  def update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val):
    y_pred_train = self.infer(inputData, colwiseData =True)
    y_pred_val   = self.infer(x_val, colwiseData =True)
    modW = np.sum( np.array( [ np.linalg.norm(W) for W in self.wmat ] ) )
    modB = np.sum( np.array( [ np.linalg.norm(B) for B in self.bias ] ) )
    modtheta_sq = (modW + modB)**2
    if self.lossFn==LOSS_SQERROR:
      loss_train = 0.5*np.sum(np.linalg.norm(y_pred_train - targetData, axis=0)**2) + 0.5*self.regparam*modtheta_sq
      loss_val   = 0.5*np.sum(np.linalg.norm(  y_pred_val - y_val     , axis=0)**2) + 0.5*self.regparam*modtheta_sq
    elif self.lossFn==LOSS_CROSSENTROPY:
      loss_train = np.sum(targetData * np.log(y_pred_train)) + 0.5*self.regparam*modtheta_sq
      loss_val   = np.sum(     y_val * np.log(y_pred_val  )) + 0.5*self.regparam*modtheta_sq
    error_train = 1 - np.count_nonzero( np.argmax(targetData, axis=0) == np.argmax(y_pred_train, axis=0) )/len(targetData[0])
    error_val   = 1 - np.count_nonzero( np.argmax(     y_val, axis=0) == np.argmax(  y_pred_val, axis=0) )/len(y_val[0])
    self.loss_train.append(loss_train)
    self.error_train.append(error_train)
    self.loss_val.append(loss_val)
    self.error_val.append(error_val)

  def sgd(self, inputData, targetData, datasetSize, batchSize, numBatches, x_val, y_val):
    # run training loop
    for epoch in range(self.epochs):
      update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val)
      for batchIndex in range(numBatches):       
        #Get grad theta
        (gradW, gradB) = self.gradtheta_for_batchindex(inputData, targetData, datasetSize, batchSize, numBatches, batchIndex) ##
        # perform parameter update
        for i in range(1,self.numLayers+1):
          self.wmat[i] += -self.learningRate * gradW[i]
          self.bias[i] += -self.learningRate * gradB[i]
    update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val)

  def momentumGD(self, inputData, targetData, datasetSize, batchSize, numBatches, x_val, y_val):
    eta = self.learningRate
    gamma = self.hyperparams["beta_1"]
    #initialize
    self.loss_lst_train = [] ##
    self.error_lst_train = [] ##
    update_w = [0]*(self.numLayers+1)
    update_b = [0]*(self.numLayers+1)
    # run training loop
    for epoch in range(self.epochs):
      update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val, x_val, y_val)
      for batchIndex in range(numBatches):       
        #Get grad theta
        (gradW, gradB) = self.gradtheta_for_batchindex(inputData, targetData, datasetSize, batchSize, numBatches, batchIndex) ##
        # perform parameter update
        for i in range(1,self.numLayers+1):
          update_w[i] = gamma*update_w[i] + eta*gradW[i]
          update_b[i] = gamma*update_b[i] + eta*gradB[i]
          self.wmat[i] += -update_w[i]
          self.bias[i] += -update_b[i]
    update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val) 

  def NAG(self, inputData, targetData, datasetSize, batchSize, numBatches, x_val, y_val):
    eta = self.learningRate
    gamma = self.hyperparams["beta_1"]
    #initialize
    update_w = [0]*(self.numLayers+1)
    update_b = [0]*(self.numLayers+1)
    # run training loop
    for epoch in range(self.epochs):
      update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val)
      for batchIndex in range(numBatches):
        # perform look ahead parameter update
        for i in range(1,self.numLayers+1):
          self.wmat[i] += -gamma*update_w[i]
          self.bias[i] += -gamma*update_b[i]
        (gradW, gradB) = self.gradtheta_for_batchindex(inputData, targetData, datasetSize, batchSize, numBatches, batchIndex) ##
        # perform parameter update
        for i in range(1,self.numLayers+1):
          update_w[i] = gamma*update_w[i] + eta*gradW[i]
          update_b[i] = gamma*update_b[i] + eta*gradB[i]
          self.wmat[i] += -eta*gradW[i]
          self.bias[i] += -eta*gradW[i]
    update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val) 

  def rmsprop(self, inputData, targetData, datasetSize, batchSize, numBatches, x_val, y_val):
    beta = self.hyperparams["beta_2"]
    epsilon = self.hyperparams["epsilon"]
    #Initialise
    v_w = [0]*(self.numLayers+1)
    v_b = [0]*(self.numLayers+1)
    # run training loop
    for epoch in range(self.epochs):
      update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val)
      for batchIndex in range(numBatches):        
        (gradW, gradB) = self.gradtheta_for_batchindex(inputData, targetData, datasetSize, batchSize, numBatches, batchIndex) ##
        # perform parameter update
        for i in range(1,self.numLayers+1):
          v_w[i] = beta*v_w[i] + (1-beta)*gradW[i]**2
          v_b[i] = beta*v_b[i] + (1-beta)*gradB[i]**2 
          self.wmat[i] += -eta * (v_w[i] + epsilon)**-0.5 * gradW[i]
          self.bias[i] += -eta * (v_b[i] + epsilon)**-0.5 * gradB[i]
    update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val) 
  
  def adam(self, inputData, targetData, datasetSize, batchSize, numBatches, x_val, y_val):
    eta = self.learningRate
    beta_1 = self.hyperparams["beta_1"]
    beta_2 = self.hyperparams["beta_2"]
    epsilon = self.hyperparams["epsilon"]
    #initialize
    m_w = [0]*(self.numLayers+1)
    m_b = [0]*(self.numLayers+1)
    v_w = [0]*(self.numLayers+1)
    v_b = [0]*(self.numLayers+1)
    t = 1 #tracks the iteration number
    # run training loop
    for epoch in range(self.epochs):
      update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val)
      for batchIndex in range(numBatches): 
        (gradW, gradB) = self.gradtheta_for_batchindex(inputData, targetData, datasetSize, batchSize, numBatches, batchIndex) ##
        # perform parameter update
        for i in range(1,self.numLayers+1):
          m_w[i] = beta_1*m_w[i] + (1-beta_1)*gradW[i]
          m_b[i] = beta_1*m_b[i] + (1-beta_1)*gradB[i]
          v_w[i] = beta_2*v_w[i] + (1-beta_2)*gradW[i]**2
          v_b[i] = beta_2*v_b[i] + (1-beta_2)*gradB[i]**2
          m_w_hat = m_w[i]/(1-beta_1**t)
          m_b_hat = m_b[i]/(1-beta_1**t)
          v_w_hat = v_w[i]/(1-beta_2**t)
          v_b_hat = v_b[i]/(1-beta_2**t)
          self.wmat[i] += -eta * (v_w_hat + epsilon)**-0.5 * m_w_hat
          self.bias[i] += -eta * (v_b_hat + epsilon)**-0.5 * m_b_hat
        t += 1
    update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val) 

  def nadam(self, inputData, targetData, datasetSize, batchSize, numBatches, x_val, y_val):
    eta = self.learningRate
    beta_1 = self.hyperparams["beta_1"]
    beta_2 = self.hyperparams["beta_2"]
    epsilon = self.hyperparams["epsilon"]
    #initialize
    m_w = [0]*(self.numLayers+1)
    m_b = [0]*(self.numLayers+1)
    v_w = [0]*(self.numLayers+1)
    v_b = [0]*(self.numLayers+1)
    t = 1 #tracks the iteration number
    # run training loop
    for epoch in range(self.epochs):
      update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val)
      for batchIndex in range(numBatches): 
        (gradW, gradB) = self.gradtheta_for_batchindex(inputData, targetData, datasetSize, batchSize, numBatches, batchIndex) ##
        # perform parameter update
        for i in range(1,self.numLayers+1):
          m_w[i] = beta_1*m_w[i] + (1-beta_1)*gradW[i]
          m_b[i] = beta_1*m_b[i] + (1-beta_1)*gradB[i]
          v_w[i] = beta_2*v_w[i] + (1-beta_2)*gradW[i]**2
          v_b[i] = beta_2*v_b[i] + (1-beta_2)*gradB[i]**2
          m_w_hat = (beta_1/(1-beta_1**(t+1)))*m_w[i] + ((1-beta_1)/(1-beta_1**t))*gradW[i]
          m_b_hat = (beta_1/(1-beta_1**(t+1)))*m_b[i] + ((1-beta_1)/(1-beta_1**t))*gradB[i]
          v_w_hat = v_w[i]/(1-beta_2**t)
          v_b_hat = v_b[i]/(1-beta_2**t)
          self.wmat[i] += -eta * (v_w_hat + epsilon)**-0.5 * m_w_hat
          self.bias[i] += -eta * (v_b_hat + epsilon)**-0.5 * m_b_hat
        t += 1
    update_val_train_loss_and_error(self, inputData, targetData, x_val, y_val) 


  def train(self, inputData, targetData, x_val, y_val, **kwargs):
    """
    Train the network on the given input and target datasets
    Unless colwiseData=True is given as an argument, data will be interpreted as being dataset size x layer dimension
    """
    # resolving input and target dimensions
    inputData  = np.array(inputData,ndmin=2)
    targetData = np.array(targetData,ndmin=2)
    if "colwiseData" in kwargs and kwargs["colwiseData"]==True:
      pass
    else:
      inputData  = inputData.T
      targetData = targetData.T
      x_val = x_val.T
      y_val = y_val.T
    assert np.shape(inputData)[1]==np.shape(targetData)[1], "input and target datasets have different dataset sizes"
    assert np.shape(inputData)[0]==self.layerSizes[0], "size of input datapoint differs from size of input vector given as hyperparameter"
    assert np.shape(targetData)[0]==self.layerSizes[-1], "size of target datapoint differs from size of target vector given as hyperparameter"
    datasetSize = np.shape(targetData)[1]

    # calculate batch parameters
    batchSize = datasetSize if self.batchSize==-1 else self.batchSize
    numBatches = int(np.ceil(datasetSize / batchSize))

    #Initialise loss and error lists
    self.loss_train = []
    self.error_train = []
    self.loss_val = []
    self.error_val = []

    if self.hyperparams["optimizer"] == GDOPT_NONE:
      self.sgd(inputData, targetData, datasetSize, batchSize, numBatches)
    elif self.hyperparams["optimizer"] == GDOPT_MOMENTUM:
      self.momentumGD(inputData, targetData, datasetSize, batchSize, numBatches)
    elif self.hyperparams["optimizer"] == GDOPT_NESTEROV:
      self.NAG(inputData, targetData, datasetSize, batchSize, numBatches)
    elif self.hyperparams["optimizer"] == GDOPT_RMSPROP:
      self.rmsprop(inputData, targetData, datasetSize, batchSize, numBatches)
    elif self.hyperparams["optimizer"] == GDOPT_ADAM:
      self.adam(inputData, targetData, datasetSize, batchSize, numBatches)
    elif self.hyperparams["optimizer"] == GDOPT_NADAM:
      self.nadam(inputData, targetData, datasetSize, batchSize, numBatches)
    
    self.loss_train = np.array(self.loss_train)
    self.error_train = np.array(self.error_train)
    self.loss_val = np.array(self.loss_val)
    self.error_val = np.array(self.error_val)

        
        

In [None]:
hyp = {
    "layerSizes": [2,4,1],
    "batchSize": 1,
    "learningRate": 1,
    "epochs": 500,
    "activations": [ACTIVATION_SIGMOID, ACTIVATION_SIGMOID],
    "lossFn": LOSS_SQERROR,
    "initWeightBounds": (-0.1,0.1),
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-8,  # eta scaling hyperparam
    "reg param":0.1
}

x = neuralNetwork(hyp)
inp = np.array([[1,0.5],[-0.5,0.25],[1,2]])
tar = np.array([[0.5],[0.75],[0.67]])
print("Target data:")
print(tar.T)
print("Output before training:")
print(x.infer(inp))
print("Performing training now")
x.train(inp,tar)
print("Output after training for %d epochs with learning rate of %.2f:"%(x.epochs,x.learningRate))
print(x.infer(inp))

In [None]:
x.setHyperparameters({
    "learningRate":0.2,
    "epochs":5000,
    "activations":[ACTIVATION_RELU, ACTIVATION_RELU]
    # square error loss will work as Hamming distance in this case
})
inp = np.array([[0,0],[0,1],[1,0],[1,1],[0,0],[0,1],[1,0],[1,1]])
tar = np.array([[0],[0],[0],[1],[0],[0],[0],[1]])
print("Target data:")
print(tar.T)
print("Output before training:")
print(x.infer(inp))
print("Performing training now")
x.train(inp,tar)
print("Output after training for %d epochs with learning rate of %.2f:"%(x.epochs,x.learningRate))
print(x.infer(inp))

In [None]:
#Reshaping the 'x' data:
len_1D = x_train.shape[1]*x_train.shape[2]
x_train_1D = np.array( [x.reshape(len_1D) for x in x_train] )
x_test_1D  = np.array( [x.reshape(len_1D) for x in x_test] )

#Transforming 'y' data - changing scalar i to vector e(i)
y_train_1D = np.zeros( (len(y_train), len(classes)) )
for i in range(len(y_train)):
  y_train_1D[i, y_train[i]] = 1
y_test_1D = np.zeros( (len(y_test), len(classes)) )
for i in range(len(y_test)):
  y_test_1D[i, y_train[i]] = 1

frac_val = 0.1
all_idx = np.arange(len(x_train))
val_idx = np.random.choice(all_idx, int(frac_val*len(x_train)), replace=False)
tr2_idx = np.array([i for i in all_idx if i not in val_idx])

x_train2 = x_train_1D[tr2_idx]
y_train2 = y_train_1D[tr2_idx]
x_val = x_train_1D[val_idx]
y_val = y_train_1D[val_idx]

In [None]:
len(x_train)

In [None]:
hyp = {
    "layerSizes": [len(x_train_1D[0]),len(y_train_1D[0])],
    "batchSize": 32,
    "learningRate": 1e-3,
    "epochs": 1,
    "activations": [ ACTIVATION_SOFTMAX],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightBounds": (-1,1),
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-8,  # eta scaling hyperparam
    "regparam": 0.1
}

#wandb.config.update(hyp)

NN = neuralNetwork(hyp)
(val_loss, val_error) = NN.train(x_train2[:256]/255, y_train2[:256]/255)
#(loss_lst, error_lst) = NN.train(x_train_1D, y_train_1D)



Two things (20210310_0242):
1. The high `exp` values were due to the fact that the inputs are pixels with values from 0 to 255. On scaling them down by 255, the outputs became more normal without even doing something about nan. Yet `np.nan_to_num()` can be helpful in case it is needed later on
2. Why are the softmax outputs not adding up to 1?

In [None]:
val_loss

In [None]:
val_error

In [None]:
a = np.array([ [0,1],[1,0],[0,1] ])
b = np.array( [ [0,0],[1,0],[0,0] ] )
np.count_nonzero(np.all(a==b, axis=0))
a[np.array([0,1])]

In [None]:
# wandb log checks
'''wandb.log(
    {"Parameter 1": 
     ["test string","another test string","yet another test"], 
     "Test text": wandb.Html("<b><i>Working</i></b>"), 
     "Test table": wandb.Table(columns=["I like this column", "Well, I like this one"], data=[["The header above likes me", "Same here"],["No idiot, he likes the whole column", "Good point there, neighbour"]])})
'''

# wandb sweep checks
def fsweeptest():
  run = wandb.init(config={"daIndex":10})
  for i in range(10):
    wandb.log({"x": i, "daMetric": np.cos(np.pi*i*wandb.config.daIndex/80)})

sweepCfg = {"name":"Test sweep", 
            "metric":{
                "name":"daMetric",
                "goal":"maximise"
            },
            "method":"grid", 
            "parameters":{"daIndex":{"values":[10,20,40,80,160]}}}
sweepId = wandb.sweep(sweepCfg)
wandb.agent(sweepId, function = fsweeptest)

In [None]:
def runSweep():
  hyp = {}
  cfg = wandb.config

  layersHidden = []
  for i in range(cfg.numHiddenLayers):
    layersHidden.append(cfg.hiddenLayerSize)
  hyp["layerSizes"] = [784] + layersHidden + [10]

  hyp["batchSize"] = cfg.batchSize

  hyp["learningRate"] = cfg.learningRate

  hyp["epochs"] = cfg.epochs

  # activations?

  hyp["lossFn"] = cfg.loss

  hyp["optimizer"] = cfg.optimizer

  nn = neuralNetwork(hyp)
  ## run train functions. Also, put wandb.log statements inside after loss/err calculation

'''hyp = {
    "layerSizes": [len(x_train_1D[0]),len(y_train_1D[0])],
    "batchSize": 32,
    "learningRate": 1e-3,
    "epochs": 1,
    "activations": [ ACTIVATION_SOFTMAX],
    "lossFn": LOSS_CROSSENTROPY,
    "initWeightBounds": (-1,1),
    "optimizer": GDOPT_ADAM,
    "beta_1": 0.9,   # momentum scaling hyperparam
    "beta_2": 0.999, # eta scaling hyperparam
    "epsilon": 1e-8,  # eta scaling hyperparam
    "reg param": 0.1
}'''
  nn = neuralNetwork(hyp)

sweepCfg = {
    "name":"NN Fashion MNIST - Test Sweep", 
    "metric":{
        "name":"valLoss", 
        "goal":"minimize"
    }, 
    "method": "bayes", 
    "parameters":{
        "epochs":{
          "values":[5,10]
        },
        "numHiddenLayers":{
          "values":[3,4,5]
        },
        "hiddenLayerSize":{
          "values":[32,64,128]
        },
        "l2Reg":{
          "values":[0,5e-4,0.5]
        },
        "learningRate":{
          "values":[1e-3, 1e-4]
        },
        "optimizer":{
          "values":[GDOPT_NONE, GDOPT_MOMENTUM, GDOPT_NESTEROV, GDOPT_RMSPROP, GDOPT_ADAM, GDOPT_NADAM]
        },
        "batchSize":{
          "values":[16,32,64]
        },
        "weightInit":{
          "values":[WINIT_RANDOM, WINIT_XAVIER]
        },
        "activationFns":{
          "values":[ACTIVATION_SIGMOID, ACTIVATION_RELU, ACTIVATION_TANH]},
    }
}

# in wandb harness: