<a href="https://colab.research.google.com/github/ykamen/CS4342/blob/main/CS4342_HW5_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive
from google.colab import files
import scipy.optimize

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
tr_labels = np.load('drive/MyDrive/fashion_mnist_train_labels.npy', 'r')
tr_images = np.load('drive/MyDrive/fashion_mnist_train_images.npy', 'r').T / 255.0
te_images = np.load('drive/MyDrive/fashion_mnist_test_images.npy', 'r').T / 255.0
te_labels = np.load('drive/MyDrive/fashion_mnist_test_labels.npy', 'r')

n_values = np.max(tr_labels) + 1
tr_labels = np.eye(n_values)[tr_labels]

n_values = np.max(te_labels) + 1
te_labels = np.eye(n_values)[te_labels]

In [89]:
NUM_INPUT = 784  # Number of input neurons
NUM_HIDDEN = 40  # Number of hidden neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

# Given a vector w containing all the weights and biased vectors, extract
# and return the individual weights and biases W1, b1, W2, b2.
# This is useful for performing a gradient check with check_grad.
def unpack (w):
    # Unpack arguments
    start = 0
    end = NUM_HIDDEN*NUM_INPUT
    W1 = w[0:end]
    start = end
    end = end + NUM_HIDDEN
    b1 = w[start:end]
    start = end
    end = end + NUM_OUTPUT*NUM_HIDDEN
    W2 = w[start:end]
    start = end
    end = end + NUM_OUTPUT
    b2 = w[start:end]
    # Convert from vectors into matrices
    W1 = W1.reshape(NUM_HIDDEN, NUM_INPUT)
    W2 = W2.reshape(NUM_OUTPUT, NUM_HIDDEN)
    return W1,b1,W2,b2

# Given individual weights and biases W1, b1, W2, b2, concatenate them and
# return a vector w containing all of them.
# This is useful for performing a gradient check with check_grad.
def pack (W1, b1, W2, b2):
    return np.hstack((W1.flatten(), b1.flatten(), W2.flatten(), b2.flatten()))

def relu(x):
    return np.maximum(0, x)

def softmax(Z):
  yhat = np.exp(Z)
  for i in range(Z.shape[0]):
    temp = np.sum(yhat[i],axis=0)
    yhat[i] = yhat[i]/temp
  return yhat.T

def pc(yhat, y):
  c = 0
  for i in range(y.shape[0]):
    if np.argmax(yhat[i]) == np.argmax(y[i]):
      c = c+1
  return c/y.shape[0]

# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the cross-entropy (CE) loss, accuracy,
# as well as the intermediate values of the NN.
def fCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    z1 = W1.dot(X.T).T + b1 #np.tile(b1,(X.shape[1],1)).T - for multiple units in batch
    h1 = relu(z1)
    z2 = W2.dot(h1.T).T + b2 #np.tile(b2,(X.shape[1],1)).T – for multiple units in batch
    yhat = softmax(z2)
    cost = -np.sum(Y.dot(np.log(yhat)))/(yhat.shape[1])
    acc = pc(yhat,Y)
    return cost, acc, X, z1, h1, W1, W2, yhat


def reluDerivative(x):
  x[x<=0] = 0
  x[x>0] = 1
  return x

# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the gradient of fCE. You might
# want to extend this function to return multiple arguments (in which case you
# will also need to modify slightly the gradient check code below).
def gradCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    cost, acc, X, z1, h1, W1, W2, yhat = fCE(X,Y,w)
    temp1 = yhat.T-Y
    temp2 = temp1.dot(W2)
    g = (temp2*reluDerivative(z1.T).T).T
    grad_b1 = g
    grad_b2 = (yhat.T-Y)
    grad_W1 = g.dot(X)
    temp3 = (h1.T).T
    grad_W2 = (yhat.T - Y).T.dot(temp3)
    return pack(grad_W1, grad_b1, grad_W2, grad_b2)


#average of the b1 / b2 and then transform into a row / column vector

# Given training and testing datasets and an initial set of weights/biases b,
# train the NN.
def train (trainX, trainY, testX, testY, w):
    batchSize = 256
    epsilon = .01
    alpha = .01
    a = trainX.shape[1] / batchSize
    indeces = np.arange(trainX.shape[1])
    np.random.shuffle(indeces)
    y = trainY
    ind = np.split(indeces,a)
    for i in range(int(a)):
      cost, acc, X, z1, h1, W1, W2, yhat = fCE(trainX,trainY,w)
      batchy = y[ind[i]]
      batchx = trainX[:,ind[i]]
      grad_b1, grad_b2, grad_W1, grad_W2 = gradCE(batchx,batchy,w)
      W1,b1,W2,b2 = unpack(w)
      W1 = W1-epsilon*(grad_W1+alpha*W1/batchSize)
      b1 = b1-epsilon*grad_b1
      W2 = W2-epsilon*(grad_W2+alpha*W2/batchSize)
      b2 = b2-epsilon*grad_b2
      w = pack(W1,b1,W2,b2)
      if (i >= (int(a)-5)):
        cost = -np.sum(trainY*np.log(yhat))/(yhat.shape[0])
        acc = pc(yhat,trainY)
        print(f"Loss at batch {i+1} is {cost}")
    return a

if __name__ == "__main__":
    # Load data
    trainX,trainY,testX,testY = tr_images,tr_labels,te_images,te_labels

    # Initialize weights randomly
    W1 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_INPUT))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
    b1 = 0.01 * np.ones(NUM_HIDDEN)
    W2 = 2*(np.random.random(size=(NUM_OUTPUT, NUM_HIDDEN))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
    b2 = 0.01 * np.ones(NUM_OUTPUT)
    
    # Concatenate all the weights and biases into one vector; this is necessary for check_grad
    w = pack(W1, b1, W2, b2)

    # Check that the gradient is correct on just a few examples (randomly drawn).
    idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]
    print("Numerical gradient:")
  #  print(scipy.optimize.approx_fprime(w, lambda w_: fCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[:,idxs]), w_)[0], 1e-10))
    print("Analytical gradient:")
   # print(gradCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[:,idxs]), w))
    print("Discrepancy:")
    #print(scipy.optimize.check_grad(lambda w_: fCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[:,idxs]), w_), \
     #                               lambda w_: gradCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[:,idxs]), w_), \
      #                              w))

    # Train the network using SGD.
  #  train(trainX, trainY, testX, testY, w)



Numerical gradient:
Analytical gradient:
Discrepancy:


In [90]:
idxs = 0
print("Numerical gradient:")
print(scipy.optimize.approx_fprime(w, lambda w_: fCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[idxs]), w_)[0], 1e-10))
print("Analytical gradient:")
print(gradCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[idxs]), w))
print("Discrepancy:")
print(scipy.optimize.check_grad(lambda w_: fCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[idxs]), w_)[0], \
                               lambda w_: gradCE(np.atleast_2d(trainX[:,idxs]), np.atleast_2d(trainY[idxs]), w_), \
                              w))

Numerical gradient:
[ 0.          0.          0.         ...  0.10310863  0.09741985
 -0.91501029]
Analytical gradient:
[ 0.          0.          0.         ...  0.10310648  0.09741803
 -0.91501308]
Discrepancy:
1.1153376700730655e-06


In [None]:
W1 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_INPUT))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
b1 = 0.01 * np.ones(NUM_HIDDEN)

W2 = 2*(np.random.random(size=(NUM_OUTPUT, NUM_HIDDEN))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
b2 = 0.01 * np.ones(NUM_OUTPUT)

w = pack(W1, b1, W2, b2)

print(trainX.shape[1])



#fCE(tr_images,tr_labels,w)
#gradCE(tr_images,tr_labels,w)
#print(fCE(tr_images,tr_labels,w).shape)
train(trainX, trainY, testX, testY, w)

60000
(257, 10)
(257, 40)
(40, 257)


ValueError: ignored