In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchvision.transforms import ToTensor
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import matplotlib.cm as cm 
import os

alpha = 0.1

mnist_data_train = torchvision.datasets.MNIST('.', train=True,download=True, transform=ToTensor())
train_data_loader = torch.utils.data.DataLoader(mnist_data_train, batch_size=1, shuffle=False)
mnist_data_test = torchvision.datasets.MNIST('.', train=False,download=True)

#print(len(train_data_loader))

train_features_array = []
train_labels_array = []

def init_params():
  W1 = np.random.uniform(-1,1,[300, 784]) 
  W2 = np.random.uniform(-1,1,[200, 300])
  W3 = np.random.uniform(-1,1,[10, 200])
  A2 = np.random.uniform(-1,1,[200, 1])
  A3 = np.random.uniform(-1,1,[200, 1])
  # W1 = np.zeros([300, 784]) 
  # W2 = np.zeros([200, 300])
  # W3 = np.zeros([10, 200])
  # A2 = np.zeros([200, 1])
  # A3 = np.zeros([200, 1])  
  return W1, W2, W3

def sigmoid (x):
  k = 1/(1+np.exp(-x))
  return k

def softmax(Z):
  A = np.exp(Z) / sum(np.exp(Z))
  return A
    
#def forward_prop(W1, W2, W3, X):
#  Z1 = W1.dot(X)
#  A1 = sigmoid(Z1)
#  Z2 = W2.dot(A1)
#  A2 = sigmoid(Z2)
#  Z3 = W3.dot(A2)
#  A3 = softmax(Z3)
#  return Z1, A1, Z2, A2, Z3, A3

def forward_prop(W1, W2, W3, X):
  #print("W1 : {} W2 : {} W3 : {} X : {}".format(np.shape(W1),np.shape(W2),np.shape(W3),np.shape(X)))
  #print("In F prop")
  #print(W1)
  Z1 = np.matmul(W1,X)
  #print(Z1)
  A1 = sigmoid(Z1)
  #print(A1)
  Z2 = np.matmul(W2,A1)
  A2 = sigmoid(Z2)
  Z3 = np.matmul(W3,A2)
  A3 = softmax(Z3)
  return Z1, A1, Z2, A2, Z3, A3



def softmax_deriv(Y, Y_hat):
  return Y_hat - Y

def sigmoid_deriv(x):
  return np.diag(x*(1-x))

def onehot(y):
  one_hot_Y = np.atleast_2d(np.zeros(10))
  one_hot_Y[0][y] = 1
  one_hot_Y = one_hot_Y.T
  return one_hot_Y

#def back_prop(W1, W2, W3, A1, A2, A3, X, Y):
# print(onehot(Y))
#  print(A2.T)
#  dW3 = (A3 - onehot(Y)).dot(A2.T)
#  A2_deriv = sigmoid_deriv(A2.squeeze())
#  A = np.matmul (A2_deriv,W3.T)
#  B = np.matmul(A,(A3-onehot(Y)))
#  dW2 = np.matmul(B,A1.T)
#  A1_deriv = sigmoid_deriv(A1.squeeze())
#  C = np.matmul (A1_deriv, W2.T)
#  D = np.matmul ( C, B)
#  dW1 = np.matmul(D, X.T)
#  return dW1, dW2, dW3


def back_prop (W2, W3, A1, A2, A3, X, Y):
    #print("back_prop : ")

    #print(np.shape(A3))
    #print(np.shape(A2.T))
    #print(np.shape(Z3))
    q = onehot(Y)
    A2 = np.atleast_2d(np.array(A2))
    #print(np.shape(Z2))
    #print(np.shape(q))
    dW3 = np.matmul(A3 - onehot(Y), A2.T)
    A2_deriv = sigmoid_deriv(A2.squeeze())
    A = np.matmul (A2_deriv,W3.T)

    B = np.matmul(A,(A3-onehot(Y)))
    dW2 = np.matmul(B,A1.T)
    A1_deriv = sigmoid_deriv(A1.squeeze())
    C = np.matmul (A1_deriv, W2.T)
    D = np.matmul ( C, B)
    dW1 = np.matmul(D, X.T)
    return dW1, dW2, dW3

def update_params(W1, W2, W3, dW1, dW2, dW3, alpha):
  W1 = W1 - alpha * dW1
  W2 = W2 - alpha * dW2  
  W3 = W3 - alpha * dW3  
  return W1, W2, W3

def get_predictions(A3):
    return np.argmax(A3, 0)

def gradient_descent(W1, W2, W3, X, Y, alpha):
    #print("Iteration: ", i)
    X = np.atleast_2d(np.array(X))
    X = X.T
    #print("input : " , np.shape(X))
    z1, a1, z2, a2, z3, a3 = forward_prop(W1, W2, W3, X)
    dW1, dW2, dW3 = back_prop( W2, W3, a1, a2, a3, X, Y)
    W1, W2, W3 = update_params(W1, W2, W3, dW1, dW2, dW3, alpha)
    y_cap = np.matmul(onehot(Y).T,a3)
    L = -np.log(y_cap)
    #print("LOSS : ", -np.log(y_cap))
    predictions = get_predictions(a3)
   # print(("PRED : ", predictions, Y))   
    #plt.plot(Loss)
    return W1, W2, W3 , L

mnist_data_train = torchvision.datasets.MNIST('.', train=True,download=True, transform=ToTensor())
mnist_data_test = torchvision.datasets.MNIST('.', train=False,download=True, transform=ToTensor())

train_size = [10, 50, 100, 500, 1000, 5000, 10000, 50000]
test_size = len(mnist_data_test)
epochs = 10
acc_arr = []

for g in train_size:
  #print("Printing test_size : {}".format(test_size))
  W1, W2, W3 = init_params()
  Loss = []
  
  for i in range(epochs):  
    for p in range(g):
      train_features, Y = mnist_data_train[p]
      train_flatten = torch.flatten(train_features.squeeze())
      In = train_flatten.numpy()
      w1, w2, w3 , L = gradient_descent(W1,W2,W3,In, Y, 0.1)
      W1 = w1
      W2 = w2
      W3 = w3
    Loss.append(L[0])
  correct_pred = 0
  
  for i in range (test_size):
    test_features, Y = mnist_data_test[i]
    test_flatten = torch.flatten(test_features.squeeze())
    In = test_flatten.numpy()
    z1, a1, z2, a2, z3, a3 = forward_prop(W1, W2, W3, In)
    Y_hat = np.argmax(a3,0)
    #print(Y_hat)
    #print(Y)
    if (Y_hat==Y):
      correct_pred = correct_pred + 1
  
  acc = correct_pred/test_size       
  acc_arr.append(acc)
  print("Training set size : {} , Learning rate : {}".format(g, alpha))
  print("Total number of predictions (test set size) : {}".format(test_size))
  print("Number of correct predictions : {}".format(correct_pred))
  print("Number of errors (wrong predictions) : {}".format(test_size - correct_pred))
  print("Accuracy : {}\n".format(acc))

print(acc_arr)
plt.plot(train_size,acc_arr)
plt.xscale("log")
plt.xlabel("Training set size")

plt.ylabel("Accuracy")
plt.grid()
plt.show()
