In [20]:
import numpy as np
from torchvision.datasets import MNIST

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                    transform=lambda x: np.array(x).flatten(),
                    download=True,
                    train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return np.array(mnist_data), np.array(mnist_labels)
train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

In [21]:
train_X = train_X/255.0
test_X = test_X/255.0

train_Y = np.eye(10)[train_Y]
test_Y = np.eye(10)[test_Y]

In [22]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def sigmoid_derivative(y):
    return y*(1-y)

In [23]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True)) 
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [24]:
input_size = 784
output_size = 10
hidden_size = 100

np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size)*0.01
b1 = np.zeros((1,hidden_size))
W2 = np.random.randn(hidden_size, output_size)*0.01
b2 = np.zeros((1,output_size))

In [25]:
def forward_propagation(X):
    z1 = np.dot(X, W1) + b1
    y1 = sigmoid(z1)
    z2 = np.dot(y1, W2) + b2
    y2 = softmax(z2)
    return y1, z1, y2, z2

In [26]:
def backward(X, y_true, y1, y2, learning_rate):
    global W1, W2, b1, b2
    
    output_error = y2-y_true
    
    hidden_error = np.dot(output_error,W2.T)
    hidden_delta = hidden_error*sigmoid_derivative(y1)
    
    w_hidden_output_gradient = np.dot(y1.T, output_error)
    w_input_hidden_gradient = np.dot(X.T,hidden_delta)
    
    b2_hidden_output_gradient = np.sum(output_error, axis=0, keepdims=True)
    b1_input_hidden_gradient = np.sum(hidden_delta, axis=0, keepdims=True)
    
    W1-=learning_rate*w_input_hidden_gradient
    W2-=learning_rate*w_hidden_output_gradient
    b1-=learning_rate*b1_input_hidden_gradient
    b2-=learning_rate*b2_hidden_output_gradient

In [27]:
def cross_entropy_loss(y_pred, y_true, epsilon=1e-12):
    m = y_true.shape[0]  
    y_pred = np.clip(y_pred, epsilon, 1. - epsilon)
    log_likelihood = -np.log(y_pred[range(m), np.argmax(y_true, axis=1)])
    loss = np.sum(log_likelihood) / m
    return loss

def accuracy(y_pred, y_true):
    return np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_true, axis=1))

In [13]:
def train(train_X, train_Y, test_X, test_Y, epochs=20, batch_size=64, learning_rate=0.05):
    n_samples = train_X.shape[0]

    for epoch in range(epochs):
        permutation = np.random.permutation(n_samples)
        train_X_shuffled = train_X[permutation]
        train_Y_shuffled = train_Y[permutation]

        for i in range(0, n_samples, batch_size):
            X_batch = train_X_shuffled[i:i+batch_size]
            Y_batch = train_Y_shuffled[i:i+batch_size]

            y1, z1, y2, z2 = forward_propagation(X_batch)

            backward(X_batch, Y_batch, y1, y2, learning_rate)

        _, _, _, val_pred_train = forward_propagation(train_X)
        train_acc = accuracy(val_pred_train, train_Y)
        _, _, _, val_pred = forward_propagation(test_X)
        val_loss = cross_entropy_loss(val_pred, test_Y)
        val_acc = accuracy(val_pred, test_Y)
        
        

        print(f"Epoch {epoch+1}/{epochs},Train Accuracy: {train_acc:.4f} ,Loss: {val_loss:.4f} ,Validation Accuracy: {val_acc:.4f}")

train(train_X, train_Y, test_X, test_Y, epochs=120, batch_size=100, learning_rate=0.01)

Epoch 1/135,Train Accuracy: 0.9266 ,Loss: 0.0693 ,Validation Accuracy: 0.9293
Epoch 2/135,Train Accuracy: 0.9502 ,Loss: 0.0350 ,Validation Accuracy: 0.9472
Epoch 3/135,Train Accuracy: 0.9597 ,Loss: 0.0238 ,Validation Accuracy: 0.9561
Epoch 4/135,Train Accuracy: 0.9672 ,Loss: 0.0207 ,Validation Accuracy: 0.9626
Epoch 5/135,Train Accuracy: 0.9721 ,Loss: 0.0200 ,Validation Accuracy: 0.9653
Epoch 6/135,Train Accuracy: 0.9767 ,Loss: 0.0118 ,Validation Accuracy: 0.9704
Epoch 7/135,Train Accuracy: 0.9809 ,Loss: 0.0118 ,Validation Accuracy: 0.9721
Epoch 8/135,Train Accuracy: 0.9815 ,Loss: 0.0142 ,Validation Accuracy: 0.9719
Epoch 9/135,Train Accuracy: 0.9832 ,Loss: 0.0114 ,Validation Accuracy: 0.9738
Epoch 10/135,Train Accuracy: 0.9857 ,Loss: 0.0115 ,Validation Accuracy: 0.9752
Epoch 11/135,Train Accuracy: 0.9874 ,Loss: 0.0140 ,Validation Accuracy: 0.9765
Epoch 12/135,Train Accuracy: 0.9885 ,Loss: 0.0141 ,Validation Accuracy: 0.9760
Epoch 13/135,Train Accuracy: 0.9911 ,Loss: 0.0113 ,Validation

KeyboardInterrupt: 

In [28]:
def train_with_learning_rate_scheduler(train_X, train_Y, test_X, test_Y, epochs=20, batch_size=64, learning_rate=0.05):
    n_samples = train_X.shape[0]
    learning_rates_acc = []
    
    for epoch in range(epochs):
        permutation = np.random.permutation(n_samples)
        train_X_shuffled = train_X[permutation]
        train_Y_shuffled = train_Y[permutation]

        for i in range(0, n_samples, batch_size):
            X_batch = train_X_shuffled[i:i+batch_size]
            Y_batch = train_Y_shuffled[i:i+batch_size]

            y1, z1, y2, z2 = forward_propagation(X_batch)

            backward(X_batch, Y_batch, y1, y2, learning_rate)

        _, _, _, val_pred_train = forward_propagation(train_X)
        train_acc = accuracy(val_pred_train, train_Y)
        _, _, _, val_pred = forward_propagation(test_X)
        val_loss = cross_entropy_loss(val_pred, test_Y)
        val_acc = accuracy(val_pred, test_Y)
        learning_rates_acc.append(val_acc)
        
        if epoch % 10==0:
            if abs(learning_rates_acc[-1] - np.mean(learning_rates_acc[-11:-1])) < 0.01:
                learning_rate = learning_rate/1.05
                print("New learning rate:", learning_rate)
        

        print(f"Epoch {epoch+1}/{epochs},Train Accuracy: {train_acc:.4f} ,Loss: {val_loss:.4f} ,Validation Accuracy: {val_acc:.4f}, Learning Rate: {learning_rate:.5f}")

train_with_learning_rate_scheduler(train_X, train_Y, test_X, test_Y, epochs=120, batch_size=100, learning_rate=0.01)

Epoch 1/120,Train Accuracy: 0.9266 ,Loss: 0.0693 ,Validation Accuracy: 0.9293, Learning Rate: 0.01000
Epoch 2/120,Train Accuracy: 0.9502 ,Loss: 0.0350 ,Validation Accuracy: 0.9472, Learning Rate: 0.01000
Epoch 3/120,Train Accuracy: 0.9597 ,Loss: 0.0238 ,Validation Accuracy: 0.9561, Learning Rate: 0.01000
Epoch 4/120,Train Accuracy: 0.9672 ,Loss: 0.0207 ,Validation Accuracy: 0.9626, Learning Rate: 0.01000
Epoch 5/120,Train Accuracy: 0.9721 ,Loss: 0.0200 ,Validation Accuracy: 0.9653, Learning Rate: 0.01000
Epoch 6/120,Train Accuracy: 0.9767 ,Loss: 0.0118 ,Validation Accuracy: 0.9704, Learning Rate: 0.01000
Epoch 7/120,Train Accuracy: 0.9809 ,Loss: 0.0118 ,Validation Accuracy: 0.9721, Learning Rate: 0.01000
Epoch 8/120,Train Accuracy: 0.9815 ,Loss: 0.0142 ,Validation Accuracy: 0.9719, Learning Rate: 0.01000
Epoch 9/120,Train Accuracy: 0.9832 ,Loss: 0.0114 ,Validation Accuracy: 0.9738, Learning Rate: 0.01000
Epoch 10/120,Train Accuracy: 0.9857 ,Loss: 0.0115 ,Validation Accuracy: 0.9752, Le

KeyboardInterrupt: 