In [5]:
import numpy as np
import torch
import torchvision
from torchvision import datasets, transforms

In [6]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
test_dataset = datasets.MNIST('../data', train=True, download=True, transform=transform)
train_dataset = datasets.MNIST('../data', train=False, transform=transform)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 12066901.08it/s]


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 37180998.72it/s]

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 12952781.16it/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 6129513.76it/s]


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw



#### 1. Unbatch

In [34]:
# Creating a simple Multi layer perceptron
# h = sigmoid(Wx + b)
# y = softmax(Vh + c)


train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

Ni = 784
Nh = 500
No = 10

W = np.random.normal(size = (Nh, Ni)) # Nh x Ni
b = np.random.normal(size = (Nh,)) # Nh
V = np.random.normal(size = (No, Nh)) # No x Nh
c = np.random.normal(size = (No,))

def sigmoid(x: np.ndarray):
    return 1 / (1.0 + np.exp(-x))

def softmax(x):
    return np.exp(x) / (np.einsum("x->", np.exp(x)) + 1e-8)

x, target = 10, 20 # samples, no batch
iterations = 100 # number of samples

# parameters
lr = 1e-4
for i in range(iterations):
    total_loss = 0
    for data, target in train_loader:
        x = data.numpy().reshape(Ni) # data shape [1,1,28,28] to [784,]
        target = target.numpy() 
        target = np.eye(No)[target].reshape(No) # target shape [10,]
        # Forward propagation

        ha = np.einsum("hi,i->h", W, x) + b #(Nh,)
        h = sigmoid(ha) # (Nh,)
        ya = np.einsum("oh,h->o", V, h) + c #(No,)
        y = softmax(ya) #(No,)

        #print(f"ha {ha.mean()}, h: {h.mean()}, ya: {ya.mean()}, y: {y.mean()}")

        # Loss
        loss = -target * y
        loss = np.einsum("k->", loss)
        total_loss += loss

        # Backpropagation

        dL_dya = y - target
        dL_dV = np.einsum("o,h->oh", dL_dya, h)
        dL_dc = dL_dya
        dL_dh = np.einsum("o,oh->h", dL_dya, V) 
        dL_dha = dL_dh * np.einsum("h,h->h", h, (1-h))
        dL_dW = np.einsum("h,i->hi", dL_dha, x)
        dL_db = dL_dha

        #print(f"dya {dL_dya.mean()}, dV {dL_dV.mean()} dc: {dL_dc.mean()} dh {dL_dh.mean()} dha {dL_dha.mean()} dw {dL_dW.mean()}")

        # Update parameters
        W = W - lr * dL_dW
        b = b - lr * dL_db
        V = V - lr * dL_dV
        c = c - lr * dL_dc

    test_loss = 0
    test_accuracy = 0
    for data, target in test_loader:
        x = data.numpy().reshape(Ni) # data shape [1,1,28,28] to [784,]
        target = target.numpy() 
        target = np.eye(No)[target].reshape(No) # target shape [10,]
        # Forward propagation

        ha = np.einsum("hi,i->h", W, x) + b #(Nh,)
        h = sigmoid(ha) # (Nh,)
        ya = np.einsum("oh,h->o", V, h) + c #(No,)
        y = softmax(ya) #(No,)

        # Loss
        loss = -target * y
        loss = np.einsum("k->", loss)
        test_loss += loss

        # Accuracy
        y_pred = np.argmax(y) #(1,)
        y_true = np.argmax(target) #(1,)
        test_accuracy += (y_pred == y_true).mean()

    print(f"Train Loss is: {total_loss/len(train_loader):.2f} Test loss: {test_loss/len(test_loader):.2f}, Test accuracy {test_accuracy/len(test_loader)*100:.2f}")

Train Loss is: -0.10 Test loss: -0.13, Test accuracy 13.55
Train Loss is: -0.19 Test loss: -0.22, Test accuracy 22.19
Train Loss is: -0.27 Test loss: -0.29, Test accuracy 29.70
Train Loss is: -0.35 Test loss: -0.35, Test accuracy 35.83
Train Loss is: -0.40 Test loss: -0.40, Test accuracy 40.49
Train Loss is: -0.45 Test loss: -0.44, Test accuracy 44.53
Train Loss is: -0.49 Test loss: -0.47, Test accuracy 47.70
Train Loss is: -0.53 Test loss: -0.50, Test accuracy 50.58
Train Loss is: -0.56 Test loss: -0.52, Test accuracy 52.80
Train Loss is: -0.58 Test loss: -0.54, Test accuracy 54.80
Train Loss is: -0.60 Test loss: -0.56, Test accuracy 56.56
Train Loss is: -0.62 Test loss: -0.57, Test accuracy 58.13
Train Loss is: -0.64 Test loss: -0.59, Test accuracy 59.44
Train Loss is: -0.66 Test loss: -0.60, Test accuracy 60.64
Train Loss is: -0.67 Test loss: -0.61, Test accuracy 61.59
Train Loss is: -0.68 Test loss: -0.62, Test accuracy 62.54
Train Loss is: -0.69 Test loss: -0.63, Test accuracy 63.

#### 1. Batch

In [None]:
# Previously, x is no batch, but now it is batched

# Creating a simple Multi layer perceptron
# h = sigmoid(Wx + b)
# y = softmax(Vh + c)

batch_size = 16
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

Ni = 784
Nh = 500
No = 10

W = np.random.normal(size = (Nh, Ni)) # Nh x Ni
b = np.random.normal(size = (Nh,)) # Nh
V = np.random.normal(size = (No, Nh)) # No x Nh
c = np.random.normal(size = (No,))

def sigmoid(x: np.ndarray):
    return 1 / (1.0 + np.exp(-x))

def softmax(x):
    return np.exp(x) / (np.einsum("bx->b", np.exp(x)) + 1e-8)

iterations = 100 # number of samples

# parameters
lr = 1e-4
for i in range(iterations):
    total_loss = 0
    for data, target in train_loader:
        x = data.numpy().reshape(batch_size, Ni) # data shape [bs,1,28,28] to [bs, 784]
        target = target.numpy() 
        onehot = np.zeros((batch_size, No)) # target shape [bs, 10]
        onehot[np.arange(batch_size), target] = 1
        target = onehot
        # Forward propagation

        ha = np.einsum("hi,bi->bh", W, x) + b #(bs, Nh)
        h = sigmoid(ha) # (bs,Nh)
        ya = np.einsum("oh,bh->bo", V, h) + c #(bs,No)
        y = softmax(ya) #(bs,No)

        #print(f"ha {ha.mean()}, h: {h.mean()}, ya: {ya.mean()}, y: {y.mean()}")

        # Loss
        loss = -target * y
        loss = np.einsum("bk->", loss)
        total_loss += loss/batch_size

        # Backpropagation

        dL_dya = y - target
        dL_dV = np.einsum("bo,bh->oh", dL_dya, h) / batch_size ## Need to divide as omitting leter in output will be summed
        dL_dc = np.einsum("bo -> o", dL_dya) / batch_size ### We divide batch because omitting letter in output will be sumed, then we need to divide
        dL_dh = np.einsum("bo,oh->bh", dL_dya, V) 
        dL_dha = dL_dh * np.einsum("bh,bh->bh", h, (1-h))
        dL_dW = np.einsum("bh,bi->hi", dL_dha, x) / batch_size
        dL_db = np.einsum("bh -> h", dL_dha) / batch_size

        #print(f"dya {dL_dya.mean()}, dV {dL_dV.mean()} dc: {dL_dc.mean()} dh {dL_dh.mean()} dha {dL_dha.mean()} dw {dL_dW.mean()}")

        # Update parameters
        W = W - lr * dL_dW
        b = b - lr * dL_db
        V = V - lr * dL_dV
        c = c - lr * dL_dc

    test_loss = 0
    test_accuracy = 0
    for data, target in test_loader:
        x = data.numpy().reshape(batch_size, Ni) # data shape [1,1,28,28] to [784,]
        target = target.numpy() 
        target = np.eye(No)[target].reshape(No) # target shape [10,]
        # Forward propagation

        ha = np.einsum("hi,i->h", W, x) + b #(Nh,)
        h = sigmoid(ha) # (Nh,)
        ya = np.einsum("oh,h->o", V, h) + c #(No,)
        y = softmax(ya) #(No,)

        # Loss
        loss = -target * y
        loss = np.einsum("k->", loss)
        test_loss += loss

        # Accuracy
        y_pred = np.argmax(y) #(1,)
        y_true = np.argmax(target) #(1,)
        test_accuracy += (y_pred == y_true).mean()

    print(f"Train Loss is: {total_loss/len(train_loader):.2f} Test loss: {test_loss/len(test_loader):.2f}, Test accuracy {test_accuracy/len(test_loader)*100:.2f}")