In [2]:
# MLP manully
# -----------

import torch
import torchvision
import matplotlib.pyplot as plt
from torch.utils import data
from torchvision.transforms import transforms

# load datasets
def load_minibatch_data(batch_size, n_workers, resize=None):
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    train_set = torchvision.datasets.FashionMNIST(root='../data', 
                                                  train=True, 
                                                  transform=trans, 
                                                  download=True)
    test_set = torchvision.datasets.FashionMNIST(root='../data', 
                                                  train=False, 
                                                  transform=trans, 
                                                  download=True)
    return (data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=n_workers), 
            data.DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers=n_workers))

batch_size = 256
train_iter, test_iter = load_minibatch_data(batch_size, n_workers=4)
for X, y in train_iter:
    print('Xshape[0] = row number:', X.shape[0])
    print('Xshape[1] = column number:', X.shape[1])
    print('Xshape[2 * 3] = image shape:', X.shape[2], '*', X.shape[3])
    break

Xshape[0] = row number: 256
Xshape[1] = column number: 1
Xshape[2 * 3] = image shape: 28 * 28


In [5]:
# initialize parameters
from torch import nn

num_inputs = 28 * 28
num_outputs = 10
num_hiddens = 16 * 16

# attention to size/shape of parameter!
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad=True) * 0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))

# attention to size/shape of parameter!
W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad=True) * 0.01)
b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))

params = [W1, b1, W2, b2]

# activation: 
# ReLU activation function
# for hidden layer
def relu(X):
    zeros = torch.zeros_like(X)
    return torch.max(X, zeros)

# softmax activation function combined with cross entropy loss
# for output layer
def sm_cross_entropy(y_hat, y):
    max = torch.max(y_hat, dim=1).values
    y_log = y_hat.T - max - torch.log(torch.sum(torch.exp(y_hat.T - max)))
    y_log = y_log.T
    return - y_log[range(y_log.shape[0]), y]

# single hidden layer network
def net(X):
    X = X.reshape((-1, num_inputs))
    H = relu(torch.matmul(X, W1) + b1)  # '@' means matrix multiplication
    return (torch.matmul(H, W2) + b2)

# loss
loss = nn.CrossEntropyLoss()

# optimizer
def SGD(params, lr, batch_size):
    with torch.no_grad():      # we dont want to compute gradient when updating parameter
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

optimizer = torch.optim.SGD(params, lr=0.1)

# accuracy
def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    compare = y_hat.type(y.dtype) == y
    return float(compare.type(y.dtype).sum() / len(y))


# train
n_epochs = 10
lr = 0.1
accuracy_epoch = []
loss_epoch = []

def train(net, minibatch_data, epochs, loss, optimizer):
    for epoch in range(epochs):
        accuracy_batch = []
        loss_batch = []
        for X, y in minibatch_data:
            # train
            y_hat = net(X)
            # accuracy
            a = accuracy(y_hat, y)
            accuracy_batch.append(a)
            # loss
            l = loss(y_hat, y)
            loss_batch.append(sum(l) / len(l))
            l.sum().backward()
            # optimization
            optimizer(params, lr, X.shape[0])
            # optimizer.zero_grad()
            # l.backward()
            # optimizer.step()
        a_avg = sum(accuracy_batch) / len(accuracy_batch)
        l_avg = sum(loss_batch) / len(loss_batch)
        print(a_avg)
        print(l_avg)
        accuracy_epoch.append(a_avg)
        loss_epoch.append(l_avg)
    return (loss_epoch, accuracy_epoch)

MLP_loss, MLP_accuracy = train(net, train_iter, n_epochs, sm_cross_entropy, SGD)
# MLP_loss, MLP_accuracy = train(net, train_iter, n_epochs, loss, optimizer)
print('loss: %.3f' % (sum(MLP_loss) / n_epochs))
print('accuracy: %.3f' % (sum(MLP_accuracy) / n_epochs * 100), '%')

0.6420933066530431
tensor(6.6608, grad_fn=<DivBackward0>)
0.7885028811211282
tensor(6.1965, grad_fn=<DivBackward0>)
0.8168661348363186
tensor(6.1118, grad_fn=<DivBackward0>)
0.8317542109083622
tensor(6.0676, grad_fn=<DivBackward0>)
0.8398769946808511
tensor(6.0358, grad_fn=<DivBackward0>)
0.8458832002700644
tensor(6.0140, grad_fn=<DivBackward0>)
0.8527316045253834
tensor(5.9963, grad_fn=<DivBackward0>)
0.8552138741980208
tensor(5.9849, grad_fn=<DivBackward0>)
0.8613918438870856
tensor(5.9686, grad_fn=<DivBackward0>)
0.861968085106383
tensor(5.9629, grad_fn=<DivBackward0>)
loss: 6.100
accuracy: 81.963 %


In [11]:
# MLP with torch

# load dataset
# done

# MLP model
import torch
from torch import nn

net = nn.Sequential(nn.Flatten(), 
                    nn.Linear(28*28, 256), 
                    nn.ReLU(), 
                    nn.Linear(256, 10))

# initialize parameters
def initialize_weights(net):
    if type(net) == nn.Linear:
        nn.init.normal(net.weight, std=0.01)
net.apply(initialize_weights)

# loss
ce_loss = nn.CrossEntropyLoss()

# optimizer
sgd_optim = torch.optim.SGD(net.parameters(), lr=0.1)

# accuracy
def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    compare = y_hat.type(y.dtype) == y
    return float(compare.type(y.dtype).sum() / len(y))

# train
n_epochs = 20
accuracy_epoch = []
loss_epoch = []
def train(net, minibatch_data, n_epochs, loss, optimizer):
    net.train()
    for epoch in range(n_epochs):
        accuracy_batch = []
        loss_batch = []
        for X, y in minibatch_data:
            y_hat = net(X)
            accuracy_batch.append(accuracy(y_hat, y))
            l = loss(y_hat, y)
            loss_batch.append(l)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        a_avg = sum(accuracy_batch) / len(accuracy_batch)
        l_avg = sum(loss_batch) / len(loss_batch)
        # print(a_avg)
        # print(l_avg)
        accuracy_epoch.append(a_avg)
        loss_epoch.append(l_avg)
    return (loss_epoch, accuracy_epoch)

MLP_loss, MLP_accuracy = train(net, train_iter, n_epochs, ce_loss, sgd_optim)
print('loss: %.3f' % (sum(MLP_loss) / n_epochs))
print('accuracy: %.3f' % (sum(MLP_accuracy) / n_epochs * 100), '%')

  nn.init.normal(net.weight, std=0.01)


loss: 0.429
accuracy: 84.890 %
