In [1]:
%matplotlib inline
import d2lzh as d2l
from mxnet import autograd, nd

In [2]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

In [3]:
num_inputs = 784
num_outputs = 10

W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)

In [4]:
W, b

(
 [[ 0.01163079  0.00483805  0.00299563 ... -0.0235563   0.0054144
    0.02678506]
  [ 0.01254634 -0.00548774 -0.00681064 ... -0.02757963  0.0107628
   -0.00614132]
  [ 0.01830765 -0.01146806  0.00053838 ...  0.00201315  0.00350055
    0.00536052]
  ...
  [-0.00507716 -0.00258162 -0.00671208 ...  0.00318219 -0.00042557
   -0.00010889]
  [-0.00392015 -0.0097427   0.00342802 ... -0.01475318  0.01448503
    0.00457099]
  [-0.01297282 -0.01351191 -0.00353406 ...  0.01087382 -0.00354935
   -0.00361599]]
 <NDArray 784x10 @cpu(0)>, 
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 <NDArray 10 @cpu(0)>)

In [5]:
W.attach_grad()
b.attach_grad()

In [6]:
# X 256 1 28 28 (before transform X 256 28 28 1)
# net(X) reshape to 256 1 784 * W 784 10 = 256 1 10 + b
# softmax(net(X)) 

In [7]:
def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition

In [38]:
def net(X):
    return softmax(nd.dot(X.reshape(-1, num_inputs), W) + b)

In [9]:
def cross_entropy(y_hat, y):
    return -nd.pick(y_hat, y).log()

In [10]:
def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()

In [11]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        y = y.astype('float32')
        acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar()
        n += y.size
        
    return acc_sum / n

In [12]:
evaluate_accuracy(test_iter, net)

0.0856

In [39]:
num_epochs, lr = 5, 0.1

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, 
             params = None, lr = None, trainer = None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            # print((y_hat.argmax(axis=1) == y).sum().asscalar())
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
            break
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train_acc %.3f, test_acc %.3f' 
             % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
    

In [40]:
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)

epoch 1, loss 0.5030, train_acc 0.832, test_acc 0.844
epoch 2, loss 0.4981, train_acc 0.828, test_acc 0.840
epoch 3, loss 0.5626, train_acc 0.816, test_acc 0.839
epoch 4, loss 0.5373, train_acc 0.832, test_acc 0.838
epoch 5, loss 0.3961, train_acc 0.867, test_acc 0.841
