In [1]:
import d2l
from mxnet import gluon, init
from mxnet.gluon import loss as gloss, nn

In [2]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

# Initialize Model Parameters 

In [4]:
net = nn.Sequential()
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

# The Softmax 

In the implementation of `softmax`, we first subtract $\max(z_i)$ from all $z_i$. This won't change the result of the calculation. But after the substraction there maybe some `z_j` that is very negative. Thus, $e^{z_j}$ will be very close to zero and might be rounded to zero. So we combine the `softmax` and `cross_entropy` together to avoid calculating $e^{z_j}$ directly.

In [5]:
loss = gloss.SoftmaxCrossEntropyLoss()

# Optimization Algorithm

In [6]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})

# Training 

In [7]:
num_epochs = 5
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 0.7882, train acc 0.748, test acc 0.802
epoch 2, loss 0.5745, train acc 0.810, test acc 0.823
epoch 3, loss 0.5296, train acc 0.824, test acc 0.830
epoch 4, loss 0.5059, train acc 0.829, test acc 0.832
epoch 5, loss 0.4889, train acc 0.835, test acc 0.840


# Problems 

1. Try adjusting the hyper-parameters, such as batch size, epoch, and learning rate, to see what the results are.

In [9]:
num_epochs = 5
batch_size = 128

net.initialize(init.Normal(sigma=0.01), force_reinit=True)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 0.8060, train acc 0.743, test acc 0.812
epoch 2, loss 0.5712, train acc 0.806, test acc 0.812
epoch 3, loss 0.5275, train acc 0.821, test acc 0.841
epoch 4, loss 0.5063, train acc 0.827, test acc 0.844
epoch 5, loss 0.4858, train acc 0.834, test acc 0.847


In [11]:
num_epochs = 10
batch_size = 128

net.initialize(init.Normal(sigma=0.01), force_reinit=True)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 0.8051, train acc 0.742, test acc 0.812
epoch 2, loss 0.5803, train acc 0.805, test acc 0.833
epoch 3, loss 0.5227, train acc 0.821, test acc 0.835
epoch 4, loss 0.4983, train acc 0.830, test acc 0.845
epoch 5, loss 0.4872, train acc 0.833, test acc 0.837
epoch 6, loss 0.4741, train acc 0.837, test acc 0.845
epoch 7, loss 0.4655, train acc 0.840, test acc 0.826
epoch 8, loss 0.4720, train acc 0.838, test acc 0.852
epoch 9, loss 0.4585, train acc 0.842, test acc 0.852
epoch 10, loss 0.4492, train acc 0.844, test acc 0.843


In [16]:
num_epochs = 10
batch_size = 128
trainer.set_learning_rate(0.3)

net.initialize(init.Normal(sigma=0.01), force_reinit=True)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 2.1196, train acc 0.706, test acc 0.825
epoch 2, loss 1.1644, train acc 0.783, test acc 0.831
epoch 3, loss 1.0939, train acc 0.792, test acc 0.830
epoch 4, loss 1.0196, train acc 0.799, test acc 0.841
epoch 5, loss 1.0268, train acc 0.804, test acc 0.851
epoch 6, loss 0.9546, train acc 0.808, test acc 0.838
epoch 7, loss 0.9468, train acc 0.810, test acc 0.848
epoch 8, loss 0.9631, train acc 0.809, test acc 0.851
epoch 9, loss 0.9427, train acc 0.812, test acc 0.836
epoch 10, loss 0.8656, train acc 0.818, test acc 0.855


2. Why might the test accuracy decrease again after a while? How could we fix this?