In [1]:
import d2l
from mxnet import gluon, init
from mxnet.gluon import loss as gloss, nn

# The Model

In [2]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

# Training

In [3]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

loss = gloss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})
num_epochs = 10

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 0.7982, train acc 0.696, test acc 0.832
epoch 2, loss 0.4873, train acc 0.821, test acc 0.849
epoch 3, loss 0.4318, train acc 0.840, test acc 0.860
epoch 4, loss 0.3976, train acc 0.854, test acc 0.858
epoch 5, loss 0.3695, train acc 0.863, test acc 0.867
epoch 6, loss 0.3532, train acc 0.869, test acc 0.875
epoch 7, loss 0.3374, train acc 0.877, test acc 0.871
epoch 8, loss 0.3274, train acc 0.880, test acc 0.878
epoch 9, loss 0.3174, train acc 0.883, test acc 0.877
epoch 10, loss 0.3044, train acc 0.887, test acc 0.882


# Problems 

1. Try adding a few more hidden layers to see how the result changes.

In [5]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(128, activation='relu'))
net.add(nn.Dense(64, activation='relu'))
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 2.2036, train acc 0.135, test acc 0.205
epoch 2, loss 1.7057, train acc 0.322, test acc 0.508
epoch 3, loss 1.0973, train acc 0.586, test acc 0.445
epoch 4, loss 0.8512, train acc 0.665, test acc 0.743
epoch 5, loss 0.5751, train acc 0.770, test acc 0.778
epoch 6, loss 0.5137, train acc 0.807, test acc 0.769
epoch 7, loss 0.4665, train acc 0.825, test acc 0.839
epoch 8, loss 0.4441, train acc 0.834, test acc 0.854
epoch 9, loss 0.4177, train acc 0.843, test acc 0.857
epoch 10, loss 0.3946, train acc 0.852, test acc 0.851


2.  Try out different activation functions. Which ones work best?

In [9]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='softsign'))
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01), force_reinit=True)

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 0.7386, train acc 0.733, test acc 0.824
epoch 2, loss 0.5115, train acc 0.812, test acc 0.838
epoch 3, loss 0.4651, train acc 0.830, test acc 0.851
epoch 4, loss 0.4319, train acc 0.842, test acc 0.860
epoch 5, loss 0.4183, train acc 0.848, test acc 0.860
epoch 6, loss 0.3980, train acc 0.854, test acc 0.868
epoch 7, loss 0.3834, train acc 0.860, test acc 0.869
epoch 8, loss 0.3780, train acc 0.862, test acc 0.864
epoch 9, loss 0.3638, train acc 0.867, test acc 0.864
epoch 10, loss 0.3608, train acc 0.867, test acc 0.870


3. Try out different initializations of the weights.

In [11]:
[init_method for init_method in dir(init) if init_method[0].isupper()]

['Bilinear',
 'Constant',
 'FusedRNN',
 'InitDesc',
 'Initializer',
 'LSTMBias',
 'Load',
 'MSRAPrelu',
 'Mixed',
 'NDArray',
 'Normal',
 'One',
 'Orthogonal',
 'Uniform',
 'Xavier',
 'Zero']

In [14]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize(init.Uniform(), force_reinit=True)

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 0.7367, train acc 0.728, test acc 0.829
epoch 2, loss 0.4693, train acc 0.827, test acc 0.854
epoch 3, loss 0.4137, train acc 0.848, test acc 0.865
epoch 4, loss 0.3826, train acc 0.859, test acc 0.867
epoch 5, loss 0.3574, train acc 0.869, test acc 0.872
epoch 6, loss 0.3443, train acc 0.873, test acc 0.868
epoch 7, loss 0.3266, train acc 0.879, test acc 0.882
epoch 8, loss 0.3143, train acc 0.884, test acc 0.882
epoch 9, loss 0.3044, train acc 0.887, test acc 0.884
epoch 10, loss 0.2984, train acc 0.889, test acc 0.883
