In [1]:
from mxnet import autograd, nd, init, gluon
from mxnet.gluon import nn
from mxnet.gluon import data as gdata
from mxnet.gluon import loss as gloss

# Generating Data Sets 

In [2]:
num_inputs = 2
num_examples = 1000
true_w = nd.array([2, -3.4])
true_b = 4.2
features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
labels = nd.dot(features, true_w) + true_b
labels += nd.random.normal(scale=0.01, shape=labels.shape)

# Reading Data 

Gluon provides the `data` module to read data. We combine the features and labels of the training data using `ArrayDataset`, and obtain a data iterator by using `DataLoader`.

In [3]:
batch_size = 10
dataset = gdata.ArrayDataset(features, labels)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)

In [4]:
for X, y in data_iter:
    print(X, y)
    break


[[-0.6904091   0.09003334]
 [-0.74876964 -0.68402195]
 [-0.23816614 -0.3334923 ]
 [ 0.29402137 -1.5258902 ]
 [-0.10250564 -0.06988182]
 [-0.34302703 -0.15515815]
 [ 1.174712    1.3677438 ]
 [-1.4996531  -2.1551476 ]
 [-0.74845386  1.5153002 ]
 [ 0.60099405 -0.04193413]]
<NDArray 10x2 @cpu(0)> 
[ 2.5108895  5.0226946  4.8740754  9.98281    4.2365274  4.0597463
  1.9208252  8.532497  -2.4259458  5.538153 ]
<NDArray 10 @cpu(0)>


# Define the Model 

The `nn` module in the `gluon` provide a large number of predefined layers, which allow us to focus on the layers used to construct the model. The `Sequential` instance can be used as a container that concatenates various layers in sequence. We use a single Dense layer to create a linear regression model.

In [5]:
net = nn.Sequential()
net.add(nn.Dense(1))

# Initialize Model Parameters

In [6]:
net.initialize(init.Normal(sigma=0.01))

Parameters hasn't been initialized yet because Gluon dosen't know the how many dimensions the input will have at this point. The updates are deferred until the first time that data is sent throught the network.

# Define the Loss Function

In [7]:
loss = gloss.L2Loss()

# Define the Optimization Algorithm 

In [8]:
training = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})

# Training 

1. Generate predictions `net(X)` and calculate loss `l` -- forward pass.
2. Call `l.backward()` to do the gradient calculation -- backward pass.
3. Update parameters by invoking SGD optimizer.

In [9]:
num_epochs = 3
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        with autograd.record():
            l = loss(net(X), y)
        l.backward()
        training.step(batch_size)
    l = loss(net(features), labels)
    print('epoch %d, loss: %f' % (epoch, l.mean().asnumpy()))

epoch 1, loss: 0.035105
epoch 2, loss: 0.000129
epoch 3, loss: 0.000049


In [10]:
w = net[0].weight.data()
print('Error in estimating w', true_w.reshape(w.shape) - w)
b = net[0].bias.data()
print('Error in estimating b', true_b - b)

Error in estimating w 
[[ 0.00063837 -0.00077081]]
<NDArray 1x2 @cpu(0)>
Error in estimating b 
[1.5735626e-05]
<NDArray 1 @cpu(0)>


# Problems 

1. If we replace `l = loss(output, y)` with `l = loss(output, y).mean()`, we need to change `trainer.step(batch_size)` to `trainer.step(1)` accordingly. Why?

Because the `Trainer.step` normalize gradients by 1 / batch_size for us. 

2. Review the MXNet documentation to see what loss functions and initialization methods are provided in the modules `gluon.loss` and `init`. Replace the loss by Huber's loss.

In [14]:
loss = gloss.HuberLoss()

net.initialize(init.Normal(sigma=0.01), force_reinit=True)
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        with autograd.record():
            l = loss(net(X), y)
        l.backward()
        training.step(batch_size)
    l = loss(net(features), labels)
    print('epoch %d, loss: %f' % (epoch, l.mean().asnumpy()))

epoch 1, loss: 2.253408
epoch 2, loss: 0.373638
epoch 3, loss: 0.001061


3. How do you access the gradient of `dense.weight`?

In [30]:
net.collect_params()['dense0_weight'].grad()


[[-0.21444759  0.42217892]]
<NDArray 1x2 @cpu(0)>