## 生成数据集

In [1]:
# 与上一节相同
from mxnet import autograd, nd

num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2
features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += nd.random.normal(scale=0.01, shape=labels.shape)

In [2]:
from mxnet.gluon import data as gdata

batch_size = 10
# 将训练数据的特征和标签组合
dataset = gdata.ArrayDataset(features, labels)
# 随机读取小批量
# 这里data_iter的使用跟上一节中的一样。让我们读取并打印第一个小批量数据样本
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)

In [3]:
for X, y in data_iter:
    print(X, y)
    break


[[ 0.60375047 -0.5491552 ]
 [ 1.5329047   0.8233045 ]
 [-1.3713115   0.44100854]
 [-0.5489446   0.9783404 ]
 [ 0.61842227  0.0640344 ]
 [-0.8622762  -0.22448552]
 [-0.8627405  -1.9521301 ]
 [-2.0252197   0.14331104]
 [-0.5558463  -0.440662  ]
 [-0.9930084   1.0062088 ]]
<NDArray 10x2 @cpu(0)> 
[ 7.2679596   4.480722   -0.02090577 -0.22527038  5.2208033   3.2509205
  9.108723   -0.34202975  4.5954723  -1.2033253 ]
<NDArray 10 @cpu(0)>


In [4]:
# 准备工作
from mxnet.gluon import nn
from mxnet import init
from mxnet.gluon import loss as gloss
from mxnet import gluon
# 实例化Sequential
net = nn.Sequential()
# 添加一个全连接层，输出个数为1
net.add(nn.Dense(1))
# 初始化时随机采样于均值为0、标准差为0.01的正态分布。偏差参数默认会初始化为零
net.initialize(init.Normal(sigma=0.01))
# 平方损失又称L2范数损失
loss = gloss.L2Loss()
# 创建一个Trainer实例，并指定学习率为0.03的小批量随机梯度下降（sgd）为优化算法
# 该优化算法将用来迭代net实例所有通过add函数嵌套的层所包含的全部参数。
# 这些参数可以通过collect_params函数获取
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})

In [5]:
# 训练模型
# 由于变量l是长度为batch_size的一维NDArray，执行l.backward()等价于执行l.sum().backward()。
# 按照小批量随机梯度下降的定义，我们在step函数中指明批量大小，从而对批量中样本梯度求平均
num_epochs = 3
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        with autograd.record():
            l = loss(net(X), y)
        l.backward()
        trainer.step(batch_size) # 与自己写的唯一区别在这步，自己写这里是sgd
    l = loss(net(features), labels)
    print('epoch %d, loss: %f' % (epoch, l.mean().asnumpy()))

epoch 1, loss: 0.035009
epoch 2, loss: 0.000129
epoch 3, loss: 0.000049


In [6]:
dense = net[0]
true_w, dense.weight.data()

([2, -3.4], 
 [[ 1.9997326 -3.3994465]]
 <NDArray 1x2 @cpu(0)>)

In [7]:
true_b, dense.bias.data()

(4.2, 
 [4.1996355]
 <NDArray 1 @cpu(0)>)

## 练习

* 如果将`l = loss(net(X), y)`替换成`l = loss(net(X), y).mean()`，我们需要将`trainer.step(batch_size)`相应地改成`trainer.step(1)`。这是为什么呢？
* 答：自动求梯度模块计算得来的梯度是一个批量样本的梯度和，所以在更新w,b的时候，因为w,b是向量，把每一个权重用总梯度更新肯定不对，所以除以batch_size，用梯度的均值来更新才对，但是如果损失函数就已经求均值了(本来输出是(num_examples,1)的向量)，在求损失函数关于w,b的梯度的时候，得到的就不是梯度和，而是均值的梯度，就不用再除batch_size了，效果一样
* 查阅MXNet文档，看看`gluon.loss`和`init`模块里提供了哪些损失函数和初始化方法。
* 如何访问`dense.weight`的梯度？
dense.weight.grad()

In [8]:
help(trainer.step)
# Gradient will be normalized by `1/batch_size`.

Help on method step in module mxnet.gluon.trainer:

step(batch_size, ignore_stale_grad=False) method of mxnet.gluon.trainer.Trainer instance
    Makes one step of parameter update. Should be called after
    `autograd.backward()` and outside of `record()` scope.
    
    For normal parameter updates, `step()` should be used, which internally calls
    `allreduce_grads()` and then `update()`. However, if you need to get the reduced
    gradients to perform certain transformation, such as in gradient clipping, then
    you may want to manually call `allreduce_grads()` and `update()` separately.
    
    Parameters
    ----------
    batch_size : int
        Batch size of data processed. Gradient will be normalized by `1/batch_size`.
        Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
    ignore_stale_grad : bool, optional, default=False
        If true, ignores Parameters with stale gradient (gradient that has not
        been updated by `backward` after last 

In [9]:
help(dense.weight)

Help on Parameter in module mxnet.gluon.parameter object:

class Parameter(builtins.object)
 |  A Container holding parameters (weights) of Blocks.
 |  
 |  :py:class:`Parameter` holds a copy of the parameter on each :py:class:`Context` after
 |  it is initialized with ``Parameter.initialize(...)``. If :py:attr:`grad_req` is
 |  not ``'null'``, it will also hold a gradient array on each :py:class:`Context`::
 |  
 |      ctx = mx.gpu(0)
 |      x = mx.nd.zeros((16, 100), ctx=ctx)
 |      w = mx.gluon.Parameter('fc_weight', shape=(64, 100), init=mx.init.Xavier())
 |      b = mx.gluon.Parameter('fc_bias', shape=(64,), init=mx.init.Zero())
 |      w.initialize(ctx=ctx)
 |      b.initialize(ctx=ctx)
 |      out = mx.nd.FullyConnected(x, w.data(ctx), b.data(ctx), num_hidden=64)
 |  
 |  Parameters
 |  ----------
 |  name : str
 |      Name of this parameter.
 |  grad_req : {'write', 'add', 'null'}, default 'write'
 |      Specifies how to update gradient to grad arrays.
 |  
 |      - ``'wr