# 用线性回归函数构建单层神经网络

In [None]:
from mxnet import nd
from mxnet import autograd as ag

In [None]:
num_exanples=1000
num_inputs=2
true_w=[2,-3.4]
true_b=4.2
features=nd.random.normal(scale=1,shape=(num_exanples,num_inputs))
labels=features[ : ,0]*true_w[0]+features[ : ,1]*true_w[1]+true_b
noise=nd.random.normal(scale=0.01,shape=(labels.shape))
labels+=noise

In [None]:
len(features)

In [None]:
features,labels

In [None]:
from IPython import display
def use_svg_display():
    display.set_matplotlib_formats('svg')
    
def set_figsize(figsize=(3.5,2.5)):
    use_svg_display()
    plt.rcParams['figure.figsize']=figsize
    
import matplotlib.pyplot as plt
%matplotlib inline
set_figsize()
plt.scatter(features[ : ,1].asnumpy() ,labels.asnumpy(),1)

In [None]:
import random
def data_iter(batch_size,features,labels):
    num_examples=len(features)
    indices=list(range(num_examples))
    random.shuffle(indices)
    for i in range(0,num_examples,batch_size):
        j=nd.array(indices[i:min(i+batch_size,num_examples)])
        yield features.take(j),labels.take(j)##根据索引返回相应函数

In [None]:
features.take(nd.array([1]))

In [None]:
batch_size=10
for x,y in data_iter(batch_size,features,labels):
    print(x,y)
    break

In [None]:
w=nd.random.normal(scale=0.01,shape=(num_inputs,1))
b=nd.zeros(shape=(1,))

In [None]:
w.attach_grad()
b.attach_grad()

In [None]:
def square_loss(y,y_hat):
    return (y.reshape(y_hat.shape)-y_hat)**2/2

def sgd(params,batch_size,lr):
    for param in params:
        param[:]-=lr*param.grad/batch_size##[:]是将内容传送到新地址
        
def linreg(x,w,b):
    return nd.dot(x,w)+b

In [None]:
loss=square_loss
net=linreg
epoch=5
lr=0.03

for i in range(epoch):
    for x,y in data_iter(batch_size,features,labels):
        with ag.record():
            loss(y,net(x,w,b)).sum().backward()
        sgd([w,b],batch_size,lr)        
    l=loss(labels,net(features,w,b))
    print("epoch%d  :  loss=%f"%(i+1,l.mean().asnumpy()))

In [None]:
print(true_w,w)
print(true_b,b)

# 用softmax函数构建单层神经网络

In [None]:
import d2lzh as d2l
from mxnet import nd
from mxnet import autograd as ag

batch_size=256
train_iter,test_iter =d2l.load_data_fashion_mnist(batch_size)

In [None]:
num_inputs=28*28
num_outputs=10

w=nd.random.normal(scale=0.01,shape=(num_inputs,num_outputs))
b=nd.zeros(shape=num_outputs)

w.attach_grad()
b.attach_grad()

In [None]:
def Softmax(X):
    X_exp=X.exp()
    X_exp_sum=X_exp.sum(axis=1,keepdims=True)
    return X_exp/X_exp_sum

In [None]:
X=nd.random.normal(shape=(2,5))
X_prob=Softmax(X)
X_prob,X_prob.sum(axis=1)

In [None]:
def net(X):
    return Softmax(nd.dot(X.reshape(-1,num_inputs),w)+b)

In [None]:
def cross_entropy(y_hat,y):
    return -nd.pick(y_hat,y).log()

In [None]:
def evaluate_accuracy(data_iter,net):
    acc_sum,n=0.0,0
    for x,y in data_iter:
        y_hat=net(x)
        y=y.astype('float32')
        acc_sum+=((y_hat.argmax(axis=1))==y).sum().asscalar()
        n+=y.size
    return acc_sum/n

In [None]:
evaluate_accuracy(test_iter,net)

In [None]:
def sgd(params,lr,batch_size):    
    for param in params:
        param[:]-=lr*param.grad/batch_size

In [None]:
n=0.0
lr=0.1
epochs=10
def train_ch3(net,train_iter,test_iter,loss,epochs,batch_size,w,b,lr,trainer):
    for epoch in range(epochs):
        train_loss,train_accuracy,n=0.0,0.0,0
        for x,y in train_iter: 
            with ag.record():
                l=loss(net(x),y).sum()
            l.backward()
            n+=y.size
            trainer([w,b],lr,batch_size)
            train_loss+=l.asscalar()
            y=y.astype("float32")
            train_accuracy+=(net(x).argmax(axis=1)==y).sum().asscalar()
        test_accuracy=evaluate_accuracy(test_iter,net)
        print("NO.%s :train_loss: %.4f, train_accuracy: %.4f, test_accuracy: %.4f"%(epoch+1,train_loss/n,train_accuracy/n,test_accuracy))
train_ch3(net,train_iter,test_iter,cross_entropy,epochs,batch_size,w,b,lr,sgd)

In [None]:
for x,y in test_iter:
    true_labels=d2l.get_fashion_mnist_labels(y.asnumpy())
    pred_labels=d2l.get_fashion_mnist_labels(net(x).argmax(axis=1).asnumpy())
    titles=[true+"\n"+pred for true,pred in zip(true_labels,pred_labels)]
    d2l.show_fashion_mnist(x[0:9],titles[0:9])
    break

# 多层感知机

In [None]:
from mxnet import autograd as ag
from mxnet import ndarray as nd
import d2lzh as d2l

In [None]:
batch_size=256
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)

In [None]:
num_inputs,num_outputs,num_hiddens=28*28,10,256

w1=nd.random.normal(scale=0.01,shape=(num_inputs,num_hiddens))
b1=nd.zeros(num_hiddens)
w2=nd.random.normal(scale=0.01,shape=(num_hiddens,num_outputs))
b2=nd.zeros(num_outputs)

for param in [w1,b1,w2,b2]:
    param.attach_grad()

In [None]:
def relu(X):
    return nd.maximum(X,0)

In [None]:
def Softmax(X):
    X_sum_exp=X.exp().sum(axis=1,keepdims=True)##重要容易出错信息,keepdims用于保持维度特性
    return X.exp()/X_sum_exp

def net(X):
    X=X.reshape(-1,num_inputs)
    H1=relu(nd.dot(X,w1)+b1)
    return Softmax(nd.dot(H1,w2)+b2)

In [None]:
def cross_entropy(y_hat,y):
    return -nd.pick(y_hat,y).log()

In [None]:
def evaluate_accuracy(data_iter,net):
    corr,n=0.0,0
    for X,y in data_iter:
        y=y.astype("float32")
        y_hat=net(X)
        corr+=(y_hat.argmax(axis=1)==y).sum().asscalar()
        n+=y.size
    return corr/n

In [None]:
def sgd(params,lr,batch_size):
    for param in params:
        param[:]-=lr*param.grad/batch_size

In [None]:
def train_ch3(train_iter,test_iter,batch_size,lr,net,params,epochs,loss,trainer):
    for epoch in range(epochs):
        for X,y in train_iter:
            y=y.astype("float32")
            n,train_cross_enropy,train_acc=0,0.0,0.0
            with ag.record():
                l=loss(net(X),y).sum()
            l.backward()
            trainer(params,lr,batch_size)
            n+=y.size
            train_cross_enropy+=l.asscalar()
            train_acc+=(net(X).argmax(axis=1)==y).sum().asscalar()
        test_acc=evaluate_accuracy(test_iter,net)
        print("NO.%s ,train_loss is %.3f, train_acc is %.4f, test_acc is %.4f"%(epoch+1,train_cross_enropy/n,train_acc/n,test_acc))
    
batch_size,lr,epochs=256,0.5,10
train_ch3(train_iter,test_iter,batch_size,lr,net,[w1,b1,w2,b2],epochs,cross_entropy,sgd)

In [None]:
?nd.sum

# 正则化

## 权重衰减（L2范数）

In [None]:
from mxnet import autograd as ag
from mxnet import nd

In [None]:
def sgd(params,lr,batch_size):
    for param in params:
        param[:]-=lr*param.grad/batch_size

In [None]:
def l2_penalty(w):
    return (w**2).sum()/2

In [None]:
def entropy_loss(y_hat,y):
    return -nd.pick(y_hat,y).log().sum()

In [None]:
def evaluate_accuracy(data_iter,net):
    acc,n=0.0,0
    for X,y in data_iter:
        y=y.astype('float32')
        acc+=(net(X).argmax(axis=1)==y).sum().asscalar()
        n+=y.size
    return acc/n

In [None]:
def trainer_ch3(batch_size,lr,train_iter,test_iter,net,train,epochs,loss,l2_penalty,params):
    for epoch in range(epochs):
        acc_train,n=0.0,0
        for X,y in train_iter:
            y_hat=net(X)
            with ag.record():
                l=loss+l2_penalty(params[0])
            l.backward()
            sgd(params,lr,batch_size)
            n+=y.size
            train_acc+=(y_hat.rgmax(axis=1)==y).sum().asscalar()
        test_acc=evaluate_accuracy(test_iter,net)
        print("NO.%d: train_loss=%.3f, test_loss="%(epoch+1,train_acc/n,test_acc))            

## Dropout法

In [None]:
from mxnet import autograd as ag
from mxnet import nd

In [None]:
def dropout(X,drop_prob):
    assert 0<=drop_prob<=1
    keep_prob=1-drop_prob
    if keep_prob ==0:
        return X.zeros_like()
    mask=nd.random.uniform(0,1,X.shape)<keep_prob
    return mask*X/keep_prob

In [None]:
num_inputs,num_outputs,num_hiddens1,num_hiddens2=784,10,256,256

w1=nd.random.normal(scale=0.01,shape=(num_inputs,num_hiddens1))
b1=nd.zeros(num_hiddens1)
w2=nd.random.normal(scale=0.01,shape=(num_hiddens1,num_hiddens2))
b2=nd.zeros(num_hiddens2)
w3=nd.random.normal(scale=0.01,shape=(num_hiddens2,num_outputs))
b3=nd.zeros(num_outputs)

params=[w1,b1,w2,b2,w3,b3]
for param in params:
    param.attach_grad()

In [None]:
drop_prob1, drop_prob2=0.2,0.5

def relu(X):
    return nd.maximum(X,0)

def net(X):
    X=X.reshape(-1,num_inputs)
    h1=relu(nd.dot(X,w1)+b1)
    if ag.is_training():
        h1=dropout(h1,drop_prob1)
    h2=relu(nd.dot(h1,w2)+b2)
    if ag.is_training():
        h2=dropout(h2,drop_prob2)
    return nd.dot(h2,w3)+b3

Practice

In [None]:
def dropout(X,dropout_prob):
    assert 0<=dropout_prob<=1
    keep_prob=1-dropout_prob
    if keep_prob==0:
        return X.zeros_like()
    mask=nd.random.uniform(0,1,X.shape())<keep_prob
    return mask*X/keep_prob

def net(X,dropout_prob):
    h1=nd.dot(X,w1)+b1
    if ag.is_training():
        h1=dropout(relu(h1),dropout_prob)
    h2=nd.dot(h1,w2)+b2
    if ag.is_training():
        h2=dropout(relu(h2),dropout_prob)
    h3=nd.dot(h2,w3)+b3
    return h3

# 用mxnet直接生成模型

In [1]:
from mxnet.gluon import nn
from mxnet import nd

##   nn里已经有Sequential()
net=nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(256,activation="relu"))
    net.add(nn.Dense(10))
print(net)

Sequential(
  (0): Dense(None -> 256, Activation(relu))
  (1): Dense(None -> 10, linear)
)


In [16]:
##类似Squential的定义方法，只是类的名字为MySquential
class MySquential(nn.Block):
    def __init__(self,**kwargs):
        super(MySquential,self).__init__(**kwargs)
        
    def add(self,block):
        self._children[block.name]=block
    
    def forward(self,x):
        for block in self._children.values():
            x=block(x)
        return x

In [31]:
X=nd.random.uniform(shape=(2,30))

net=MySquential()
net.add(nn.Dense(256,activation="relu"))
net.add(nn.Dense(10))
net.initialize()
net(X)##可发现结果与Sequential的作用一样


[[-0.03479836  0.08300613 -0.02447348  0.03858991  0.01280021 -0.20667431
   0.04105026  0.11398264 -0.06973129 -0.01379004]
 [-0.03529222  0.09777347  0.03061453  0.01371823  0.03054728 -0.09273786
  -0.05212441  0.09239373  0.00355052 -0.03384817]]
<NDArray 2x10 @cpu(0)>

In [21]:
class MLP(nn.Block):
    def __init__(self,**kwargs):
        super(MLP,self).__init__(**kwargs)
        self.hidden=nn.Dense(256,activation="relu")
        self.output=nn.Dense(10)
        
    def forward(self,x):
        return self.output(self.hidden(x))

In [19]:
X=nd.random.uniform(shape=(2,30))
net=MLP()
net.initialize()
net(X)


[[ 0.00483723 -0.01825137 -0.07284753 -0.06607331 -0.07486404  0.10269828
   0.00578628 -0.00612297  0.00765912 -0.06142711]
 [ 0.04935986 -0.07191715 -0.02122155 -0.09590983 -0.02557939  0.07625449
  -0.025772    0.02371885  0.05016173 -0.04302575]]
<NDArray 2x10 @cpu(0)>

In [20]:
print('default prefix:', net.hidden.name)

net3 = MLP(prefix='another_mlp_')
print('customized prefix:', net3.hidden.name)

default prefix: dense8
customized prefix: dense10


In [40]:
class FancyMLP(nn.Block):
    def __init__(self,**kwargs):
        super(FancyMLP,self).__init__(**kwargs)
        self.rand_weight=self.params.get_constant('rand_weight',nd.random.uniform(shape=(20,20)))
        ##用get_constant函数创建常数参数，即不被迭代的参数
        self.dense=nn.Dense(20,activation="relu")
    
    def forward(self,x):
        x=self.dense(x)
        x=nd.relu(nd.dot(x,self.rand_weight.data())+1)
        x=self.dense(x)
        while x.norm().asscalar()>1:
            x/=2
        if x.norm().asscalar()<0.8:
            x*=10
        return x.sum()

In [41]:
class NestMLP(nn.Block):
    def __init__(self,**kwargs):
        super(NestMLP,self).__init__(**kwargs)
        self.net=nn.Sequential()
        self.net.add(nn.Dense(64,activation='relu'),
                    nn.Dense(32,activation='relu'))
        self.dense=nn.Dense(16,activation='relu')
        
    def forward(self,x):
        return self.dense(self.net(x))

net=nn.Sequential()
net.add(NestMLP(),nn.Dense(20),FancyMLP())
net.initialize()
net(X)


[20.839413]
<NDArray 1 @cpu(0)>