In [1]:
#卷积神经网络是指主要由卷积层构成的神经网络
#解决：模型计算量大、在图片相近的像素在向量里的表示可能相差很远
#相关术语：卷积层kernel或者filter （代表相应位置相乘的和）pooling(池化层)
#nd.Convolution实现

In [16]:
from mxnet import ndarray as nd

#输入：单通道；输出：单通道

# 输入输出数据格式是 batch x channel x height x width，这里batch和channel都是1
#weight: (num_filter, channel, kernel[0], kernel[1])
#权重格式是 output_channels x in_channels x height x width
w = nd.arange(4).reshape((1,1,2,2))
#其实我可以这样理解，1代表batch数值，可以代表索引数，第几个包包
#第二个是channal数，可以理解为维度，我可以print（height x width）维的矩阵
b = nd.array([1])
data = nd.arange(9).reshape((1,1,3,3))
out=nd.Convolution(data, w, b, kernel=w.shape[2:], num_filter=w.shape[1])
#kernel=w.shape[2:]没有看懂这个意思
#num_filter=w.shape[1]卷积核的数量是w的一维数值

print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]]]
<NDArray 1x1x3x3 @cpu(0)> 

weight: 
[[[[ 0.  1.]
   [ 2.  3.]]]]
<NDArray 1x1x2x2 @cpu(0)> 

bias: 
[ 1.]
<NDArray 1 @cpu(0)> 

output: 
[[[[ 20.  26.]
   [ 38.  44.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [17]:
#输入：单通道；输出：单通道； 步长：2
out = nd.Convolution(data, w, b, kernel=w.shape[2:], num_filter=w.shape[1],
                     stride=(2,2), pad=(1,1))

print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]]]
<NDArray 1x1x3x3 @cpu(0)> 

weight: 
[[[[ 0.  1.]
   [ 2.  3.]]]]
<NDArray 1x1x2x2 @cpu(0)> 

bias: 
[ 1.]
<NDArray 1 @cpu(0)> 

output: 
[[[[  1.   9.]
   [ 22.  44.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [21]:
#输入：多通道；输出：单通道
w = nd.arange(8).reshape((1,2,2,2))
data = nd.arange(18).reshape((1,2,3,3))

out=nd.Convolution(data,w,b,kernel=w.shape[2:],num_filter=w.shape[0])

print('input:',data,'\n\nweight:',w,'\n\nbias:', b, '\n\noutput:', out)


input: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 

weight: 
[[[[ 0.  1.]
   [ 2.  3.]]

  [[ 4.  5.]
   [ 6.  7.]]]]
<NDArray 1x2x2x2 @cpu(0)> 

bias: 
[ 1.]
<NDArray 1 @cpu(0)> 

output: 
[[[[ 269.  297.]
   [ 353.  381.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [25]:
#输入：多通道 输出：多通道
data=nd.arange(18).reshape((1,2,3,3))
w=nd.arange(16).reshape((2,2,2,2))
b=nd.array([1,2])

out=nd.Convolution(data,w,b,kernel=w.shape[2:],num_filter=w.shape[0])
print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 

weight: 
[[[[  0.   1.]
   [  2.   3.]]

  [[  4.   5.]
   [  6.   7.]]]


 [[[  8.   9.]
   [ 10.  11.]]

  [[ 12.  13.]
   [ 14.  15.]]]]
<NDArray 2x2x2x2 @cpu(0)> 

bias: 
[ 1.  2.]
<NDArray 2 @cpu(0)> 

output: 
[[[[  269.   297.]
   [  353.   381.]]

  [[  686.   778.]
   [  962.  1054.]]]]
<NDArray 1x2x2x2 @cpu(0)>


In [30]:
data = nd.arange(18).reshape((1,2,3,3))

max_pool = nd.Pooling(data=data, pool_type="max", kernel=(2,2))
avg_pool = nd.Pooling(data=data, pool_type="avg", kernel=(2,2))

print('data:', data, '\n\nmax pooling:', max_pool, '\n\navg pooling:', avg_pool)

data: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 

max pooling: 
[[[[  4.   5.]
   [  7.   8.]]

  [[ 13.  14.]
   [ 16.  17.]]]]
<NDArray 1x2x2x2 @cpu(0)> 

avg pooling: 
[[[[  2.   3.]
   [  5.   6.]]

  [[ 11.  12.]
   [ 14.  15.]]]]
<NDArray 1x2x2x2 @cpu(0)>


In [67]:
#read mnist data
import sys
sys.path.append('.')
from utils import load_data_fashion_mnist

batch_size=256
train_data, test_data = load_data_fashion_mnist(batch_size)

import mxnet as mx

ctx = mx.cpu()

In [76]:
#define parameters
from mxnet import autograd
#the first layer
weight_scale = .01

# output channels = 20, kernel = (5,5)
W1 = nd.random_normal(shape=(20,1,5,5), scale=weight_scale, ctx=ctx)
b1 = nd.zeros(W1.shape[0], ctx=ctx)

# output channels = 50, kernel = (3,3)
W2 = nd.random_normal(shape=(50,20,3,3), scale=weight_scale, ctx=ctx)
b2 = nd.zeros(W2.shape[0], ctx=ctx)

# output dim = 128
W3 = nd.random_normal(shape=(1250, 128), scale=weight_scale, ctx=ctx)
b3 = nd.zeros(W3.shape[1], ctx=ctx)

# output dim = 10
W4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale, ctx=ctx)
b4 = nd.zeros(W4.shape[1], ctx=ctx)

params = [W1, b1, W2, b2, W3, b3, W4, b4]
for param in params:
    param.attach_grad()



In [77]:
#define function
#there are three basic construction in convolutionnal layer
#conv+activation+pooling

In [84]:
def net(X,verbose=False):# what means verbose? verbose 相当于一个开关，如果verbose=true，我就相当于打开开关，下面的程序开始执行
    X = X.as_in_context(W1.context)#我也看不懂为什么X这样定义，X输入进去的是图片？
    #the first convolutional layer
    h1_conv=nd.Convolution(
    data=X,weight=w1,bias=b1, kernel=w1.shape[2:], num_filter=W1.shape[0])
    h1_activation = nd.relu(h1_conv)
    h1=nd.Pooling(
    data=h1_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    #the second convolutional layer
    h2_conv=nd.Convolution(
    data=h1,weight=w2,bias=b2, kernel=w2.shape[2:], num_filter=w2.shape[0])
    h2_activation = nd.relu(h2_conv)
    h2=nd.Pooling(data=h2_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    h2 = nd.flatten(h2)
    #the first connection layer 全连接层好像就是普通的线性处理，那前面的卷积层就应该是对图像进行降维处理
    h3_linear = nd.dot(h2, W3) + b3
    h3 = nd.relu(h3_linear)
    #the second connection layer
    h4_linear = nd.dot(h3, W4) + b4
    h4 = nd.relu(h4_linear)
    
    if verbose:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_linear.shape)
        print('output:', h4_linear)
    return h4_linear

In [85]:
#training
from mxnet import autograd as autograd
from utils import SGD, accuracy, evaluate_accuracy
from mxnet import gluon

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

learning_rate = .2

for epoch in range(5):
    train_loss = 0.
    train_acc = 0.
    for data, label in train_data:
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        SGD(params, learning_rate/batch_size)

        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, label)

    test_acc = evaluate_accuracy(test_data, net, ctx)
    print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % (
        epoch, train_loss/len(train_data),
        train_acc/len(train_data), test_acc))

Epoch 0. Loss: 1.060994, Train acc 0.595670, Test acc 0.697716
Epoch 1. Loss: 0.623971, Train acc 0.753990, Test acc 0.784054
Epoch 2. Loss: 0.546025, Train acc 0.789413, Test acc 0.803385
Epoch 3. Loss: 0.499387, Train acc 0.812366, Test acc 0.770232
Epoch 4. Loss: 0.469481, Train acc 0.825020, Test acc 0.844651


In [None]:
#为什么Epoch 3. Loss: 0.499387, Train acc 0.812366, Test acc 0.770232
#Test acc精度会下降了一下？