# 实验5  

将下面的代码补充完整

## 读取MNIST数据集，并将其划分为train/val/test数据集

In [93]:
import gzip
import os
import struct
import numpy as np

def load_mnist(path, kind='train'):
    """加载MNIST数据集"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(len(labels), 784)

    return images, labels



# 数据集划分
def data_split(images, labels, ratio):
    
    total_len = images.shape[0]
    offset = int(total_len * ratio)

    val_img = images[:offset][:]
    val_lb = labels[:offset]

    train_img = images[offset:][:]
    train_lb = labels[offset:]

    return train_img, train_lb, val_img, val_lb    

# 读取训练集和测试集数据
[images, labels] = load_mnist('./MNIST', kind='train')
[test_img, test_lb] = load_mnist('./MNIST',kind='test')
train_img, train_lb, val_img, val_lb = data_split(images, labels, 1/6)

# 对标签进行热编码
one_hot_train_lb = np.eye(10)[train_lb]
one_hot_val_lb = np.eye(10)[val_lb]
one_hot_test_lb= np.eye(10)[test_lb]

# 打印查看数据集格式
print('训练集图像格式为:', train_img.shape, '训练集标签格式为:', train_lb.shape,'热编码训练集标签格式为:', one_hot_train_lb.shape)
print('验证集图像格式为:', val_img.shape, '验证集标签格式为:', val_lb.shape,'热编码验证集标签格式为:', one_hot_val_lb.shape)
print('测试集图像格式为:', test_img.shape, '测试集标签格式为:', test_lb.shape,'热编码测试集标签格式为:', one_hot_test_lb.shape)

训练集图像格式为: (50000, 784) 训练集标签格式为: (50000,) 热编码训练集标签格式为: (50000, 10)
验证集图像格式为: (10000, 784) 验证集标签格式为: (10000,) 热编码验证集标签格式为: (10000, 10)
测试集图像格式为: (10000, 784) 测试集标签格式为: (10000,) 热编码测试集标签格式为: (10000, 10)


## 神经网络(2层线性分类器)+L2 Loss+梯度后向传播+Adam优化器

### 定义Adam

In [94]:
def adam_opt(dx, first_momentum, second_momentum, i, beta1=0.9, beta2=0.999, eps=1e-8):
      # momentum
      first_momentum = beta1 * first_momentum + (1-beta1) * dx
      # adagrad
      second_momentum = beta2 * second_momentum + (1-beta2) * dx * dx
      # bias correction
      first_unbias = first_momentum / (1 - beta1 ** i)
      second_unbias = second_momentum / (1 - beta2 ** i)
      return first_unbias / (np.sqrt(second_unbias) + eps),first_momentum,second_momentum

### 题目1：Train network without bias (Adam) 

In [121]:
#模型为两层线性分类器
#隐藏层神经元数量64，使用sigmoid激活函数，忽略偏置b1
#输出层没有使用激活函数,忽略偏置b2
#使用L2损失函数
def train1(x, y, epoch):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=1000 examples.
    
    Inputs:
    - x: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,C) containing training labels; y[c] = 1 means
          that X[i] has label c, where 0 <= c < C.
    - epoch: Training iterations, an integer.
    
    Returns:
    - 训练得到的参数
    """
    
    # set the hyperparameters
    learning_rate = 1e-3

    # Initialize first and second momentums in Adam
    mW1, vW1 = 0, 0
    mW2, vW2 = 0, 0

    # initialize W1 (D,64) and w2(64,C) 
    w1 = np.random.randn(x.shape[1], 64) * 0.0001
    w2 = np.random.randn(64,y.shape[1]) * 0.0001


    for t in range (1,epoch+1):
      h = 1 / (1 + np.exp(-(x.dot (w1)) ) )  #(N,64)
      y_pred = h.dot(w2)      #(N,C)
      loss = np.square(y_pred - y).mean()
      scores = np.argmax(y_pred, axis=1)
      lb     = np.argmax(y,      axis=1)
      accuracy = np.mean(scores == lb) * 100

      # compute the numeric gradient
      grad_y_pred = (y_pred-y)*2 #Nx10
      grad_w2 = h.T.dot(grad_y_pred) #64x10
      grad_h = grad_y_pred.dot(w2.T) #50000*64
      grad_sig=h*(1-h)*grad_h
      grad_w1 = x.T.dot(grad_sig)


      # update W and b with Adam
      grad_w2, mW2, vW2 = adam_opt(grad_w2, mW2, vW2, t)
      grad_w1, mW1, vW1 = adam_opt(grad_w1, mW1, vW1, t)

      w2 -= learning_rate *grad_w2
      w1 -= learning_rate *grad_w1

        # print the result
      print("Epoch: %d  Loss: %.3f  Acc: %.3f%%" % (t, loss, accuracy))
    
    return w1, w2

#### 在训练集上进行训练

In [74]:
# train the linear classifier with adam in 1000 examples
epoch = 1000
best_X = train1(train_img, one_hot_train_lb, epoch)

Epoch: 1  Loss: 0.100  Acc: 7.288%
Epoch: 2  Loss: 0.095  Acc: 29.914%
Epoch: 3  Loss: 0.091  Acc: 30.122%
Epoch: 4  Loss: 0.090  Acc: 12.548%
Epoch: 5  Loss: 0.091  Acc: 11.230%
Epoch: 6  Loss: 0.092  Acc: 11.230%
Epoch: 7  Loss: 0.091  Acc: 11.230%
Epoch: 8  Loss: 0.091  Acc: 11.230%
Epoch: 9  Loss: 0.090  Acc: 11.230%
Epoch: 10  Loss: 0.090  Acc: 11.230%
Epoch: 11  Loss: 0.089  Acc: 11.230%
Epoch: 12  Loss: 0.090  Acc: 11.230%
Epoch: 13  Loss: 0.089  Acc: 11.230%
Epoch: 14  Loss: 0.089  Acc: 23.182%
Epoch: 15  Loss: 0.089  Acc: 25.944%
Epoch: 16  Loss: 0.089  Acc: 43.162%
Epoch: 17  Loss: 0.088  Acc: 45.474%
Epoch: 18  Loss: 0.088  Acc: 47.610%
Epoch: 19  Loss: 0.088  Acc: 47.344%
Epoch: 20  Loss: 0.087  Acc: 46.210%
Epoch: 21  Loss: 0.087  Acc: 45.916%
Epoch: 22  Loss: 0.087  Acc: 45.906%
Epoch: 23  Loss: 0.087  Acc: 48.990%
Epoch: 24  Loss: 0.086  Acc: 50.148%
Epoch: 25  Loss: 0.086  Acc: 51.266%
Epoch: 26  Loss: 0.085  Acc: 51.886%
Epoch: 27  Loss: 0.085  Acc: 52.282%
Epoch: 28  

  h = 1 / (1 + np.exp(-(x.dot (w1)) ) )  #(N,64)


Epoch: 373  Loss: 0.054  Acc: 79.488%
Epoch: 374  Loss: 0.054  Acc: 79.426%
Epoch: 375  Loss: 0.054  Acc: 79.526%
Epoch: 376  Loss: 0.054  Acc: 79.464%
Epoch: 377  Loss: 0.054  Acc: 79.440%
Epoch: 378  Loss: 0.054  Acc: 79.582%
Epoch: 379  Loss: 0.053  Acc: 79.474%
Epoch: 380  Loss: 0.053  Acc: 79.606%
Epoch: 381  Loss: 0.053  Acc: 79.578%
Epoch: 382  Loss: 0.053  Acc: 79.560%
Epoch: 383  Loss: 0.053  Acc: 79.356%
Epoch: 384  Loss: 0.053  Acc: 79.486%
Epoch: 385  Loss: 0.052  Acc: 79.696%
Epoch: 386  Loss: 0.052  Acc: 79.584%
Epoch: 387  Loss: 0.052  Acc: 79.758%
Epoch: 388  Loss: 0.052  Acc: 79.662%
Epoch: 389  Loss: 0.052  Acc: 81.118%
Epoch: 390  Loss: 0.052  Acc: 80.958%
Epoch: 391  Loss: 0.052  Acc: 80.840%
Epoch: 392  Loss: 0.051  Acc: 81.044%
Epoch: 393  Loss: 0.051  Acc: 80.998%
Epoch: 394  Loss: 0.051  Acc: 81.256%
Epoch: 395  Loss: 0.051  Acc: 80.842%
Epoch: 396  Loss: 0.051  Acc: 81.000%
Epoch: 397  Loss: 0.051  Acc: 81.658%
Epoch: 398  Loss: 0.051  Acc: 81.674%
Epoch: 399  

#### 在验证集上测试分类效果

In [76]:
# test the classification accuracy on validation dataset
def sigmoid(x):
    return 1/(1+np.exp(-x))
res=val_img.dot(best_X[0])
res=sigmoid(res)
res=res.dot(best_X[1])
scores = np.argmax(res, axis=1)
lb     = np.argmax(one_hot_val_lb,axis=1)
accuracy = np.mean(scores == lb) * 100
print("Numeric_Gradient+Adam优化在验证集上的分类精度为: %.3f%%" %(accuracy))


Numeric_Gradient+Adam优化在验证集上的分类精度为: 82.090%


  return 1/(1+np.exp(-x))


### 题目2： Train network with bias (Adam) 

In [128]:
#在完成上一部分程序的基础上，给两个线性分类器加上bias
def train2(x, y, epoch):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=1000 examples.
    
    Inputs:
    - x: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,C) containing training labels; y[c] = 1 means
          that X[i] has label c, where 0 <= c < C.
    - epoch: Training iterations, an integer.
    
    Returns:
    - 训练得到的参数
    """
    
    # set the hyperparameters
    learning_rate = 1e-3

    # Initialize first and second momentums in Adam
    mW1, vW1 = 0, 0
    mW2, vW2 = 0, 0
    mb1, vb1 = 0, 0
    mb2, vb2 = 0, 0

    # initialize W1 (D,64) and w2(64,C) 
    w1 = np.random.randn(x.shape[1], 64) * 0.0001
    w2 = np.random.randn(64,y.shape[1]) * 0.0001
    b1 = np.random.randn(64 ) * 0.0001
    b2 = np.random.randn(10) * 0.0001

    for t in range (1,epoch+1):
      h = 1 / (1 + np.exp(-((x.dot (w1))) ) )  #(N,64)
      h=h+b1
      y_pred =h.dot(w2)     #(N,C)
      y_pred=y_pred+b2
      loss = np.square(y_pred - y).mean( )
      scores = np.argmax(y_pred, axis=1)
      lb     = np.argmax(y,      axis=1)
      accuracy = np.mean(scores == lb) * 100
      # compute the numeric gradient
      grad_y_pred = (y_pred-y)*2 #Nx10
      grad_w2 = h.T.dot(grad_y_pred) #64x10
      grad_h = grad_y_pred.dot(w2.T)#50000*64
      grad_sig=h*(1-h)*grad_h
      grad_w1 = x.T.dot(grad_sig)
      grad_b2 =np.sum(grad_y_pred,axis=0)
      grad_b1 =np.sum(grad_sig,axis=0)


      # update W and b with Adam
      grad_w2, mW2, vW2 = adam_opt(grad_w2, mW2, vW2, t)
      grad_b2, mb2, vb2 = adam_opt(grad_b2, mb2, vb2, t)
      grad_w1, mW1, vW1 = adam_opt(grad_w1, mW1, vW1, t)
      grad_b1, mb1, vb1 = adam_opt(grad_b1, mb1, vb1, t)

      w2 -= learning_rate*grad_w2
      b2 -= learning_rate*grad_b2
      w1 -= learning_rate*grad_w1
      b1 -= learning_rate*grad_b1
      

        # print the result
      print("Epoch: %d  Loss: %.3f  Acc: %.3f%%" % (t, loss, accuracy))
    
    return w1, b1, w2, b2

#### 在训练集上进行训练

In [131]:
# train the linear classifier with adam in 1000 examples
epoch = 100
best_X = train2(train_img, one_hot_train_lb, epoch)

Epoch: 1  Loss: 0.100  Acc: 9.846%
Epoch: 2  Loss: 0.095  Acc: 25.956%
Epoch: 3  Loss: 0.091  Acc: 17.508%
Epoch: 4  Loss: 0.090  Acc: 12.716%
Epoch: 5  Loss: 0.091  Acc: 14.172%
Epoch: 6  Loss: 0.092  Acc: 11.230%
Epoch: 7  Loss: 0.092  Acc: 11.230%
Epoch: 8  Loss: 0.091  Acc: 11.230%
Epoch: 9  Loss: 0.090  Acc: 11.230%
Epoch: 10  Loss: 0.090  Acc: 11.230%
Epoch: 11  Loss: 0.089  Acc: 11.230%
Epoch: 12  Loss: 0.089  Acc: 11.230%
Epoch: 13  Loss: 0.089  Acc: 11.230%
Epoch: 14  Loss: 0.089  Acc: 14.684%
Epoch: 15  Loss: 0.088  Acc: 35.000%
Epoch: 16  Loss: 0.088  Acc: 49.180%
Epoch: 17  Loss: 0.087  Acc: 56.408%
Epoch: 18  Loss: 0.086  Acc: 61.904%
Epoch: 19  Loss: 0.085  Acc: 64.388%
Epoch: 20  Loss: 0.084  Acc: 68.222%
Epoch: 21  Loss: 0.083  Acc: 70.786%
Epoch: 22  Loss: 0.082  Acc: 71.032%
Epoch: 23  Loss: 0.080  Acc: 70.942%
Epoch: 24  Loss: 0.079  Acc: 71.520%


  h = 1 / (1 + np.exp(-((x.dot (w1))) ) )  #(N,64)


Epoch: 25  Loss: 0.078  Acc: 72.086%
Epoch: 26  Loss: 0.076  Acc: 74.182%
Epoch: 27  Loss: 0.075  Acc: 75.454%
Epoch: 28  Loss: 0.073  Acc: 77.382%
Epoch: 29  Loss: 0.072  Acc: 80.138%
Epoch: 30  Loss: 0.070  Acc: 82.184%
Epoch: 31  Loss: 0.069  Acc: 83.916%
Epoch: 32  Loss: 0.068  Acc: 83.884%
Epoch: 33  Loss: 0.066  Acc: 84.492%
Epoch: 34  Loss: 0.065  Acc: 85.142%
Epoch: 35  Loss: 0.064  Acc: 85.362%
Epoch: 36  Loss: 0.063  Acc: 85.718%
Epoch: 37  Loss: 0.062  Acc: 86.188%
Epoch: 38  Loss: 0.061  Acc: 86.426%
Epoch: 39  Loss: 0.060  Acc: 86.402%
Epoch: 40  Loss: 0.059  Acc: 86.904%
Epoch: 41  Loss: 0.058  Acc: 87.126%
Epoch: 42  Loss: 0.057  Acc: 87.250%
Epoch: 43  Loss: 0.056  Acc: 87.300%
Epoch: 44  Loss: 0.056  Acc: 87.610%
Epoch: 45  Loss: 0.055  Acc: 87.610%
Epoch: 46  Loss: 0.054  Acc: 87.632%
Epoch: 47  Loss: 0.054  Acc: 87.852%
Epoch: 48  Loss: 0.053  Acc: 87.922%
Epoch: 49  Loss: 0.053  Acc: 87.862%
Epoch: 50  Loss: 0.052  Acc: 88.098%
Epoch: 51  Loss: 0.052  Acc: 87.950%
E

#### 在验证集上测试分类效果

In [133]:
# test the classification accuracy on validation dataset
def sigmoid(x):
    return 1/(1+np.exp(-x))
res=val_img.dot(best_X[0])
res+=best_X[1]
res=sigmoid(res)
res=res.dot(best_X[2])
res+=best_X[3]
scores = np.argmax(res, axis=1)
lb     = np.argmax(one_hot_val_lb,axis=1)
accuracy = np.mean(scores == lb) * 100
print("Numeric_Gradient+Adam优化在验证集上的分类精度为: %.3f%%" %(accuracy))


Numeric_Gradient+Adam优化在验证集上的分类精度为: 86.930%


  return 1/(1+np.exp(-x))


### 题目3： 题目2中第二个线性分类器没有激活函数，请为其添加Softmax函数，其他条件不变，完成训练和测试。

In [169]:
def softmax_front(x):
    for i in range(x.shape[0]):
        x[i]=np.exp(x[i])
        x[i]=x[i]/np.sum(x[i])
    return x
def softmax_back(x,g):
    grad=np.zeros(x.shape)
    for i in range(x.shape[0]):
        grad[i]=x[i]*g[i]-x[i]*(x[i].dot(g[i]))
    return grad

In [170]:
#在完成上一部分程序的基础上，给两个线性分类器加上bias
def train3(x, y, epoch):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=1000 examples.
    
    Inputs:
    - x: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,C) containing training labels; y[c] = 1 means
          that X[i] has label c, where 0 <= c < C.
    - epoch: Training iterations, an integer.
    
    Returns:
    - 训练得到的参数
    """
    
    # set the hyperparameters
    learning_rate = 1e-3

    # Initialize first and second momentums in Adam
    mW1, vW1 = 0, 0
    mW2, vW2 = 0, 0
    mb1, vb1 = 0, 0
    mb2, vb2 = 0, 0

    # initialize W1 (D,64) and w2(64,C) 
    w1 = np.random.randn(x.shape[1], 64) * 0.0001
    w2 = np.random.randn(64,y.shape[1]) * 0.0001
    b1 = np.random.randn(64 ) * 0.0001
    b2 = np.random.randn(10) * 0.0001

    for t in range (1,epoch+1):
      h = 1 / (1 + np.exp(-((x.dot (w1))) ) )  #(N,64)
      h=h+b1
      h2=h.dot(w2)     #(N,C)
      h2=h2+b2
      y_pred=softmax_front(h2)
      loss = np.square(y_pred - y).mean( )
      scores = np.argmax(y_pred, axis=1)
      lb     = np.argmax(y,      axis=1)
      accuracy = np.mean(scores == lb) * 100
      # compute the numeric gradient
      grad_y_pred = (y_pred-y)*2 #Nx10
      grad_h2=softmax_back(h2,grad_y_pred)
      grad_w2 = h.T.dot(grad_h2) #64x10
      grad_h = grad_h2.dot(w2.T)#50000*64
      grad_sig=h*(1-h)*grad_h
      grad_w1 = x.T.dot(grad_sig)
      grad_b2 =np.sum(grad_y_pred,axis=0)
      grad_b1 =np.sum(grad_sig,axis=0)


      # update W and b with Adam
      grad_w2, mW2, vW2 = adam_opt(grad_w2, mW2, vW2, t)
      grad_b2, mb2, vb2 = adam_opt(grad_b2, mb2, vb2, t)
      grad_w1, mW1, vW1 = adam_opt(grad_w1, mW1, vW1, t)
      grad_b1, mb1, vb1 = adam_opt(grad_b1, mb1, vb1, t)

      w2 -= learning_rate*grad_w2
      b2 -= learning_rate*grad_b2
      w1 -= learning_rate*grad_w1
      b1 -= learning_rate*grad_b1
      

        # print the result
      print("Epoch: %d  Loss: %.3f  Acc: %.3f%%" % (t, loss, accuracy))
    
    return w1, b1, w2, b2

In [176]:
epoch = 100
best_X = train3(train_img, one_hot_train_lb, epoch)

Epoch: 1  Loss: 0.090  Acc: 10.620%
Epoch: 2  Loss: 0.090  Acc: 11.230%
Epoch: 3  Loss: 0.090  Acc: 16.860%
Epoch: 4  Loss: 0.089  Acc: 38.342%
Epoch: 5  Loss: 0.089  Acc: 44.520%
Epoch: 6  Loss: 0.089  Acc: 44.372%
Epoch: 7  Loss: 0.088  Acc: 47.036%
Epoch: 8  Loss: 0.088  Acc: 49.546%
Epoch: 9  Loss: 0.087  Acc: 51.024%
Epoch: 10  Loss: 0.087  Acc: 52.114%
Epoch: 11  Loss: 0.087  Acc: 52.682%
Epoch: 12  Loss: 0.086  Acc: 53.866%
Epoch: 13  Loss: 0.085  Acc: 55.994%
Epoch: 14  Loss: 0.085  Acc: 58.850%
Epoch: 15  Loss: 0.084  Acc: 59.860%
Epoch: 16  Loss: 0.084  Acc: 60.858%
Epoch: 17  Loss: 0.083  Acc: 61.720%
Epoch: 18  Loss: 0.083  Acc: 62.200%
Epoch: 19  Loss: 0.082  Acc: 62.978%
Epoch: 20  Loss: 0.082  Acc: 64.024%
Epoch: 21  Loss: 0.081  Acc: 64.742%
Epoch: 22  Loss: 0.080  Acc: 65.042%
Epoch: 23  Loss: 0.080  Acc: 65.594%
Epoch: 24  Loss: 0.079  Acc: 66.120%


  h = 1 / (1 + np.exp(-((x.dot (w1))) ) )  #(N,64)


Epoch: 25  Loss: 0.079  Acc: 66.382%
Epoch: 26  Loss: 0.078  Acc: 66.804%
Epoch: 27  Loss: 0.077  Acc: 67.288%
Epoch: 28  Loss: 0.077  Acc: 67.820%
Epoch: 29  Loss: 0.076  Acc: 68.264%
Epoch: 30  Loss: 0.076  Acc: 69.752%
Epoch: 31  Loss: 0.075  Acc: 71.302%
Epoch: 32  Loss: 0.075  Acc: 72.920%
Epoch: 33  Loss: 0.074  Acc: 74.648%
Epoch: 34  Loss: 0.073  Acc: 74.832%
Epoch: 35  Loss: 0.073  Acc: 75.788%
Epoch: 36  Loss: 0.072  Acc: 76.654%
Epoch: 37  Loss: 0.072  Acc: 76.744%
Epoch: 38  Loss: 0.071  Acc: 77.532%
Epoch: 39  Loss: 0.071  Acc: 77.880%
Epoch: 40  Loss: 0.070  Acc: 78.056%
Epoch: 41  Loss: 0.070  Acc: 78.744%
Epoch: 42  Loss: 0.070  Acc: 78.996%
Epoch: 43  Loss: 0.069  Acc: 79.418%
Epoch: 44  Loss: 0.069  Acc: 79.814%
Epoch: 45  Loss: 0.068  Acc: 79.572%
Epoch: 46  Loss: 0.068  Acc: 79.794%
Epoch: 47  Loss: 0.067  Acc: 79.964%
Epoch: 48  Loss: 0.067  Acc: 79.796%
Epoch: 49  Loss: 0.067  Acc: 79.950%
Epoch: 50  Loss: 0.066  Acc: 80.270%
Epoch: 51  Loss: 0.066  Acc: 80.364%
E

In [177]:
def sigmoid(x):
    return 1/(1+np.exp(-x))
res=val_img.dot(best_X[0])
res+=best_X[1]
res=sigmoid(res)
res=res.dot(best_X[2])
res+=best_X[3]
res=softmax_front(res)
scores = np.argmax(res, axis=1)
lb     = np.argmax(one_hot_val_lb,axis=1)
accuracy = np.mean(scores == lb) * 100
print("Numeric_Gradient+Adam优化在验证集上的分类精度为: %.3f%%" %(accuracy))

Numeric_Gradient+Adam优化在验证集上的分类精度为: 80.320%


  return 1/(1+np.exp(-x))
