In [90]:
import numpy as np
import time
from tensorflow import keras

# 数据处理

In [91]:
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

In [92]:
print("Training data: {} {}".format(X_train.shape, y_train.shape))
print("Test data: {} {}".format(X_test.shape, y_test.shape))

Training data: (60000, 28, 28) (60000,)
Test data: (10000, 28, 28) (10000,)


In [93]:
# 1. 现在的trainning_data是28*28纬度的，我们需要把它变成784维度的向量
# 2. 每一个像素都是0-255的灰度值，我们需要把它变成0-1之间的数值
# 3. 我们需要把y变成one-hot编码，用于计算loss

X_train_flat = X_train.reshape(len(X_train), (28*28))
X_test_flat = X_test.reshape(len(X_test), (28*28))

X_train_flat = X_train_flat / 255.0
X_test_flat = X_test_flat / 255.0

In [94]:
def one_hot(x, k, dtype=np.float32):
    """Create a one-hot encoding of x of size k."""
    return np.array(x[:, None] == np.arange(k), dtype)

# One-hot encode labels
num_labels = 10 # 0-9的数字
y_train = one_hot(y_train.astype('int32'), num_labels)
y_test = one_hot(y_test.astype('int32'), num_labels)

In [95]:
# test = np.array([0, 2, 1, 3, 4])
# one_hot(test.astype('int32'), 5)

In [96]:
print("Training data: {} {}".format(X_train_flat.shape, y_train.shape))
print("Test data: {} {}".format(X_test_flat.shape, y_test.shape))

Training data: (60000, 784) (60000, 10)
Test data: (10000, 784) (10000, 10)


# 模型构造

In [97]:
class DeepNeuralNetwork():
    def __init__(self, sizes, activation='sigmoid'):
        self.sizes = sizes
        
        # Choose activation function
        if activation == 'relu':
            self.activation = self.relu
        elif activation == 'sigmoid':
            self.activation = self.sigmoid
        else:
            raise ValueError("Activation function is currently not support, please use 'relu' or 'sigmoid' instead.")
        
        # 保存所有weights
        self.params = self.initialize()
        # 保存所有需要更新的中间值, i.e. activations
        self.cache = {}
        
    def relu(self, x, derivative=False):
        '''
            Derivative of ReLU is a bit more complicated since it is not differentiable at x = 0
        
            Forward path:
            relu(x) = max(0, x)
            In other word,
            relu(x) = 0, if x < 0
                    = x, if x >= 0

            Backward path:
            ∇relu(x) = 0, if x < 0
                     = 1, if x >=0
        '''
        if derivative:
            x = np.where(x < 0, 0, x)
            x = np.where(x >= 0, 1, x)
            return x
        return np.maximum(0, x)

    def sigmoid(self, x, derivative=False):
        '''
            Forward path:
            σ(x) = 1 / 1+exp(-z)
            
            Backward path:
            ∇σ(x) = exp(-z) / (1+exp(-z))^2
        '''
        if derivative:
            return (np.exp(-x))/((np.exp(-x)+1)**2)
        return 1/(1 + np.exp(-x))

    def softmax(self, x):
        '''
            softmax(x) = exp(x) / ∑exp(x)
        '''
        # Numerically stable with large exponentials
        exps = np.exp(x - x.max())
        return exps / np.sum(exps, axis=0)

    def initialize(self):
        # 初始化模型每一层的nodes数量
        input_layer=self.sizes[0]
        hidden_layer=self.sizes[1]
        output_layer=self.sizes[2]
        
        # 参数的初始化
        params = {
            "W1": np.random.randn(hidden_layer, input_layer) * np.sqrt(1./input_layer),
            "b1": np.zeros((hidden_layer, 1)) * np.sqrt(1./input_layer),
            "W2": np.random.randn(output_layer, hidden_layer) * np.sqrt(1./hidden_layer),
            "b2": np.zeros((output_layer, 1)) * np.sqrt(1./hidden_layer)
        }
        return params
    
    def initialize_momemtum_optimizer(self):
        momemtum_opt = {
            "W1": np.zeros(self.params["W1"].shape),
            "b1": np.zeros(self.params["b1"].shape),
            "W2": np.zeros(self.params["W2"].shape),
            "b2": np.zeros(self.params["b2"].shape),
        }
        return momemtum_opt

    def feed_forward(self, x):
        '''
            y = (wX + b)
            根据参数更新中间值,并保存到cache中,供后续计算使用
        '''
        self.cache["X"] = x
        self.cache["Z1"] = np.matmul(self.params["W1"], self.cache["X"].T) + self.params["b1"]
        self.cache["A1"] = self.activation(self.cache["Z1"])
        self.cache["Z2"] = np.matmul(self.params["W2"], self.cache["A1"]) + self.params["b2"]
        self.cache["A2"] = self.softmax(self.cache["Z2"])

        # 把softmax的概率结果作为输出返回
        return self.cache["A2"]
    
    def back_propagate(self, y, output):
        '''
            这是backpropagation algorith, 用于更新参数

        '''
        current_batch_size = y.shape[0]
        
        dZ2 = output - y.T
        dW2 = (1./current_batch_size) * np.matmul(dZ2, self.cache["A1"].T)
        db2 = (1./current_batch_size) * np.sum(dZ2, axis=1, keepdims=True)

        dA1 = np.matmul(self.params["W2"].T, dZ2)
        dZ1 = dA1 * self.activation(self.cache["Z1"], derivative=True)
        dW1 = (1./current_batch_size) * np.matmul(dZ1, self.cache["X"])
        db1 = (1./current_batch_size) * np.sum(dZ1, axis=1, keepdims=True)

        self.grads = {"W1": dW1, "b1": db1, "W2": dW2, "b2": db2}
        return self.grads
    
    def cross_entropy_loss(self, y, output):
        '''
            L(y, ŷ) = −∑ylog(ŷ).
        '''
        l_sum = np.sum(np.multiply(y.T, np.log(output)))
        m = y.shape[0]
        l = -(1./m) * l_sum
        return l
                
    def optimize(self, l_rate=0.1, beta=.9):
        '''
            Stochatic Gradient Descent (SGD):
            θ^(t+1) <- θ^t - η∇L(y, ŷ)
            
            Momentum:
            v^(t+1) <- βv^t + (1-β)∇L(y, ŷ)^t
            θ^(t+1) <- θ^t - ηv^(t+1)
        '''
        if self.optimizer == "sgd":
            for key in self.params:
                self.params[key] = self.params[key] - l_rate * self.grads[key]
        elif self.optimizer == "momentum":
            for key in self.params:
                self.momemtum_opt[key] = (beta * self.momemtum_opt[key] + (1. - beta) * self.grads[key])
                self.params[key] = self.params[key] - l_rate * self.momemtum_opt[key]
        else:
            raise ValueError("Optimizer is currently not support, please use 'sgd' or 'momentum' instead.")

    def accuracy(self, y, output):
        return np.mean(np.argmax(y, axis=-1) == np.argmax(output.T, axis=-1))

    def train(self, x_train, y_train, x_test, y_test, epochs=10, 
              batch_size=64, optimizer='momentum', l_rate=0.1, beta=.9):
        # Hyperparameters
        # 把每一个batch过一遍叫一个epoch，batch_size是每次训练的样本数(共同组成测试数据)
        self.epochs = epochs
        self.batch_size = batch_size
        num_batches = -(-x_train.shape[0] // self.batch_size)
        
        # Initialize optimizer
        self.optimizer = optimizer
        if self.optimizer == 'momentum':
            self.momemtum_opt = self.initialize_momemtum_optimizer()
        
        start_time = time.time()
        template = "Epoch {}: {:.2f}s, train acc={:.2f}, train loss={:.2f}, test acc={:.2f}, test loss={:.2f}"
        
        # Train
        for i in range(self.epochs):
            # Shuffle打乱数据和标签的排序
            permutation = np.random.permutation(x_train.shape[0])
            x_train_shuffled = x_train[permutation]
            y_train_shuffled = y_train[permutation]

            for j in range(num_batches):
                # Batch
                begin = j * self.batch_size
                end = min(begin + self.batch_size, x_train.shape[0]-1)
                # 把数据分批次训练，每一批次训练完成后更新一次参数
                x = x_train_shuffled[begin:end]
                y = y_train_shuffled[begin:end]
                
                # Forward
                output = self.feed_forward(x)
                # Backprop
                grad = self.back_propagate(y, output)
                # Optimize
                self.optimize(l_rate=l_rate, beta=beta)

            # Evaluate performance
            # 模型权重训练好后，这时候再把所有数据都过一遍，计算准确率和损失
            output = self.feed_forward(x_train)
            train_acc = self.accuracy(y_train, output)
            train_loss = self.cross_entropy_loss(y_train, output)
            # Test data
            output = self.feed_forward(x_test)
            test_acc = self.accuracy(y_test, output)
            test_loss = self.cross_entropy_loss(y_test, output)
            print(template.format(i+1, time.time()-start_time, train_acc, train_loss, test_acc, test_loss))

In [98]:
# Sigmoid + Momentum
dnn = DeepNeuralNetwork(sizes=[784, 64, 10], activation='sigmoid')
dnn.train(X_train_flat, y_train, X_test_flat, y_test, batch_size=128, optimizer='momentum', l_rate=4, beta=.9)

Epoch 1: 1.56s, train acc=0.95, train loss=0.16, test acc=0.95, test loss=0.17
Epoch 2: 2.65s, train acc=0.97, train loss=0.11, test acc=0.96, test loss=0.13
Epoch 3: 3.63s, train acc=0.98, train loss=0.07, test acc=0.97, test loss=0.10
Epoch 4: 4.51s, train acc=0.98, train loss=0.06, test acc=0.97, test loss=0.09
Epoch 5: 5.63s, train acc=0.99, train loss=0.05, test acc=0.97, test loss=0.08
Epoch 6: 6.65s, train acc=0.99, train loss=0.04, test acc=0.97, test loss=0.08
Epoch 7: 7.89s, train acc=0.99, train loss=0.04, test acc=0.98, test loss=0.08
Epoch 8: 8.92s, train acc=0.99, train loss=0.03, test acc=0.98, test loss=0.08
Epoch 9: 10.28s, train acc=0.99, train loss=0.03, test acc=0.98, test loss=0.08
Epoch 10: 11.96s, train acc=0.99, train loss=0.03, test acc=0.97, test loss=0.09


In [99]:
# ReLU + SGD
dnn = DeepNeuralNetwork(sizes=[784, 64, 10], activation='relu')
dnn.train(X_train_flat, y_train, X_test_flat, y_test, batch_size=128, optimizer='sgd', l_rate=0.05)

Epoch 1: 1.22s, train acc=0.89, train loss=0.41, test acc=0.89, test loss=0.39
Epoch 2: 2.25s, train acc=0.90, train loss=0.34, test acc=0.91, test loss=0.32
Epoch 3: 3.13s, train acc=0.91, train loss=0.31, test acc=0.92, test loss=0.29
Epoch 4: 3.86s, train acc=0.92, train loss=0.29, test acc=0.92, test loss=0.28
Epoch 5: 4.73s, train acc=0.92, train loss=0.28, test acc=0.92, test loss=0.27
Epoch 6: 5.65s, train acc=0.92, train loss=0.27, test acc=0.92, test loss=0.27
Epoch 7: 6.47s, train acc=0.92, train loss=0.27, test acc=0.93, test loss=0.26
Epoch 8: 7.08s, train acc=0.92, train loss=0.26, test acc=0.93, test loss=0.25
Epoch 9: 7.69s, train acc=0.93, train loss=0.26, test acc=0.93, test loss=0.25
Epoch 10: 8.42s, train acc=0.93, train loss=0.25, test acc=0.93, test loss=0.25
