In [2]:
import struct
import numpy as np
import matplotlib.pyplot as plt

In [5]:
def decode_labels(file):
    with open(file, 'rb') as f:
        binary_data = f.read()
        _, num_items = struct.unpack_from('>II', binary_data, 0)
        labels       = struct.unpack_from('B'*num_items, binary_data, 8)
        return np.array(labels).reshape(-1, 1).astype(np.int)

def decode_images(file):
    with open(file, 'rb') as f:
        binary_data = f.read()
        _,num_images, rows, cols = struct.unpack_from('>IIII', binary_data, 0)
        images                   = struct.unpack_from('B'*(num_images*rows*cols), binary_data, 16)
        return np.array(images).reshape(-1, rows*cols)

filepath = [ "../stage_1/data/mnist/train-images-idx3-ubyte",
             "../stage_1/data/mnist/train-labels-idx1-ubyte",
             "../stage_1/data/mnist/t10k-images-idx3-ubyte",
             "../stage_1/data/mnist/t10k-labels-idx1-ubyte"]

t_images = decode_images(filepath[0])
t_labels = decode_labels(filepath[1])
v_images = decode_images(filepath[2])
v_labels = decode_labels(filepath[3])
train_images = t_images
train_labels = t_labels
test_images  = v_images
test_labels  = v_labels

# bp1
1. 2 linear layer
2. 256 hidden layer
3. sfotmax crross entropy loss
4. mini batch = 128
5. lr = 0.1
6. train 10 epochs
7. drop last
8. SGD

In [124]:
all_image   = t_images.shape[0]
num_feature = t_images.shape[1]
num_hidden  = 256
num_classes = 10
batch_size  = 100
batch_count = all_image // batch_size  # drop last
lr          = 0.1
epochs      = 10
sigmoid     = lambda x: 1 / (1 + np.exp(-x))
softmax     = lambda x: np.exp(x)/np.exp(x).sum(axis=1, keepdims=True)

np.random.seed(3)
train_img_idx  = list(range(all_image))

# init params
layer1_weight  = np.random.normal(0, 1 / np.sqrt(num_feature), size=(num_feature, num_hidden))
layer1_bias    = np.zeros((1, num_hidden))
layer2_weight  = np.random.normal(0, 1 / np.sqrt(num_hidden), size=(num_hidden, num_classes))
layer2_bias    = np.zeros((1, num_classes))

for epoch in range(epochs):
    # random index
    np.random.shuffle(train_img_idx)   # inplace
    
    for batch in range(batch_count):
        start  = batch * batch_size
        end    = start + batch_size    # drop last
        indexs = train_img_idx[start:end]
        
        images = t_images[indexs]
        labels = t_labels[indexs]
        
        images = (images / 255 - 0.5).astype(np.float32)
        
        one_hot= np.zeros((batch_size, num_classes))
        for i in range(batch_size):
            one_hot[i, labels[i]] = 1  # 把label值本身当成索引
            
        # 推理
        hidden = images @ layer1_weight + layer1_bias
        hidden_activation = sigmoid(hidden)
        output = hidden_activation @ layer2_weight + layer2_bias
        
        probability = softmax(output)
        loss   = -np.sum(one_hot * np.log(probability)) / batch_size

        # BP
        d_output     = (probability - one_hot) / batch_size
        do_l2_bias   = np.sum(d_output, axis=0)
        do_l2_weight = hidden_activation.T @ d_output
        do_l2_hid    = d_output @ layer2_weight.T
        d_hidden     = do_l2_hid * sigmoid(hidden) * (1 - sigmoid(hidden))
        dh_l1_weight = images.T @ d_hidden
        dh_l1_bias   = np.sum(d_hidden, axis=0)
        
        # SGD
        layer2_bias    -= lr * do_l2_bias
        layer2_weight  -= lr * do_l2_weight
        layer1_bias    -= lr * dh_l1_bias
        layer1_weight  -= lr * dh_l1_weight
    
    norm_test_images  = (v_images / 255 - 0.5).astype(np.float32)
    hidden = norm_test_images @ layer1_weight + layer1_bias
    hidden_activation = sigmoid(hidden)
    output        = hidden_activation @ layer2_weight + layer2_bias
    probability   = softmax(output)
    predict_label = probability.argmax(axis=1).reshape(-1, 1)
    accuracy      = (predict_label == v_labels).sum() / v_labels.shape[0]
    
    print(f'epoch: {epoch:02d}, loss: {loss:.15f}, accuracy: {accuracy*100:.2f}%')
        

2.6008399440403376
2.4849523108895535
2.54630303316331
2.5397303683669445
2.5186141368464554
2.540926927169826
2.686494472744272
2.754920268546467
2.691405645527509
2.605006667028606


In [210]:
# exercise

num_image, num_feature = t_images.shape
num_classes   = 10
num_hidden    = 256
batch_size    = 100
batch_round   = num_image // batch_size  # drop last
epochs        = 10  # training 10 times
lr            = 0.1
sigmoid       = lambda x: 1 / (1 + np.exp(-x))
softmax       = lambda x: np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
image_indexs  = list(range(num_image))

np.random.seed(3)

w1 = np.random.normal(0, 1 / np.sqrt(num_feature), size=(num_feature, num_hidden))
b1 = np.zeros((1, num_hidden))
w2 = np.random.normal(0, 1 / np.sqrt(num_hidden), size=(num_hidden, num_classes))
b2 = np.zeros((1, num_classes))

for epoch in range(epochs):
    np.random.shuffle(image_indexs)
    for batch_i in range(batch_round):
        start   = batch_i * batch_size
        end     = start + batch_size   # drop last
        indexs  = image_indexs[start:end]
        images  = t_images[indexs]
        labels  = t_labels[indexs]
        
        # normalize
        images  = (images / 255 - 0.5).astype(np.float32)
        
        # one-hot
        y = np.zeros((batch_size, num_classes))
        for i in range(batch_size):
            y[i, labels[i]] = 1    # label是0-9，本身就能当作索引
        
        # inference
        z1     = images @ w1 + b1
        a1     = sigmoid(z1)
        z2     = a1 @ w2 + b2
        p      = softmax(z2)
        loss   = -np.sum(y * np.log(p)) / batch_size

        # BP
        d_loss = (p - y) / batch_size
        dp_dw  = a1.T @ d_loss
        dp_db  = np.sum(d_loss, axis=0)
        dp_da  = d_loss @ w2.T
        da_ds  = dp_da * sigmoid(z1) * (1 - sigmoid(z1))
        das_dw = images.T @ da_ds
        das_db = np.sum(da_ds, axis=0)
        
        # SGD
        w1    -= lr * das_dw
        b1    -= lr * das_db
        w2    -= lr * dp_dw
        b2    -= lr * dp_db
#         if(batch_i == 100):
#             print(b2[0,0:5])
#             break
#     if(epoch==4): break    
    norm_img  = (v_images / 255 - 0.5).astype(np.float32)
    hidden    = norm_img @ w1 + b1
    active    = sigmoid(hidden)
    output    = active @ w2 + b2
    y         = softmax(output)
    predict   = np.argmax(y, axis=1).reshape(-1, 1)    # 索引位置正好也是值，把索引当值用
    accuracy  = np.sum(predict == v_labels) / len(v_labels)
    print(f'epoch: {epoch:02d}, loss: {loss:.15f}, accuracy: {accuracy*100:.2f}%')

epoch: 00, loss: 0.472000416626598, accuracy: 87.38%
epoch: 01, loss: 0.362744560155856, accuracy: 90.05%
epoch: 02, loss: 0.330843038298329, accuracy: 90.65%
epoch: 03, loss: 0.329966971208065, accuracy: 91.23%
epoch: 04, loss: 0.243502847987609, accuracy: 91.21%
epoch: 05, loss: 0.269793026722386, accuracy: 91.91%
epoch: 06, loss: 0.318829279245691, accuracy: 91.98%
epoch: 07, loss: 0.183251989126592, accuracy: 92.11%
epoch: 08, loss: 0.248526564437781, accuracy: 92.50%
epoch: 09, loss: 0.173685304172950, accuracy: 92.72%


In [162]:
print(v_labels.shape)

(10000, 1)
