In [24]:
import numpy as np
from numpy import random

In [2]:
filename_train_labels = './dataset/train-labels-idx1-ubyte'
dataset_labels = []

with open(filename_train_labels, 'rb') as f:
    first_4_bytes = int.from_bytes(f.read(4), byteorder='big', signed=False)
    print('Magic number: %d' % first_4_bytes)
    number_of_items = int.from_bytes(f.read(4), byteorder='big', signed=False)
    print('Number of items: %d' % number_of_items)
    
    item = f.read(number_of_items)
    dataset_labels = [i for i in item]
        

Magic number: 2049
Number of items: 60000


In [3]:
filename_train_images = './dataset/train-images-idx3-ubyte'
dataset_images = []

with open(filename_train_images, 'rb') as f:
    first_4_bytes = int.from_bytes(f.read(4), byteorder='big', signed=False)
    print('Magic number: %d' % first_4_bytes)
    number_of_items = int.from_bytes(f.read(4), byteorder='big', signed=False)
    print('Number of items: %d' % number_of_items)
    number_of_rows = int.from_bytes(f.read(4), byteorder='big', signed=False)
    print('Number of rows: %d' % number_of_rows)
    number_of_columns = int.from_bytes(f.read(4), byteorder='big', signed=False)
    print('Number of columns: %d' % number_of_columns)
    
    for _ in range(number_of_items):
        item = f.read(number_of_rows*number_of_columns)
        dataset_images.append([i for i in item])

Magic number: 2051
Number of items: 60000
Number of rows: 28
Number of columns: 28


In [10]:
# 使用numpy实现一个全连接模型

learning_rate = 0.1

n_items = len(dataset_images)
train_test_ratio = 9
train_size = int(n_items * train_test_ratio / (1 + train_test_ratio))

image_size = number_of_rows * number_of_columns
hidden_size = 64
output_size = 10

train_index = np.random.choice(list(range(n_items)), size=train_size)
test_index = list(set(list(range(n_items))) - set(train_index))

train_images = [dataset_images[i] for i in train_index]
test_images = [dataset_images[i] for i in test_index]

train_labels = [dataset_labels[i] for i in train_index]
test_labels = [dataset_labels[i] for i in test_index]

In [5]:
def softmax(x):
#     return np.exp(x) / np.sum(np.exp(x), axis=0)  # not stable
    exps = np.exp(x - x.max())
    return exps / np.sum(exps)

In [45]:
random.seed(0)

W1 = random.random([hidden_size, image_size])
W2 = random.random([output_size, hidden_size])

b1 = random.random(hidden_size)
b2 = random.random(output_size)

# train forward
X = train_images
Y = train_labels

for idx, x in enumerate(X):
    print(idx)

    x1 = W1.dot(x) + b1
    
    print('W1.shape=' + str(W1.shape))
    print('b1.shape=' + str(b1.shape))
    print('x1.shape=' + str(x1.shape))
    
    x2 = W2.dot(x1) + b2
   
    y = np.zeros(output_size)
    y[Y[idx]] = 1.0    
    y_ = softmax(x2)
    pred = np.argmax(y_)
    E = 0.5 * np.sum(np.square(y - y_))  # error
    
    if idx % 100 == 0:
        print('Error = ' + str(E))
    
    # softmax layer
    dE_dy = y_-y  # (10,)
    dy_dx2 = y_*(1-y_)  # (10,)
    dE_dx2 = dE_dy*dy_dx2  # (10,)*(10,) => (10,)
    
    # layer 2
    dx2_dW2 = x1  # (784,)
    dE_dW2 = dx2_dW2*np.expand_dims(dE_dx2, axis=1)  # (10,64)*(64,) => (10,64)
    
    dx2_db2 = np.ones(b2.shape[0])  # (10,) => (10,)
    dE_db2 = dE_dx2*dx2_db2  # (10,)*(10,) => (10,)
    
    # layer 1    
    dx1_dW1 = x  # (784,)
    dx1_db1 = np.ones(b1.shape[0])  # (64,)
    
    dx2_dx1 = W2  # (10,64)
    dE_dx1 = np.transpose(dx2_dx1)*dE_dx2 # (64,10)*(10,) => (64,)
    dx1_dW1_ = np.repeat(np.expand_dims(dx1_dW1, axis=0), dE_dx1.shape[1], axis=0)
    dE_dW1 = np.dot(dE_dx1, dx1_dW1_) # (64,10)*(10,784) => (64,784)
    dx1_db1_ = np.repeat(np.expand_dims(dx1_db1, axis=0), dE_dx1.shape[1], axis=0)
    dE_db1 = dE_dx1.dot(dx1_db1_)  # () => (64,)
    print('dx1_db1_.shape=' + str(dx1_db1_.shape))
    print('dE_dx1.shape=' + str(dE_dx1.shape))
    
    # update parameter
    W2 = W2 - learning_rate*dE_dW2
    W1 = W1 - learning_rate*dE_dW1
    b2 = b2 - learning_rate*dE_db2
    b1 = b1 - learning_rate*dE_db1  #FIXME dimension error
    print('='*10)
    
    
    
print('y = ' + str(y))
print('y_ = ' + str(y_))
print('pred = ' + str(pred))
    

0
W1.shape=(64, 784)
b1.shape=(64,)
x1.shape=(64,)
Error = 1.0
dx1_db1_.shape=(10, 64)
dE_dx1.shape=(64, 10)
1
W1.shape=(64, 784)
b1.shape=(64, 64)
x1.shape=(64, 64)


ValueError: operands could not be broadcast together with shapes (10,64) (10,) 

In [130]:
b1.shape

(64,)

In [131]:
# x1 = W1*x + b1  # (64,) <= (64,784)*(784,) + (64,)
# x2 = W2*x1 + b2  # (10,) <= (10,64)*(64,) + (10,)
# y = softmax(x2)  # (10,) <= (10,)

# np.repeat(dx1_db1, dE_dx1.shape[1], axis=0).shape
# dx1_db1_ = np.repeat(np.expand_dims(dx1_db1, axis=0), dE_dx1.shape[1], axis=0)

dx1_db1 = np.ones(b1.shape[0])
dE_db1 = dE_dx1*dx1_db1  # (64,) <= (64,10)*(64,)

ValueError: operands could not be broadcast together with shapes (64,10) (64,) 

In [125]:
print(dE_dx1.shape)

(64, 10)


In [124]:
print(dx1_db1_.shape)

(10, 64)

In [48]:
# W1.dot(x).shape
print(str(W1.shape))
print(str(len(x)))
print(str(W1.dot(x).shape))
print(str(W2.shape))

(64, 784)
784
(64,)
(10, 64)


In [62]:
(dx2_dW2*np.expand_dims(dE_dx2, axis=1)).shape

(10, 64)