In [32]:
import numpy as np
from keras.datasets import fashion_mnist
def load_fashion_mnist(flatten=True, normalize=True, one_hot=True, num_classes=10):
    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
    if flatten:
        x_train = x_train.reshape(x_train.shape[0], -1)
        x_test = x_test.reshape(x_test.shape[0], -1)
    if normalize:
        x_train = x_train / 255.0
        x_test = x_test / 255.0
    if one_hot:
        y_train = np.eye(num_classes)[y_train]
        y_test = np.eye(num_classes)[y_test]
    return x_train, y_train, x_test, y_test

In [33]:
import numpy as np
def relu(z):
    return np.maximum(0, z)
def relu_der(z):
    return (z > 0).astype(float)
def soft(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [34]:
def ini_para(layer_sizes):
    np.random.seed(42)
    params = {}
    L = len(layer_sizes)
    for l in range(1, L):
        params['W'+str(l)] = np.random.randn(layer_sizes[l-1], layer_sizes[l]) * 0.01
        params['b'+str(l)] = np.zeros((1, layer_sizes[l]))
    return params

In [35]:
def forward(X, params):
    cache = {'A0': X}
    L = len(params)//2
    for l in range(1, L):
        Z = np.dot(cache['A'+str(l-1)], params['W'+str(l)]) + params['b'+str(l)]
        A = relu(Z)
        cache['Z'+str(l)] = Z
        cache['A'+str(l)] = A
    ZL = np.dot(cache['A'+str(L-1)], params['W'+str(L)]) + params['b'+str(L)]
    AL = soft(ZL)
    cache['Z'+str(L)] = ZL
    cache['A'+str(L)] = AL
    return AL, cache

In [36]:
def comp_loss(Y_pred, Y_true):
    m = Y_true.shape[0]
    loss = -np.sum(Y_true * np.log(Y_pred + 1e-8)) / m
    return loss

In [37]:
def backward(Y_pred, Y_true, params, cache, learning_rate=0.01):
    grads = {}
    L = len(params)//2
    m = Y_true.shape[0]
    dZ = Y_pred - Y_true
    grads['dW'+str(L)] = np.dot(cache['A'+str(L-1)].T, dZ)/m
    grads['db'+str(L)] = np.sum(dZ, axis=0, keepdims=True)/m
    dA_prev = np.dot(dZ, params['W'+str(L)].T)
    for l in reversed(range(1, L)):
        dZ = dA_prev * relu_der(cache['Z'+str(l)])
        grads['dW'+str(l)] = np.dot(cache['A'+str(l-1)].T, dZ)/m
        grads['db'+str(l)] = np.sum(dZ, axis=0, keepdims=True)/m
        if l > 1:
            dA_prev = np.dot(dZ, params['W'+str(l)].T)
    for l in range(1, L+1):
        params['W'+str(l)] -= learning_rate * grads['dW'+str(l)]
        params['b'+str(l)] -= learning_rate * grads['db'+str(l)]

In [38]:
def train(X_train, Y_train, X_test, Y_test, hidden_layers=[128,64], lr=0.01, epochs=20):
    layer_sizes = [X_train.shape[1]] + hidden_layers + [10]
    params = ini_para(layer_sizes)
    for epoch in range(epochs):
        Y_pred, cache = forward(X_train, params)
        loss = comp_loss(Y_pred, Y_train)
        backward(Y_pred, Y_train, params, cache, learning_rate=lr)
        if (epoch+1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}")
    Y_pred_test, _ = forward(X_test, params)
    accuracy = np.mean(np.argmax(Y_pred_test, axis=1) == np.argmax(Y_test, axis=1))
    print(f"Test Accuracy: {accuracy*100:.2f}%")
    return params