In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
mnist = fetch_openml('mnist_784', version=1)

In [3]:
X = mnist.data  
y = mnist.target
X = X.astype(np.float64)
y = y.astype(np.int64)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [6]:
def one_hot (y, num_class):
    """
    Convert interger labels to one-hot labels
    """
    y = np.array(y)
    num_sample = y.shape[0]
    one_hot = np.zeros((num_sample, num_class)) #số hàng là số sample, số cột là số class (số label)
    one_hot[np.arange(num_sample), y] = 1 
    return one_hot

y_train, y_test = one_hot(y_train, 10), one_hot(y_test, 10)


In [None]:
class NeuralNetwork:    
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.b1 = np.zeros((1,self.hidden_size))
        self.b2 = np.zeros((1,self.output_size)) 
        #pp chuẩn
        # self.W1 = np.random.randn(self.input_size, self.hidden_size)
        # self.W2 = np.random.randn(self.hidden_size, self.output_size)
        
        #Xavier initialization
        # self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(1 / self.input_size)
        # self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(1 / self.hidden_size)

        #He initialization
        self.W1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2 / self.input_size)
        self.W2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2 / self.hidden_size)

    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def sigmoid_derivative(self, z):
        return self.sigmoid(z)* self.sigmoid(1-z) #shape (num_samples, hidden_size)
    
    def relu(self, z):
        return np.maximum(0,z)
    
    def relu_derivative(self, z):
        return (z > 0).astype(float)
    
    def softmax(self, z):
        """Compute softmax values for each sets of scores in x."""
        e_z = np.exp(z-np.max(z))
        return e_z/np.sum(e_z, axis = 1, keepdims = True)
    
    def loss(self, y_pred, y_true):
        return -np.mean(y_true*np.log(y_pred))
    
    def forward(self, X):
        """
        forward method for neural network
        args:
        X (array): input array of shape (num_samples, 784)
        returns:
        output (array): output array of shape (num_samples, 10)
        """
        # dùng sigmoid
        # self.z1 = np.dot(X, self.W1) + self.b1 # shape (num_samples, hidden_size)
        # self.a1 = self.sigmoid(self.z1) # shape (num_samples, hidden_size)  
        # self.z2 = np.dot(self.a1, self.W2) + self.b2 # shape (num_samples, 10)  
        # self.a2 = self.softmax(self.z2)

        #dùng relu
        self.z1 = np.dot(X, self.W1) + self.b1 # shape (num_samples, hidden_size)
        self.a1 = self.relu(self.z1) # shape (num_samples, hidden_size)
        self.z2 = np.dot(self.a1, self.W2) + self.b2 # shape (num_samples, 10)  
        self.a2 = self.softmax(self.z2)
        return self.a2

    def backward(self, X, y, y_pred):
        dz2 = y_pred - y # shape (num_samples, 10)
        dW2 = np.dot(self.a1.T, dz2) # shape (hidden_size, 10)
        db2 = np.sum(dz2, axis = 0, keepdims = True)

        #np.dot(dz2, self.W2.T) shape (num_samples, hidden_size)

        #dùng sigmoid
        # dz1 = np.dot(dz2, self.W2.T) * self.sigmoid_derivative(self.z1)
        # dW1 = np.dot(X.T, dz1)
        # db1 = np.sum(dz1, axis = 0, keepdims = True)

        #dùng relu
        dz1 = np.dot(dz2, self.W2.T) * self.relu_derivative(self.z1)
        dW1 = np.dot(X.T, dz1)
        db1 = np.sum(dz1, axis = 0, keepdims = True)
        return  dW2, db2,dW1, db1
    
    def predict(self, X):
        y_pred = self.forward(X)
        return np.argmax(y_pred, axis = 1) #trả về số của class có xác suất lớn nhất

    def train(self, X ,y, epochs=101, lr=1e-3):
        for epoch in range(epochs):
            y_pred = self.forward(X)
            dW2, db2, dW1, db1 = self.backward(X, y, y_pred)
            self.W2 -= lr*dW2
            self.b2 -= lr*db2
            self.W1 -= lr*dW1
            self.b1 -= lr*db1
            if(epoch % 10 == 0):
                print(f"Loss at epoch {epoch} is {self.loss(y_pred, y)}")        

In [None]:
nn= NeuralNetwork(input_size=784, hidden_size=100, output_size=10)
nn.train(X_train, y_train, epochs=201, lr=1e-5)
#tăng hidden_size từ 64 lên 100 thì bị nan 
# => thay đổi learning rate thì không bị nan nữa, nhưng accuracy chỉ có 0.87
# => thay đổi learning rate và Xavier initialization cho W1, W2 thì accuracy tăng lên 0.94
# => giảm lr, tăng epochs và sử dụng Xavier thì accuracy tăng lên 0.959


# nếu dùng ReLU thì tiếp tục giảm lr và dùng He initialization thì accuracy là 0.959

Loss at epoch 0 is 0.2906791746017246
Loss at epoch 10 is 0.03986490209447911
Loss at epoch 20 is 0.029854139091926073
Loss at epoch 30 is 0.02518774063802912
Loss at epoch 40 is 0.022238461094865037
Loss at epoch 50 is 0.02011289540068013
Loss at epoch 60 is 0.01846892265156566
Loss at epoch 70 is 0.017141956879043466
Loss at epoch 80 is 0.01603552840791515
Loss at epoch 90 is 0.015090037427681054
Loss at epoch 100 is 0.014269759724405803
Loss at epoch 110 is 0.013547860529563336
Loss at epoch 120 is 0.01290859374801702
Loss at epoch 130 is 0.012337010098474006
Loss at epoch 140 is 0.011818185867742435
Loss at epoch 150 is 0.011345494889074407
Loss at epoch 160 is 0.010911238412258228
Loss at epoch 170 is 0.010508700045007055
Loss at epoch 180 is 0.010134083790199935
Loss at epoch 190 is 0.009784402187798906
Loss at epoch 200 is 0.009456469116142075


In [11]:
y_pred = nn.predict(X_test)
y_test_labels = np.argmax(y_test, axis=1)  # Convert one-hot encoded y_test back to integer labels
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test_labels)
accuracy

0.9592142857142857

In [12]:
#f1score
from sklearn.metrics import f1_score
f1 = f1_score(y_true=y_test_labels, y_pred=y_pred, average='weighted')

print("F1-Score:", f1)

F1-Score: 0.9591850880574626
