In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import fashion_mnist

In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def softmax(x):
    exps = np.exp(x - np.max(x))
    return exps / np.sum(exps, axis=1, keepdims=True)

In [3]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Initialize weights and biases
        self.W1 = np.random.randn(self.input_size, self.hidden_size)
        self.b1 = np.zeros((1, self.hidden_size))
        self.W2 = np.random.randn(self.hidden_size, self.output_size)
        self.b2 = np.zeros((1, self.output_size))
        
    def forward(self, X):
        # Forward propagation
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = softmax(self.z2)
        return self.a2
    
    def backward(self, X, y):
        # Backpropagation
        m = X.shape[0]
        dz2 = self.a2 - y
        self.dW2 = np.dot(self.a1.T, dz2) / m
        self.db2 = np.sum(dz2, axis=0, keepdims=True) / m
        dz1 = np.dot(dz2, self.W2.T) * sigmoid_derivative(self.z1)
        self.dW1 = np.dot(X.T, dz1) / m
        self.db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
    def update_params(self, optimizer):
        optimizer.update_params(self)

In [4]:
class SGD:
    def __init__(self, lr):
        self.lr = lr
        
    def update_params(self, nn):
        pass  # SGD doesn't need additional update step

class Momentum:
    def __init__(self, lr, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.VdW1 = 0
        self.Vdb1 = 0
        self.VdW2 = 0
        self.Vdb2 = 0
        
    def update_params(self, nn):
        self.VdW1 = self.momentum * self.VdW1 + self.lr * nn.dW1
        self.Vdb1 = self.momentum * self.Vdb1 + self.lr * nn.db1
        self.VdW2 = self.momentum * self.VdW2 + self.lr * nn.dW2
        self.Vdb2 = self.momentum * self.Vdb2 + self.lr * nn.db2
        
        nn.W1 -= self.VdW1
        nn.b1 -= self.Vdb1
        nn.W2 -= self.VdW2
        nn.b2 -= self.Vdb2

class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0
        self.mW1 = 0
        self.vW1 = 0
        self.mb1 = 0
        self.vb1 = 0
        self.mW2 = 0
        self.vW2 = 0
        self.mb2 = 0
        self.vb2 = 0
        
    def update_params(self, nn):
        self.t += 1
        self.mW1 = self.beta1 * self.mW1 + (1 - self.beta1) * nn.dW1
        self.vW1 = self.beta2 * self.vW1 + (1 - self.beta2) * (nn.dW1 ** 2)
        mW1_hat = self.mW1 / (1 - self.beta1 ** self.t)
        vW1_hat = self.vW1 / (1 - self.beta2 ** self.t)
        nn.W1 -= self.lr * mW1_hat / (np.sqrt(vW1_hat) + self.epsilon)
        
        self.mb1 = self.beta1 * self.mb1 + (1 - self.beta1) * nn.db1
        self.vb1 = self.beta2 * self.vb1 + (1 - self.beta2) * (nn.db1 ** 2)
        mb1_hat = self.mb1 / (1 - self.beta1 ** self.t)
        vb1_hat = self.vb1 / (1 - self.beta2 ** self.t)
        nn.b1 -= self.lr * mb1_hat / (np.sqrt(vb1_hat) + self.epsilon)
        
        self.mW2 = self.beta1 * self.mW2 + (1 - self.beta1) * nn.dW2
        self.vW2 = self.beta2 * self.vW2 + (1 - self.beta2) * (nn.dW2 ** 2)
        mW2_hat = self.mW2 / (1 - self.beta1 ** self.t)
        vW2_hat = self.vW2 / (1 - self.beta2 ** self.t)
        nn.W2 -= self.lr * mW2_hat / (np.sqrt(vW2_hat) + self.epsilon)
        
        self.mb2 = self.beta1 * self.mb2 + (1 - self.beta1) * nn.db2
        self.vb2 = self.beta2 * self.vb2 + (1 - self.beta2) * (nn.db2 ** 2)
        mb2_hat = self.mb2 / (1 - self.beta1 ** self.t)
        vb2_hat = self.vb2 / (1 - self.beta2 ** self.t)
        nn.b2 -= self.lr * mb2_hat / (np.sqrt(vb2_hat) + self.epsilon)

In [5]:
# Load Fashion-MNIST data
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train = X_train.reshape(-1, 784) / 255.0
X_test = X_test.reshape(-1, 784) / 255.0
y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

# Initialize neural network
input_size = 784
hidden_size = 128
output_size = 10
nn = NeuralNetwork(input_size, hidden_size, output_size)

# Training parameters
epochs = 100
batch_size = 64
lr = 0.01

# Choose optimizer
optimizer = Adam(lr=lr)

# Training loop
for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        # Mini-batch
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        
        # Forward pass
        y_pred = nn.forward(X_batch)
        
        # Compute loss
        loss = -np.sum(y_batch * np.log(y_pred)) / batch_size
        
        # Backpropagation
        nn.backward(X_batch, y_batch, lr)
        
        # Update parameters using chosen optimizer
        optimizer.update_params(nn)
        
    # Validation loss
    y_val_pred = nn.forward(X_val)
    val_loss = -np.sum(y_val * np.log(y_val_pred)) / len(X_val)
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}")

# Evaluate on test set
y_test_pred = np.argmax(nn.forward(X_test), axis=1)
accuracy = np.mean(y_test_pred == np.argmax(y_test, axis=1))
print(f"Test Accuracy: {accuracy:.4f}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


AttributeError: 'NeuralNetwork' object has no attribute 'dW1'