**Samson Zhang note book**

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Read data from CSV file
data = pd.read_csv('train3.csv')
print(data)

# Convert data to numpy array and shuffle rows
data = np.array(data)
np.random.shuffle(data)

# Separate data into development and training sets
data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:].astype(float) / 255.0  # Normalize X_dev

data_train = data[1000:].T
Y_train = data_train[0]
X_train = data_train[1:].astype(float) / 255.0  # Normalize X_train
_, m_train = X_train.shape

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z))  # Avoid overflow
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def init_params():
    W1 = np.random.randn(10, 784) * 0.01
    b1 = np.zeros((10, 1))
    W2 = np.random.randn(10, 10) * 0.01
    b2 = np.zeros((10, 1))
    return W1, b1, W2, b2

def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def one_hot(Y, num_classes):
    one_hot_Y = np.zeros((num_classes, Y.size))
    one_hot_Y[Y, np.arange(Y.size)] = 1
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y.astype(int), num_classes=10)
    m = Y.size

    dZ2 = A2 - one_hot_Y
    dW2 = (1 / m) * dZ2.dot(A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = W2.T.dot(dZ2) * (Z1 > 0)
    dW1 = (1 / m) * dZ1.dot(X.T)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 -= alpha * dW1
    b1 -= alpha * db1
    W2 -= alpha * dW2
    b2 -= alpha * db2
    return W1, b1, W2, b2

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            predictions = get_predictions(A2)
            accuracy = get_accuracy(predictions, Y)
            print(f"Iteration: {i}, Accuracy: {accuracy}")
    return W1, b1, W2, b2

# Train the model
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, alpha=0.1, iterations=500)

# Evaluate on development set
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)
accuracy_dev = get_accuracy(dev_predictions, Y_dev)
print(f"Development Set Accuracy: {accuracy_dev}")

       label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0          1       0       0       0       0       0       0       0       0   
1          0       0       0       0       0       0       0       0       0   
2          1       0       0       0       0       0       0       0       0   
3          4       0       0       0       0       0       0       0       0   
4          0       0       0       0       0       0       0       0       0   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
41995      0       0       0       0       0       0       0       0       0   
41996      1       0       0       0       0       0       0       0       0   
41997      7       0       0       0       0       0       0       0       0   
41998      6       0       0       0       0       0       0       0       0   
41999      9       0       0       0       0       0       0       0       0   

       pixel8  ...  pixel774  pixel775 

**500 forward/backward propagation steps and learning rate α = 0.1(accuracy of the trained network should be about 82 to 85 percent.)**

In [None]:
import numpy as np
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt

# Load MNIST dataset
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

# Flatten and scale the grayscale values to [0, 1]
X_train = X_train.reshape(X_train.shape[0], -1).astype('float32') / 255.0

# Define functions
def init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

# Load development set
X_dev, Y_dev = X_train[:1000].T, Y_train[:1000]
X_train, Y_train = X_train[1000:].T, Y_train[1000:]
_, m_train = X_train.shape

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        one_hot_Y = np.zeros(A2.shape)
        one_hot_Y[Y, np.arange(m_train)] = 1
        dZ2 = A2 - one_hot_Y
        dW2 = 1 / m_train * dZ2.dot(A1.T)
        db2 = 1 / m_train * np.sum(dZ2, axis=1, keepdims=True)
        dZ1 = W2.T.dot(dZ2) * (Z1 > 0)
        dW1 = 1 / m_train * dZ1.dot(X.T)
        db1 = 1 / m_train * np.sum(dZ1, axis=1, keepdims=True)

        W1 -= alpha * dW1
        b1 -= alpha * db1
        W2 -= alpha * dW2
        b2 -= alpha * db2

        if i % 100 == 0:
            _, _, _, A2_dev = forward_prop(W1, b1, W2, b2, X_dev)
            predictions_dev = get_predictions(A2_dev)
            acc_dev = get_accuracy(predictions_dev, Y_dev)
            print(f"Iteration {i}: Development Set Accuracy = {acc_dev}")

    return W1, b1, W2, b2

# Train the model
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, alpha=0.1, iterations=500)

# Calculate accuracy on development set
_, _, _, A2_dev = forward_prop(W1, b1, W2, b2, X_dev)
predictions_dev = get_predictions(A2_dev)
acc_dev = get_accuracy(predictions_dev, Y_dev)
print(f"Final Development Set Accuracy: {acc_dev}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Iteration 0: Development Set Accuracy = 0.187
Iteration 100: Development Set Accuracy = 0.635
Iteration 200: Development Set Accuracy = 0.751
Iteration 300: Development Set Accuracy = 0.791
Iteration 400: Development Set Accuracy = 0.816
Final Development Set Accuracy: 0.835


**In the hope to get a higher accuracy, enlarge the neural network, by adding an additional hidden layer and adding more interior nodes. Try 20 nodes in the first and 10 nodes in the second hidden layer. Run propagation with 1000 steps and learning rate α = 0.1. How much improvement does the larger network provide?**

In [None]:
import numpy as np
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt

# Load MNIST dataset
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

# Flatten and scale the grayscale values to [0, 1]
X_train = X_train.reshape(X_train.shape[0], -1).astype('float32') / 255.0

# Define functions
def init_params():
    W1 = np.random.rand(20, 784) - 0.5
    b1 = np.random.rand(20, 1) - 0.5
    W2 = np.random.rand(10, 20) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    W3 = np.random.rand(10, 10) - 0.5
    b3 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2, W3, b3

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = ReLU(Z2)
    Z3 = W3.dot(A2) + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

def get_predictions(A3):
    return np.argmax(A3, axis=0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

# Load development set
X_dev, Y_dev = X_train[:1000].T, Y_train[:1000]
X_train, Y_train = X_train[1000:].T, Y_train[1000:]
_, m_train = X_train.shape

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2, W3, b3 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        one_hot_Y = np.zeros(A3.shape)
        one_hot_Y[Y, np.arange(m_train)] = 1
        dZ3 = A3 - one_hot_Y
        dW3 = 1 / m_train * dZ3.dot(A2.T)
        db3 = 1 / m_train * np.sum(dZ3, axis=1, keepdims=True)
        dZ2 = W3.T.dot(dZ3) * (Z2 > 0)
        dW2 = 1 / m_train * dZ2.dot(A1.T)
        db2 = 1 / m_train * np.sum(dZ2, axis=1, keepdims=True)
        dZ1 = W2.T.dot(dZ2) * (Z1 > 0)
        dW1 = 1 / m_train * dZ1.dot(X.T)
        db1 = 1 / m_train * np.sum(dZ1, axis=1, keepdims=True)

        W1 -= alpha * dW1
        b1 -= alpha * db1
        W2 -= alpha * dW2
        b2 -= alpha * db2
        W3 -= alpha * dW3
        b3 -= alpha * db3

        if i % 100 == 0:
            _, _, _, _, _, A3_dev = forward_prop(W1, b1, W2, b2, W3, b3, X_dev)
            predictions_dev = get_predictions(A3_dev)
            acc_dev = get_accuracy(predictions_dev, Y_dev)
            print(f"Iteration {i}: Development Set Accuracy = {acc_dev}")

    return W1, b1, W2, b2, W3, b3

# Train the model
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, alpha=0.1, iterations=1000)

# Calculate accuracy on development set
_, _, _, _, _, A3_dev = forward_prop(W1, b1, W2, b2, W3, b3, X_dev)
predictions_dev = get_predictions(A3_dev)
acc_dev = get_accuracy(predictions_dev, Y_dev)
print(f"Final Development Set Accuracy: {acc_dev}")

Iteration 0: Development Set Accuracy = 0.108
Iteration 100: Development Set Accuracy = 0.731
Iteration 200: Development Set Accuracy = 0.811
Iteration 300: Development Set Accuracy = 0.828
Iteration 500: Development Set Accuracy = 0.86
Iteration 600: Development Set Accuracy = 0.868
Iteration 700: Development Set Accuracy = 0.874
Iteration 800: Development Set Accuracy = 0.877
Iteration 900: Development Set Accuracy = 0.882
Final Development Set Accuracy: 0.887


**This larger network with more nodes and an additional hidden layer should improve accuracy compared to the previous smaller network. The printout during training will show the development set accuracy at different iterations, providing insight into the network's learning progress.**