# Neural Network Traiing

## Loss Function - Cross Entropy

In [2]:
# import packa
import numpy as np


In [3]:
# y is predicted value
# t is true value (label)
def cross_entroy_error(y, t):
    delta = 1e-7  # To avoid log(0)
    return -np.sum(t * np.log(y + delta))


In [8]:
def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a - c)  # prevent overflow
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

In [9]:
# test the function
y1 = np.array([0.1, 0.9, 0.8])
y2 = np.array([0.8, 0.1, 0.5])

y1_prob = softmax(y1)
y2_prob = softmax(y2)

t = np.array([0, 1, 0])
loss1 = cross_entroy_error(y1_prob, t)
print(f'Cross-Entropy Loss between {y1_prob} and {t} = {loss1}.')
loss2 = cross_entroy_error(y2_prob, t)
print(f'Cross-Entropy Loss between {y2_prob} and {t} = {loss2}')

Cross-Entropy Loss between [0.19086542 0.42477881 0.38435576] and [0 1 0] = 0.856186451532546.
Cross-Entropy Loss between [0.44694665 0.22194714 0.33110622] and [0 1 0] = 1.5053156021257363


## Mini-Batch Version of `cross_entropy_error()`

In [10]:
# show what shape is changed
print(f'y1 shape: {y1.shape} -> y1_prob shape: {y1_prob.shape}')
print(f'y2 shape: {y2.shape} -> y2_prob shape: {y2_prob.shape}')

y1 shape: (3,) -> y1_prob shape: (3,)
y2 shape: (3,) -> y2_prob shape: (3,)


In [11]:
y1.shape[0]

3

In [12]:
# batch version of cross_entropy_error
def cross_entropy_error_batch(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    batch_size = y.shape[0]
    delta = 1e-7  # To avoid log(0)
    return -np.sum(t * np.log(y + delta)) / batch_size

In [23]:
y1 = np.array([[0.1, 0.9, 0.8],
    [0.8, 0.1, 0.5],
    [0.2, 0.3, 0.5]])
y2 = np.array([[0.9, 0.1, 0.8],
    [0.8, 0.1, 0.5],
    [0.2, 0.3, 0.5]])
y1_prob = np.apply_along_axis(softmax, 1, y1)
y2_prob = np.apply_along_axis(softmax, 1, y2)

t = np.array([[0, 1, 0],
     [1, 0, 0],
     [0, 0, 1]])

print(f'y1 = {y1}')
print(f'y2 = {y2}')
print(f'y1_prob = {y1_prob}')
print(f'y2_prob = {y2_prob}')
print(f't = {t}')

y1 = [[0.1 0.9 0.8]
 [0.8 0.1 0.5]
 [0.2 0.3 0.5]]
y2 = [[0.9 0.1 0.8]
 [0.8 0.1 0.5]
 [0.2 0.3 0.5]]
y1_prob = [[0.19086542 0.42477881 0.38435576]
 [0.44694665 0.22194714 0.33110622]
 [0.28943311 0.31987306 0.39069383]]
y2_prob = [[0.42477881 0.19086542 0.38435576]
 [0.44694665 0.22194714 0.33110622]
 [0.28943311 0.31987306 0.39069383]]
t = [[0 1 0]
 [1 0 0]
 [0 0 1]]


In [22]:
# test cross_entropy_error_batch
loss1 = cross_entropy_error_batch(y1_prob, t)
loss2 = cross_entropy_error_batch(y2_prob, t)
print(f'Cross-Entropy Loss (batch) of y1_prob = {loss1}.')
print(f'Cross-Entropy Loss (batch) of y2_prob = {loss2}.')

Cross-Entropy Loss (batch) of y1_prob = 0.8671110284550632.
Cross-Entropy Loss (batch) of y2_prob = 1.1337775989508578.


## Numberical Differentiation

## 

In [24]:
def numerical_diff(f, x):
    h = 1e-4  # 0.0001
    return (f(x + h) - f(x - h)) / (2 * h)

In [25]:
# define a sample function
def function_1(x):  
    return x**2 + 0.1 * x

In [26]:
s2 = numerical_diff(function_1, 0.2)
s4 = numerical_diff(function_1, 0.4)
print(f'The numerical differentiation of function_1 at x=0.2 is {s2}.')
print(f'The numerical differentiation of function_1 at x=0.4 is {s4}.')


The numerical differentiation of function_1 at x=0.2 is 0.49999999999994493.
The numerical differentiation of function_1 at x=0.4 is 0.9000000000000674.


## Partial Derivatives

In [27]:
# Partial Derivatives when x0 = 3, x1 = 4

def function_temp1(x0):
    return x0**2 + 4.0**2


def function_temp2(x1):
    return 3.0**2 + x1**2  

dx0 = numerical_diff(function_temp1, 3.0)
dx1 = numerical_diff(function_temp2, 4.0)
print(f'The partial derivative with respect to x0 at (3, 4) is {dx0}.')
print(f'The partial derivative with respect to x1 at (3, 4) is {dx1}.')

The partial derivative with respect to x0 at (3, 4) is 6.00000000000378.
The partial derivative with respect to x1 at (3, 4) is 7.999999999999119.


## Slopes - `numerical_gradient()`

Slopes are a vector of the partical derivaties of all variables.

In [8]:
def numerical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)  # Initialize gradient array

    for idx in range(x.size):
        tmp_val = x[idx]

        # f(x + h)
        x[idx] = tmp_val + h
        fxh1 = f(x)

        # f(x - h)
        x[idx] = tmp_val - h
        fxh2 = f(x)

        grad[idx] = (fxh1 - fxh2) / (2 * h)
        x[idx] = tmp_val  # Restore original value

    return grad

In [31]:
# test the function
def function_2(x):
    return x[0]**2 + x[1]**2

grad = numerical_gradient(function_2, np.array([3.0, 4.0]))
print(f'The numerical gradient of function_2 at (3.0, 4.0) is {grad}.') 

grad = numerical_gradient(function_2, np.array([0.0, 2.0]))
print(f'The numerical gradient of function_2 at (0.0, 2.0) is {grad}.')

grad = numerical_gradient(function_2, np.array([0.0, 0.0])) 
print(f'The numerical gradient of function_2 at (0.0, 0.0) is {grad}.')


The numerical gradient of function_2 at (3.0, 4.0) is [6. 8.].
The numerical gradient of function_2 at (0.0, 2.0) is [0. 4.].
The numerical gradient of function_2 at (0.0, 0.0) is [0. 0.].


## Gradient Decent Method

In [6]:
import numpy as np

In [19]:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x

    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad

        # if grad.all() < 1e-5:
        #     break

    return x #, i

In [2]:
# test the function
def function_2(x):
    return x[0]**2 + x[1]**2

In [34]:
int_x = np.array([400.0, 300.0])
x = gradient_descent(function_2, int_x, lr=0.1, step_num=200)
print(f'The local minimum point found by gradient descent is {x}.')

The local minimum point found by gradient descent is [1.65965717e-17 1.24478705e-17].


In [23]:
x = gradient_descent(function_2, int_x, lr=0.0001, step_num=100)
print(f'The local minimum point found by gradient descent is {x}.')

The local minimum point found by gradient descent is [4.69649683e-09 3.52237262e-09].


## SimpleNet class

In [None]:
class Utility:
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def softmax(self, a):
        c = np.max(a)
        exp_a = np.exp(a - c)  # prevent overflow
        sum_exp_a = np.sum(exp_a)
        y = exp_a / sum_exp_a
        return y

    def cross_entropy_error_batch(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)
        batch_size = y.shape[0]
        delta = 1e-7  # To avoid log(0)
        return -np.sum(t * np.log(y + delta)) / batch_size
    
    def numerical_gradient(self, f, x):
        h = 1e-4  # 0.0001
        grad = np.zeros_like(x)  # Initialize gradient array

        for idx in range(x.size):
            tmp_val = x[idx]

            # f(x + h)
            x[idx] = tmp_val + h
            fxh1 = f(x)

            # f(x - h)
            x[idx] = tmp_val - h
            fxh2 = f(x)

            grad[idx] = (fxh1 - fxh2) / (2 * h)
            x[idx] = tmp_val  # Restore original value

        return grad

In [38]:
# import utility 
 
class SimpleNet:
    def __init__(self):
        self.W = np.random.randn(2, 3)  # weight initialization
        self.util = Utility()

    def predict(self, x):
        return np.dot(x, self.W)

    def loss(self, x, t):
        z = self.predict(x)
        y = self.util.softmax(z)
        loss = self.util.cross_entropy_error_batch(y, t)
        return loss

In [39]:
# test the SimpleNet class
net = SimpleNet()
print(f'Initial weight W: {net.W}')
x = np.array([0.6, 0.9])
p = net.predict(x)
print(f'Predicted value: {p}')
t = np.array([0, 0, 1])  # true label
loss = net.loss(x, t)
print(f'Loss value: {loss}.')

Initial weight W: [[ 2.50649385  1.9619885  -0.01547902]
 [ 1.65268115  0.71875699  0.62257074]]
Predicted value: [2.99130935 1.82407439 0.55102625]
Loss value: 2.7755833883781973.


## Two-Layer Net for MNIST

In [1]:
import numpy as np
from utility import Utility

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size):
        # Weight initialization
        self.params = {}
        self.params['w1'] = np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['w2'] = np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        self.util = Utility()
    
    def predict(self, x):
        w1, b1 = self.params['w1'], self.params['b1']
        w2, b2 = self.params['w2'], self.params['b2']

        a1 = np.dot(x, w1) + b1
        z1 = self.util.sigmoid(a1)
        a2 = np.dot(z1, w2) + b2
        y = self.util.softmax(a2)

        return y
    
    def loss(self, x, t):   
        y_hat = self.predict(x)
        return self.util.cross_entropy_error_batch(y_hat, t)
    
    def accuracy(self, x, t):
        y_hat = self.predict(x)
        y_pred = np.argmax(y_hat, axis=1)
        t_true = np.argmax(t, axis=1)

        accuracy = np.sum(y_pred == t_true) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['w1'] = self.util.numerical_gradient(loss_W, self.params['w1'])
        grads['b1'] = self.util.numerical_gradient(loss_W, self.params['b1'])
        grads['w2'] = self.util.numerical_gradient(loss_W, self.params['w2'])
        grads['b2'] = self.util.numerical_gradient(loss_W, self.params['b2'])
        
        return grads

In [2]:
#import mnist_data
from mnist_data import MnistData

mnist = MnistData()

Downloading train-images-idx3-ubyte.gz...
train-images-idx3-ubyte.gz already exists. Skipping download.
Downloading train-labels-idx1-ubyte.gz...
train-labels-idx1-ubyte.gz already exists. Skipping download.
Downloading t10k-images-idx3-ubyte.gz...
t10k-images-idx3-ubyte.gz already exists. Skipping download.
Downloading t10k-labels-idx1-ubyte.gz...
t10k-labels-idx1-ubyte.gz already exists. Skipping download.
mnist.pkl already exists. Loading dataset from pickle file.


In [3]:
# test the Two-Layer Net for MNIST
net = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
(train_images, train_labels), (test_images, test_labels) = mnist.get_dataset()

In [10]:
# convert train_labels and test_labels to one-hot encoding
num_classes = 10

def to_one_hot(labels, num_classes):
    labels = np.array(labels).reshape(-1)
    return np.eye(num_classes, dtype=np.float32)[labels]

if train_labels.ndim != 2 or train_labels.shape[1] != num_classes:
    train_labels = to_one_hot(train_labels, num_classes)

if test_labels.ndim != 2 or test_labels.shape[1] != num_classes:
    test_labels = to_one_hot(test_labels, num_classes)

print(f"Converted train_labels -> {train_labels.shape}, test_labels -> {test_labels.shape}")

Converted train_labels -> (60000, 10), test_labels -> (10000, 10)


In [12]:
train_labels.shape

(60000, 10)

In [4]:
# let's flat the images
train_images = train_images.reshape(train_images.shape[0], 784)
test_images = test_images.reshape(test_images.shape[0], 784)

In [11]:
train_images.shape

(60000, 784)

In [6]:
test_images.shape

(10000, 784)

In [5]:
iter_num = 100  #10,000 preferably 
train_size = int(train_images.shape[0]/6) # make this 1/6th for faster run
batch_size = 100
learning_rate = 0.1

iter_per_epoch = max(train_size / batch_size, 1)

In [8]:
train_size

10000

In [6]:
train_loss_list = []
train_acc_list = []
test_acc_list = []

In [13]:
for i in range(iter_num):
    # Mini-batch selection
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = train_images[batch_mask]
    t_batch = train_labels[batch_mask]

    # Gradient calculation
    grad = net.numerical_gradient(x_batch, t_batch)

    # Parameter update
    for key in ('w1', 'b1', 'w2', 'b2'):
        net.params[key] -= learning_rate * grad[key]

    # Record the loss
    loss = net.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    # Evaluate accuracy at each epoch
    if i % iter_per_epoch == 0:
        train_acc = net.accuracy(train_images, train_labels)
        test_acc = net.accuracy(test_images, test_labels)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(f"Iteration {i}: Train Accuracy = {train_acc}, Test Accuracy = {test_acc}")



# plot the loss curve
import matplotlib.pyplot as plt

plt.title('Training Loss')
plt.plot(x, train_acc_list, label='Train Accuracy')
plt.plot(x, test_acc_list, label='Test Accuracy', linestyle='--')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Iteration 0: Train Accuracy = 0.10905, Test Accuracy = 0.099


KeyboardInterrupt: 