# Neural Networks

Neural network is largely considered as the state-of-the-art approach to machine learning, but it should be noted that neural networks is not a single method, such as Logistic Regression or SVMs, but comes in various flavors and forms. Here we develop the theory from the simplest form, a single neuron - perceptron, to a complete ML paradigm called "deep learning".

## A single neuron - "Perceptron"

Perceptron represents a single neuron and gives its name to neural networks, i.e. a network of multiple interconnected neurons. Its a function that maps inputs, such as $x_1$ and $x_2$ plus bias $-1$, to the desired output $y$, through nonlinear mapping such as logistic sigmoid, i.e. $y=logsig(w_1x_1+w_2x_2-w_0)$. The neuron "learns" from training data by adjusting its weights, $\vec{w}=(w_0, w_1, w_2)^T$, through gradient descent.



### Demo: Training a single neuron to represent $x_1$ OR $x_2$ logical function
Let's train a single neuron using the gradient rules derived in the lectures. In this example you should try different values for the learning rate and initialization of the network weights. The both affect to speed of convergence.

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt

# Training data is x_1 OR x_2 function
x = np.array([[0,0],[1,0],[1,1],[0,1]])
y = np.array([[0,1,1,1]]).T

# Training parameters
w_t = [0,0,0] # w1 w2 w0 - good practice is to initialize random (test: [-10,0,0] [-100,0,0])
num_of_epochs = 1000
learning_rate = 0.5

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

MSE = np.zeros([num_of_epochs,1])
for e in range(num_of_epochs):
    for n in range(x.shape[0]):
        y_hat = sigmoid(w_t[0]*x[n,0] + w_t[1]*x[n,1] - w_t[2])
        sigma_sigmoid = sigmoid(y_hat)*(1-sigmoid(y_hat))
        
        sigma_w1 = -2*(y[n]-y_hat)*sigma_sigmoid*x[n,0]
        sigma_w2 = -2*(y[n]-y_hat)*sigma_sigmoid*x[n,1]
        sigma_w0 = 2*(y[n]-y_hat)*sigma_sigmoid

        w_t[0] = w_t[0] - learning_rate*sigma_w1
        w_t[1] = w_t[1] - learning_rate*sigma_w2
        w_t[2] = w_t[2] - learning_rate*sigma_w0
 
    y_h = np.zeros([x.shape[0],1])
    for n in range(x.shape[0]):
        y_h[n] = sigmoid(w_t[0]*x[n,0] + w_t[1]*x[n,1] - w_t[2])
    MSE[e] = np.sum((y-y_h)**2)

print(w_t)
for n in range(x.shape[0]):
    y_h[n] = sigmoid(w_t[0]*x[n,0] + w_t[1]*x[n,1] - w_t[2])
    print(f'GT: y({n})={y[n]} ; Pred: y_h({n})={y_h[n]}')
plt.plot(range(num_of_epochs),MSE)
plt.show()

## A full-connected network

A full-connected neural network ala "Multi-layer Perceptron" is the vanilla ice cream of neural computation.


### Training a fully connected network of three neurons to represent $x_1$ XOR $x_2$

We implement the backward pass of gradient to train this network.

In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt

# Training data is x_1 XOR x_2 function
x = np.array([[0,0],[1,0],[1,1],[0,1]])
y = np.array([[0,1,0,1]]).T

# Training parameters
#w_1_t = [-10,-10,-10] # w1 w2 w0
w_1_t = np.random.normal(0,1,[3,1])
#w_2_t = [100,-100,50] # w1 w2 w0
w_2_t = np.random.normal(0,1,[3,1])
#w_3_t = [-100,100,50] # w1 w2 w0
w_3_t = np.random.normal(0,1,[3,1])
num_of_epochs = 1000
learning_rate = 0.5

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def neuron_forward(x1,x2,w1,w2,w0):
    return sigmoid(w1*x1+w2*x2-w0)


MSE = np.zeros([num_of_epochs,1])
for e in range(num_of_epochs):
    
    # Backward flow and tuning of weights    
    y_hat_sum = 0
    for n in range(x.shape[0]):
        
        x1 = x[n,0]
        x2 = x[n,1]
        y_gt = y[n]
        
        # Forward flows from inputs to outputs
        y_2 = neuron_forward(x1,x2,w_2_t[0],w_2_t[1],w_2_t[2])
        y_3 = neuron_forward(x1,x2,w_3_t[0],w_3_t[1],w_3_t[2])
        y_1 = neuron_forward(y_2,y_3,w_1_t[0],w_1_t[1],w_1_t[2])

        # Backward flow and weight updates
        
        # Loss gradient
        sigma_loss = -2*(y_gt-y_1)

        # f1 weight gradients
        sigma_f1_w1 = sigma_loss*y_1*(1-y_1)*y_2
        sigma_f1_w2 = sigma_loss*y_1*(1-y_1)*y_3
        sigma_f1_w0 = -sigma_loss*y_1*(1-y_1)
        
        # f1 update
        w_1_t[0] = w_1_t[0] - learning_rate*sigma_f1_w1
        w_1_t[1] = w_1_t[1] - learning_rate*sigma_f1_w2
        w_1_t[2] = w_1_t[2] - learning_rate*sigma_f1_w0
        
        # f1 gradient backward flow
        sigma_f1_f2 = sigma_loss*y_1*(1-y_1)*w_1_t[0]
        sigma_f1_f3 = sigma_loss*y_1*(1-y_1)*w_1_t[1]

        # f2 weight gradients
        sigma_f2_w1 = sigma_f1_f2*y_2*(1-y_2)*x1
        sigma_f2_w2 = sigma_f1_f2*y_2*(1-y_2)*x2
        sigma_f2_w0 = -sigma_f1_f2*y_2*(1-y_2)
        
        # f2 update
        w_2_t[0] = w_2_t[0] - learning_rate*sigma_f2_w1
        w_2_t[1] = w_2_t[1] - learning_rate*sigma_f2_w2
        w_2_t[2] = w_2_t[2] - learning_rate*sigma_f2_w0

        # f3 weight gradients
        sigma_f3_w1 = sigma_f1_f3*y_3*(1-y_3)*x1
        sigma_f3_w2 = sigma_f1_f3*y_3*(1-y_3)*x2
        sigma_f3_w0 = -sigma_f1_f3*y_3*(1-y_3)
        
        # f3 update
        w_3_t[0] = w_3_t[0] - learning_rate*sigma_f3_w1
        w_3_t[1] = w_3_t[1] - learning_rate*sigma_f3_w2
        w_3_t[2] = w_3_t[2] - learning_rate*sigma_f3_w0

 
    y_h = np.zeros([x.shape[0],1])
    for n in range(x.shape[0]):
        y_2 = neuron_forward(x[n,0],x[n,1],w_2_t[0],w_2_t[1],w_2_t[2])
        y_3 = neuron_forward(x[n,0],x[n,1],w_3_t[0],w_3_t[1],w_3_t[2])
        y_1 = neuron_forward(y_2,y_3,w_1_t[0],w_1_t[1],w_1_t[2])
        y_h[n] = y_1
    MSE[e] = np.sum((y-y_h)**2)

print(w_2_t)
print(w_3_t)
print(w_1_t)
for n in range(x.shape[0]):
    y_2 = neuron_forward(x[n,0],x[n,1],w_2_t[0],w_2_t[1],w_2_t[2])
    y_3 = neuron_forward(x[n,0],x[n,1],w_3_t[0],w_3_t[1],w_3_t[2])
    y_1 = neuron_forward(y_2,y_3,w_1_t[0],w_1_t[1],w_1_t[2])
    y_h[n] = y_1
    print(f'Input x=({x[n,:]}) GT: y({n})={y[n,0]:.2f} ; Pred: y_h({n})={y_h[n,0]:.2f}')
plt.plot(range(num_of_epochs),MSE)
plt.show()

## Convolutional neural network

For large images the full-connected network has too many parameters to train and it does not learn small translation invariance which is essential, but all these problems are solved by convolutional neural networks.

### Vanilla neural network using Keras and TensorFlow for MNIST Handwritten Digits dataset

In [None]:
import tensorflow as tf
import keras
import numpy as np
print("TensorFlow version:", tf.__version__)

Let's first load the MNIST dataset

In [None]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(f' Min y (class) values is {np.min(y_test)} and max {np.max(y_test)}')

Let's then display examples of data

In [None]:
import matplotlib.pyplot as plt

print(x_train.shape)
fig, (ax1,ax2,ax3,ax4) = plt.subplots(1,4,figsize=[12,3])
ax1.imshow(x_train[0,:,:])
ax1.set_title(f'Class number {y_train[0]}')
ax2.imshow(x_train[1,:,:])
ax2.set_title(y_train[1])
ax3.imshow(x_train[2,:,:])
ax3.set_title(y_train[2])
ax4.imshow(x_train[3,:,:])
ax4.set_title(y_train[3])
plt.show()

Let's make one full connected layer. You may play with the number of neurons in that layer.

In [None]:
model = tf.keras.models.Sequential()

# Flatten input image to a vector
model.add(keras.layers.Flatten(input_shape=(28,28)))
print(model.output_shape)

# Add a full connected layer
model.add(keras.layers.Dense(32, activation='sigmoid'))
print(model.output_shape)

# Add final layer for 10 classes (one-hot encoding)
model.add(keras.layers.Dense(10,activation='sigmoid'))
print(model.output_shape)

Let's compile the model.

In [None]:
# This loss takes care of one-hot encoding (see https://keras.io/api/losses/)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#loss_fn = tf.keras.losses.MeanSquaredError(from_logits=True)
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
model.summary()

Let's see how well it does without training

In [None]:
# Test before training (rand accuracy is 1/10)
model.evaluate(x_test,  y_test, verbose=2)

Let's train for some number of epochs

In [None]:
model.fit(x_train, y_train, epochs=10)

Let's test the trained model with our witheld test data

In [None]:
y_test_hat = model.predict(x_test)
y_test_hat = y_test_hat[0:10,:]
#print(np.maxind(y_test_hat))
print(y_test[0:10])

print(np.argmax(y_test_hat,axis=1))
model.evaluate(x_test,  y_test, verbose=2)

### Convolutional neural network (CNN)

In [None]:
model2 = tf.keras.models.Sequential()

model2.add(keras.layers.Input(shape=(28,28,1)))
print(model2.output_shape)
           
# Flatten input image to a vector
model2.add(keras.layers.Conv2D(16,kernel_size=(5,5),strides=(2,2)))
print(model2.output_shape)

# Flatten input image to a vector
model2.add(keras.layers.Flatten())
print(model2.output_shape)

# Add a full connected layer
model2.add(keras.layers.Dense(10, activation='sigmoid'))
print(model2.output_shape)

In [None]:
## This loss takes care of one-hot encoding (see https://keras.io/api/losses/)
#loss_fn2 = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
##loss_fn = tf.keras.losses.MeanSquaredError(from_logits=True)
#model2.compile(optimizer='adam',
#              loss=loss_fn2,
#              metrics=['accuracy'])
#model2.summary()

model2.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model2.summary()

We need to convert training data to format assumed by a convolutional filter (add one more dimension to make it explicit) and convert y explicitly to one-hot encoding.

In [None]:
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

# convert class vectors to binary class matrices
y_train_cat = keras.utils.to_categorical(y_train, 10)
y_test_cat = keras.utils.to_categorical(y_test, 10)

Now training.

In [None]:
model2.fit(x_train, y_train_cat, epochs=5)

Testing.

In [None]:
y_test_hat = model2.predict(x_test)
y_test_hat = y_test_hat[0:10,:]
#print(np.maxind(y_test_hat))
print(y_test[0:10])

print(np.argmax(y_test_hat,axis=1))
model2.evaluate(x_test,  y_test_cat, verbose=2)

This is with more tricks and flavors but we need to learn about backbone networks first.

In [None]:
# With more tricks (ReLu and MaxPooling)
model2 = tf.keras.models.Sequential()

model2.add(keras.layers.Input(shape=(28,28,1)))
print(model2.output_shape)
           
# Flatten input image to a vector
model2.add(keras.layers.Conv2D(16,kernel_size=(3,3)))
print(model2.output_shape)

# Flatten input image to a vector
model2.add(keras.layers.MaxPooling2D(pool_size=(2,2)))
print(model2.output_shape)

# Flatten input image to a vector
model2.add(keras.layers.Conv2D(32,kernel_size=(3,3)))
print(model2.output_shape)

# Flatten input image to a vector
model2.add(keras.layers.MaxPooling2D(pool_size=(2,2)))
print(model2.output_shape)


# Flatten input image to a vector
model2.add(keras.layers.Flatten(input_shape=(28,28)))
print(model2.output_shape)

# Add dropout "layer"
model2.add(keras.layers.Dropout(0.2))
print(model2.output_shape)

# Add a full connected layer
model2.add(keras.layers.Dense(10, activation='softmax'))
print(model2.output_shape)

## References

TensorFlow Tutorials. URL: https://www.tensorflow.org/tutorials