In this assignment we implement a Deep Neural Network
Implement from scratch (hence not using NN libraries such as tensorflow,
keras, pytorch) a Neural Network that is able to approximate a function of
two variables f(x, y).\
The network will have at least two hidden layers and it
is trained using the backpropagation algorithm.\
The test case we consider is f(x, y) = x · y (we can say that we teach the network to multiply two numbers).\
As training data for the algorithm construct a matrix of input output triplets
of the function f(x, y) = x · y.\
When x, y ∈ [0, 1] the matrix would contain training triplets of the form [xi
, yj , xi· yj ], where xi and yj are random values in [0, 1].\
Try different activation functions and find out which one performs best with your network.\
For validation accuracy calculate the root mean square error.\
Add visualization of the RMSE over training epochs, and perform simple
tests to check accuracy.

\begin{equation}
\text{standard form}\\
o_h=\sum_{l=1}^{10}w_l^4\phi\left(\sum_{k-1}^{10}w_{lk}^3\phi\left(\sum_{j=1}^{10}w_{jk}^2\phi\left(\sum_{i=1}^2w_{ji}^1x_{hi}+b_j^1\right)+b_k^2\right)b_l^3\right)+c\\
=======================================\\
\text{vector form}\\
o_h=W^4\phi\left(W^3\phi\left(W^2\phi\left(W^1x_h+B^1\right)+B^2\right)+B^3\right)+c\\
=======================================\\
W^1\rightarrow 10\times2\\
W^2\rightarrow 10\times10\\
W^3\rightarrow 10\times10\\
W^4\rightarrow 1\times10\\
x_h\rightarrow 2\times1\\
B^1\rightarrow 10\times1\\
B^2\rightarrow 10\times1\\
B^3\rightarrow 10\times1\\
c\rightarrow \text{scalar}\\
o_h\rightarrow \text{scalar}\\
=======================================\\
o_h=W^4\phi\left(W^3\phi\left(W^2\phi\left((10\times2)(2\times1)+(10\times1)\right)+B^2\right)+B^3\right)+c\\
o_h=W^4\phi\left(W^3\phi\left(W^2\phi\left(10\times1\right)+B^2\right)+B^3\right)+c\\
o_h=W^4\phi\left(W^3\phi\left((10\times10)\left(10\times1\right)+(10\times1)\right)+B^3\right)+c\\
o_h=W^4\phi\left((10\times10)\left(10\times1\right)+(10\times1)\right)+c\\
o_h=(1\times10)\left(10\times1\right)+c\\
o_h=\text{scalar}+\text{scalar}\\
\end{equation}

In [82]:
# initialize the weights
import numpy as np
W1 = np.random.randn(10,2) * np.sqrt(1/2) # 10x2
W2 = np.random.randn(10,10) * np.sqrt(1/10) # 10x10
W3 = np.random.randn(10,10) * np.sqrt(1/10) # 10x10
W4 = np.random.randn(1,10) * np.sqrt(1/10) # 1x10

In [83]:
#initialize the biases
import numpy as np
b1 = np.zeros((10,1)) # 10x1
b2 = np.zeros((10,1)) # 10x1
b3 = np.zeros((10,1)) # 10x1
c = 0 # scalar

In [67]:
# activations
import numpy as np
def htan(x):
  return np.tanh(x)

# predictions and recordings
def predict_and_record(x):
  preA1 = (W1@x) + b1 # calculate pre-activations for layer 1
  A1 = htan(preA1) # calculate activations for layer 1
  preA2 = (W2@A1) + b2 # calculate pre-activations for layer 2
  A2 = htan(preA2) # calculate activations for layer 2
  preA3 = (W3@A2) + b3 # calculate pre-activations for layer 3
  A3 = htan(preA3) # calculate activations for layer 3

  # pre-activation values for the output layer will also be the activation values
  # because we are using a linear activation function for the output layer
  O = (W4@A3) + c # scalar

  #return W4@htan( W3@htan( W2@htan( W1@x +b1 ) +b2 ) +b3 ) +c
  return preA1, A1, preA2, A2, preA3, A3, O

In [68]:
# build the data
import numpy as np
def create_data(n_observations):
    train_data = np.random.rand(2*n_observations).reshape((n_observations, 2))
    label_data = train_data[:,0] * train_data[:,1]
    return train_data, label_data


In [77]:
# visualize the data
training_size = 10000
X, y = create_data(training_size)

print(X)
print(y)

[[0.64852341 0.23496672]
 [0.96615254 0.97809654]
 [0.002566   0.20810023]
 ...
 [0.87495187 0.75396298]
 [0.77661787 0.77773342]
 [0.79637192 0.29797985]]
[1.52381419e-01 9.44990466e-01 5.33984944e-04 ... 6.59681317e-01
 6.04001676e-01 2.37302788e-01]


In [85]:
# train the network
import numpy as np
def htan_prime(x):
  return 1 - np.tanh(x)**2

mu = 0.1
def single_backPropagation(x, W1, W2, W3, W4, b1, b2, b3, c, z1, a1, z2, a2, z3, a3, o, y):
  # calculate the gradient of the loss with respect to the pre-activations of each layer
  output_gradient = o-y # scalar
  layer3_gradient = (W4.T * output_gradient) * htan_prime(o) # 10x1
  layer2_gradient = (W3.T @ layer3_gradient) * htan_prime(z3) # 10x1
  layer1_gradient = (W2.T @ layer2_gradient) * htan_prime(z2) # 10x1

  # gradient of right layer, pre-activations of right layer, activations of previous layer
  loss_W4 = output_gradient * htan_prime(o) * a3 # 10x1
  loss_c = output_gradient * htan_prime(o) # 10x1
  W4 -= (mu * loss_W4).T
  c -= (mu * loss_c)
  # update (W3) with EW3 and (B3) with Eb3
  loss_W3 = layer3_gradient * htan_prime(z3) @ a2.T # 10x10
  loss_b3 = layer3_gradient * htan_prime(z3) # 10x1
  W3 -= (mu * loss_W3)
  b3 -= (mu * loss_b3)
  # update (W2) with EW2 and (B2) with Eb2
  loss_W2 = layer2_gradient * htan_prime(z2) @ a1.T # 10x10
  loss_b2 = layer2_gradient * htan_prime(z2) # 10x1
  W2 -= (mu * loss_W2)
  b2 -= (mu * loss_b2)
  # update (W1) with EW1 and (B1) with Eb1
  loss_W1 = layer1_gradient * htan_prime(z1) @ x.T # 10x2
  loss_b1 = layer1_gradient * htan_prime(z1) # 10x1
  W1 -= (mu * loss_W1)
  b1 -= (mu * loss_b1)

# how are we applying gradient descent and back propogation?
def train():
  for i in range(training_size):
    # forward pass
    z1, a1, z2, a2, z3, a3, o = predict_and_record(X[i,:].reshape(-1,1))
    # back propagation
    single_backPropagation(X[i,:].reshape(-1,1), W1, W2, W3, W4, b1, b2, b3, c, z1, a1, z2, a2, z3, a3, o, y[i])

In [114]:
# @title
# print the weights and biases to see change
print("W1: ", W1)
print("b1: ", b1)
print("W2: ", W2)
print("b2: ", b2)
print("W3: ", W3)
print("b3: ", b3)
print("W4: ", W4)
print("c: ", c)


W1:  [[ 0.5437343  -0.10105098]
 [-0.49968224 -0.13815597]
 [ 0.34846873 -1.45473262]
 [ 0.86946095 -0.15662208]
 [-0.77928211 -0.8892002 ]
 [-1.00053264  0.62889549]
 [ 0.62052342 -0.88689792]
 [-0.37055379  0.12619055]
 [ 0.54474871 -0.17302919]
 [-0.94795946 -0.01770722]]
b1:  [[-0.0517297 ]
 [ 0.53091369]
 [ 0.01495449]
 [-0.1746472 ]
 [-0.41684088]
 [-0.29272764]
 [-0.5285055 ]
 [-0.05183221]
 [-0.05725584]
 [ 0.37395342]]
W2:  [[ 3.24753802e-01 -2.85789546e-01  5.05913870e-02  2.41474483e-01
   2.16028918e-01 -1.25652103e-01  5.32195537e-01  2.20360763e-01
   4.32695170e-02 -8.03312895e-02]
 [-2.50315809e-01  9.27269577e-01  9.57505167e-02 -5.75827236e-02
  -5.31140653e-01 -4.87761485e-01  6.22448566e-01 -3.49080678e-02
  -4.99226816e-01  5.01886094e-01]
 [-1.11115747e-02 -5.52644499e-02  5.79526695e-01 -5.85067428e-02
   6.58585514e-02  3.51695764e-01  3.38668485e-01  1.29872815e-01
   2.01025596e-01  1.52634960e-04]
 [ 1.53309553e-02 -1.19525929e-01 -6.44863511e-02 -2.14057845e

In [97]:
# test the network
import numpy as np
for i in range(5):
  # train the network over all inputs with SGD
  train()

  # print out RMSE
  summed_error = 0.0
  for i in range(100):
    z1, a1, z2, a2, z3, a3, o = predict_and_record(X[i,:])
    summed_error += (o[0][0] - y[i])**2

  RMSE = np.sqrt(summed_error/100)
  print("RMSE: ", RMSE)

RMSE:  0.2947941880729724
RMSE:  0.2949915072354146
RMSE:  0.29519201543514845
RMSE:  0.2953952559684475
RMSE:  0.2956008047089855


TypeError: Field elements must be 2- or 3-tuples, got '0.5'

In [113]:
test = np.array([[1000000],[-1000000]])
z1, a1, z2, a2, z3, a3, o = predict_and_record(test)
print(o)

[[1.33193953]]


In [None]:
# visualize the test results