# Neural Networks from scratch
https://www.freecodecamp.org/news/building-a-neural-network-from-scratch/

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [17]:
def print_green(input):
    print("\033[92m {}\033[00m" .format(input))

In [7]:
def init_params(layer_dims):
    np.random.seed(3)
    params = {}
    L = len(layer_dims)

    for l in range(1, L):
        params['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*0.01
        params['b'+str(l)] = np.zeros((layer_dims[l], 1)) # Bias initilized as Zero

    return params

In [5]:
params = init_params([1,1])
params

{'W1': array([[0.01788628]]), 'b1': array([[0.]])}

# ( W ) (Weight Matrix):

This is a matrix of weights. Each element in this matrix represents the strength of the connection between the input features and the neurons in the layer.

Dimensions: If you have ( n ) input features and ( m ) neurons in the layer, ( W ) will be an ( m * n )matrix.

# ( X ) (Input Vector/Matrix):

This is the input data. It can be a vector (for a single data point) or a matrix (for multiple data points).
Dimensions: If you have ( n ) input features, ( X ) will be an ( n * 1 ) vector for a single data point or an ( n * k ) matrix for ( k ) data points.

# ( b ) (Bias Vector):

This is a vector of biases. Each element in this vector is added to the corresponding neuron's weighted sum to shift the activation function.
Dimensions: If you have ( m ) neurons, ( b ) will be an ( m * 1 ) vector.


# ( W ) (Weight Matrix):

This is a matrix of weights. Each element in this matrix represents the strength of the connection between the input features and the neurons in the layer.

Dimensions: If you have ( n ) input features and ( m ) neurons in the layer, ( W ) will be an ( m \times n ) matrix.

# ( X ) (Input Vector/Matrix):

This is the input data. It can be a vector (for a single data point) or a matrix (for multiple data points).
Dimensions: If you have ( n ) input features, ( X ) will be an ( n \times 1 ) vector for a single data point or an ( n \times k ) matrix for ( k ) data points.

# ( b ) (Bias Vector):

This is a vector of biases. Each element in this vector is added to the corresponding neuron's weighted sum to shift the activation function.
Dimensions: If you have ( m ) neurons, ( b ) will be an ( m \times 1 ) vector.

### Example

Consider a simple example with a single data point:

- **Input Vector \( X \)**: 
  \[
  \begin{bmatrix}
  x_1 \\
  x_2
  \end{bmatrix}
  \]

- **Weight Matrix \( W \)**: 
  \[
  \begin{bmatrix}
  w_{11} & w_{12} \\
  w_{21} & w_{22}
  \end{bmatrix}
  \]

- **Bias Vector \( b \)**: 
  \[
  \begin{bmatrix}
  b_1 \\
  b_2
  \end{bmatrix}
  \]

The computation would be:

\[
Z = W dot X + b = 
\begin{bmatrix}
w_{11} & w_{12} \\
w_{21} & w_{22}
\end{bmatrix}
dot
\begin{bmatrix}
x_1 \\
x_2
\end{bmatrix}
+
\begin{bmatrix}
b_1 \\
b_2
\end{bmatrix}
\]

Breaking it down:

\[
Z = 
\begin{bmatrix}
(w_{11} \cdot x_1 + w_{12} \cdot x_2) \\
(w_{21} \cdot x_1 + w_{22} \cdot x_2)
\end{bmatrix}
+
\begin{bmatrix}
b_1 \\
b_2
\end{bmatrix}
\]

Linear Transformation
The equation Z = W * X + b represents a linear transformation of the input X using the weight matrix W and the bias vector b. This is a common step in neural networks before applying an activation function.

\[
Z = 
\begin{bmatrix}
(w_{11} \cdot x_1 + w_{12} \cdot x_2 + b_1) \\
(w_{21} \cdot x_1 + w_{22} \cdot x_2 + b_2)
\end{bmatrix}
\]

Sigmoid Function
The expression 1/(1+np.exp(np.dot(-1, Z))) is the sigmoid activation function applied to Z. The sigmoid function is defined as:

\begin{equation}
\sigma(Z) = \frac{1}{1 + e^{-Z}} 
\end{equation}




In [69]:
# Z (linear hypothesis) - Z = W*X + b , 
# * - dot product,
# W - weight matrix, b- bias vector, X- Input 
# This is a linear function called Sigmoid ativation function.
# It is used to introduce non-linearity in the model.
# Z can be a scalar a vector or a matrix.

# In this case Z = -1 dot Z + b
def sigmoid(Z):
    A = 1/(1+np.exp(np.dot(-1, Z))) # The dot product multiple the matrix with -1.
    cache = (Z)

    return A, cache

def sigmoid_new(Z):
    A = 1 / (1 + np.exp(-Z))  # Simplified negation
    cache = Z  # Cache the value of Z for backpropagation

    return A, cache
print(sigmoid(1), sigmoid_orig(1))

(0.7310585786300049, 1) (0.7310585786300049, 1)


In [32]:
def sigmoid_derivative(Z):
    A, _ = sigmoid(Z)
    return A * (1 - A)

In [7]:
Z = np.array([1, 2, 3])
np.dot(-1, Z) # This is equal to -1*Z or -Z

array([-1, -2, -3])

In [8]:
np.exp(np.dot(-1, Z))

array([0.36787944, 0.13533528, 0.04978707])

In [9]:
# Example
import numpy as np

# Define the weight matrix, input vector, and bias vector
W = np.array([[0.2, 0.8], [0.5, 0.1]])
X = np.array([[1], [2]])
b = np.array([[0.1], [0.2]])

# Perform the linear transformation
Z = np.dot(W, X) + b

# Apply the sigmoid activation function
A = 1 / (1 + np.exp(-Z)) # - is used to multiply the matrix with -1 and works seemlessly also with arrays

print("Z:", Z)
print("A:", A)

Z: [[1.9]
 [0.9]]
A: [[0.86989153]
 [0.7109495 ]]


In [9]:
def forward_prop(X, params):

    A = X # input to first layer i.e. training data
    caches = []
    L = len(params)//2 # The number of layers in the network. Each layer has a weight and a bias, so the total number of parameters is twice the number of layers.
    print(f'Length of params: {len(params)}, L: {L}, Params: {params}')
    for l in range(1, L+1):
        A_prev = A

        # Debug
        print(f"Shape of W{str(l)}): {params['W'+str(l)].shape}")
        print(f"Shape of A_prev: {A_prev.shape}")
        
        # Linear Hypothesis - Using the formula Z = W*X + b
        print(f'Weight {"W"+str(l)}: {params["W"+str(l)]}, A_prev: {A_prev}, Bias: {params["b"+str(l)]}')
        Z = np.dot(params['W'+str(l)], A_prev) + params['b'+str(l)] 

        # Storing the linear cache
        linear_cache = (A_prev, params['W'+str(l)], params['b'+str(l)]) 
        print(f'Linear Cache: {linear_cache}')
        # Applying sigmoid on linear hypothesis
        A, activation_cache = sigmoid(Z) 
        print(f'Sigmoid: {A}, Activation Cache: {activation_cache}')
         # storing the both linear and activation cache
        cache = (linear_cache, activation_cache)
        caches.append(cache)

    return A, caches


![image.png](attachment:image.png)

```mermaid
graph LR
  subgraph Input Layer
    A[Input Vector/Matrix #40;X#41;] --> B[Weight Matrix #40;W#41;]
    A --> C[Bias Vector #40;b#41;]
    B --> D[Linear Transformation #40;Z = W * X + b#41;]
    C --> D
    D --> E[Activation Function #40;A = sigmoid#40;Z#41;#41;]
    E --> F[Output #40;A#41;]
  end

  subgraph Hidden Layer 1
    D -->|Cache| G[Cache]
  end

  subgraph Output Layer
    F --> H[Final Output #40;A#41;]
  end

  subgraph Variables
    A#40;#40;A#41;#41; -->|numpy.ndarray| A_value{{array[[0.50665373]]}}
    W#40;#40;W#41;#41; -->|numpy.ndarray| W_value{{array[[0.2, 0.8], [0.5, 0.1]]}}
    X#40;#40;X#41;#41; -->|numpy.ndarray| X_value{{array[1, 2]}}
    Z#40;#40;Z#41;#41; -->|numpy.ndarray| Z_value{{array[[1.9], [0.9]]}}
    b#40;#40;b#41;#41; -->|numpy.ndarray| b_value{{array[[0.1], [0.2]]}}
    caches#40;#40;caches#41;#41; -->|list| caches_value{{[((array[1, 2]), array[[0.01788628, 0.0043651]], array[[0.]])), array[[0.02661648]]]]}}
    params#40;#40;params#41;#41; -->|dict| params_value{{{'W1': array[[0.01788628, 0.0043651]], 'b1': array[[0.]]}}}}
  end

  A_value --> A
  W_value --> W
  X_value --> X
  Z_value --> Z
  b_value --> b
  caches_value --> caches
  params_value --> params
```

# Relationships between the variables in the neural network

For every X (input), there is a corresponding W (weight) and b (bias) that are used to compute Z (linear transformation). The output of the linear transformation Z is passed through an activation function to get the final output A.

Let's say we have a neural network with one input layer, one hidden layer, and one output layer. The input layer has two neurons, the hidden layer has two neurons, and the output layer has one neuron.

The input layer has two neurons, so the input vector X will have two elements. The weight matrix W connecting the input layer to the hidden layer will be a 2x2 matrix. The bias vector b for the hidden layer will be a 2x1 vector.

```mermaid
graph LR
  subgraph Input Layer
    A1[Neuron 1 - b1]
    A2[Neuron 2 - b2]
  end

  subgraph Hidden Layer
    B1[Neuron 1 - b1]
    B2[Neuron 2 - b2]
  end

  subgraph Output Layer
    C1[Neuron 1 - b1]
  end

  A1 -->|W11| B1
  A1 -->|W12| B2
  A2 -->|W21| B1
  A2 -->|W22| B2

  B1 -->|W31| C1
  B2 -->|W32| C1

  style A1 fill:#f9f,stroke:#333,stroke-width:2px,color:#000
  style A2 fill:#f9f,stroke:#333,stroke-width:2px,color:#000
  style B1 fill:#9f9,stroke:#333,stroke-width:2px,color:#000
  style B2 fill:#9f9,stroke:#333,stroke-width:2px,color:#000
  style C1 fill:#ff9,stroke:#333,stroke-width:2px,color:#000
```

The number of Weights paramas is given by the formula:


\[
\begin{equation}
\text{Number of Weights} = \text{Number of Neurons in Previous Layer} \times \text{Number of Neurons in Current Layer}
\end{equation}
\]

The number of Bias params is given by the formula:
\[
\begin{equation}
\text{Number of Biases} = \text{Number of Neurons in Current Layer}
\end{equation}
\]

In [28]:
X = np.array([1,2]) # Neural Network Input
params = init_params([2, 1]) # Neural Network Parameters
print(f'X: {X}\nParams: {params}')

A, caches = forward_prop(X, params)

print(f"\nA:\t{A}\nCaches:\t{caches}")

X: [1 2]
Params: {'W1': array([[0.01788628, 0.0043651 ]]), 'b1': array([[0.]])}
Length of params: 2, L: 1, Params: {'W1': array([[0.01788628, 0.0043651 ]]), 'b1': array([[0.]])}
Shape of W1): (1, 2)
Shape of A_prev: (2,)
Weight W1: [[0.01788628 0.0043651 ]], A_prev: [1 2], Bias: [[0.]]
Linear Cache: (array([1, 2]), array([[0.01788628, 0.0043651 ]]), array([[0.]]))
Sigmoid: [[0.50665373]], Activation Cache: [[0.02661648]]

A:	[[0.50665373]]
Caches:	[((array([1, 2]), array([[0.01788628, 0.0043651 ]]), array([[0.]])), array([[0.02661648]]))]


`X=[[1,2]]`

```yaml
    params={
        'W1': array([[0.2, 0.8], [0.5, 0.1]]), 
        'b1': array([[0.1], [0.2]]),
        'W2': array([[0.3, 0.7]]),
        'b2': array([[0.3]])
    }
```



```mermaid
graph TD
    subgraph Input Layer
        X1[1] -->|W1| Z1
        X2[2] -->|W1| Z1
        X3[3] -->|W1| Z2
        X4[4] -->|W1| Z2
    end

    subgraph Hidden Layer 1
        Z1[Z1 = 0.2*1 + 0.8*2] -->| b1| Z1b
        Z2[Z2 = 0.5*3 + 0.1*4] -->| b1| Z2b
        Z1b[Z1b = Z1 + 0.1] -->|Sigmoid| A1
        Z2b[Z2b = Z2 + 0.2] -->|Sigmoid| A2
    end

    subgraph Output Layer
        A1 -->|W2| Z3
        A2 -->|W2| Z3
        Z3[Z3 = 0.3*A1 + 0.7*A2] -->| b2| Z3b
        Z3b[Z3b = Z3 + 0.3] -->|Sigmoid| A3
    end

    A3 --> FinalOutput[Output A3]
```

```mermaid
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#ffcc00', 'edgeLabelBackground':'#ffffff', 'tertiaryColor': '#ffffff'}}}%%
graph LR
    subgraph Column1
        A[Input Vector/Matrix #40;X#41;] --> B[Weight Matrix #40;W#41;]
        A --> C[Bias Vector #40;b#41;]
        B --> D[Linear Transformation #40;Z = W * X + b#41;]
        C --> D
        D --> E[Activation Function #40;A = sigmoid#40;Z#41;#41;]
        E --> F[Output #40;A#41;]
    end

    subgraph Column2
        B_explanation[Weights applied to the input data]
        C_explanation[Biases added to the weighted sum]
        D_explanation[Linear combination of weights, inputs, and biases]
        E_explanation[Activation function applied to the linear transformation]
        F_explanation[Final output after applying the activation function]

        B -.-> B_explanation
        C -.-> C_explanation
        D -.-> D_explanation
        E -.-> E_explanation
        F -.-> F_explanation
    end
```

In [36]:
# Example with 2 layers
import numpy as np

# Example parameters
params = {
    'W1': np.array([[0.2, 0.8], [0.5, 0.1]]),
    'b1': np.array([[0.1], [0.2]]),
    'W2': np.array([[0.3, 0.7]]),
    'b2': np.array([[0.3]])
}

# Example input
X = np.array([[1, 2], [3, 4]])

# Perform forward propagation
A, caches = forward_prop(X, params)

print("Final Activation:", A)
print("Caches:", caches)

Length of params: 4, L: 2, Params: {'W1': array([[0.2, 0.8],
       [0.5, 0.1]]), 'b1': array([[0.1],
       [0.2]]), 'W2': array([[0.3, 0.7]]), 'b2': array([[0.3]])}
Weight W1: [[0.2 0.8]
 [0.5 0.1]], A_prev: [[1 2]
 [3 4]], Bias: [[0.1]
 [0.2]]
Linear Cache: (array([[1, 2],
       [3, 4]]), array([[0.2, 0.8],
       [0.5, 0.1]]), array([[0.1],
       [0.2]]))
Sigmoid: [[0.93702664 0.97587298]
 [0.73105858 0.83201839]], Activation Cache: [[2.7 3.7]
 [1.  1.6]]
Weight W2: [[0.3 0.7]], A_prev: [[0.93702664 0.97587298]
 [0.73105858 0.83201839]], Bias: [[0.3]]
Linear Cache: (array([[0.93702664, 0.97587298],
       [0.73105858, 0.83201839]]), array([[0.3, 0.7]]), array([[0.3]]))
Sigmoid: [[0.74891783 0.7640791 ]], Activation Cache: [[1.092849   1.17517476]]
Final Activation: [[0.74891783 0.7640791 ]]
Caches: [((array([[1, 2],
       [3, 4]]), array([[0.2, 0.8],
       [0.5, 0.1]]), array([[0.1],
       [0.2]])), array([[2.7, 3.7],
       [1. , 1.6]])), ((array([[0.93702664, 0.97587298],
  

# Let's now define our cost function.

The cost function is a measure of how wrong the model is in terms of its ability to estimate the relationship between X and Y. The cost function we will use is the mean squared error (MSE), which is defined as:

\[
\begin{equation}
J = \frac{1}{m} \sum_{i=1}^{m} (Y_{\text{pred}} - Y_{\text{true}})^2
\end{equation}
\]

Where:
- \( J \) is the cost function
- \( m \) is the number of data points in the dataset or batch
- \( $Y_{\text{pred}}$ \) is the predicted output
- \( $Y_{\text{true}}$ \) is the true output
- \( ${\sum}$ \) is the sum over all data points
- \( $(Y_{\text{pred}} - Y_{\text{true}})^2$ \) is the squared error
- \( ${\frac{1}{m} \sum_{i=1}^{m} (Y_{\text{pred}} - Y_{\text{true}})^2}$ \) is the mean squared error

This is the Gradient Descent algorithm. It is the process of minimizing the cost function by adjusting the weights and biases of the neural network. The algorithm works by taking the derivative of the cost function with respect to each weight and bias and updating them in the opposite direction of the gradient.

In [11]:
def cost_function(A, Y):
    """
    Compute the cost function for the neural network.
    
    Parameters:
    A - the final activation value
    Y - the true label
    
    Returns:
    cost - the cost function value
    """
    m = Y.shape[1]

    cost = (-1/m)*(np.dot(np.log(A), Y.T) + np.dot(np.log(1-A), 1-Y.T)) 

    return cost

In [37]:
A = np.array([[0.9, 0.0000001]])  # Example predicted output
Y = np.array([[1, 0]])      # Example true labels

cost = cost_function(A, Y)
print("Cost:", cost)

Cost: [[0.05268031]]


# The Backpropagation Algorithm

The backpropagation algorithm is a method for training neural networks. It works by calculating the gradient of the cost function with respect to the weights and biases of the network. This gradient is then used to update the weights and biases in the opposite direction of the gradient, thereby minimizing the cost function.

# Explanation of `one_layer_backward` Function

The `one_layer_backward` function computes the gradients for a single layer during the backpropagation process in a neural network. Here is a step-by-step explanation of what happens in the function:

1. **Extract Cache Values**:
   - The `cache` parameter contains two elements: `linear_cache` and `activation_cache`.
   - `linear_cache` contains the values `(A_prev, W, b)` from the forward pass.
   - `activation_cache` contains the value `Z` from the forward pass.

2. **Compute `dZ`**:
   - `dZ` is the gradient of the cost with respect to `Z`.
   - For the sigmoid activation function, the derivative is computed as:
     $$
     dZ = dA \cdot \sigma(Z) \cdot (1 - \sigma(Z))
     $$
   - Where:
     - \( dA \) is the gradient of the cost with respect to the activation of the current layer.
     - \( $\sigma(Z)$ \) is the sigmoid function applied to \( Z \).

3. **Extract Linear Cache Values**:
   - `A_prev`, `W`, and `b` are extracted from `linear_cache`.
   - `m` is the number of training examples.

4. **Compute Gradients**:
   - The gradient of the cost with respect to the weights \( W \) is computed as:
     $$
     dW = \frac{1}{m} \cdot dZ \cdot A_{\text{prev}}^T
     $$
   - The gradient of the cost with respect to the biases \( b \) is computed as:
     $$
     db = \frac{1}{m} \cdot \sum dZ
     $$
   - The gradient of the cost with respect to the activation of the previous layer \( $A_{\text{prev}}$ \)  is computed as:
  
     $$
     dA_{\text{prev}} = W^T \cdot dZ
     $$

5. **Return Gradients**:
   - The function returns the gradients `dA_prev`, `dW`, and `db`.

### Summary of Formulas

- Gradient of \( Z \):
  $$
  dZ = dA \cdot \sigma(Z) \cdot (1 - \sigma(Z))
  $$

- Gradient of Weights:
  $$
  dW = \frac{1}{m} \cdot dZ \cdot A_{\text{prev}}^T
  $$

- Gradient of Biases:
  $$
  db = \frac{1}{m} \cdot \sum dZ
  $$

- Gradient of Previous Layer Activation:
  $$
  dA_{\text{prev}} = W^T \cdot dZ
  $$

This function is essential for updating the parameters of the neural network during training, allowing the network to learn from the data.

In [74]:
def one_layer_backward_orig(dA, cache):
    linear_cache, activation_cache = cache

    Z = activation_cache
    
    print(f'---------------------------Z: {Z}')

    dZ = dA*sigmoid(Z)*(1-sigmoid(Z)) # The derivative of the sigmoid function

    A_prev, W, b = linear_cache
    m = A_prev.shape[1]

    dW = (1/m)*np.dot(dZ, A_prev.T)
    db = (1/m)*np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [73]:
def one_layer_backward(dA, cache):
    linear_cache, activation_cache = cache
    Z = activation_cache  # Ensure Z is correctly assigned from activation_cache
    
    dZ = dA * sigmoid_derivative(Z)  # Use the sigmoid derivative function
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    dW = (1/m) * np.dot(dZ, A_prev.T)
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [88]:
def backprop(AL, Y, caches):
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)

    dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))

    current_cache = caches[L-1]
    grads['dA'+str(L-1)], grads['dW'+str(L-1)], grads['db'+str(L-1)] = one_layer_backward(dAL, current_cache)

    for l in reversed(range(L-1)):

        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = one_layer_backward(grads["dA" + str(l+1)], current_cache)
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

```mermaid
graph TD
    A[Start Backpropagation] --> B[Initialize grads dictionary]
    B --> C[Calculate dAL]
    C --> D[Calculate gradients for last layer]
    D --> E[Iterate through layers backwards]
    E --> F[Calculate gradients for current layer]
    F --> G{More layers?}
    G -->|Yes| E
    G -->|No| H[Return grads dictionary]
    
    subgraph Inputs
        I1[AL: Last layer output]
        I2[Y: True labels]
        I3[caches: Cached values]
    end
    
    subgraph For each layer
        F1[Get cached values]
        F2[Calculate dA, dW, db]
        F3[Store gradients in grads]
        F1 --> F2 --> F3
    end
    
    I1 & I2 & I3 --> A
    E --> F1

This code implements the backpropagation algorithm for a neural network. Here's a breakdown of what it does:

1. The function backprop takes three parameters:

- AL: The output of the last layer (prediction)
- Y: The true labels
- caches: A list of cached values from forward propagation

2. It initializes an empty dictionary grads to store gradients.
3. L is set to the number of layers in the network.
4. m is the number of training examples.
5. Y is reshaped to match the shape of AL.
6. dAL is calculated as the derivative of the cost with respect to the final layer output.
7. The gradients for the last layer are calculated using one_layer_backward.
8. The function then iterates backwards through the remaining layers:

- For each layer, it calculates the gradients using one_layer_backward.
- The gradients (dA, dW, db) for each layer are stored in the grads dictionary.


Finally, it returns the grads dictionary containing all gradients.

In [104]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    print(f'Grads: {grads}')
    print(f'Parameters: {parameters}')
    for l in range(L-1):
        print(f'Param W{l+1}: {parameters["W"+str(l+1)]}')
        print(f'Grads dA{(l+1)}: {learning_rate*grads["dA"+str(l+1)]}')
        print(f'b{str(l+1)}: {parameters["b"+str(l+1)]}')
        print(f'learning_rate * b{str(l+1)}: {learning_rate*grads["db"+str(l+1)]}')
        # parameters["W"+str(l+1)] = parameters['W'+str(l+1)] - learning_rate*grads['dA'+str(l+1)]
        parameters["W"+str(l+1)] = parameters['W'+str(l+1)] - learning_rate*grads['dW'+str(l+1)]
        parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate*grads['db'+str(l+1)]

    return parameters

```mermaid
graph TD
    A[Start update_parameters] --> B[Calculate number of layers]
    B --> C[Iterate through layers]
    C --> D[Update weights W]
    D --> E[Update biases b]
    E --> F{More layers?}
    F -->|Yes| C
    F -->|No| G[Return updated parameters]

    subgraph Inputs
        I1[parameters]
        I2[grads]
        I3[learning_rate]
    end

    I1 & I2 & I3 --> A

1. Inputs (parameters, grads, learning_rate):

- These are the parameters of the update_parameters function.
  - parameters is a dictionary containing the current weights and biases.
  - grads is a dictionary containing the gradients for each parameter.
  - learning_rate is a scalar value determining the step size for updates.


2. Calculate number of layers:

  -L = len(parameters) // 2
   - This calculates the number of layers in the network. It's divided by 2 because each layer has both weights (W) and biases (b).


3. Iterate through layers:

- for l in range(L):
- This loop goes through each layer of the network.


4. Update weights W:

- parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)]
- For each layer, this updates the weights by subtracting the product of the learning rate and the corresponding gradient.


5. Update biases b:

- parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)]
- Similarly, this updates the biases for each layer.


6. Return updated parameters:

- return parameters
- After processing all layers, the function returns the dictionary containing all updated parameters.



Key points about this function:

* It uses gradient descent to update the parameters of the neural network.
* The update is performed for both weights (W) and biases (b) of each layer.
* The learning rate determines how large of a step is taken in the direction of the negative gradient.
* The process is repeated for each layer in the network.
* The function modifies the parameters dictionary in-place and then returns it.

This update process is typically called after the backpropagation step in each iteration of training a neural network. It adjusts the parameters to minimize the loss function, gradually improving the network's performance on the training data.

In [41]:
def train(X, Y, layer_dims, epochs, lr):
    params = init_params(layer_dims)
    cost_history = []

    for i in range(epochs):
        Y_hat, caches = forward_prop(X, params)
        cost = cost_function(Y_hat, Y)
        cost_history.append(cost)
        grads = backprop(Y_hat, Y, caches)
        print(f'PARAMS BEFORE UPDATE: {params}')
        params = update_parameters(params, grads, lr)


    return params, cost_history

```mermaid
graph TD
    A[Start train] --> B[Initialize parameters]
    B --> C[Initialize cost history]
    C --> D[Start epoch loop]
    D --> E[Forward propagation]
    E --> F[Calculate cost]
    F --> G[Backpropagation]
    G --> H[Update parameters]
    H --> I{More epochs?}
    I -->|Yes| D
    I -->|No| J[Return parameters and cost history]

    subgraph Inputs
        I1[X: Input data]
        I2[Y: True labels]
        I3[layer_dims: Layer dimensions]
        I4[epochs: Number of epochs]
        I5[lr: Learning rate]
    end

    I1 & I2 & I3 & I4 & I5 --> A

Certainly! I'll create a simplified diagram for the `train` function and then provide an explanation.



```mermaid
graph TD
    A[Start train] --> B[Initialize parameters]
    B --> C[Initialize cost history]
    C --> D[Start epoch loop]
    D --> E[Forward propagation]
    E --> F[Calculate cost]
    F --> G[Backpropagation]
    G --> H[Update parameters]
    H --> I{More epochs?}
    I -->|Yes| D
    I -->|No| J[Return parameters and cost history]

    subgraph Inputs
        I1[X: Input data]
        I2[Y: True labels]
        I3[layer_dims: Layer dimensions]
        I4[epochs: Number of epochs]
        I5[lr: Learning rate]
    end

    I1 & I2 & I3 & I4 & I5 --> A

```

Now, let me explain this diagram and how it relates to the `train` function:

1. Inputs (X, Y, layer_dims, epochs, lr):
   - X: Input data
   - Y: True labels
   - layer_dims: Dimensions of each layer in the network
   - epochs: Number of training iterations
   - lr: Learning rate

2. Initialize parameters:
   - `params = init_params(layer_dims)`
   - This initializes the weights and biases for all layers of the network.

3. Initialize cost history:
   - `cost_history = []`
   - This list will store the cost (loss) for each epoch.

4. Start epoch loop:
   - `for i in range(epochs):`
   - This loop runs the training process for the specified number of epochs.

5. Forward propagation:
   - `Y_hat, caches = forward_prop(X, params)`
   - This step computes the predictions (Y_hat) and stores intermediate values (caches) for each layer.

6. Calculate cost:
   - `cost = cost_function(Y_hat, Y)`
   - This calculates the loss between the predictions and true labels.
   - `cost_history.append(cost)`
   - The cost is added to the cost history list.

7. Backpropagation:
   - `grads = backprop(Y_hat, Y, caches)`
   - This computes the gradients for all parameters.

8. Update parameters:
   - `params = update_parameters(params, grads, lr)`
   - This updates all weights and biases using the computed gradients.

9. Return parameters and cost history:
   - `return params, cost_history`
   - After all epochs, the function returns the final parameters and the cost history.

Key points about this function:

1. It encapsulates the entire training process for a neural network.
2. It uses the previously defined functions (forward_prop, backprop, update_parameters) to perform the key steps of training.
3. The process is repeated for the specified number of epochs.
4. It tracks the cost over time, which can be used to plot the learning curve.
5. The function returns both the trained parameters and the cost history, which can be used for evaluation and visualization.

This `train` function represents a complete training loop for a neural network. It repeatedly performs forward propagation to make predictions, calculates the cost, performs backpropagation to compute gradients, and updates the parameters. This process gradually improves the network's performance on the training data over multiple epochs.

In [39]:
import numpy as np

# Set the random seed for reproducibility
np.random.seed(1)

# Number of examples
m = 100  # You can change this to any number of examples you want

# Generate random input features
X = np.random.randn(2, m)  # 2 features, m examples

# Generate random labels (0 or 1)
Y = (np.sum(X, axis=0) > 0).astype(int).reshape(1, m)  # Simple rule: sum of features > 0 -> label 1, else 0

print("Input features (X):")
print(X)
print("Labels (Y):")
print(Y)

Input features (X):
[[ 1.62434536 -0.61175641 -0.52817175 -1.07296862  0.86540763 -2.3015387
   1.74481176 -0.7612069   0.3190391  -0.24937038  1.46210794 -2.06014071
  -0.3224172  -0.38405435  1.13376944 -1.09989127 -0.17242821 -0.87785842
   0.04221375  0.58281521 -1.10061918  1.14472371  0.90159072  0.50249434
   0.90085595 -0.68372786 -0.12289023 -0.93576943 -0.26788808  0.53035547
  -0.69166075 -0.39675353 -0.6871727  -0.84520564 -0.67124613 -0.0126646
  -1.11731035  0.2344157   1.65980218  0.74204416 -0.19183555 -0.88762896
  -0.74715829  1.6924546   0.05080775 -0.63699565  0.19091548  2.10025514
   0.12015895  0.61720311  0.30017032 -0.35224985 -1.1425182  -0.34934272
  -0.20889423  0.58662319  0.83898341  0.93110208  0.28558733  0.88514116
  -0.75439794  1.25286816  0.51292982 -0.29809284  0.48851815 -0.07557171
   1.13162939  1.51981682  2.18557541 -1.39649634 -1.44411381 -0.50446586
   0.16003707  0.87616892  0.31563495 -2.02220122 -0.30620401  0.82797464
   0.23009474  0.762

In [106]:
layer_dims = [2, 5, 1]
epochs = 100
lr = 0.01
params, cost_history = train(X, Y, layer_dims, epochs, lr)

print(f"Final parameters: {params}")
print(f"Final cost: {cost_history}")

Length of params: 4, L: 2, Params: {'W1': array([[ 0.01788628,  0.0043651 ],
       [ 0.00096497, -0.01863493],
       [-0.00277388, -0.00354759],
       [-0.00082741, -0.00627001],
       [-0.00043818, -0.00477218]]), 'b1': array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]]), 'W2': array([[-0.01313865,  0.00884622,  0.00881318,  0.01709573,  0.00050034]]), 'b2': array([[0.]])}
Shape of W1): (5, 2)
Shape of A_prev: (2, 100)
Weight W1: [[ 0.01788628  0.0043651 ]
 [ 0.00096497 -0.01863493]
 [-0.00277388 -0.00354759]
 [-0.00082741 -0.00627001]
 [-0.00043818 -0.00477218]], A_prev: [[ 1.62434536 -0.61175641 -0.52817175 -1.07296862  0.86540763 -2.3015387
   1.74481176 -0.7612069   0.3190391  -0.24937038  1.46210794 -2.06014071
  -0.3224172  -0.38405435  1.13376944 -1.09989127 -0.17242821 -0.87785842
   0.04221375  0.58281521 -1.10061918  1.14472371  0.90159072  0.50249434
   0.90085595 -0.68372786 -0.12289023 -0.93576943 -0.26788808  0.53035547
  -0.69166075 -0.39675353 -0.6871