# 5.2 Introduction to Deep Learning, Part 2

## 5.2.1 Multiclass Multiclassification Problem

* input has 4 samples and 5 features
* target contains 3 classes
* one hidden layer, two linear neural networks
* use the softmax activation function

In [1]:
import torch

# input data with 4  samples and 5 features
input = torch.tensor([
    [ 0.7550,  0.2580, -0.0376, -0.5695,  0.0454],
    [-0.6897,  1.4822,  0.7860, -2.0889, -0.4481],
    [ 0.4464,  0.7335, -0.9837,  1.7818, -0.9048],
    [-0.7997,  1.1157, -0.4644,  1.1144, -0.6903]
])

# target with class number per observation
mc_target = torch.tensor([2,2,1,0])

num_classes = 3

In [46]:
import torch.nn as nn
import torch.nn.functional as F

# simple neural network model with 1 hidden layer, two linear networks
torch.manual_seed(11052023)
net = nn.Sequential(
    nn.Linear(5,4),
    nn.Linear(4,3))

# compute the pre-activation output
mc_pre_activation = net(input)

# apply the softmax activation function
softmax = nn.Softmax(dim = -1)
mc_output = softmax(mc_pre_activation)

# predict the class per observation
mc_prediction = torch.argmax(mc_output,dim=-1).view(-1,1)

print(f"pre_activation: \n {mc_pre_activation}")
print(f"output: \n {mc_output}")
print("prediction: \n", mc_prediction)
print("target: \n", mc_target)

pre_activation: 
 tensor([[ 0.4926, -0.1480,  0.2304],
        [ 0.8385,  0.4200,  0.9452],
        [ 0.1639, -0.0355, -0.1235],
        [ 0.3077,  0.4059,  0.1616]], grad_fn=<AddmmBackward0>)
output: 
 tensor([[0.4355, 0.2295, 0.3350],
        [0.3609, 0.2375, 0.4016],
        [0.3892, 0.3188, 0.2920],
        [0.3370, 0.3718, 0.2912]], grad_fn=<SoftmaxBackward0>)
prediction: 
 tensor([[0],
        [2],
        [0],
        [1]])
target: 
 tensor([2, 2, 1, 0])


In [51]:
## 
print(mc_prediction.view(1, 4))
print(mc_prediction.view(1,-1))
print(mc_prediction.resize_(4,))

tensor([[0, 2, 0, 1]])
tensor([[0, 2, 0, 1]])
tensor([0, 2, 0, 1])


In [131]:
ce_loss = nn.CrossEntropyLoss()
print(f"output:\n {mc_output}")
print(f"\ntarget:\n {mc_target}")
print(f"\n loss: {ce_loss(mc_output, mc_target)}")

output:
 tensor([[0.4355, 0.2295, 0.3350],
        [0.3609, 0.2375, 0.4016],
        [0.3892, 0.3188, 0.2920],
        [0.3370, 0.3718, 0.2912]], grad_fn=<SoftmaxBackward0>)

target:
 tensor([2, 2, 1, 0])

 loss: 1.08565354347229


#### On Cross Entropy Loss

$$\text{CrossEntropyLoss} = -\sum{ \text{one-hot-target} \times\log({\text{probas})}}$$

where $\text{probas} = \text{softmax}(\text{output})$

In [135]:
probas = softmax(mc_output)
one_hot_target = F.one_hot(mc_target.clone().detach(), mc_output.shape[1])
print(f"probas:\n {probas}")
print(f"\ntarget: {mc_target}")
print(f"\none_hot_target:\n {one_hot_target}")
print(f"\n CrossEntropyLoss = {-torch.sum(one_hot_target * torch.log(probas))/4}")

probas:
 tensor([[0.3679, 0.2994, 0.3327],
        [0.3418, 0.3021, 0.3560],
        [0.3522, 0.3283, 0.3196],
        [0.3344, 0.3462, 0.3194]], grad_fn=<SoftmaxBackward0>)

target: tensor([2, 2, 1, 0])

one_hot_target:
 tensor([[0, 0, 1],
        [0, 0, 1],
        [0, 1, 0],
        [1, 0, 0]])

 CrossEntropyLoss = 1.08565354347229


## 5.1.2 Regression Problem

* input has 4 observations and 5 features
* target is a numerical value.
* one hidden layer, two linear neural networks
* use the sigmoid activation function

In [58]:
# input data with 4  observations and 5 features
input = torch.tensor([
    [ 0.7550,  0.2580, -0.0376, -0.5695,  0.0454],
    [-0.6897,  1.4822,  0.7860, -2.0889, -0.4481],
    [ 0.4464,  0.7335, -0.9837,  1.7818, -0.9048],
    [-0.7997,  1.1157, -0.4644,  1.1144, -0.6903]
])
reg_target = torch.tensor(
    [[3.1000],
     [2.3000],
     [1.2000],
     [0.4000]]
)

In [60]:
import torch.nn as nn
import torch.nn.functional as F

# simple neural network model with 1 hidden layer
torch.manual_seed(11052023)
net = nn.Sequential(
    nn.Linear(5,4),
    nn.Linear(4,1))

# compute the output
reg_output = net(input)

print("output: \n", reg_output)
print("target: \n", reg_target)

output: 
 tensor([[-0.2958],
        [ 0.0502],
        [-0.6245],
        [-0.4806]], grad_fn=<AddmmBackward0>)
target: 
 tensor([[3.1000],
        [2.3000],
        [1.2000],
        [0.4000]])


In [61]:
mse_loss = nn.MSELoss()
reg_loss = mse_loss(reg_output, reg_target)
reg_loss

tensor(5.1743, grad_fn=<MseLossBackward0>)

### 5.1.3 Binary Classification Problem  as a Regression Problem

* input has 4 observations and 5 features
* target contains `0` or `1`
* two hidden layers
* use the sigmoid activation function

In [93]:
import torch.nn as nn
import torch.nn.functional as F

# input data with 4  observations and 5 features
input = torch.tensor([
    [ 0.7550,  0.2580, -0.0376, -0.5695,  0.0454],
    [-0.6897,  1.4822,  0.7860, -2.0889, -0.4481],
    [ 0.4464,  0.7335, -0.9837,  1.7818, -0.9048],
    [-0.7997,  1.1157, -0.4644,  1.1144, -0.6903]
])
bc_target = torch.tensor([0, 0, 1, 0]).view(-1,1)
  

# simple neural network model with 2 hidden layers
net = nn.Sequential(
    nn.Linear(5,4),
    nn.Linear(4,1))

# compute the pre-activation output
bc_pre_activation = net(input)

# apply the softmax activation function
#sigmoid = nn.Sigmoid()
bc_output = sigmoid(bc_pre_activation)

# predict the class per observation
bc_prediction = torch.FloatTensor((bc_output > 0.5).float())

print("prediction: \n", bc_prediction)
print("target: \n", bc_target)

prediction: 
 tensor([[1.],
        [1.],
        [0.],
        [0.]])
target: 
 tensor([[0],
        [0],
        [1],
        [0]])


In [94]:
bin_mse_loss = nn.MSELoss()
bin_mse_loss = mse_loss(bc_prediction, bc_target)
bin_mse_loss

tensor(0.7500)

## 5.3 Forward Pass

Generating a prediction  from the neural network models is is called **running a forward pass** through the network.

![](images/neural_network_v3.png)

*  Input is `X` of shape  `(n, m)`.

*  Number of classes is `num_classes = k`.
  
*  First neural network: `nn.Linear(m, p)`.
    *  weights: $w_0 = $`0.weight.t()` of shape `(m, p)`
      
    *  bias: $b_0 = $0.bias of shape `(1, p)`
      
*  Second neural network: `nn.Linear(p, k)`.
    *  weights: $w_1 = $`1.weight.t()` of shape `(p, k)`
      
    *  bias: $b_1 = $1.bias of shape `(1, k)`

*  Softmax activation function of a variable $z$ computes the elements of its $i$th row of $z$.
    At the $i$th row,
   
\begin{align*}
   S(x_i) &= S([x_{i0},x_{i1},\ldots,x_{i(m-1)}])\\
          &= \left[{S(x_{i0}), S(x_{i1}), \ldots, S(x_{i(m-1)})}\right]
\end{align*}
such that $ 0 \leq S(x_{ij}) = \dfrac{e^{x_{ij}}}{e^{x_{i0}}+e^{x_{i1}} + \ldots +e^{x_{i(m-1)}}} \leq 1$ 

for each $j = 0,1, \ldots, m-1$ 

and $1 = \displaystyle\sum_{j=1}^{m-1} S(x_{ij})$.

**Forward pass**

$$\left(z_0 = X\circ w_0 + b_0\right)\;  \Longrightarrow \left(z_1 = z_0 \circ w_1 + b_1\right) \; \Longrightarrow S(z_1) \;\Longrightarrow Loss(\hat{y}, y)$$

**For the first neural network: (hidden layer)** 
$z_0 = \begin{bmatrix} z_{0}^{(0)} \\
                 z_{1}^{(0)}  \\
                \end{bmatrix}$ such that
 
 \begin{align*} 
 \begin{bmatrix} z_{0}^{(0)} \\
                 z_{1}^{(0)}  \\            
 \end{bmatrix} &= 
 \begin{bmatrix} x_{0}^{(0)} & x_{1}^{(0)}  \\
  \end{bmatrix} \circ 
  \begin{bmatrix} w_{00}^{(0)} & w_{01}^{(0)}  \\
                  w_{10}^{(0)} & w_{11}^{(0)}  \\
  \end{bmatrix} +
  \begin{bmatrix} b_{0}^{(0)} \\
                  b_{1}^{(0)} \\
  \end{bmatrix} \\ 
  \begin{bmatrix} z_{0}^{(0)} \\
                  z_{1}^{(0)} \\      
  \end{bmatrix} &= 
 \begin{bmatrix} 
      x_{0}^{(0)} w_{00}^{(0)} + x_{1}^{(0)} w_{10}^{(0)} + b_{0}^{(0)} \\
      x_{0}^{(0)} w_{01}^{(0)} + x_{1}^{(0)} w_{11}^{(0)} + b_{1}^{(0)} \\ 
   \end{bmatrix}
\end{align*}

<br>

Thus,

$z_{0}^{(0)} = x_{0}^{(0)} w_{00}^{(0)} + x_{1}^{(0)} w_{10}^{(0)} + b_{0}^{(0)}$ and
 
$z_{1}^{(0)} = x_{0}^{(0)} w_{01}^{(0)} + x_{1}^{(0)} w_{11}^{(0)} + b_{1}^{(0)}$.
  


<br>

**For the second neural network: (pre-activation (output layer)** 
 $z_1=\begin{bmatrix} z_{0}^{(1)} \\
                 z_{1}^{(1)}  \\            
 \end{bmatrix}$ such that
 
\begin{align*} 
 \begin{bmatrix} z_{0}^{(1)} \\
                 z_{1}^{(1)}  \\            
 \end{bmatrix} &= 
 \begin{bmatrix} z_{0}^{(0)} & z_{1}^{(0)}  \\
  \end{bmatrix} \circ 
  \begin{bmatrix} w_{00}^{(1)} & w_{01}^{(1)}  \\
                  w_{10}^{(1)} & w_{11}^{(1)}  \\
  \end{bmatrix} +
  \begin{bmatrix} b_{0}^{(1)} \\
                  b_{1}^{(1)} \\
  \end{bmatrix} \\
  \begin{bmatrix} z_{0}^{(1)} \\
                  z_{1}^{(1)} \\            
 \end{bmatrix} &=
  \begin{bmatrix} 
      z_{0}^{(0)} w_{00}^{(1)} + z_{1}^{(0)} w_{10}^{(1)} + b_{0}^{(1)} \\
      z_{0}^{(0)} w_{01}^{(1)} + z_{1}^{(0)} w_{11}^{(1)} + b_{1}^{(1)} \\ 
   \end{bmatrix}
\end{align*}

<br>

Thus,

$z_{0}^{(1)} = z_{0}^{(0)} w_{00}^{(1)} + z_{1}^{(0)} w_{10}^{(1)} + b_{0}^{(1)}$ and

$z_{1}^{(1)} = z_{0}^{(0)} w_{01}^{(1)} + z_{1}^{(0)} w_{11}^{(1)} + b_{1}^{(1)}$


<br>

**Prediction using the Softmax activation function**: 
 $\hat{y} = S(z_1)$ such that
 
$\hat{y}= 
\begin{bmatrix}
\hat{y}_0 \\
\hat{y}_1
\end{bmatrix}$ with  

$\hat{y_0} = \dfrac{e^{z_0^{(1)}}}{e^{z_0^{(1)}}+e^{z_1^{(1)}}}$ and 

$\hat{y_1} = \dfrac{e^{z_1^{(1)}}}{e^{z_0^{(1)}}+e^{z_1^{(1)}}}$.




<br>

**For the loss function (cross entropy for multiclass classification)**:

$\text{Loss}(\hat{y}, y)$ where 

$\hat{y}$ is the prediction and 

$y$ is the target.

### 5.4 Backward pass

A **backward pass**, or **backpropagation**, is the process by which layer weights and biases are updated during training. 

All this is part of something called a **"training loop"**. 

---

This involves 

**propagating data forward**, **comparing outputs (predictions) to true values**, 

then **propagating backwards to improve each layer's weights and biases** using some handy math.

---

We repeat several times until the model is tuned with meaningful weights and biases. 

So during training, the backward pass is the complementary step to the forward pass.

## 5.4.1 Loss functions to evaluate model predictions

* In the created neural networks, the model is trained by the inputs and then predictions are returned as outputs. 

*  We'll now assess the differences between actual values and those predicted by the network using the loss function
    * the loss function tells us how good our model is at making predictions during training. 
    * the inputs of the loss function are  a model prediction as `y_hat`, and true label, or ground truth as `y`
    * the output is a floating number.
 
* Our goal in the backward pass is to minimize the loss. This can be done by updating the values of the weights and biases. Weights and biases are the trainable parameters of the model.

**Visualizing a loss function with a minimum value**

![](images/minimizing_loss.png)

This means that we will be using loss functions that are differentiable with respect to the weights and biases.

Note that the forward pass is $$\left(z_1 = X\circ w_0 + b_0\right)\;  \Longrightarrow \left(z_1 = z_0 \circ w_1 + b_1\right) \; \Longrightarrow \left( \hat{y} = S(z_1) \right)\; \Longrightarrow \;\text{loss} = \text{Loss}(\hat{y}, y)$$

To update the weights and biases:

*  *Updating the weight* : `weight = weight - lr * weight_grad`
*  *Updating thew bias* :   `bias = bias - lr * bias_grad`

where `lr` is the learning rate and `weight_grad` and `bias_grad` are the gradients of the weights and biases, respectively.


In our previous neural networks, we have 12 trainable parameters:
8 weights and 4 biases.

![](images/neural_network_v3.png)

<br>

Since the loss function is $\text{Loss}\left(\hat{y}, y\right)$ and

\begin{align*}
\hat{y} &= S(z_1) \\
\hat{y_0} &= \dfrac{e^{z_0^{1}}}{e^{z_0^{(1)}}+e^{z_1^{(1)}}}\\
\hat{y_1} &= \dfrac{e^{z_1^{1}}}{e^{z_0^{(1)}}+e^{z_1^{(1)}}}\\
z_{0}^{(1)} &= z_{0}^{(0)} w_{00}^{(1)} + z_{1}^{(0)} w_{10}^{(1)} + b_{0}^{(1)}\\
z_{1}^{(1)} &= z_{0}^{(0)} w_{01}^{(1)} + z_{1}^{(0)} w_{11}^{(1)} + b_{1}^{(1)}\\
z_{0}^{(0)} &= x_{0}^{(0)} w_{00}^{(0)} + x_{1}^{(0)} w_{10}^{(0)} + b_{0}^{(0)}\\
z_{1}^{(0)} &= x_{0}^{(0)} w_{01}^{(0)} + x_{1}^{(0)} w_{11}^{(0)} + b_{1}^{(0)}\\
\end{align*}

The gradients in the 2nd Linear network:

**(1) Gradient with respect to $w_{00}^{(1)}$**:

$\dfrac{\partial{\text{Loss}}}{\partial{w_{00}^{(1)}}} = 
\left(\dfrac{\partial{\text{loss}}}{\partial{\hat{y}_0}}\right)
\left(\dfrac{\partial{\hat{y}_0}}{\partial{z_0^{(1)}}}\right)
\left(\dfrac{\partial{z_0^{(1)}}}{\partial{w_{00}^{(1)}}}\right)= 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}}}\right) \cdot
 S(z_0^{(1)})\cdot S(z_1^{(1)}) \cdot z_{0}^{(0)}$

**(2) Gradient with respect to $w_{01}^{(1)}$**:

$\dfrac{\partial{\text{Loss}}}{\partial{w_{01}^{(1)}}} = 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}_1}}\right)
\left(\dfrac{\partial{\hat{y}_1}}{\partial{z_1^{(1)}}}\right)
\left(\dfrac{\partial{z_1^{(1)}}}{\partial{w_{01}^{(1)}}}\right)= 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}}}\right) \cdot
 S(z_0^{(1)})\cdot S(z_1^{(1)}) \cdot z_{0}^{(0)}$

**(3) Gradient with respect to $w_{10}^{(1)}$**:

 $\dfrac{\partial{\text{Loss}}}{\partial{w_{10}^{(1)}}} = 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}_0}}\right)
\left(\dfrac{\partial{\hat{y}_0}}{\partial{z_0^{(1)}}}\right)
\left(\dfrac{\partial{z_0^{(1)}}}{\partial{w_{10}^{(1)}}}\right)= 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}}}\right) \cdot
 S(z_0^{(1)})\cdot S(z_1^{(1)}) \cdot z_{1}^{(0)}$


**(4) Gradient with respect to $w_{11}^{(1)}$**:

 $\dfrac{\partial{\text{Loss}}}{\partial{w_{11}^{(1)}}} = 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}_1}}\right)
\left(\dfrac{\partial{\hat{y}_1}}{\partial{z_1^{(1)}}}\right)
\left(\dfrac{\partial{z_1^{(1)}}}{\partial{w_{11}^{(1)}}}\right)= 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}}}\right) \cdot
 S(z_0^{(1)})\cdot S(z_1^{(1)}) \cdot z_{1}^{(0)}$

**(5) Gradient with respect to $b_{0}^{(1)}$**:

$\dfrac{\partial{\text{Loss}}}{\partial{b_0^{(1)}}} = 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}_0}}\right)
\left(\dfrac{\partial{\hat{y}_0}}{\partial{z_0^{(1)}}}\right)
\left(\dfrac{\partial{z_0^{(1)}}}{\partial{b_{0}^{(1)}}}\right) =
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}}}\right) \cdot
 S(z_0^{(1)})\cdot S(z_1^{(1)})$

**(6) Gradient with respect to $b_{1}^{(1)}$**:

$\dfrac{\partial{\text{Loss}}}{\partial{b_1^{(1)}}} = 
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}_1}}\right)
\left(\dfrac{\partial{\hat{y}_1}}{\partial{z_1^{(1)}}}\right)
\left(\dfrac{\partial{z_1^{(1)}}}{\partial{b_{1}^{(1)}}}\right) =
\left(\dfrac{\partial{\text{Loss}}}{\partial{\hat{y}}}\right) \cdot
 S(z_0^{(1)})\cdot S(z_1^{(1)})$


The gradient of a weight is $\dfrac{\partial{\text{loss}}}{\partial{\text{weight}}}$
which is `weight.grad` in PyTorch.

Also, The gradient of a bias in PyTorch  is $\dfrac{\partial{\text{loss}}}{\partial{\text{bias}}}$
which is `bias.grad` in PyTorch.

The gradients for all the tensors that require gradient in the computational graph can be computed by calling `loss.backward()`. 

`Loss.backward()` uses the chain rule to propagate the gradients from the output tensor (usually the loss) to the input tensors (usually the model parameters).

To update the weights and biases:

*  *Updating the weight* : `weight = weight - lr * weight_grad`
*  *Updating thew bias* :   `bias = bias - lr * bias_grad`


 where
 * $\text{weight\_grad} = \dfrac{\partial\,{\text{loss}}}{\partial{w}}$
 * $\text{bias\_grad} = \dfrac{\partial\,{\text{loss}}}{\partial{b}}$
 * `lr` is the **learning rate**.

 In deep learning the learning  rate is a hyperparameter that determines the step size at which a neural network's model parameters are updated during training. 
It controls how much the model learns from each new data point.



### 5.4.3 Loss functions for classification and regression

**Binary cross entropy loss function (BCE)** for binary classification problem. In PyTorch we use (`nn.BCELoss()`)

*  it measures the difference between the predicted probabilities, `y_hat` and the true labels `y`. 

*  The formula for binary cross entropy loss is:
    
    $$L(y,p) = - [y \cdot \log{p} + (1-y)\log(1-p)]$$
    
where 

$L(y, p)$ is the binary cross-entropy loss, 

$y$ is the true binary label ($0$ or $1$), and 

$p$ is the predicted probability that the instance belongs to class 1 
with

*   `p = y_hat` if `y_hat > 0.5` and 
*  `1-p = y_hat` if `y_hat <= 0.5`.    
    
Its derivative with respect to $p$ is

$$\frac{dL}{dp} = -\dfrac{y}{p} + \dfrac{1 - y}{1 - p}$$


The **BCE** is usually combined with **sigmoid** activation function.


In PyTorch, the **cross-entropy loss function** is commonly used for multi-class classification problems and is typically referred to as "CrossEntropyLoss." (`nn.CrossEntropyLoss()`)

This loss function combines the `softmax function` (to convert raw scores into class probabilities) and the negative log-likelihood loss.


The formula forthe  **cross-entropy loss function**  in one observation is given by:

$$L(y, p) = - \sum_{i}^{n}  y_i  \cdot \log{p_i}$$

where $L(y, p)$ is the cross-entropy loss function,

$n$ is the number of classes

$y$  is the true multiclass label, which is a vector of one-hot encoded labels, where only the true class is $1$, and all other entries are $0$.

$p$  represents the predicted probability distribution over classes.

The `nn.CrossEntropyLoss` function automatically applies the **softmax** function to the output tensor and computes the **cross-entropy loss** between the predicted class probabilities and the true labels.

n PyTorch, for regression tasks where we want to predict continuous values rather than class labels, we typically use mean squared error (MSE) or mean absolute error (MAE) loss functions.

**Mean Squared Error (MSE) Loss (L2 Loss)**: `nn.MSELoss()`.

This loss function measures the average of the squared differences between the predicted values and the true target values.
Mean Absolute Error (MAE) Loss (L1 Loss):  nn.L1Loss().

This loss function measures the average of the absolute differences between the predicted values and the true target value
. It is also suitable for regression tasks, particularly iweou want your model to be less sensitive to outliers.

In [136]:
# Binary classification

import torch
import torch.nn as nn

# torch.manual_seed(12345)
input_tensor = torch.randn(1,9)
target = torch.tensor([[1., 0.]])

model = nn.Sequential(
    nn.Linear(9,2),
    nn.Sigmoid() # Sigmoid activation function                 
)

prediction = model(input_tensor)

criterion = nn.BCELoss()

# Calculate the loss
loss = criterion(prediction, target)

# Compute the gradients of the loss
loss.backward()

In [150]:
lr = 0.01
print(f"original weights:\n {model[0].weight.data}")
print(f"updated weights:\n  {model[0].weight - lr * model[0].weight.grad}")

original weights:
 tensor([[-0.1853,  0.2824, -0.1396, -0.0936,  0.2140, -0.1216,  0.0378, -0.1440,
         -0.2662],
        [-0.0854, -0.2092,  0.1421,  0.3006, -0.0994,  0.2449,  0.0683,  0.0028,
          0.0767]])
updated weights:
  tensor([[-0.1842,  0.2830, -0.1408, -0.0938,  0.2137, -0.1167,  0.0390, -0.1414,
         -0.2651],
        [-0.0864, -0.2097,  0.1432,  0.3007, -0.0992,  0.2405,  0.0672,  0.0005,
          0.0757]], grad_fn=<SubBackward0>)


<br>

In PyTorch, for regression tasks where we want to predict continuous values rather than class labels, we typically use **mean squared error (MSE)** or **mean absolute error (MAE) loss functions**. 

*   **Mean Squared Error (MSE) Loss (L2 Loss)**:  `nn.MSELoss()`.

    *   This loss function measures the average of the squared differences between the predicted values and the true target values. 

*   **Mean Absolute Error (MAE) Loss (L1 Loss)**: ` nn.L1Loss()`.
    *    
This loss function measures the average of the absolute differences between the predicted values and the true target value
    *    . It is also suitable for regression tasks, particularly iweou want your model to be less sensitive to outliers.

### 5.4.4 The need for an optimizer

*  Some functions have one minimum and one only, called the "global" minimum. These functions are "convex". 
*  Some "non-convex" functions have more than one "local" minimum.

![](images/convex_and_non_convex_functions.png)

At a local minimum, the function value is lowest compared to nearby points, but points further away may be even lower. 

When minimizing loss functions, our goal is to find the global minimum of the non-convex function, here, when x is approximately one.

Loss functions used in deep learning are not convex! 

To find global minima of non-convex functions, we use a mechanism called "gradient descent" which is called "optimizers" in PyTorch. 

In [152]:
import torch.optim as optim
# torch.manual_seed(908) # for 01
# torch.manual_seed(1) # for 11
torch.manual_seed(2)  # for 10
#torch.manual_seed(3)  # for 11
#torch.manual_seed(1000)  # for 00

input_tensor = torch.randn(1,9)
target = torch.tensor([[1., 0.]])

model = nn.Sequential(
    nn.Linear(9,2),
    nn.Sigmoid() # Sigmoid activation function                 
)

first_prediction = model(input_tensor)

criterion = nn.BCELoss()

# Calculate the loss
loss = criterion(first_prediction, target)

# Create the optimizer
optimizer = optim.SGD(model.parameters(), lr=1, momentum=1)

# Compute the gradients of the loss
#loss.backward(retain_graph=True)
loss.backward()

# Update the model's parameters using the optimizer
optimizer.step()


final_prediction = model(input_tensor)

print("first prediction = ", (first_prediction > 0.5).float())
print("final prediction = ", (final_prediction > 0.5).float())
print("target = ", target)

first prediction =  tensor([[0., 1.]])
final prediction =  tensor([[1., 0.]])
target =  tensor([[1., 0.]])


## 5.5 Writing a training loop

In scikit-learn, the whole training loop is contained in the `.fit()` method. 

In PyTorch, however, we have to implement the loop manually.

### 5.5.1 Training a neural network

1)  Create a model
    
2)  Choose a loss function

3)  Create a dataset

4)  Define an optimizer

5)  Run a training loop, where for each sample of the dataset, we repeat:

    *   Calculating loss (forward pass)
     
    *   Calculating local gradients
     
    *   Updating model parameters

In [154]:
import pandas as pd
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

### 5.5.2 Before the training loop

In [155]:
import pandas as pd
salaries = pd.read_csv("datasets/ds_salaries.csv", index_col = "Unnamed: 0")
target_df = salaries["salary_in_usd"]
features_df = salaries[["experience_level", "employment_type", "remote_ratio", "company_size"]]

In [158]:
print(salaries.columns)

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')


In [159]:
salaries.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [160]:
# transforming features' type to category 
features_df.loc[:, "company_size"] = features_df["company_size"].astype("category").cat.codes
features_df.loc[:, "experience_level"] = features_df["experience_level"].astype("category").cat.codes
features_df.loc[:, "employment_type"] = features_df["employment_type"].astype("category").cat.codes
features = features_df.to_numpy(dtype='float32')
features.shape

(607, 4)

In [161]:
features_df.head()

Unnamed: 0,experience_level,employment_type,remote_ratio,company_size
0,2,2,0,0
1,3,2,0,2
2,3,2,50,1
3,2,2,0,2
4,3,2,50,0


In [162]:
features[:5]

array([[ 2.,  2.,  0.,  0.],
       [ 3.,  2.,  0.,  2.],
       [ 3.,  2., 50.,  1.],
       [ 2.,  2.,  0.,  2.],
       [ 3.,  2., 50.,  0.]], dtype=float32)

In [163]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

`MinMaxScaler()` is a function in scikit-learn that transforms features by scaling each feature to a given range, such as between 0 and 1. 

It is often used as an alternative to zero mean, unit variance scaling, which is also known as standardization. 

`MinMaxScaler()` can be useful for data that has outliers or different scales, as it preserves the shape of the original distribution and does not reduce the importance of small variations.

The main idea behind normalization/standardization is always the same. 

Variables that are measured at different scales do not contribute equally to the model fitting & model learned function and might end up creating a bias. 

Thus, to deal with this potential problem feature-wise normalization such as `MinMax Scaling` is usually used prior to model fitting.

In [168]:
target_df.describe()

count       607.000000
mean     112297.869852
std       70957.259411
min        2859.000000
25%       62726.000000
50%      101570.000000
75%      150000.000000
max      600000.000000
Name: salary_in_usd, dtype: float64

In [173]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
target = scaler.fit_transform(target_df.values.reshape(-1, 1))
target = target.astype(np.float32)
target.shape

(607, 1)

In [182]:
pd.Series(target.reshape(607,)).describe()

count    607.000000
mean       0.183271
std        0.118828
min        0.000000
25%        0.100256
50%        0.165306
75%        0.246409
max        1.000000
dtype: float64

## 6. PyTorch's `dataset` and `dataloader`

PyTorch `dataset` and `dataloader` are two data primitives that allow us to use pre-loaded datasets as well as our own data for training deep learning models. 

A `dataset` is an object that stores the **samples** and their corresponding **labels**, and a `dataloader` is an object that wraps an **iterable** around the dataset to enable easy access to the samples in **batches**. 

The batch size parameter in `dataloader` in pytorch is the number of samples that will be loaded into memory at each iteration. 

This is controlled by the `batch_size` argument in the dataloader. 

By default, this is set to `1`. 

There are a few reasons why we might want to change the batch size:

*   If we are training on a GPU, we can increase the batch size to make better use of the GPU’s processing power.
*  
Ifwer’re training a deep neural network,weu may need to increase the batch size to avoid overfitting onyour training data.
*  
Iweou have a lot of data and wanweur training to run fasterweou can increase the batch size so that more data is processed per training iteratio

In deep learning, **epoch** and **batch size** are two key parameters that control the training of a model.

*Epoch** is the number of times the training dataset is passed through the neural network during training. 
One epoch is one complete pass through the training dataset.

**Batch size** is the number of training examples used in one iteration. The size of a batch must be more than or equal to one and less than or equal to the number of samples in the training dataset. The higher the batch size, the more memory space you’ll needn.

In [183]:
# Create the dataset and the dataloader
dataset = TensorDataset(torch.tensor(features), torch.tensor(target))
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Create the model
model = nn.Sequential(nn.Linear(4, 2),                       
                      nn.Linear(2, 1))

# Create the loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

### 6.1 The training loop

In [186]:
def show_results(model, dataloader):
    model.eval()
    total_loss = 0
    total_samples = 0
    
    with torch.no_grad():
        for data in dataloader:
            feature, target = data
            pred = model(feature)
            loss = criterion(pred, target)
            total_loss += loss.item() * feature.size(0)
            total_samples += feature.size(0)
            
            # Print ground truth and predicted salaries
            for i in range(len(target)):
                print("Ground truth salary: {:.3f}. Predicted salary: {:.3f}".format(target[i].item(), pred[i].item()))
    
    average_loss = total_loss / total_samples
    print("Average Loss: {:.4f}".format(average_loss))

In [187]:
# Combining the codes

import pandas as pd
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
torch.manual_seed(1) # for 01

salaries = pd.read_csv("datasets/ds_salaries.csv")
target_df = salaries["salary_in_usd"]
features_df = salaries[["experience_level", "employment_type", "remote_ratio", "company_size"]]
features_df.loc[:, "company_size"] = features_df["company_size"].astype("category").cat.codes
features_df.loc[:, "experience_level"] = features_df["experience_level"].astype("category").cat.codes
features_df.loc[:, "employment_type"] = features_df["employment_type"].astype("category").cat.codes
features = features_df.to_numpy(dtype='float32')
scaler = MinMaxScaler()
target = scaler.fit_transform(target_df.values.reshape(-1, 1))
target = target.astype(np.float32)

# Create the dataset and the dataloader
dataset = TensorDataset(torch.tensor(features), torch.tensor(target))
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Create the model
model = nn.Sequential(nn.Linear(4, 2),                       
                      nn.Linear(2, 1))

# Create the loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Loop over the number of epochs and the dataloader
num_epochs = 10
for epoch in range(num_epochs):
    for data in dataloader:
        # Set the gradients to zero
        optimizer.zero_grad()
        # Get feature and target from the data loader
        feature, target = data
        # Run a forward pass
        pred = model(feature)
        # Compute loss and gradients
        loss = criterion(pred, target)
        loss.backward()
        # Update the parameters
        optimizer.step()
        
show_results(model, dataloader)

Ground truth salary: 0.148. Predicted salary: 0.088
Ground truth salary: 0.253. Predicted salary: 0.208
Ground truth salary: 0.280. Predicted salary: 0.153
Ground truth salary: 0.205. Predicted salary: 0.154
Ground truth salary: 0.179. Predicted salary: 0.153
Ground truth salary: 0.213. Predicted salary: 0.165
Ground truth salary: 0.093. Predicted salary: 0.175
Ground truth salary: 0.213. Predicted salary: 0.208
Ground truth salary: 0.288. Predicted salary: 0.208
Ground truth salary: 0.096. Predicted salary: 0.262
Ground truth salary: 0.145. Predicted salary: 0.175
Ground truth salary: 0.330. Predicted salary: 0.153
Ground truth salary: 0.224. Predicted salary: 0.208
Ground truth salary: 0.167. Predicted salary: 0.185
Ground truth salary: 0.364. Predicted salary: 0.229
Ground truth salary: 0.087. Predicted salary: 0.219
Ground truth salary: 0.104. Predicted salary: 0.133
Ground truth salary: 0.339. Predicted salary: 0.251
Ground truth salary: 0.330. Predicted salary: 0.153
Ground truth

The inverse function of `scaler.fit_transform` in `scikit-learn` is `scaler.inverse_transform`, which scales the data back to the original representation. 

You can use this function to transform the predicted values into the original scale of the data. 

For example, if you have used StandardScaler to scale your data before fitting a model, you can use `StandardScaler.inverse_transform` to undo the scaling on the predictions. 

In [188]:
# from sklearn.preprocessing import StandardScaler.inverse_transform
# StandardScaler.inverse_transform(pred)
torch.mean(pred)

tensor(0.2061, grad_fn=<MeanBackward0>)

## 7. Activation functions between layers

![](images/sigmoid_function.png)


Some of the limitations of the sigmoid function. 

*  Bounded between 0 and 1.

*  Can be used anywhere in the network.

**Gradients**

*  Approach zero for low and high values of $x$

*  It will cause the function to saturate. 

* This can lead to vanishing gradients during backpropagation.


**This is also a problem for softmax**





### 5.6.1 Introduction ReLU, Leaky ReLU and ELU

![](images/relu.png)

**ReLU**  Rectified Linear Unit 

*  $f(x) = \max(x,0)$

*  for positive inputs, the output is equal to 
the input

*  for strictly negative inputs, the output is   equal to  zero

*  it oovercomes the vanishing gradients problem

*  however, it is not differentiable at $x=0$

*  The problem of “dead ReLU” or “dying ReLU” refers to a situation in neural networks where certain neurons using the Rectified Linear Unit (ReLU) activation function become inactive during training and never recover.

   *  Such neurons always output zero and do not contribute to the learning process
*  In pytorch, `relu = nn.ReLU()`






![](images/leaky_relu.png)


**Leaky ReLU**

* For positive inputs, it behaves similarly to ReLU

* For negative inputs, it multiplies the input by a small coefficient (defaulted to 0.01)

* The gradients for negative inputs are never null.

* In PyTorch, `leaky_relu = nn.LeakyReLU(negative_slope = 0.05)`



![](images/elu_and_its_derivative.png) 

**ELU**  Exponential Linear Uni

*  $f(x) = \begin{cases}x & \text{if $x > 0$}\\
                       \alpha(e^x-1) & \text{if $x \leq 0$}\end{cases}$

*  Tend to converge faster than ReLU (because mean ELU activations are closer to zero)

*  Fully continuous

*  Fully differentiable

*  Does not have a vanishing gradients problem

*  Does not have an exploding gradients problem

*  Does not have a dead relu problem.

*  In pytorch,  `F.elu(input, alpha=1.0, inplace=False)` which applies the ELU function to the input tensor with `alpha=1.0` and not operating `in-place`

In [250]:
# Create a ReLU function with PyTorch
relu_pytorch = nn.ReLU()

# Apply your ReLU function on x, and calculate gradients
x = torch.tensor(1.5, requires_grad=True)
y = relu_pytorch(x)

y.backward()

# Print the gradient of the ReLU function for x
gradient = x.grad
print(gradient)

tensor(1.)


In [251]:
# Create a ReLU function with PyTorch
leaky_relu = nn.LeakyReLU()

# Apply your ReLU function on input, and calculate gradients
# torch.manual_seed(0)

input = torch.tensor(-2.0, requires_grad=True)

output = leaky_relu(input)
output.backward()

# Print the gradient of the ReLU function for x
radient = input.grad

print(input)
print(output)

tensor(-2., requires_grad=True)
tensor(-0.0200, grad_fn=<LeakyReluBackward0>)


In [252]:
import torch.nn.functional as F
torch.manual_seed(0)

input = torch.randn(4) # create a random input tensor of size 4
output = F.elu(input, alpha=1.0, inplace=False) #

print(input)
print(output)

tensor([ 1.5410, -0.2934, -2.1788,  0.5684])
tensor([ 1.5410, -0.2543, -0.8868,  0.5684])


In [None]:
## 5.7 Counting the number of parameters

In [253]:
model = nn.Sequential(nn.Linear(16, 4),
                      nn.Linear(4, 2),
                      nn.Linear(2, 1))

total = 0

# Calculate the number of parameters in the model
for parameter in model.parameters():
  total += parameter.numel()
print(total)

81


# Create a neural network with exactly four linear layers and more than 120 parameters

which takes n_features as inputs and outputs n_classes.

In [255]:
def calculate_capacity(model):
  total = 0
  for p in model.parameters():
    total += p.numel()
  return total

In [256]:
n_features = 8
n_classes = 2

input_tensor = torch.Tensor([[3, 4, 6, 2, 3, 6, 8, 9]])

# Create a neural network with more than 120 parameters
model = nn.Sequential(
    nn.Linear(n_features, 8),
    nn.Linear(8, 4),
    nn.Linear(4, 4),
    nn.Linear(4, n_classes)
)

output = model(input_tensor)

print(calculate_capacity(model))

138


Using the TensorDataset class

In [257]:
import numpy as np
import torch
from torch.utils.data import TensorDataset

np_features = np.array(np.random.rand(12, 8))
np_target = np.array(np.random.rand(12, 1))

# Convert arrays to PyTorch tensors
torch_features = torch.tensor(np_features)
torch_target = torch.tensor(np_target)

# Create a TensorDataset from two tensors
dataset = TensorDataset(torch_features, torch_target)

# Return the last element of this dataset
print(dataset[-1])

(tensor([0.5978, 0.0183, 0.0100, 0.3260, 0.4076, 0.9300, 0.3318, 0.9228],
       dtype=torch.float64), tensor([0.5949], dtype=torch.float64))
