# HW1 PT2 Vyacheslav Stepanyan

# **Task 1**: Implementing and analyzing Custom Loss Functions in PyTorch

This task sequence introduces the development of custom loss functions in PyTorch, with a focus on applying theoretical knowledge to practical implementation. As an initial example, the L1 Loss (Mean Absolute Error) function is fully implemented, demonstrating how to extend PyTorch's nn.Module to create custom loss computations. Following this example **implement** additional loss functions, including L2 Loss (Mean Squared Error), Binary Cross-Entropy Loss, and Cross-Entropy Loss for multi-class classification.

In [1]:
import torch
import torch.nn as nn

class L1Loss(nn.Module):
    """
    L1 Loss, also known as Mean Absolute Error (MAE).
    """
    def forward(self, y_pred, y_true):
        """
        Forward pass for L1 loss using PyTorch operations.

        :param y_pred: Predicted values (Tensor).
        :param y_true: Ground truth values (Tensor).
        :return: Scalar tensor representing the L1 loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the forward pass to calculate the L1 loss.                 #
        # Use PyTorch tensor operations to compute the mean absolute difference#
        # between y_pred and y_true.                                           #
        ########################################################################
        return torch.mean(torch.abs(y_pred - y_true))
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    y_pred = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
    y_true = torch.tensor([1.5, 2.5, 3.0, 4.5])

    # Initialize your custom L2Loss
    criterion = L1Loss()

    # TODO: Compute the loss using your L2Loss class and print it.
    loss = criterion(y_pred, y_true)
    print(f"Loss: {loss}")
    loss2 = nn.L1Loss()
    out = loss2(y_pred,y_true)
    print(f"Loss: {out}")

    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on y_pred: {y_pred.grad}")

Loss: 0.375
Loss: 0.375
Gradients on y_pred: tensor([-0.2500, -0.2500,  0.0000, -0.2500])


In [2]:
class L2Loss(nn.Module):
    """
    L2 Loss, also known as Mean Squared Error (MSE).
    """
    def forward(self, y_pred, y_true):
        """
        Forward pass for L2 loss using PyTorch operations.
        :param y_pred: Predicted values (Tensor).
        :param y_true: Ground truth values (Tensor).
        :return: Scalar tensor representing the L2 loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the forward pass to calculate the L2 loss.                 #
        ########################################################################
        return torch.mean(torch.pow((y_pred-y_true),2))
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    y_pred = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
    y_true = torch.tensor([1.5, 2.5, 3.0, 4.5])
    # Ensure y_pred and y_true are PyTorch tensors.

    # Initialize your custom L1Loss
    criterion = L2Loss()
    
    # TODO: Compute the loss using your L2Loss class and print it.
    loss = criterion(y_pred, y_true)
    print(f"Manual Loss: {loss}")
    loss2 = nn.MSELoss()
    out = loss2(y_pred,y_true)
    print(f"Torch Loss: {out}")

    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on y_pred: {y_pred.grad}")

Manual Loss: 0.1875
Torch Loss: 0.1875
Gradients on y_pred: tensor([-0.2500, -0.2500,  0.0000, -0.2500])


In [3]:
class BCELoss(nn.Module):
    """
    Binary Cross-Entropy (BCE) Loss implemented for PyTorch.
    Note: PyTorch already provides nn.BCELoss, but implementing it manually can be educational.
    """
    def forward(self, y_pred, y_true):
        """
        Forward pass for BCE loss using PyTorch operations.

        :param y_pred: Predicted probabilities (Tensor) with values in range [0, 1].
        :param y_true: Ground truth values (Tensor) with binary values 0 or 1.
        :return: Scalar tensor representing the BCE loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the BCE loss calculation here.                             #
        # Hint: Use PyTorch's torch.clamp to avoid log(0) which is undefined.  #
        # Use torch.log for natural logarithm.                                 #
        ########################################################################
        y_pred = torch.clamp(y_pred, min = 0.00000001, max = 0.9999999)
        loss = -torch.mean(y_true*torch.log(y_pred) + (1-y_true)*(torch.log(1-y_pred)))
        return loss
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    y_pred = torch.tensor([0.1, 0.7, 0.05, 0.15], requires_grad=True)
    y_true = torch.tensor([0., 1., 0., 0.])
    # Ensure y_pred and y_true are PyTorch tensors.

    # Initialize your custom BCELoss
    criterion = BCELoss()

    # TODO: Compute the loss using your BCELoss class and print it.
    loss = criterion(y_pred, y_true)
    print(f"Manual Loss: {loss}")
    loss2 = torch.nn.BCELoss()
    out = loss2(y_pred,y_true)
    print(f"Torch Loss: {out}")

    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print(f"Gradients on y_pred: {y_pred.grad}")

Manual Loss: 0.16896192729473114
Torch Loss: 0.16896192729473114
Gradients on y_pred: tensor([ 0.2778, -0.3571,  0.2632,  0.2941])


The function above does not reach 0 loss due to claming

In [4]:
class CELoss(nn.Module):
    """
    Implement the Cross-Entropy Loss for multi-class classification in PyTorch.
    """
    def __init__(self):
        super(CELoss, self).__init__()

    def forward(self, logits, targets):
        """
        Forward pass for Cross-Entropy loss.

        :param logits: Logits from the model (Tensor). Shape: [batch_size, num_classes].
        :param targets: Ground truth class indices (Tensor). Shape: [batch_size].
        :return: Scalar tensor representing the CE loss.
        """
        ########################################################################
        # TODO:                                                                #
        # Implement the forward pass to calculate the Cross-Entropy loss.      #
        # Hint: Don't use PyTorch's log_softmax and nll_loss functions.   #
        ########################################################################

        max_logits = logits.max(axis=1, keepdims=True)[0]
        exp_logits = torch.exp(logits - max_logits)
        probs = exp_logits / torch.sum(exp_logits, axis=1, keepdims=True)
        one_hot_targets = torch.zeros_like(logits)
        one_hot_targets[torch.arange(logits.shape[0]), targets] = 1
        loss = -torch.sum(one_hot_targets * torch.log(probs)) / (logits.shape[0])

        return loss

        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Define sample predicted values and ground truth values for testing your implementation.
    # Ensure y_pred and y_true are PyTorch tensors.
    y_pred = torch.tensor([[0.8, 0.4, 0.9], [0.1, 0.3, 0.1], [0.7, 0.6, 0.2]], requires_grad=True)
    y_true = torch.tensor([0, 1, 0])

    # Initialize your custom CELoss
    criterion = CELoss()

    # TODO: Compute the loss using your CELoss class and print it.
    loss = criterion(y_pred, y_true)
    print("Cross-Entropy Loss:", loss.item())
    #torch loss
    loss2 = torch.nn.CrossEntropyLoss()
    out = loss2(y_pred,y_true)
    print(f"Torch Loss: {out}")
    # TODO: Perform a backward pass to compute gradients (optional demonstration of PyTorch's autograd).
    loss.backward()
    print("Gradients:", y_pred.grad)

Cross-Entropy Loss: 0.9704906940460205
Torch Loss: 0.9704906940460205
Gradients: tensor([[-0.2132,  0.0805,  0.1327],
        [ 0.1035, -0.2069,  0.1035],
        [-0.2006,  0.1201,  0.0805]])


# **Task 2:** Implementing Custom Activation Functions in PyTorch


This task involves developing a set of custom activation functions in PyTorch, understanding their roles in neural networks, and how they can be implemented from scratch. Activation functions are crucial for introducing non-linearity into the network, allowing for the learning of complex patterns in the data. You'll start with an example of the ReLU (Rectified Linear Unit) activation function and then proceed to **implement** additional activation functions such as Sigmoid, Tanh, and Softmax, followed by a **comparison** with PyTorch's built-in implementations.

**The backward calculation for the Softmax function is not straightforward; hence, you may rely solely on PyTorch's built-in functionality for the backward pass.**

In [5]:
class ReLU(nn.Module):
    """
    Implement the ReLU activation function.
    """
    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        """
        Forward pass for ReLU.
        :param x: Input tensor.
        :return: Output tensor where ReLU(x) = max(0, x).
        """
        ########################################################################
        # TODO: Implement the ReLU activation function.                        #
        ########################################################################

        return torch.maximum(torch.zeros_like(x), x)

        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def backward(grad_output):
      """
      Backward pass for custom ReLU.
      :param grad_output: Gradient tensor of the output.
      :return: Gradient tensor for the input.
      """
      ########################################################################
      # TODO: Implement the backward computation for ReLU.                   #
      ########################################################################
      # Gradient of ReLU is 1 for input > 0; otherwise, it's 0
      grad_input = grad_output.clone()
      grad_input[grad_input >= 1] = 1
      return grad_input
      ########################################################################
      #                           END OF YOUR CODE                           #
      ########################################################################

# Example usage
if __name__ == "__main__":
    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Initialize the custom ReLU activation function.
    custom_relu = ReLU()

    # Compute the activation using the custom ReLU class.
    activated_x_custom = custom_relu(x)

    # Perform a backward pass to compute gradients for the custom implementation.
    gradients_custom = ReLU.backward(activated_x_custom)

    # Print the outputs and gradients from the custom implementation.
    print("Custom ReLU output:", activated_x_custom)
    print("Custom ReLU gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in relu function.
    activated_x_torch = torch.relu(x)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(torch.ones_like(x))
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch ReLU output:", activated_x_torch)
    print("PyTorch ReLU gradients:", gradients_torch)

Custom ReLU output: tensor([0., 0., 1., 2.], grad_fn=<MaximumBackward0>)
Custom ReLU gradients: tensor([0., 0., 1., 1.], grad_fn=<IndexPutBackward0>)
PyTorch ReLU output: tensor([0., 0., 1., 2.], grad_fn=<ReluBackward0>)
PyTorch ReLU gradients: tensor([0., 0., 1., 1.])


In [6]:
class Sigmoid(nn.Module):
    """
    Implement the Sigmoid activation function.
    """
    def __init__(self):
        super(Sigmoid, self).__init__()
    def forward(self, x):
        """
        Forward pass for Sigmoid.
        :param x: Input tensor.
        :return: Output tensor where Sigmoid(x) = 1 / (1 + exp(-x)).
        """
        ########################################################################
        # TODO: Implement the Sigmoid activation function.                     #
        ########################################################################
        sigm_x = (1/(1+torch.exp(-x)))
        self.sigm_x = sigm_x
        return sigm_x
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def backward(self, grad_output):
        """
        Backward pass for custom Sigmoid.
       :param grad_output: Gradient tensor of the output.
       :return: Gradient tensor for the input.
        """
        ########################################################################
        # TODO: Implement the backward computation for Sigmoid.                #
        ########################################################################
        d_x = grad_output*(self.sigm_x *(1 - self.sigm_x))
        return d_x
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Test your Sigmoid implementation.
    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Initialize the custom ReLU activation function.
    custom_sigmoid = Sigmoid()

    # Compute the activation using the custom sigmoid class.
    activated_x_custom = custom_sigmoid(x)

    # Perform a backward pass to compute gradients for the custom implementation.
    gradients_custom = custom_sigmoid.backward(torch.ones_like(activated_x_custom))

    # Print the outputs and gradients from the custom implementation.
    print("Custom Sigmoid output:", activated_x_custom)
    print("Custom Sigmoid gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in sigmoid function.
    activated_x_torch = torch.sigmoid(x)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(torch.ones_like(x))
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch sigmoid output:", activated_x_torch)
    print("PyTorch sigmoid gradients:", gradients_torch)

Custom Sigmoid output: tensor([0.2689, 0.5000, 0.7311, 0.8808], grad_fn=<MulBackward0>)
Custom Sigmoid gradients: tensor([0.1966, 0.2500, 0.1966, 0.1050], grad_fn=<MulBackward0>)
PyTorch sigmoid output: tensor([0.2689, 0.5000, 0.7311, 0.8808], grad_fn=<SigmoidBackward0>)
PyTorch sigmoid gradients: tensor([0.1966, 0.2500, 0.1966, 0.1050])


In [7]:
class Tanh(nn.Module):
    """
    Implement the Tanh activation function.
    """
    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, x):
        """
        Forward pass for Tanh.
        :param x: Input tensor.
        :return: Output tensor where Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)).
        """
        ########################################################################
        # TODO: Implement the Tanh activation function.                        #
        ########################################################################
        tanh_x = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
        self.tanh_x = tanh_x
        return tanh_x
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def backward(self, grad_output):
        """
        Backward pass for custom Tanh.
       :param grad_output: Gradient tensor of the output.
       :return: Gradient tensor for the input.
        """
        ########################################################################
        # TODO: Implement the backward computation for Tanh                    #
        ########################################################################
        d_x = grad_output * (1 - (self.tanh_x)**2)
        return d_x
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Test your Tanh implementation.
    tanh = Tanh()

    # Define a sample input tensor.
    x = torch.tensor([-1.0, 0.0, 1.0, 2.0], requires_grad=True)

    # Compute the activation using the custom Tanh class.
    activated_x_custom = tanh(x)

    # Create a tensor with the same shape as activated_x_custom filled with ones.
    grad_output_custom = torch.ones_like(activated_x_custom)

    # Perform a backward pass to compute gradients for the custom implementation.
    gradients_custom = tanh.backward(grad_output_custom)

    # Print the outputs and gradients from the custom implementation.
    print("Custom Tanh output:", activated_x_custom)
    print("Custom Tanh gradients:", gradients_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in tanh function.
    activated_x_torch = torch.tanh(x)

    # Create a tensor with the same shape as activated_x_torch filled with ones.
    grad_output_torch = torch.ones_like(activated_x_torch)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(grad_output_torch)
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch Tanh output:", activated_x_torch)
    print("PyTorch Tanh gradients:", gradients_torch)

Custom Tanh output: tensor([-0.7616,  0.0000,  0.7616,  0.9640], grad_fn=<DivBackward0>)
Custom Tanh gradients: tensor([0.4200, 1.0000, 0.4200, 0.0707], grad_fn=<MulBackward0>)
PyTorch Tanh output: tensor([-0.7616,  0.0000,  0.7616,  0.9640], grad_fn=<TanhBackward0>)
PyTorch Tanh gradients: tensor([0.4200, 1.0000, 0.4200, 0.0707])


In [8]:
class Softmax(nn.Module):
    """
    Implement the Softmax activation function.
    """
    def __init__(self):
        super(Softmax, self).__init__()

    def forward(self, x, dim=1):
        """
        Forward pass for Softmax.
        :param x: Input tensor.
        :param dim: The dimension Softmax would be applied to.
        :return: Output tensor after applying Softmax.
        """
        ########################################################################
        # TODO: Implement the Softmax activation function.                     #
        # Hint: Subtract the maximum value in each row for numerical stability #
        ########################################################################
        max_values = torch.max(x, dim, keepdim=True)[0]
        exp_x = torch.exp(x - max_values)
        softmax_x = exp_x / torch.sum(exp_x, dim, keepdim=True)
        return softmax_x
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # TODO: Test your Softmax implementation.
    # Test Softmax implementation.
    softmax = Softmax()

    # Example input tensor
    x = torch.tensor([[4.0, 3.0, 5.0],[3.0, 4.0, 7.0],[6.0, 8.0, 2.0]], requires_grad=True)

    # Forward pass
    activated_x_custom = softmax(x)

    # Print the outputs and gradients from the custom implementation.
    print("Custom Softmax output:", activated_x_custom)

    # Reset gradients to zero before another backward pass
    x.grad = None

    # Compute the activation using PyTorch's built-in softmax function.
    activated_x_torch = torch.softmax(x, dim=1)

    # Create a tensor with the same shape as activated_x_torch filled with ones.
    grad_output_torch = torch.ones_like(activated_x_torch)

    # Perform a backward pass to compute gradients for PyTorch's implementation.
    activated_x_torch.backward(grad_output_torch)
    gradients_torch = x.grad

    # Print the outputs and gradients from PyTorch's implementation.
    print("PyTorch Softmax output:", activated_x_torch)
    print("PyTorch Softmax gradients:", gradients_torch)

Custom Softmax output: tensor([[0.2447, 0.0900, 0.6652],
        [0.0171, 0.0466, 0.9362],
        [0.1189, 0.8789, 0.0022]], grad_fn=<DivBackward0>)
PyTorch Softmax output: tensor([[0.2447, 0.0900, 0.6652],
        [0.0171, 0.0466, 0.9362],
        [0.1189, 0.8789, 0.0022]], grad_fn=<SoftmaxBackward0>)
PyTorch Softmax gradients: tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


# **Task 3**: Deriving and Understanding the Sigmoid Function

The sigmoid function is a widely used activation function in the field of machine learning, especially in logistic regression and neural networks. It maps any real-valued number into the range between 0 and 1.

1. Given the sigmoid function defined as $\sigma(x) = \frac{1}{1 + e^{-x}}$, **compute the derivative** $\frac{d\sigma(x)}{dx}$ **with respect to $x$**.

2. A special property of the sigmoid function is that its derivative can be expressed in terms of the sigmoid function itself. If we denote $y = \sigma(x)$, **show how the derivative you've computed can be re-written in terms of $y$**, where $y$ is the output of the sigmoid function.

   *Hint: Your answer should only depend on $y$.*


$$
\frac{d\sigma(x)}{dx} = \frac{d}{dx}\left(\frac{1}{1 + e^{-x}}\right)
$$

$$
\frac{d\sigma(x)}{dx} = \frac{0 - (-e^{-x})}{(1 + e^{-x})^2}
$$

$$
\frac{d\sigma(x)}{dx} = \frac{e^{-x}}{(1 + e^{-x})^2}
$$

$$
y = \sigma(x)
$$

$$
e^{-x} = \frac{1}{y} - 1
$$

$$
\frac{d\sigma(x)}{dx} = \frac{\frac{1}{y} - 1}{(1 + \frac{1}{y} - 1)^2}
$$

$$
\frac{d\sigma(x)}{dx} = \frac{\frac{1}{y} - 1}{(\frac{1}{y})^2} = \frac{1 - y}{y} : \frac{1}{y^2} = \frac{1 - y}{y} * {y^2} = {(1-y)*y}
$$

# **Task 4**: Connecting Sigmoid and Softmax Functions

The sigmoid and softmax functions are foundational to machine learning, particularly in classification tasks. While the sigmoid function is traditionally used for binary classification, the softmax function generalizes this concept to multi-class problems. The sigmoid function can be seen as a special case of the softmax function when the output space consists of two classes.

Consider a binary classification problem and the general form of the softmax function for an arbitrary vector $\mathbf{z} $ with components $\mathbf{z_i} $ for $\mathbf( i = 1, \ldots, K) $ classes. The softmax function is defined as:

$$
\text{softmax}(\mathbf{z})_i = \frac{e^{z_i}}{\sum_{j=1}^K e^{z_j}}
$$

Your task is to demonstrate that the softmax function simplifies to the sigmoid function in the context of binary classification.

1. **Express the Softmax Function for Two Classes:**
   Show the softmax function for a two-class system and define the components of the vector $\mathbf{z} $ as arbitrary logits without specifying any particular values.

2. **Derive the Sigmoid Function from Softmax:**
   Simplify the expression for the probability of the first class and show how it is equivalent to the sigmoid function for an arbitrary logit.

_Hint: Consider the nature of binary classification and how the probabilities must sum to one._


$$
K = 2
$$

$$
\text{softmax}(\mathbf{z})_1 = \frac{e^{z_1}}{e^{z_1} + e^{z_2}}
$$

$$
\text{softmax}(\mathbf{z})_2 = \frac{e^{z_2}}{e^{z_1} + e^{z_2}}
$$

$\mathbf{z_1}$ and $\mathbf{z_2}$ are logits for the two classes.

$$
\text{softmax}(\mathbf{z})_1 = \frac{e^{z_1}}{e^{z_1} + e^{z_2}}
$$

$$
\text{softmax}(\mathbf{z})_1 = \frac{e^{z_1}}{e^{z_1} + e^{z_2}} * \frac{e^{-z_1}}{e^{-z_1}}
$$

$$
\text{softmax}(\mathbf{z})_1 = \frac{1}{1 + e^{z_2 - z_1}}
$$

$$
x = z_2 - z_1
$$

$$
\text{softmax}(\mathbf{z})_1 = \frac{1}{1 + e^{-x}}
$$


# **Task 5:** Understanding Logits and Log Odds

In logistic regression and neural networks, the concept of logits and log odds play a central role in modeling probabilities.

- **Logits:** The logit function is the inverse of the sigmoid function. It takes a probability value and maps it to the entire real number line, which can be interpreted as the log odds.

- **Log Odds:** This is the logarithm of the odds ratio. For a probability $p$, the odds are $\frac{p}{1-p}$, and the log odds, or logits, is the natural logarithm of this odds: $\text{logit}(p) = \log\left(\frac{p}{1-p}\right)$. In logistic regression, we predict log odds with the linear combination of features, and then convert these predictions into probabilities using the sigmoid function.



**The Sigmoid Inverse:** The inverse of the sigmoid function, denoted as $(\sigma^{-1})$, is the logit function. Given the sigmoid function defined as:
$$
\sigma(x) = \frac{1}{1 + e^{-x}}
$$
**Derive its inverse, $(\sigma^{-1}(y))$,** which takes a probability and gives the corresponding log odds.

Hint: To find the inverse, set $y = \sigma(x)$, and solve for $x$ in terms of $y$. The result will give you the logit function.



$$
y = \frac{1}{1 + e^{-x}}
$$

$$
1 + e^{-x} = \frac{1}{y}
$$

$$
e^{-x} = \frac{1}{y} - 1
$$

$$
x = -\log\left(\frac{1}{y} - 1\right)
$$

$$
\sigma^{-1}(y) = -\log\left(\frac{1}{y} - 1\right) 
$$

$$
= -\log(\frac{1-y}{y}) 
$$

$$
= \log(\frac{y}{1-y})
$$

# Task 6: Understanding Backpropagation and the Chain Rule

Background

Backpropagation is an algorithm commonly used for training neural networks. It leverages the chain rule to calculate the gradient of the loss function with respect to each weight in the network. This gradient tells us how much the loss will change for a small change in the weights, and it's used to update the weights to minimize the loss.

The chain rule is a fundamental principle in calculus that is used to find the derivative of composite functions. If we have functions nested within each other, the chain rule allows us to take the derivative of the entire expression by multiplying the derivatives of the constituent functions.

1. **Chain Rule for Simple Composition**

   Given a function composed as $f(u(x))$, where $u$ is a function of $x$, use the chain rule to find the derivative of $f$ with respect to $x$.

   **Example Function:**

   Let $f(u) = e^u$ and $u(x) = 2x + 3$. Compute $\frac{df}{dx}$.

   **Solution:**

   First, find $\frac{du}{dx}$ where $u(x) = 2x + 3$. The derivative is $\frac{du}{dx} = 2$.

   Then, compute $\frac{df}{du}$ for $f(u) = e^u$. The derivative is $\frac{df}{du} = e^u$.

   Multiply $\frac{du}{dx}$ by $\frac{df}{du}$ to get $\frac{df}{dx} = 2e^{(2x+3)}$.

2. **Chain Rule for Nested Composition**

   For a nested function $f(g(u(x)))$, where $g$ is a function of $u(x)$, and $u$ is a function of $x$, apply the chain rule to compute the derivative of $f$ with respect to $x$.

   **Example Function:**

   Let $f(g) = \sin(g)$, $g(u) = u^2$, and $u(x) = 3x - 5$. Find $\frac{df}{dx}$.

   **Solution:**

   Start by finding $\frac{du}{dx}$ for $u(x) = 3x - 5$. The derivative is $\frac{du}{dx} = 3$.

   Next, find $\frac{dg}{du}$ for $g(u) = u^2$. The derivative is $\frac{dg}{du} = 2u$.

   Then, compute $\frac{df}{dg}$ for $f(g) = \sin(g)$. The derivative is $\frac{df}{dg} = \cos(g)$.

   By the chain rule, $\frac{df}{dx} = \frac{df}{dg} \cdot \frac{dg}{du} \cdot \frac{du}{dx} = \cos(u^2) \cdot 2u \cdot 3$.

   Substituting $u(x)$ into the derivative, we get $\frac{df}{dx} = \cos((3x - 5)^2) \cdot 2(3x - 5) \cdot 3$.


**Task:** Consider a neural network with a single neuron that takes two inputs $x_1$ and $x_2$, with weights $w_1$ and $w_2$ respectively, and a bias $b$. The output of the neuron is passed through a hyperbolic tangent activation function:

$$
f(x) = \tanh(w_1x_1 + w_2x_2 + b)
$$

The hyperbolic tangent function, $\tanh(x)$, is defined as:

$$
\tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
$$

**Compute the partial derivatives of the function $f(x)$ with respect to $w_1$.**

### Compute the Partial Derivative of $f(x)$ with respect to $w_1$

$$
u = w_1x_1 + w_2x_2 + b
$$

$$
v = \tanh(u)
$$.

$$
\frac{\partial f}{\partial w_1} = \frac{\partial v}{\partial u} \cdot \frac{\partial u}{\partial w_1}
$$

### 1. Differentiate $v$ wrt $u$

$$
\tanh(u) = \frac{e^{u} - e^{-u}}{e^{u} + e^{-u}}
$$

$$
= \frac{(e^{u} - e^{-u})*e^{u}}{(e^{u} + e^{-u})*e^{u}} 
$$

$$
= \frac{e^{2u} - 1}{e^{2u} + 1}
$$

$$
\frac{d\tanh(u)}{du} = \frac{(2e^{2u})(e^{2u} + 1) - (e^{2u} - 1)(2e^{2u})}{(e^{2u} + 1)^2}
$$

$$
= \frac{4e^{2u}}{(e^{2u} + 1)^2}
$$

### 2. Differentiate $u$ wrt $w_1$

$$
\frac{\partial u}{\partial w_1} = x_1
$$

### 3. Apply the Chain Rule

$$
\frac{\partial f}{\partial w_1} = (\frac{4e^{2u}}{(e^{2u} + 1)^2}) \cdot x_1
$$

$$
= \frac{4e^{2u}*x_1}{(e^{2u} + 1)^2} 
$$

$$
= \frac{4e^{2*(w_1x_1 + w_2x_2 + b)}*x_1}{(e^{2*(w_1x_1 + w_2x_2 + b)} + 1)^2}
$$


# **Task 7 (Optional):** Implementing Custom Optimizers in PyTorch

In this task, you will delve into the mechanics of optimization algorithms in deep learning by creating custom optimizer classes in PyTorch. Optimizers are the engines that power the learning process, updating model weights based on gradients to minimize loss functions. You will start by understanding the foundational principles of the Gradient Descent optimizer. Following this, you will **implement** custom versions of more advanced optimizers such as Stochastic Gradient Descent (SGD), Momentum, and Adam, and **compare** their performance with PyTorch's built-in optimizers.

In [None]:
class GradientDescentOptimizer:
    """
    Custom implementation of the gradient descent optimization algorithm.
    """
    def __init__(self, parameters, learning_rate):
        """
        Initializes the GradientDescentOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
        """
        self.parameters = list(parameters)
        self.learning_rate = learning_rate

    def step(self):
        """
        Performs a single optimization step using gradient descent.

        :return: None
        """
        ########################################################################
        # TODO: Implement the gradient descent update step.                    #
        ########################################################################
        with torch.no_grad():
            for param in self.parameters:
                if param.grad is not None:
                    param.data -= self.learning_rate * param.grad
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clears gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        for param in self.parameters:
            if param.grad is not None:
                param.grad.zero_()
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # Define a simple model and a sample loss function
    model = torch.nn.Linear(1, 1)
    loss_fn = torch.nn.MSELoss()

    # Define sample input and target data
    input_data = torch.tensor([[1.0], [2.0]], requires_grad=True)
    target_data = torch.tensor([[2.0], [4.0]])

    # Initialize the custom optimizer
    optimizer = GradientDescentOptimizer(model.parameters(), learning_rate=0.01)

    # Forward pass: Compute predicted y by passing input_data to the model
    predicted_y = model(input_data)

    # Compute loss
    loss = loss_fn(predicted_y, target_data)

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Updated model weights: {model.weight.data}")

Updated model weights: tensor([[-0.4420]])


In [None]:
class SGDOptimizer:
    """
    Custom implementation of the stochastic gradient descent optimization algorithm.

    Attributes:
        parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
        learning_rate (float): Learning rate for the optimizer.
    """
    def __init__(self, parameters, learning_rate):
        """
        Initializes the SGDOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
        """
        ########################################################################
        # TODO: Initialize parameters and learning rate.                       #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def step(self):
        """
        Perform a single optimization step using SGD.

        :return: None
        """
        ########################################################################
        # TODO: Implement the SGD update step.                                 #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clear gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Repeat the process above for the SGD optimizer

In [None]:
class MomentumOptimizer:
    """
    Custom implementation of the stochastic gradient descent optimization algorithm with momentum.
    """
    def __init__(self, parameters, learning_rate, momentum=0.9):
        """
        Initializes the MomentumOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
            momentum (float): Momentum factor (default: 0.9).
        """
        ########################################################################
        # TODO: Initialize parameters, learning rate, and momentum.            #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def step(self):
        """
        Perform a single optimization step using SGD with momentum.

        :return: None
        """
        ########################################################################
        # TODO: Implement the momentum update step.                            #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clear gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################


# Example usage
if __name__ == "__main__":
    # TODO: Repeat the process above for the SGD with momentum optimizer

In [None]:
class AdamOptimizer:
    """
    Custom implementation of the Adam optimization algorithm.
    """
    def __init__(self, parameters, learning_rate=0.001, betas=(0.9, 0.999), eps=1e-8):
        """
        Initializes the AdamOptimizer.

        Args:
            parameters (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            learning_rate (float): Learning rate for the optimizer.
            betas (Tuple[float, float]): Coefficients used for computing running averages of gradient and its square.
            eps (float): Term added to the denominator to improve numerical stability.
        """
        ########################################################################
        # TODO: Initialize parameters, learning rate, betas, and eps.          #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def step(self):
        """
        Perform a single optimization step using Adam.

        :return: None
        """
        ########################################################################
        # TODO: Implement the Adam update step.                                #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

    def zero_grad(self):
        """
        Clear gradients of all optimized parameters.
        """
        ########################################################################
        # TODO: Clear gradients of all parameters.                             #
        ########################################################################
        pass
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################

# Example usage
if __name__ == "__main__":
    # TODO: Repeat the process above for the Adam optimizer