<a href="https://colab.research.google.com/github/trian-gles/nyu-deep-learning-hw/blob/main/hw_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch


In [None]:
class MLP:
    _nonlinears_fwd = {
            'relu': lambda t: torch.fmax(t, torch.zeros(t.size())),
            'sigmoid': lambda t: (torch.ones(t.size()) + torch.exp(-1 * t)).pow(-1),
            'identity': lambda t: t

        }

    _nonlinears_back = {
        'relu': lambda t: (t > 0).to(int),
        'sigmoid': lambda t: MLP._nonlinears_fwd['sigmoid'](t) * (torch.ones(t.size()) - MLP._nonlinears_fwd['sigmoid'](t)),
        'identity': lambda t: torch.ones(t.size())
    }
    
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
        """
        self.f_function = f_function
        self.g_function = g_function

        self.parameters = dict(
            W1 = torch.randn(linear_1_out_features, linear_1_in_features),
            b1 = torch.randn(linear_1_out_features),
            W2 = torch.randn(linear_2_out_features, linear_2_in_features),
            b2 = torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1 = torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1 = torch.zeros(linear_1_out_features),
            dJdW2 = torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2 = torch.zeros(linear_2_out_features),
        )

        # put all the cache value you need in self.cache
        self.cache = dict()

    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        """

        self.cache['lin_1_in'] = x # batch size, linear_1_in_features
        lin_1_out = torch.matmul(x, torch.t(self.parameters["W1"])) + self.parameters["b1"] # batch size, limear_1_out_features
                                # [batch size X lin_1_in] X [lin_1_in X lin_1_out] + [lin_1_out] = [batch size x lin_1_out]
        self.cache['lin_1_out'] = lin_1_out
        f_out = self._nonlinears_fwd[self.f_function](lin_1_out) # batch size, lin_1_out

        self.cache['lin_2_in'] = f_out # batch size, lin_1_out
        lin_2_out = torch.matmul(f_out, torch.t(self.parameters["W2"])) + self.parameters["b2"] # batch size, linear_2_out_features
                                #[batch size X lin_1_out] X [lin_1_out X lin_2_out] + [lin_2_out] = [batch size x lin_2_out]
        self.cache['lin_2_out'] = lin_2_out # batch size, linear_2_out_features

        g_out = self._nonlinears_fwd[self.g_function](lin_2_out) # batch size, linear_2_out_features
        return g_out
    
    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
        """
        dJdgin = dJdy_hat * self._nonlinears_back[self.g_function](self.cache['lin_2_out']) # batch size, linear_2_out_features
        self.grads['dJdW2'] = torch.matmul(torch.t(dJdgin), self.cache['lin_2_in']) # batch size, linear_2_out_features
#                                          [lin2outfeat x batch size] X [batch size x lin2infeat] = [lin2out X lin2in]
        self.grads['dJdb2'] = torch.matmul(torch.ones(dJdgin.shape[0]), dJdgin) # [lin2outfeat]
          # [batch size X lin2out] = [lin2outfeat]

        dJdfout = torch.matmul(dJdgin, self.parameters['W2'])
                              # [batch size X lin2out] X [lin2out X lin2in] = [batch size X lin2in(lin1out)]


        dJdfin = dJdfout * self._nonlinears_back[self.f_function](self.cache['lin_1_out'])

        self.grads['dJdW1'] = torch.matmul(torch.t(dJdfin), self.cache['lin_1_in'])
                              # [lin1out X batch size] X [batch size x lin1in] = [lin1out x lin1in]

        self.grads['dJdb1'] = torch.matmul(torch.ones(dJdfin.shape[0]), dJdfin) # trick to sum up all of the batches

    
    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()



## Testing pieces

In [None]:
def compare(torch_func, my_forw, my_backw):
  rand_els = torch.randn(1, 1, requires_grad=True)

  my_output = my_forw(rand_els)
  torch_output = torch_func(rand_els)

  my_back = my_backw(rand_els)

  torch_output.backward()
  torch_back = rand_els.grad

  print(my_output, torch_output)
  print(my_back, torch_back)

In [None]:
compare(torch.nn.Identity(), MLP._nonlinears_fwd['identity'], MLP._nonlinears_back['identity'])

tensor([[0.1367]], requires_grad=True) tensor([[0.1367]], requires_grad=True)
tensor([[1.]]) tensor([[1.]])


In [None]:
def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)

    Return:
        J: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    loss = (torch.square(y_hat - y)).sum() / (y.size(dim=0) * y.size(dim=1))
    dJdy_hat = 2 * (y_hat - y) / (y.size(dim=0) * y.size(dim=1))
    # return loss, dJdy_hat

    return loss, dJdy_hat



In [None]:


y_hat = torch.randn(3, 5, requires_grad=True)
y = torch.randn(3, 5)
print(mse_loss(y, y_hat))

torch_mse_loss = torch.nn.MSELoss()
output = torch_mse_loss(y_hat, y)
print(output)
output.backward()
print(y_hat.grad)

(tensor(1.0971, grad_fn=<DivBackward0>), tensor([[ 0.1211,  0.2152, -0.1248,  0.0567, -0.0976],
        [-0.1469,  0.0817, -0.1453, -0.0113, -0.1360],
        [-0.2298, -0.1878,  0.0012, -0.2166,  0.0171]], grad_fn=<DivBackward0>))
tensor(1.0971, grad_fn=<MseLossBackward0>)
tensor([[ 0.1211,  0.2152, -0.1248,  0.0567, -0.0976],
        [-0.1469,  0.0817, -0.1453, -0.0113, -0.1360],
        [-0.2298, -0.1878,  0.0012, -0.2166,  0.0171]])


In [None]:
def bce_loss(y, y_hat):
    """
    Args:
        y_hat: the prediction tensor
        y: the label tensor
        
    Return:
        loss: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    loss = torch.mean(y * torch.log(y_hat) + (torch.ones(y.shape) - y) * torch.log(torch.ones(y.shape) - y_hat)) * -1
    dJdy_hat = (y_hat - y) / ((y_hat * (torch.ones(y_hat.shape) - y_hat)) * y_hat.shape[0] * y_hat.shape[1])

    return loss, dJdy_hat

In [None]:
y_hat = torch.rand(3, 5, requires_grad=True)
y = torch.rand(3, 5)
my_loss, my_d = bce_loss(y, y_hat)
print(my_loss)

torch_bce_loss = torch.nn.BCELoss()
output = torch_bce_loss(y_hat, y)
print(output)
output.backward()
print(my_d)
print(y_hat.grad)

tensor(0.9840, grad_fn=<MulBackward0>)
tensor(0.9840, grad_fn=<BinaryCrossEntropyBackward0>)
tensor([[-0.1704,  0.2386, -0.2822, -0.1741,  0.1928],
        [-0.0993, -0.0063, -0.1474,  0.0866, -0.2359],
        [-0.1852, -0.0878,  0.1056,  0.1814, -0.2266]], grad_fn=<DivBackward0>)
tensor([[-0.1704,  0.2386, -0.2822, -0.1741,  0.1928],
        [-0.0993, -0.0063, -0.1474,  0.0866, -0.2359],
        [-0.1852, -0.0878,  0.1056,  0.1814, -0.2266]])


# Test 1

In [None]:
import torch.nn as nn
import torch.nn.functional as F

from collections import OrderedDict

In [None]:
net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=5,
    g_function='identity'
)
x = torch.randn(10, 2)
y = torch.randn(10, 5)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)
net.backward(dJdy_hat)

#------------------------------------------------
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20, 5)),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm()< 1e-3)

tensor(True)
tensor(True)
tensor(True)
tensor(True)


# Test 2

In [None]:
net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='sigmoid',
    linear_2_in_features=20,
    linear_2_out_features=5,
    g_function='sigmoid'
)
x = torch.randn(10, 2)
y = (torch.randn(10, 5) < 0.5) * 1.0

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = bce_loss(y, y_hat)
net.backward(dJdy_hat)

#------------------------------------------------
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('sigmoid1', nn.Sigmoid()),
        ('linear2', nn.Linear(20, 5)),
        ('sigmoid2', nn.Sigmoid()),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = torch.nn.BCELoss()(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm()< 1e-3)

tensor(True)
tensor(True)
tensor(True)
tensor(True)


# Test 3

In [None]:
net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=22,
    g_function='relu'
)
x = torch.randn(10, 2)
y = torch.randn(10, 22)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)
net.backward(dJdy_hat)

#------------------------------------------------
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('relu1', nn.ReLU()),
        ('linear2', nn.Linear(20, 22)),
        ('relu2', nn.ReLU()),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm()< 1e-3)
#------------------------------------------------

tensor(True)
tensor(True)
tensor(True)
tensor(True)
