In [1]:
import torch

BATCH_SIZE = 16
DIM_IN = 1000
HIDDEN_SIZE = 100
DIM_OUT = 10

class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.layer1 = torch.nn.Linear(1000, 100)
        self.relu = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(100, 10)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)

        return x
    

some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)

model = TinyModel()

In [20]:
print(model.layer1.weight.shape)
print(model.relu)
print(model.layer2.weight.shape)
print(model.parameters)

torch.Size([100, 1000])
ReLU()
torch.Size([10, 100])
<bound method Module.parameters of TinyModel(
  (layer1): Linear(in_features=1000, out_features=100, bias=True)
  (relu): ReLU()
  (layer2): Linear(in_features=100, out_features=10, bias=True)
)>


In [24]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

prediction = model(some_input)

loss = (ideal_output - prediction).pow(2).sum()

print(loss)

tensor(173.0572, grad_fn=<SumBackward0>)


In [25]:
loss.backward()

In [32]:
print(model.layer2.weight.shape)
print(model.layer2.weight.grad.shape)

torch.Size([10, 100])
torch.Size([10, 100])


In [33]:
optimizer.step()

In [34]:
print(model.layer2.weight)

Parameter containing:
tensor([[ 6.7683e-02, -3.4487e-02, -1.7559e-02, -4.2119e-02,  9.5270e-02,
         -6.9665e-03,  5.0232e-02,  1.7724e-02, -9.4259e-02,  4.5635e-02,
         -9.2800e-02, -4.1156e-02, -8.5544e-02,  8.7827e-03,  2.9855e-03,
         -3.3205e-02,  7.9451e-02, -2.6458e-02, -9.0868e-02, -1.0713e-02,
          8.9444e-02,  1.6475e-02,  7.5590e-02, -4.4829e-02, -5.6091e-02,
          2.3668e-02,  7.3946e-02, -6.4994e-02,  2.3310e-02, -1.3465e-02,
         -6.8868e-02, -8.2800e-03, -3.6021e-02,  9.1975e-02,  1.7672e-03,
          1.0087e-01, -2.2389e-02,  1.4291e-03,  2.2581e-02, -1.7147e-03,
         -6.1592e-02,  4.2058e-02,  1.4491e-02,  1.2379e-02,  7.0589e-02,
         -3.5654e-02,  3.1118e-03, -3.3720e-02, -5.0302e-02, -8.2787e-03,
         -2.5910e-02, -5.6826e-02,  9.6518e-02,  5.5030e-03, -6.9200e-02,
          4.5546e-03,  5.1975e-02, -4.6875e-02, -2.4766e-02,  8.6339e-02,
          7.0457e-04,  7.5280e-03,  6.4729e-02, -4.1605e-03,  3.9209e-02,
         -7.8789

In [36]:
for i in range(0, 5):
    prediction = model(some_input)
    loss = (ideal_output - prediction).pow(2).sum()
    loss.backward()

print(model.layer2.weight.grad[0][0:10])
optimizer.zero_grad(set_to_none=False)
print(model.layer2.weight.grad[0][0:10])

tensor([ -1.6542,   3.2534,   2.6234,   8.3513,   2.3771,  -1.4415, -12.7955,
          1.5535,   4.8971, -35.5633])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
