In [1]:
import numpy as np
import torch
import torch.nn as nn

# Single batch training

In [2]:
batch_size = 16
dim_in = 1000
hidden_size =100
dim_out =10

class Tinymodel(torch.nn.Module):
    def __init__(self):
        super(Tinymodel,self).__init__()

        self.layer1 = torch.nn.Linear(1000,100)
        self.relu = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(100,10)
        
    def forward(self,x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

some_input = torch.randn(batch_size,dim_in,requires_grad=True)
ideal_output = torch.randn(batch_size,dim_out,requires_grad=False)

model = Tinymodel()

look at the layers of the model we can examine the values of the weight but no gradient can computed yet

In [3]:
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad)

tensor([-0.0964, -0.0893, -0.0133,  0.0277, -0.0643, -0.0905,  0.0229,  0.0408,
         0.0910, -0.0329], grad_fn=<SliceBackward>)
None


# loss_func , prediction , optimizer

In [4]:
import torch.optim

In [5]:
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

prediction = model(some_input)
loss = (ideal_output-prediction).pow(2).sum()
print(loss)

tensor(166.9617, grad_fn=<SumBackward0>)


# loss.backward()

In [6]:
loss.backward()
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])


tensor([-0.0964, -0.0893, -0.0133,  0.0277, -0.0643, -0.0905,  0.0229,  0.0408,
         0.0910, -0.0329], grad_fn=<SliceBackward>)
tensor([ 1.3326,  0.2995, -0.3225,  0.9774,  0.8257,  2.6286,  0.6513,  3.2136,
         0.5124, -0.8974])


The optimizer is responsible for updating model weight based on the computed Gradients

In [8]:
optimizer.step()
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])
# gradient can be computed but weight will remain unchanged..


tensor([-0.0991, -0.0899, -0.0127,  0.0258, -0.0659, -0.0957,  0.0216,  0.0343,
         0.0900, -0.0311], grad_fn=<SliceBackward>)
tensor([ 1.3326,  0.2995, -0.3225,  0.9774,  0.8257,  2.6286,  0.6513,  3.2136,
         0.5124, -0.8974])


One important thing about the process, After calling optimizer.step(). you need to call optimizer.zero_grad() , or else every time you can loss.backward()  the gardients on the learning weight will accumulate..

In [9]:
print(model.layer2.weight.grad[0][0:10])

for i in range(0,5):
    prediction = model(some_input)
    loss = (ideal_output-prediction).pow(2).sum()
    loss.backward()
print(model.layer2.weight.grad[0][0:10])
optimizer.zero_grad()
print(model.layer2.weight.grad[0][0:10])

tensor([ 1.3326,  0.2995, -0.3225,  0.9774,  0.8257,  2.6286,  0.6513,  3.2136,
         0.5124, -0.8974])
tensor([ 25.4164,  -0.9029, -10.5874,  11.2315,   4.7047,   7.9834,   6.7588,
         15.1513,  -0.4925,   5.1818])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
