there is always a problem of how do we setup a loss function and optimize it. Choosing the right loss function increase the chances of model convergence.

In [34]:
import torch

In [35]:
torch.__version__

'1.12.1'

In [36]:
torch.tensor

<function torch._VariableFunctionsClass.tensor>

In [37]:
t_c = torch.tensor([0.,1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11])
t_u = torch.tensor([0.4,1.2,2.5,3.9,4.3,5.7,6.2,7.9,8.3,9.5,10.5,11.5])

In [38]:
t_c

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])

In [39]:
t_u

tensor([ 0.4000,  1.2000,  2.5000,  3.9000,  4.3000,  5.7000,  6.2000,  7.9000,
         8.3000,  9.5000, 10.5000, 11.5000])

define model, loss_fn

In [40]:
def model(t_u, w, b):
    return w * t_u + b

In [41]:
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c) ** 2
    return squared_diffs.mean()

In [42]:
w = torch.ones(1)
b = torch.zeros(1)

model infer of training set tensor(t_u)

In [43]:
t_p = model(t_u, w, b)
t_p

tensor([ 0.4000,  1.2000,  2.5000,  3.9000,  4.3000,  5.7000,  6.2000,  7.9000,
         8.3000,  9.5000, 10.5000, 11.5000])

In [44]:
# count loss function value
loss = loss_fn(t_p, t_c)
loss

tensor(0.2942)

The errors in the first round of iteration is backpropagated to reduce the error in the second round, for which the initial set of weight should be updated.

In [45]:
delta = 0.1
loss_with_delta = loss_fn(model(t_u, w+delta, b), t_c)
loss_before_delta = loss_fn(model(t_u, w, b), t_c)
loss_rate_of_change_w = (loss_with_delta - loss_before_delta)
loss_rate_of_change_w

tensor(1.0976)

In [46]:
learning_rate = 1e-2
w = w - learning_rate * loss_rate_of_change_w

# first, delta > 0, so, if loss_rate_of_change_w > 0, then we can realize that the optimization could go to a bad direction,
# and the weight(w) should avoid of this direction. 
# if loss_rate_of_change_w < 0, the w should grow, too.

In [48]:
delta = 0.1
learning_rate = 1e-2

loss_rate_of_change_w = (loss_fn(model(t_u, w+delta,b), t_c) - loss_fn(model(t_u, w-delta, b), t_c)) / (2.0*delta)
w = w - learning_rate * loss_rate_of_change_w

loss_rate_of_change_b =  (loss_fn(model(t_u, w,b+delta), t_c) - loss_fn(model(t_u, w, b-delta), t_c)) / (2.0*delta)
b = b - learning_rate * loss_rate_of_change_b


In [49]:
w,b

(tensor([0.9379]), tensor([-0.0024]))

In [50]:
from torch import nn
loss = nn.MSELoss()
input = torch.randn(10,5, requires_grad = True)
target = torch.randn(10,5)
output = loss(input,target)
output.backward()

In [51]:
output

tensor(1.5577, grad_fn=<MseLossBackward0>)

In [52]:
output.grad_fn

<MseLossBackward0 at 0x260da947220>

How do we estimate the derivative of a loss function?

In [53]:
def dloss_fn(t_p, t_c):
    dsq_diffs = 2 * (t_p - t_c)
    return dsq_diffs

def model(t_u, w, b):
    return w * t_u + b

def dmodel_dw(t_u, w, b):
    return t_u

def dmodel_db(t_u, w, b):
    return 1.0

def grad_fn(t_u, t_c, t_p, w, b):
    dloss_dw = dloss_fn(t_p, t_c) * dmodel_dw(t_u, w, b)
    dloss_db = dloss_fn(t_p, t_c) * dmodel_db(t_u, w, b)
    return torch.stack([dloss_dw.mean(), dloss_db.mean()])

In [54]:
params = torch.tensor([1.0,1.0])

nepochs = 100

learning_rate = 1e-2

for epoch in range(nepochs):
    w, b  = params
    t_p = model(t_u, w, b)
    loss = loss_fn(t_p, t_c)
    print(f'Epoch {epoch}: Loss {float(loss)}')

    grad = grad_fn(t_u, t_c, t_p, w, b)

    print(f"params: {params}, Grad: {grad}")

    params = params - learning_rate * grad

params

Epoch 0: Loss 2.2774999141693115
params: tensor([1., 1.]), Grad: tensor([18.1550,  2.9833])
Epoch 1: Loss 0.5418065786361694
params: tensor([0.8184, 0.9702]), Grad: tensor([0.3527, 0.7481])
Epoch 2: Loss 0.5359362959861755
params: tensor([0.8149, 0.9627]), Grad: tensor([-0.0759,  0.6909])
Epoch 3: Loss 0.5311182141304016
params: tensor([0.8157, 0.9558]), Grad: tensor([-0.0858,  0.6861])
Epoch 4: Loss 0.5263487100601196
params: tensor([0.8165, 0.9489]), Grad: tensor([-0.0856,  0.6827])
Epoch 5: Loss 0.5216265320777893
params: tensor([0.8174, 0.9421]), Grad: tensor([-0.0852,  0.6793])
Epoch 6: Loss 0.5169512629508972
params: tensor([0.8182, 0.9353]), Grad: tensor([-0.0847,  0.6759])
Epoch 7: Loss 0.5123223662376404
params: tensor([0.8191, 0.9285]), Grad: tensor([-0.0843,  0.6726])
Epoch 8: Loss 0.5077394843101501
params: tensor([0.8199, 0.9218]), Grad: tensor([-0.0839,  0.6692])
Epoch 9: Loss 0.5032020807266235
params: tensor([0.8208, 0.9151]), Grad: tensor([-0.0835,  0.6659])
Epoch 10: 

tensor([0.8820, 0.4269])