In [1]:
import torch
from torch.autograd import Variable
from torch.autograd import Function
import torch.nn.functional as F
import numpy as np

## Huber loss function
https://en.wikipedia.org/wiki/Huber_loss

In [2]:
# A loss function measures distance between a predicted and a target tensor
# An implementation of Huber loss function is given below
# We will make use of this loss function in gradient descent optimization
def Huber_Loss(input,delta):
  m = (torch.abs(input)<=delta).detach().float()
  output = torch.sum(0.5*m*input**2 + delta*(1.0-m)*(torch.abs(input)-0.5*delta))
  return output

# Test Huber loss with a couple of different examples

In [3]:
a = torch.tensor([[0.3, 2.0, -3.1],[0.5, 9.2, 0.1]])
print(a.numpy())
ha = Huber_Loss(a,1.0)
print(ha.numpy())

b = torch.tensor([0.3, 2.0])
print(b.numpy())
hb = Huber_Loss(b,1.0)
print(hb.numpy())

[[ 0.3  2.  -3.1]
 [ 0.5  9.2  0.1]]
12.975
[0.3 2. ]
1.545


In [4]:
def gradient_descent(var,optimizer,softmax,loss,target,nIter,nPrint):
  for i in range(nIter):
    z = softmax(var)
    f = loss(z-target,1.0)
    optimizer.zero_grad()
    f.backward()
    optimizer.step()
    if i%nPrint==0:
      with np.printoptions(precision=3, suppress=True):
        print("Iteration:",i,"Variable:", z.detach().numpy(),"Loss: %0.6f" % f.item())


In [5]:
y = torch.zeros(10)
y[2] = 1.0
print("Target 1-hot vector:",y.numpy())
x = Variable(torch.randn(y.shape),requires_grad=True)

optimizer = torch.optim.SGD([x], lr=1e-1, momentum=0.9) # create an optimizer that will do gradient descent optimization

gradient_descent(x,optimizer,F.softmax,Huber_Loss,y,1000,100)


Target 1-hot vector: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
Iteration: 0 Variable: [0.264 0.06  0.053 0.03  0.067 0.044 0.015 0.099 0.093 0.275] Loss: 0.536443
Iteration: 100 Variable: [0.008 0.006 0.945 0.004 0.007 0.005 0.002 0.008 0.008 0.008] Loss: 0.001678
Iteration: 200 Variable: [0.006 0.005 0.958 0.003 0.005 0.004 0.002 0.006 0.006 0.006] Loss: 0.000967


  This is separate from the ipykernel package so we can avoid doing imports until


Iteration: 300 Variable: [0.005 0.004 0.965 0.002 0.004 0.003 0.001 0.005 0.005 0.005] Loss: 0.000683
Iteration: 400 Variable: [0.004 0.003 0.969 0.002 0.004 0.003 0.001 0.004 0.004 0.004] Loss: 0.000527
Iteration: 500 Variable: [0.004 0.003 0.972 0.002 0.003 0.003 0.001 0.004 0.004 0.004] Loss: 0.000430
Iteration: 600 Variable: [0.004 0.003 0.975 0.002 0.003 0.002 0.001 0.004 0.004 0.004] Loss: 0.000362
Iteration: 700 Variable: [0.003 0.003 0.976 0.002 0.003 0.002 0.001 0.003 0.003 0.003] Loss: 0.000313
Iteration: 800 Variable: [0.003 0.003 0.978 0.002 0.003 0.002 0.001 0.003 0.003 0.003] Loss: 0.000276
Iteration: 900 Variable: [0.003 0.002 0.979 0.001 0.003 0.002 0.001 0.003 0.003 0.003] Loss: 0.000246


In [6]:
# Inherit from torch.autograd.Function
class My_f(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    def forward(ctx, x):
        f = torch.sum(x**2)
        ctx.save_for_backward(x,torch.tensor(2.0)) # note that the constant 2.0 is cast as a pytorch tensor before saving
        return f

    @staticmethod
    def backward(ctx, output_grad):
        # retrieve saved tensors and use them in derivative calculation
        x,two = ctx.saved_tensors
        # Return Jacobian-vector product (chain rule)
        input_grad = two*x*output_grad
                
        return input_grad

x = Variable(torch.tensor([3.0,-1.0,0.0,1.0]),requires_grad=True)
my_fval = My_f.apply(x)
print("When x is", x.data, "function value is",my_fval.item())

# compute gradient of f at x
g = torch.autograd.grad(my_fval,x)[0]

print("Gradient of f at x:",g.data)

When x is tensor([ 3., -1.,  0.,  1.]) function value is 11.0
Gradient of f at x: tensor([ 6., -2.,  0.,  2.])


In [7]:
# Inherit from torch.autograd.Function
class My_Huber_Loss(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    def forward(ctx, input, delta):
        m = (torch.abs(input)<=delta).float()
        ctx.save_for_backward(input,torch.tensor(m),torch.tensor(delta))
        output = torch.sum(0.5*m*input**2 + delta*(1.0-m)*(torch.abs(input)-0.5*delta))
        return output

    @staticmethod
    def backward(ctx, output_grad):
        # retrieve saved tensors and use them in derivative calculation
        input, m, delta = ctx.saved_tensors

        # Return Jacobian-vector producynt (chain rule)
        # For Huber loss function the Jacobian happens to be a diagonal matrix
        # Also, note that output_grad is a scalar, because forward function returns a scalar value

        # Take our gradient we derived before and multiply it by the output_grad
        input_grad = (m*input + delta*(1.0-m)*(input/torch.abs(input))) * output_grad

        # must return two gradients becuase forward function takes in two arguments
        return input_grad, None

# Gradient Descent on our Own Huber Loss

In [8]:
y = torch.zeros(10)
y[2] = 1.0
print("Target:",y.numpy())
x = Variable(torch.randn(y.shape),requires_grad=True)

optimizer = torch.optim.SGD([x], lr=1e-1, momentum=0.9) # create an optimizer that will do gradient descent optimization

gradient_descent(x,optimizer,F.softmax,My_Huber_Loss.apply,y,1000,100)


Target: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
Iteration: 0 Variable: [0.046 0.029 0.201 0.042 0.059 0.087 0.335 0.067 0.05  0.085] Loss: 0.390159
Iteration: 100 Variable: [0.005 0.003 0.948 0.004 0.006 0.007 0.008 0.006 0.005 0.007] Loss: 0.001521
Iteration: 200 Variable: [0.004 0.003 0.959 0.003 0.004 0.006 0.006 0.005 0.004 0.006]

  This is separate from the ipykernel package so we can avoid doing imports until
  


 Loss: 0.000919
Iteration: 300 Variable: [0.003 0.002 0.966 0.003 0.004 0.005 0.005 0.004 0.003 0.005] Loss: 0.000659
Iteration: 400 Variable: [0.003 0.002 0.97  0.003 0.003 0.004 0.005 0.004 0.003 0.004] Loss: 0.000514
Iteration: 500 Variable: [0.003 0.002 0.973 0.002 0.003 0.004 0.004 0.003 0.003 0.004] Loss: 0.000421
Iteration: 600 Variable: [0.002 0.002 0.975 0.002 0.003 0.003 0.004 0.003 0.002 0.003] Loss: 0.000356
Iteration: 700 Variable: [0.002 0.002 0.976 0.002 0.003 0.003 0.004 0.003 0.002 0.003] Loss: 0.000309
Iteration: 800 Variable: [0.002 0.001 0.978 0.002 0.002 0.003 0.003 0.003 0.002 0.003] Loss: 0.000272
Iteration: 900 Variable: [0.002 0.001 0.979 0.002 0.002 0.003 0.003 0.002 0.002 0.003] Loss: 0.000244


In [9]:
# Inherit from Function
class My_softmax(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    def forward(ctx, input):
        output = F.softmax(input,dim=0)
        ctx.save_for_backward(output) # this is the only tensor you will need to save for backward function
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, output_grad):
        # retrieve saved tensors and use them in derivative calculation
        output = ctx.saved_tensors[0]
        
        # Create an identity matrix to represent the i==j case
        eye = torch.eye(output.shape[0])

        # Subtract each S(1), S(2), .. from each row to account for both cases 
        eye.sub_(torch.reshape(output,(-1,1)))
    
        # Sum the product row by row 
        input_grad = torch.sum(output * eye * output_grad, dim=1)
        return input_grad

# Gradient Descent on our own Huber Loss and your own softmax

In [10]:
y = torch.zeros(10)
y[2] = 1.0
print(y)
x = Variable(torch.randn(y.shape),requires_grad=True)
print(x)

optimizer = torch.optim.SGD([x], lr=1e-1, momentum=0.9) # create an optimizer that will do gradient descent optimization

gradient_descent(x,optimizer,My_softmax.apply,My_Huber_Loss.apply,y,1000,100)


tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])
tensor([-2.1823, -0.4421,  0.2447,  1.6455, -1.3325,  0.7894, -1.2010, -1.5473,
        -0.3125, -0.5362], requires_grad=True)
Iteration: 0 Variable: [0.01  0.056 0.111 0.45  0.023 0.191 0.026 0.018 0.064 0.051] Loss: 0.520559
Iteration: 100 Variable: [0.001 0.007 0.951 0.008 0.003 0.01  0.004 0.003 0.007 0.006] Loss: 0.001366


  


Iteration: 200 Variable: [0.001 0.005 0.961 0.006 0.003 0.008 0.003 0.002 0.006 0.005] Loss: 0.000853
Iteration: 300 Variable: [0.001 0.005 0.967 0.005 0.002 0.007 0.002 0.002 0.005 0.004] Loss: 0.000621
Iteration: 400 Variable: [0.001 0.004 0.971 0.005 0.002 0.006 0.002 0.002 0.004 0.004] Loss: 0.000488
Iteration: 500 Variable: [0.001 0.004 0.973 0.004 0.002 0.005 0.002 0.001 0.004 0.003] Loss: 0.000402
Iteration: 600 Variable: [0.001 0.003 0.975 0.004 0.002 0.005 0.002 0.001 0.004 0.003] Loss: 0.000341
Iteration: 700 Variable: [0.001 0.003 0.977 0.004 0.002 0.005 0.002 0.001 0.003 0.003] Loss: 0.000297
Iteration: 800 Variable: [0.001 0.003 0.979 0.003 0.001 0.004 0.002 0.001 0.003 0.003] Loss: 0.000262
Iteration: 900 Variable: [0.001 0.003 0.98  0.003 0.001 0.004 0.002 0.001 0.003 0.003] Loss: 0.000235
