In [2]:
import torch
import numpy as np
from torch.optim import Optimizer
from torch.autograd import Variable

In [164]:
class hessian(Optimizer):
    def __init__(self, params, lr=1e-3, momentum = 0.9):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= momentum:
            raise ValueError("Invalid momentum parameter: {}".format(momentum))
        self.iteration = -1
        defaults = dict(lr=lr, momentum = momentum)
        super(hessian, self).__init__(params, defaults)
        #initialize variables 
        for group in self.param_groups:
             for p in group['params']:
                state = self.state[p]
                state['prev_param'] = torch.zeros_like(p)
                state['prev_g'] = torch.zeros_like(p)
                state['current_param'] = torch.zeros_like(p)
    
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()
        self.iteration += 1
        for group in self.param_groups:
            momentum = group['momentum']
            lr = group['lr']
            vector = []
            grads = []
            param = []
            for p in group['params']:
                if p.grad is None:
                    continue
                state = self.state[p]
                prev_param, prev_g, current_param = state['prev_param'], state['prev_g'], state['current_param']
                #print(state)
                with torch.no_grad():
                    if(self.iteration == 0):
                        prev_g.add_(p.grad)
                        prev_param.add_(p)
                        g_norm = torch.div(torch.norm(prev_g), prev_g)
                        current_param.add_(prev_param.add(g_norm, alpha = -group['lr']))
                        p.add_(g_norm, alpha = -group['lr'])
                        param.append(p)
                    else:
                        vector.append(current_param.add(prev_param, alpha = -1))
                        grads.append(p.grad)
                        param.append(p)
                    
            if(self.iteration > 0):
                dot_product = sum([(g * v).sum() for g, v in zip(grads, vector)])
                hvp = torch.autograd.grad(dot_product, param)
                with torch.no_grad():
                    i = 0
                    for p in group['params']:
                        state = self.state[p]
                        prev_param, prev_g, current_param = state['prev_param'], state['prev_g'], state['current_param']
                        prev_g.add_(hvp[i]).mul_(1-group['momentum']).add_(p.grad, alpha = group['momentum'])
                        prev_param.copy_(current_param)
                        g_norm = torch.div(torch.norm(prev_g), prev_g)
                        current_param.add_(g_norm, alpha = -group['lr'])
                        p.add_(g_norm, alpha = -group['lr'])
                        i += 1
                    
               
        return loss
                    
                    
                    
                    
                    
                
                

In [165]:
# solution here
import torch.nn as nn
class Neuron(nn.Module):
    def __init__(self, input_size, output_size):
        super(Neuron, self).__init__()
        self.fc1 = nn.Linear(input_size,output_size)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        out1 = self.fc1(x)
        out2 = self.sig(out1)
        return out2
x = torch.tensor([-1.0, -2.0, 1.0], requires_grad = True)
net = Neuron(3,1)
print(net)


Neuron(
  (fc1): Linear(in_features=3, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [166]:
x = torch.tensor([[1.2, 1], [0.2, 1.4], [0.5, 0.5], 
                  [-1.5, -1.3], [0.2, -1.4], [-0.7, -0.5]])
y = torch.tensor([0, 0, 0, 1, 1, 1 ])

my_neuron = Neuron(2,1)
optimizer1 = hessian(my_neuron.parameters())
optimizer2 = torch.optim.SGD(my_neuron.parameters(), lr = 0.001, momentum = 0.9)
criterion = nn.CrossEntropyLoss()
total_loss = []
for i in range(20):
    optimizer1.zero_grad()
    out = my_neuron(x)
    loss = criterion(torch.cat((out, 1-out), axis=1), y)
    loss.backward(create_graph = True)
    para_list = [x for x in my_neuron.parameters()]
    optimizer1.step()
print(loss)



tensor(0.8038, grad_fn=<NllLossBackward>)
