Authors: Zhewei Yao <https://github.com/yaozhewei>, Amir Gholami <http://amirgholami.org/>


This tutorial shows how to compute the Hessian information using (randomized) numerical linear algebra for both explicit Hessian (the matrix is given) as well as implicit Hessian (the matrix is ungiven).

We'll start by doing the necessary imports:

In [1]:
import numpy as np
import torch 
from torchvision import datasets, transforms
from utils import * # get the dataset
from pyhessian import hessian
from pyhessian.hessian_with_activation import hessian_with_activation # Hessian computation
from density_plot import get_esd_plot # ESD plot
from pytorchcv.model_provider import get_model as ptcv_get_model # model
from pyhessian.utils import group_product

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# enable cuda devices
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

# device = torch.device("cuda:0,1")

In [15]:
# get the model 
model = ptcv_get_model("resnet20_cifar10", pretrained=True)

# change the model to eval mode to disable running stats upate
model.eval()


# create loss function
criterion = torch.nn.CrossEntropyLoss()

# get dataset 
train_loader, test_loader = getData(train_bs=20, train_length=0.02)

model = model.cuda()

# to print batch size and the number of batch
for inputs, targets in train_loader:
    break;
print(len(train_loader))    
print(len(inputs))

# make hessian object
hessian_comp = hessian_with_activation(model, criterion, dataloader=train_loader, cuda=True)

Files already downloaded and verified
50
20


In [16]:
target = "weight"

if target == "input":
    # insert forward and backword hook in model
    hessian_comp.insert_hook("conv") # insert hook to module which name ends with "conv"

    # compute input hessian from full dataset
    act_trace = hessian_comp.trace_activ(maxIter=2, tol=1e-6)
    print(np.mean(act_trace, axis=0))
elif target == "weight":
    weight_trace = hessian_comp.trace(maxIter=1, param_name='conv.weight')
    print(np.mean(weight_trace, axis=0))



trace had not been converge
[ 1.65418892e+01  6.69527054e+01  1.22185196e+02  3.60268745e+01
 -3.12565994e+00  8.31134491e+01  1.61412659e+01  1.71913330e+02
  1.87712421e+01  2.60156670e+01  1.55261860e+01 -6.51743174e+00
  2.11751614e+01 -5.25889540e+00  6.81667328e+01  1.59252289e+02
  1.00606794e+01 -9.92853642e-02  2.15795059e+01  3.80902596e+01
 -3.12092743e+01]


In [None]:
# to check hooked layer, param size and grad size
hessian_comp.get_activ_rand_v(show_layer=True, dont_reset=True) # if you not remove remark of get_same_size_activ_grad, show_layer option not work

# print hooked layer , param size and grad size
for layer in hessian_comp.activation_grads.keys():
    has_grad = hessian_comp.activation_grads[layer][0] #.require_grad()
    # print(dir(has_grad))
    if (has_grad is not None):
        print(f"*** {layer} : {has_grad.grad_fn}")

append features.stage1.unit1.body.conv1.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage1.unit1.body.conv2.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage1.unit2.body.conv1.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage1.unit2.body.conv2.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage1.unit3.body.conv1.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage1.unit3.body.conv2.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage2.unit1.identity_conv.conv, active size : torch.Size([20, 16, 32, 32]), active grad size : torch.Size([20, 16, 32, 32])
append features.stage2.unit1.bo

In [None]:
# From this block, all codes are for debug
from pyhessian.utils import *

# to check grad and param in 1 batch backward
device = hessian_comp.device
hessian_comp.insert_hook_quant_module("quant_convbn")
for inputs, targets in hessian_comp.data:
    hessian_comp.model.zero_grad()

    hessian_comp.reset_reg_active()

    outputs = hessian_comp.model(inputs.to(device))
    loss = hessian_comp.criterion(outputs, targets.to(device))
    loss.backward(create_graph=True)
    break
param, grad = get_params_grad(hessian_comp.model)

for p, g in zip(param, grad):
    print(p.size(), g.size(), g.grad_fn)

In [7]:

# # print(hessian_comp.activations.keys())
# print((hessian_comp.activation_grads["features.stage1.unit1.body.conv1.conv"][0][0].size()))
# print((hessian_comp.activations["features.stage1.unit1.body.conv1.conv"][0][0].size()))
# # print((hessian_comp.activations["features.stage1.unit1.body.conv1.conv"][0][0].grad))
# print((hessian_comp.activations["features.stage1.unit1.body.conv1.conv"][0][0]))
# print((hessian_comp.activation_grads["features.stage1.unit1.body.conv1.conv"][0][0]))
# # print((hessian_comp.activation_grads["features.stage1.unit1.body.conv1.conv"][0][0].grad_fn.next_functions))

In [9]:
for layer in hessian_comp.activations.keys():
    input_size = torch.randint_like(hessian_comp.activations[layer][0][0], high=2, device="cuda").size()
    if hessian_comp.activation_grads[layer][0][0] is not None:
        grad_size = hessian_comp.activation_grads[layer][0][0].size()
    else:
        grad_size = 1
    # print(type(hessian_comp.activation_grads[layer][i][0]))
    if(input_size != grad_size):
        print(f"************* {layer} not equal!! ************")
        print( input_size, grad_size, "\n\n")
    else:
        print(f"************* {layer} ************")
        print( input_size, "\n\n")


************* features.init_block.conv ************
torch.Size([5, 3, 32, 32]) 1 


************* features.stage1.unit1.body.conv1.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage1.unit1.body.conv2.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage1.unit2.body.conv1.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage1.unit2.body.conv2.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage1.unit3.body.conv1.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage1.unit3.body.conv2.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage2.unit1.identity_conv.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.stage2.unit1.identity_conv ************
torch.Size([5, 16, 32, 32]) torch.Size([5, 32, 16, 16]) 


************* features.stage2.unit1.body.conv1.conv ************
torch.Size([5, 16, 32, 32]) 


************* features.

In [9]:
"""
hessian_comp.activations["features.stage3.unit2.activ"] are saved inputs of each layer
while executing trace and dataload_hv_product for loop. 
They listed by trace and dataload_hv_product for loop.

Dim 1 of hessian_comp.activations["features.stage3.unit2.activ"][0] means batch.
"""
# v = torch.randint_like(hessian_comp.activations["features.stage1.unit1.body.conv1.conv"][0][0], high=2, device="cuda")
rand_vs = []
activ_grads = []
activs = []

for layer in hessian_comp.activations.keys():
    for i in range(len(hessian_comp.activations[layer])):
        activ_element = hessian_comp.activations[layer][i][0]
        grad_element = hessian_comp.activation_grads[layer][i][0]

        if (grad_element is None):
            continue
        elif grad_element.size() != activ_element.size() :
            continue
        else:    
            rand_vs.append(torch.randint_like(hessian_comp.activations[layer][i][0], high=2, device="cuda"))
            activ_grads.append(hessian_comp.activation_grads[layer][i][0])
            activs.append(hessian_comp.activations[layer][i][0])


Hv_list = []
trace_list = []
for (v, grad, activ) in zip(rand_vs, activ_grads, activs):
    Hv = torch.autograd.grad(
        grad, 
        activ, 
        grad_outputs=v, only_inputs=True, retain_graph=True)
    Hv_list.append(Hv)
    trace_list.append(group_product(Hv, v).cpu().item())

print(trace_list)


# Hv = torch.autograd.grad(
#     hessian_comp.activation_grads["features.stage1.unit1.body.conv1.conv"][0][0], 
#     hessian_comp.activations["features.stage1.unit1.body.conv1.conv"][0][0], 
#     grad_outputs=v, only_inputs=True, retain_graph=False)



# a = torch.autograd.grad(hessian_comp.activation_grads["features.stage3.unit2.activ"][1][0], hessian_comp.activations["features.stage3.unit2.activ"][1]) 
# print(hessian_comp.activations["features.stage3.unit2.activ"][1].size())

[0.2398936152458191, 0.2883727252483368, 0.06399604678153992, 2.021277904510498, 0.024379881098866463, 2.3017399311065674, 0.05182919651269913, 0.07555367797613144, 0.4853941798210144, 0.007302280515432358, 0.8529448509216309, 0.2646264433860779, 21.458599090576172, 0.010562198236584663, 0.03980473428964615, 3.2707457542419434, 1.180624008178711, 5.310753345489502, 2.88645339012146, 5.8211669921875]


In [12]:
layer1 = "features.stage1.unit1.body.conv1.conv"

tmp_v = torch.randint_like(hessian_comp.activations[layer1][0][0], high=2, device="cuda")


In [19]:
layer2 = "features.stage1.unit1.body.conv1.conv"

Hv = torch.autograd.grad(
    hessian_comp.activation_grads[layer1][0][0], 
    hessian_comp.activations[layer2][0][0], 
    grad_outputs=tmp_v, only_inputs=True, retain_graph=True)



print(group_product(Hv, tmp_v).cpu().item())


0.6590860486030579
