-
Notifications
You must be signed in to change notification settings - Fork 80
/
grad_cov_utils.py
137 lines (102 loc) · 3.65 KB
/
grad_cov_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
utils for gradient covariance experiment
"""
import itertools
import tqdm
import torch
def initialize_grad_list(model):
grad_list = []
for param in model.parameters():
param_dict = {
"sum": torch.zeros_like(param, requires_grad=False),
"sq_sum": torch.zeros_like(param, requires_grad=False),
"num_models": 0,
}
grad_list.append(param_dict)
return grad_list
def collect_grads(model, model_grads_list):
for (param, grad_dict) in zip(model.parameters(), model_grads_list):
grad_dict["sum"] += param.grad.data
grad_dict["sq_sum"] += param.grad.data ** 2.0
grad_dict["num_models"] += 1
def compute_opt_lr(grad_list, momentum, dataset_size):
var_diag_sum = 0
num_params = 0
if grad_list[0]["num_models"] < 2:
print("No models stored yet")
return None, None
for grad_dict in grad_list:
# (1/n sum x_i )^2
first_moment_squared = (grad_dict["sum"] / grad_dict["num_models"]) ** 2
# 1/n sum x_i^2
second_moment = grad_dict["sq_sum"] / grad_dict["num_models"]
# E(x^2) - E(x)^2
var = second_moment - first_moment_squared
var_diag_sum += var.sum()
num_params += var.numel()
# this is the noise up-scaled by a factor of S, so this cancels with the upper batch size
grad_noise = var_diag_sum.item()
# optimal lr is 2 * \mu * S /N * D/tr(C)
return 2 * num_params / (dataset_size * grad_noise), grad_noise
def train_epoch(
loader,
model,
criterion,
optimizer,
model_grads_list=None,
cuda=True,
regression=False,
verbose=False,
subset=None,
):
loss_sum = 0.0
correct = 0.0
verb_stage = 0
num_objects_current = 0
num_batches = len(loader)
model.train()
if subset is not None:
num_batches = int(num_batches * subset)
loader = itertools.islice(loader, num_batches)
dataset_size = len(loader.dataset)
if verbose:
loader = tqdm.tqdm(loader, total=num_batches)
for i, (input, target) in enumerate(loader):
if cuda:
input = input.cuda(non_blocking=True)
target = target.cuda(non_blocking=True)
loss, output = criterion(model, input, target)
optimizer.zero_grad()
loss.backward()
if model_grads_list is not None:
# store gradients
collect_grads(model, model_grads_list)
# compute optimal learning rates
# note that \mu = 1 - sgd['momentum'] bc of differences in pytorch's implementation
lr, grad_noise = compute_opt_lr(
model_grads_list,
1 - optimizer.param_groups[0]["momentum"],
dataset_size,
)
optimizer.step()
loss_sum += loss.data.item() * input.size(0)
if not regression:
pred = output.data.argmax(1, keepdim=True)
correct += pred.eq(target.data.view_as(pred)).sum().item()
num_objects_current += input.size(0)
if verbose and 10 * (i + 1) / num_batches >= verb_stage + 1:
print(
"Stage %d/10. Loss: %12.4f. Acc: %6.2f"
% (
verb_stage + 1,
loss_sum / num_objects_current,
correct / num_objects_current * 100.0,
)
)
if model_grads_list is not None:
print("Learning Rate: %.3f. tr(V(\hat(g))): %.3f" % (lr, grad_noise))
verb_stage += 1
return {
"loss": loss_sum / num_objects_current,
"accuracy": None if regression else correct / num_objects_current * 100.0,
}