forked from scipy/scipy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
_hessian_update_strategy.py
430 lines (371 loc) · 15.6 KB
/
_hessian_update_strategy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
"""Hessian update strategies for quasi-Newton optimization methods."""
from __future__ import division, print_function, absolute_import
import numpy as np
from numpy.linalg import norm
from scipy.linalg import get_blas_funcs
from warnings import warn
__all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1']
class HessianUpdateStrategy(object):
"""Interface for implementing Hessian update strategies.
Many optimization methods make use of Hessian (or inverse Hessian)
approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS.
Some of these approximations, however, do not actually need to store
the entire matrix or can compute the internal matrix product with a
given vector in a very efficiently manner. This class serves as an
abstract interface between the optimization algorithm and the
quasi-Newton update strategies, giving freedom of implementation
to store and update the internal matrix as efficiently as possible.
Different choices of initialization and update procedure will result
in different quasi-Newton strategies.
Four methods should be implemented in derived classes: ``initialize``,
``update``, ``dot`` and ``get_matrix``.
Notes
-----
Any instance of a class that implements this interface,
can be accepted by the method ``minimize`` and used by
the compatible solvers to approximate the Hessian (or
inverse Hessian) used by the optimization algorithms.
"""
def initialize(self, n, approx_type):
"""Initialize internal matrix.
Allocate internal memory for storing and updating
the Hessian or its inverse.
Parameters
----------
n : int
Problem dimension.
approx_type : {'hess', 'inv_hess'}
Selects either the Hessian or the inverse Hessian.
When set to 'hess' the Hessian will be stored and updated.
When set to 'inv_hess' its inverse will be used instead.
"""
raise NotImplementedError("The method ``initialize(n, approx_type)``"
" is not implemented.")
def update(self, delta_x, delta_grad):
"""Update internal matrix.
Update Hessian matrix or its inverse (depending on how 'approx_type'
is defined) using information about the last evaluated points.
Parameters
----------
delta_x : ndarray
The difference between two points the gradient
function have been evaluated at: ``delta_x = x2 - x1``.
delta_grad : ndarray
The difference between the gradients:
``delta_grad = grad(x2) - grad(x1)``.
"""
raise NotImplementedError("The method ``update(delta_x, delta_grad)``"
" is not implemented.")
def dot(self, p):
"""Compute the product of the internal matrix with the given vector.
Parameters
----------
p : array_like
1-d array representing a vector.
Returns
-------
Hp : array
1-d represents the result of multiplying the approximation matrix
by vector p.
"""
raise NotImplementedError("The method ``dot(p)``"
" is not implemented.")
def get_matrix(self):
"""Return current internal matrix.
Returns
-------
H : ndarray, shape (n, n)
Dense matrix containing either the Hessian
or its inverse (depending on how 'approx_type'
is defined).
"""
raise NotImplementedError("The method ``get_matrix(p)``"
" is not implemented.")
class FullHessianUpdateStrategy(HessianUpdateStrategy):
"""Hessian update strategy with full dimensional internal representation.
"""
_syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update
_syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update
# Symmetric matrix-vector product
_symv = get_blas_funcs('symv', dtype='d')
def __init__(self, init_scale='auto'):
self.init_scale = init_scale
# Until initialize is called we can't really use the class,
# so it makes sense to set everything to None.
self.first_iteration = None
self.approx_type = None
self.B = None
self.H = None
def initialize(self, n, approx_type):
"""Initialize internal matrix.
Allocate internal memory for storing and updating
the Hessian or its inverse.
Parameters
----------
n : int
Problem dimension.
approx_type : {'hess', 'inv_hess'}
Selects either the Hessian or the inverse Hessian.
When set to 'hess' the Hessian will be stored and updated.
When set to 'inv_hess' its inverse will be used instead.
"""
self.first_iteration = True
self.n = n
self.approx_type = approx_type
if approx_type not in ('hess', 'inv_hess'):
raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.")
# Create matrix
if self.approx_type == 'hess':
self.B = np.eye(n, dtype=float)
else:
self.H = np.eye(n, dtype=float)
def _auto_scale(self, delta_x, delta_grad):
# Heuristic to scale matrix at first iteration.
# Described in Nocedal and Wright "Numerical Optimization"
# p.143 formula (6.20).
s_norm2 = np.dot(delta_x, delta_x)
y_norm2 = np.dot(delta_grad, delta_grad)
ys = np.abs(np.dot(delta_grad, delta_x))
if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0:
return 1
if self.approx_type == 'hess':
return y_norm2 / ys
else:
return ys / y_norm2
def _update_implementation(self, delta_x, delta_grad):
raise NotImplementedError("The method ``_update_implementation``"
" is not implemented.")
def update(self, delta_x, delta_grad):
"""Update internal matrix.
Update Hessian matrix or its inverse (depending on how 'approx_type'
is defined) using information about the last evaluated points.
Parameters
----------
delta_x : ndarray
The difference between two points the gradient
function have been evaluated at: ``delta_x = x2 - x1``.
delta_grad : ndarray
The difference between the gradients:
``delta_grad = grad(x2) - grad(x1)``.
"""
if np.all(delta_x == 0.0):
return
if np.all(delta_grad == 0.0):
warn('delta_grad == 0.0. Check if the approximated '
'function is linear. If the function is linear '
'better results can be obtained by defining the '
'Hessian as zero instead of using quasi-Newton '
'approximations.', UserWarning)
return
if self.first_iteration:
# Get user specific scale
if self.init_scale == "auto":
scale = self._auto_scale(delta_x, delta_grad)
else:
scale = float(self.init_scale)
# Scale initial matrix with ``scale * np.eye(n)``
if self.approx_type == 'hess':
self.B *= scale
else:
self.H *= scale
self.first_iteration = False
self._update_implementation(delta_x, delta_grad)
def dot(self, p):
"""Compute the product of the internal matrix with the given vector.
Parameters
----------
p : array_like
1-d array representing a vector.
Returns
-------
Hp : array
1-d represents the result of multiplying the approximation matrix
by vector p.
"""
if self.approx_type == 'hess':
return self._symv(1, self.B, p)
else:
return self._symv(1, self.H, p)
def get_matrix(self):
"""Return the current internal matrix.
Returns
-------
M : ndarray, shape (n, n)
Dense matrix containing either the Hessian or its inverse
(depending on how `approx_type` was defined).
"""
if self.approx_type == 'hess':
M = np.copy(self.B)
else:
M = np.copy(self.H)
li = np.tril_indices_from(M, k=-1)
M[li] = M.T[li]
return M
class BFGS(FullHessianUpdateStrategy):
"""Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
Parameters
----------
exception_strategy : {'skip_update', 'damp_update'}, optional
Define how to proceed when the curvature condition is violated.
Set it to 'skip_update' to just skip the update. Or, alternatively,
set it to 'damp_update' to interpolate between the actual BFGS
result and the unmodified matrix. Both exceptions strategies
are explained in [1]_, p.536-537.
min_curvature : float
This number, scaled by a normalization factor, defines the
minimum curvature ``dot(delta_grad, delta_x)`` allowed to go
unaffected by the exception strategy. By default is equal to
1e-8 when ``exception_strategy = 'skip_update'`` and equal
to 0.2 when ``exception_strategy = 'damp_update'``.
init_scale : {float, 'auto'}
Matrix scale at first iteration. At the first
iteration the Hessian matrix or its inverse will be initialized
with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
Set it to 'auto' in order to use an automatic heuristic for choosing
the initial scale. The heuristic is described in [1]_, p.143.
By default uses 'auto'.
Notes
-----
The update is based on the description in [1]_, p.140.
References
----------
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
Second Edition (2006).
"""
def __init__(self, exception_strategy='skip_update', min_curvature=None,
init_scale='auto'):
if exception_strategy == 'skip_update':
if min_curvature is not None:
self.min_curvature = min_curvature
else:
self.min_curvature = 1e-8
elif exception_strategy == 'damp_update':
if min_curvature is not None:
self.min_curvature = min_curvature
else:
self.min_curvature = 0.2
else:
raise ValueError("`exception_strategy` must be 'skip_update' "
"or 'damp_update'.")
super(BFGS, self).__init__(init_scale)
self.exception_strategy = exception_strategy
def _update_inverse_hessian(self, ys, Hy, yHy, s):
"""Update the inverse Hessian matrix.
BFGS update using the formula:
``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T)
- 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
where ``s = delta_x`` and ``y = delta_grad``. This formula is
equivalent to (6.17) in [1]_ written in a more efficient way
for implementation.
References
----------
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
Second Edition (2006).
"""
self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H)
self.H = self._syr((ys+yHy)/ys**2, s, a=self.H)
def _update_hessian(self, ys, Bs, sBs, y):
"""Update the Hessian matrix.
BFGS update using the formula:
``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
where ``s`` is short for ``delta_x`` and ``y`` is short
for ``delta_grad``. Formula (6.19) in [1]_.
References
----------
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
Second Edition (2006).
"""
self.B = self._syr(1.0 / ys, y, a=self.B)
self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
def _update_implementation(self, delta_x, delta_grad):
# Auxiliary variables w and z
if self.approx_type == 'hess':
w = delta_x
z = delta_grad
else:
w = delta_grad
z = delta_x
# Do some common operations
wz = np.dot(w, z)
Mw = self.dot(w)
wMw = Mw.dot(w)
# Guarantee that wMw > 0 by reinitializing matrix.
# While this is always true in exact arithmetics,
# indefinite matrix may appear due to roundoff errors.
if wMw <= 0.0:
scale = self._auto_scale(delta_x, delta_grad)
# Reinitialize matrix
if self.approx_type == 'hess':
self.B = scale * np.eye(self.n, dtype=float)
else:
self.H = scale * np.eye(self.n, dtype=float)
# Do common operations for new matrix
Mw = self.dot(w)
wMw = Mw.dot(w)
# Check if curvature condition is violated
if wz <= self.min_curvature * wMw:
# If the option 'skip_update' is set
# we just skip the update when the condion
# is violated.
if self.exception_strategy == 'skip_update':
return
# If the option 'damp_update' is set we
# interpolate between the actual BFGS
# result and the unmodified matrix.
elif self.exception_strategy == 'damp_update':
update_factor = (1-self.min_curvature) / (1 - wz/wMw)
z = update_factor*z + (1-update_factor)*Mw
wz = np.dot(w, z)
# Update matrix
if self.approx_type == 'hess':
self._update_hessian(wz, Mw, wMw, z)
else:
self._update_inverse_hessian(wz, Mw, wMw, z)
class SR1(FullHessianUpdateStrategy):
"""Symmetric-rank-1 Hessian update strategy.
Parameters
----------
min_denominator : float
This number, scaled by a normalization factor,
defines the minimum denominator magnitude allowed
in the update. When the condition is violated we skip
the update. By default uses ``1e-8``.
init_scale : {float, 'auto'}, optional
Matrix scale at first iteration. At the first
iteration the Hessian matrix or its inverse will be initialized
with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
Set it to 'auto' in order to use an automatic heuristic for choosing
the initial scale. The heuristic is described in [1]_, p.143.
By default uses 'auto'.
Notes
-----
The update is based on the description in [1]_, p.144-146.
References
----------
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
Second Edition (2006).
"""
def __init__(self, min_denominator=1e-8, init_scale='auto'):
self.min_denominator = min_denominator
super(SR1, self).__init__(init_scale)
def _update_implementation(self, delta_x, delta_grad):
# Auxiliary variables w and z
if self.approx_type == 'hess':
w = delta_x
z = delta_grad
else:
w = delta_grad
z = delta_x
# Do some common operations
Mw = self.dot(w)
z_minus_Mw = z - Mw
denominator = np.dot(w, z_minus_Mw)
# If the denominator is too small
# we just skip the update.
if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw):
return
# Update matrix
if self.approx_type == 'hess':
self.B = self._syr(1/denominator, z_minus_Mw, a=self.B)
else:
self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)