/
simple_step_size_adaptation.py
482 lines (415 loc) · 21.5 KB
/
simple_step_size_adaptation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# Copyright 2018 The TensorFlow Probability Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""SimpleStepSizeAdaptation TransitionKernel."""
import collections
import numpy as np
import tensorflow.compat.v2 as tf
from tensorflow_probability.python.internal import assert_util
from tensorflow_probability.python.internal import dtype_util
from tensorflow_probability.python.internal import prefer_static
from tensorflow_probability.python.internal import tensorshape_util
from tensorflow_probability.python.internal import unnest
from tensorflow_probability.python.math.generic import reduce_logmeanexp
from tensorflow_probability.python.mcmc import kernel as kernel_base
from tensorflow_probability.python.mcmc.internal import util as mcmc_util
def hmc_like_step_size_setter_fn(kernel_results, new_step_size):
"""Setter for `step_size` so it can be adapted."""
return unnest.replace_innermost(kernel_results, step_size=new_step_size)
def hmc_like_step_size_getter_fn(kernel_results):
"""Getter for `step_size` so it can be inspected."""
return unnest.get_innermost(kernel_results, 'step_size')
def hmc_like_log_accept_prob_getter_fn(kernel_results):
log_accept_ratio = unnest.get_innermost(kernel_results, 'log_accept_ratio')
safe_accept_ratio = tf.where(
tf.math.is_finite(log_accept_ratio),
log_accept_ratio,
tf.constant(-np.inf, dtype=log_accept_ratio.dtype))
return tf.minimum(safe_accept_ratio, 0.)
def get_differing_dims(a, b):
# Get the indices of dimensions where shapes of `a` and `b` differ.
# `a` is allowed to have fewer dimensions than `b`.
if (not tensorshape_util.is_fully_defined(a.shape) or
not tensorshape_util.is_fully_defined(b.shape)):
return tf.where(
tf.not_equal(tf.shape(a), tf.shape(b)[:tf.rank(a)]))[:, 0]
a_shape = np.int32(a.shape)
b_shape = np.int32(b.shape)
return np.where(a_shape != b_shape[:len(a_shape)])[0]
class SimpleStepSizeAdaptationResults(
mcmc_util.PrettyNamedTupleMixin,
collections.namedtuple(
'SimpleStepSizeAdaptationResults',
[
'inner_results',
'target_accept_prob',
'adaptation_rate',
'step',
'new_step_size',
])):
"""Results of the SimpleStepSizeAdaptation TransitionKernel.
Attributes:
inner_results: Results of the inner kernel.
target_accept_prob: Floating point scalar `Tensor`. Target accept
probability.
adaptation_rate: Floating point scalar `Tensor`. Fraction by which to adjust
the step size during each step.
step: Int32 scalar `Tensor`. The current step number as perceived by this
kernel. Increases by 1 for every call to `one_step`.
new_step_size: Floating point scalar `Tensor` or a list thereof (one for
each `state_part`). Step size that will be passed to the inner kernel
during the next step.
"""
__slots__ = ()
class SimpleStepSizeAdaptation(kernel_base.TransitionKernel):
"""Adapts the inner kernel's `step_size` based on `log_accept_prob`.
The simple policy multiplicatively increases or decreases the `step_size` of
the inner kernel based on the value of `log_accept_prob`. It is based on
[equation 19 of Andrieu and Thoms (2008)][1]. Given enough steps and small
enough `adaptation_rate` the median of the distribution of the acceptance
probability will converge to the `target_accept_prob`. A good target
acceptance probability depends on the inner kernel. If this kernel is
`HamiltonianMonteCarlo`, then 0.6-0.9 is a good range to aim for. For
`RandomWalkMetropolis` this should be closer to 0.25. See the individual
kernels' docstrings for guidance.
In general, adaptation prevents the chain from reaching a stationary
distribution, so obtaining consistent samples requires `num_adaptation_steps`
be set to a value [somewhat smaller][2] than the number of burnin steps.
However, it may sometimes be helpful to set `num_adaptation_steps` to a larger
value during development in order to inspect the behavior of the chain during
adaptation.
The step size is assumed to broadcast with the chain state, potentially having
leading dimensions corresponding to multiple chains. When there are fewer of
those leading dimensions than there are chain dimensions, the corresponding
dimensions in the `log_accept_prob` are averaged (in the direct space, rather
than the log space) before being used to adjust the step size. This means that
this kernel can do both cross-chain adaptation, or per-chain step size
adaptation, depending on the shape of the step size.
For example, if your problem has a state with shape `[S]`, your chain state
has shape `[C0, C1, Y]` (meaning that there are `C0 * C1` total chains) and
`log_accept_prob` has shape `[C0, C1]` (one acceptance probability per chain),
then depending on the shape of the step size, the following will happen:
- Step size has shape [], [S] or [1], the `log_accept_prob` will be averaged
across its `C0` and `C1` dimensions. This means that you will learn a shared
step size based on the mean acceptance probability across all chains. This
can be useful if you don't have a lot of steps to adapt and want to average
away the noise.
- Step size has shape [C1, 1] or [C1, S], the `log_accept_prob` will be
averaged across its `C0` dimension. This means that you will learn a shared
step size based on the mean acceptance probability across chains that share
the coordinate across the `C1` dimension. This can be useful when the `C1`
dimension indexes different distributions, while `C0` indexes replicas of a
single distribution, all sampled in parallel.
- Step size has shape [C0, C1, 1] or [C0, C1, S], then no averaging will
happen. This means that each chain will learn its own step size. This can be
useful when all chains are sampling from different distributions. Even when
all chains are for the same distribution, this can help during the initial
warmup period.
- Step size has shape [C0, 1, 1] or [C0, 1, S], the `log_accept_prob` will be
averaged across its `C1` dimension. This means that you will learn a shared
step size based on the mean acceptance probability across chains that share
the coordinate across the `C0` dimension. This can be useful when the `C0`
dimension indexes different distributions, while `C1` indexes replicas of a
single distribution, all sampled in parallel.
By default, the averaging function used above is the arithmetic mean, which is
not robust to stuck chains (e.g. average of one chain with `p_accept = 0` and
three chains with `p_accept = 1` will result in an average `p_accept = 0.75`,
which will cause this kernel keep the step size roughly the same rather than
reducing it to unstick the stuck chain). A more robust choice would be to set
`reduce_fn` argument to `tfp.math.reduce_log_harmonic_mean_exp` [3]. Note,
however, that the harmonic mean of a set of numbers is usually smaller than
the arithmetic mean, so its use will typically produce smaller than optimal
step sizes even for well behaved target distributions.
#### Examples
```python
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
target_log_prob_fn = tfd.Normal(loc=0., scale=1.).log_prob
num_burnin_steps = 500
num_results = 500
num_chains = 64
step_size = 0.1
# Or, if you want per-chain step size:
# step_size = tf.fill([num_chains], step_size)
kernel = tfp.mcmc.HamiltonianMonteCarlo(
target_log_prob_fn=target_log_prob_fn,
num_leapfrog_steps=2,
step_size=step_size)
kernel = tfp.mcmc.SimpleStepSizeAdaptation(
inner_kernel=kernel, num_adaptation_steps=int(num_burnin_steps * 0.8))
# The chain will be stepped for num_results + num_burnin_steps, adapting for
# the first num_adaptation_steps.
samples, [step_size, log_accept_ratio] = tfp.mcmc.sample_chain(
num_results=num_results,
num_burnin_steps=num_burnin_steps,
current_state=tf.zeros(num_chains),
kernel=kernel,
trace_fn=lambda _, pkr: [pkr.inner_results.accepted_results.step_size,
pkr.inner_results.log_accept_ratio])
# ~0.75
p_accept = tf.math.exp(tfp.math.reduce_logmeanexp(
tf.minimum(log_accept_ratio, 0.)))
```
#### References
[1]: Andrieu, Christophe, Thoms, Johannes. A tutorial on adaptive MCMC.
_Statistics and Computing_, 2008.
https://people.eecs.berkeley.edu/~jordan/sail/readings/andrieu-thoms.pdf
[2]:
http://andrewgelman.com/2017/12/15/burn-vs-warm-iterative-simulation-algorithms/#comment-627745
[3]: Hoffman, M., Radul, A., & Sountsov, P. An Adaptive MCMC Scheme
for Setting Trajectory Lengths in Hamiltonian Monte Carlo, 2020. In
preparation.
"""
def __init__(self,
inner_kernel,
num_adaptation_steps,
target_accept_prob=0.75,
adaptation_rate=0.01,
step_size_setter_fn=hmc_like_step_size_setter_fn,
step_size_getter_fn=hmc_like_step_size_getter_fn,
log_accept_prob_getter_fn=hmc_like_log_accept_prob_getter_fn,
reduce_fn=reduce_logmeanexp,
experimental_reduce_chain_axis_names=None,
validate_args=False,
name=None):
"""Creates the step size adaptation kernel.
The default setter_fn and the getter_fn callbacks assume that the inner
kernel produces kernel results structurally the same as the
`HamiltonianMonteCarlo` kernel.
Args:
inner_kernel: `TransitionKernel`-like object.
num_adaptation_steps: Scalar `int` `Tensor` number of initial steps to
during which to adjust the step size. This may be greater, less than, or
equal to the number of burnin steps.
target_accept_prob: A floating point `Tensor` representing desired
acceptance probability. Must be a positive number less than 1. This can
either be a scalar, or have shape [num_chains]. Default value: `0.75`
(the [center of asymptotically optimal rate for HMC][1]).
adaptation_rate: `Tensor` representing amount to scale the current
`step_size`.
step_size_setter_fn: A callable with the signature
`(kernel_results, new_step_size) -> new_kernel_results` where
`kernel_results` are the results of the `inner_kernel`, `new_step_size`
is a `Tensor` or a nested collection of `Tensor`s with the same
structure as returned by the `step_size_getter_fn`, and
`new_kernel_results` are a copy of `kernel_results` with the step
size(s) set.
step_size_getter_fn: A callable with the signature
`(kernel_results) -> step_size` where `kernel_results` are the results
of the `inner_kernel`, and `step_size` is a floating point `Tensor` or a
nested collection of such `Tensor`s.
log_accept_prob_getter_fn: A callable with the signature
`(kernel_results) -> log_accept_prob` where `kernel_results` are the
results of the `inner_kernel`, and `log_accept_prob` is a floating point
`Tensor`. `log_accept_prob` can either be a scalar, or have shape
[num_chains]. If it's the latter, `step_size` should also have the same
leading dimension.
reduce_fn: A callable with signature `(input_tensor, axis, keepdims) ->
tensor` that returns a log-reduction of `log_accept_prob`, typically
some sort of mean. By default, this performs an arithmetic mean.
experimental_reduce_chain_axis_names: A `str` or list of `str`s indicating
the named axes that should additionally reduced during the log-reduction
of `log_accept_prob`.
validate_args: Python `bool`. When `True` kernel parameters are checked
for validity. When `False` invalid inputs may silently render incorrect
outputs.
name: Python `str` name prefixed to Ops created by this class. Default:
'simple_step_size_adaptation'.
#### References
[1]: Betancourt, M. J., Byrne, S., & Girolami, M. (2014). _Optimizing The
Integrator Step Size for Hamiltonian Monte Carlo_.
http://arxiv.org/abs/1411.6669
"""
inner_kernel = mcmc_util.enable_store_parameters_in_results(inner_kernel)
with tf.name_scope(mcmc_util.make_name(
name, 'simple_step_size_adaptation', '__init__')) as name:
dtype = dtype_util.common_dtype([target_accept_prob, adaptation_rate],
tf.float32)
target_accept_prob = tf.convert_to_tensor(
target_accept_prob, dtype=dtype, name='target_accept_prob')
adaptation_rate = tf.convert_to_tensor(
adaptation_rate, dtype=dtype, name='adaptation_rate')
num_adaptation_steps = tf.convert_to_tensor(
num_adaptation_steps,
dtype=tf.int32,
name='num_adaptation_steps')
target_accept_prob = _maybe_validate_target_accept_prob(
target_accept_prob, validate_args)
self._parameters = dict(
inner_kernel=inner_kernel,
num_adaptation_steps=num_adaptation_steps,
target_accept_prob=target_accept_prob,
adaptation_rate=adaptation_rate,
step_size_setter_fn=step_size_setter_fn,
step_size_getter_fn=step_size_getter_fn,
log_accept_prob_getter_fn=log_accept_prob_getter_fn,
reduce_fn=reduce_fn,
experimental_reduce_chain_axis_names=(
experimental_reduce_chain_axis_names),
name=name,
)
@property
def inner_kernel(self):
return self._parameters['inner_kernel']
@property
def name(self):
return self._parameters['name']
@property
def num_adaptation_steps(self):
return self._parameters['num_adaptation_steps']
def step_size_setter_fn(self, kernel_results, new_step_size):
return self._parameters['step_size_setter_fn'](kernel_results,
new_step_size)
def step_size_getter_fn(self, kernel_results):
return self._parameters['step_size_getter_fn'](kernel_results)
def log_accept_prob_getter_fn(self, kernel_results):
return self._parameters['log_accept_prob_getter_fn'](kernel_results)
def reduce_fn(self, input_tensor, axis, keepdims,
experimental_named_axis=None):
if experimental_named_axis is None:
return self._parameters['reduce_fn'](
input_tensor, axis=axis, keepdims=keepdims)
return self._parameters['reduce_fn'](
input_tensor, axis=axis, keepdims=keepdims,
experimental_named_axis=experimental_named_axis)
@property
def parameters(self):
"""Return `dict` of ``__init__`` arguments and their values."""
return self._parameters
def one_step(self, current_state, previous_kernel_results, seed=None):
with tf.name_scope(mcmc_util.make_name(
self.name, 'simple_step_size_adaptation', 'one_step')):
# Set the step_size.
inner_results = self.step_size_setter_fn(
previous_kernel_results.inner_results,
previous_kernel_results.new_step_size)
# Step the inner kernel.
inner_kwargs = {} if seed is None else dict(seed=seed)
new_state, new_inner_results = self.inner_kernel.one_step(
current_state, inner_results, **inner_kwargs)
# Get the new step size.
log_accept_prob = self.log_accept_prob_getter_fn(new_inner_results)
log_target_accept_prob = tf.math.log(
tf.cast(previous_kernel_results.target_accept_prob,
dtype=log_accept_prob.dtype))
state_parts = tf.nest.flatten(current_state)
step_size = self.step_size_getter_fn(new_inner_results)
step_size_parts = tf.nest.flatten(step_size)
log_accept_prob_rank = prefer_static.rank(log_accept_prob)
new_step_size_parts = []
for step_size_part, state_part in zip(step_size_parts, state_parts):
# Compute new step sizes for each step size part. If step size part has
# smaller rank than the corresponding state part, then the difference is
# averaged away in the log accept prob.
#
# Example:
#
# state_part has shape [2, 3, 4, 5]
# step_size_part has shape [1, 4, 1]
# log_accept_prob has shape [2, 3, 4]
#
# Since step size has 1 rank fewer than the state, we reduce away the
# leading dimension of log_accept_prob to get a Tensor with shape [3,
# 4]. Next, since log_accept_prob must broadcast into step_size_part on
# the left, we reduce the dimensions where their shapes differ, to get a
# Tensor with shape [1, 4], which now is compatible with the leading
# dimensions of step_size_part.
#
# There is a subtlety here in that step_size_parts might be a length-1
# list, which means that we'll be "structure-broadcasting" it for all
# the state parts (see logic in, e.g., hmc.py). In this case we must
# assume that that the lone step size provided broadcasts with the event
# dims of each state part. This means that either step size has no
# dimensions corresponding to chain dimensions, or all states are of the
# same shape. For the former, we want to reduce over all chain
# dimensions. For the later, we want to use the same logic as in the
# non-structure-broadcasted case.
#
# It turns out we can compute the reduction dimensions for both cases
# uniformly by taking the rank of any state part. This obviously works
# in the second case (where all state ranks are the same). In the first
# case, all state parts have the rank L + D_i + B, where L is the rank
# of log_accept_prob, D_i is the non-shared dimensions amongst all
# states, and B are the shared dimensions of all the states, which are
# equal to the step size. When we subtract B, we will always get a
# number >= L, which means we'll get the full reduction we want.
num_reduce_dims = prefer_static.minimum(
log_accept_prob_rank,
prefer_static.rank(state_part) - prefer_static.rank(step_size_part))
reduced_log_accept_prob = self.reduce_fn(
log_accept_prob,
axis=prefer_static.range(num_reduce_dims),
keepdims=False,
experimental_named_axis=self.experimental_reduce_chain_axis_names)
# reduced_log_accept_prob must broadcast into step_size_part on the
# left, so we do an additional reduction over dimensions where their
# shapes differ.
reduce_indices = get_differing_dims(reduced_log_accept_prob,
step_size_part)
reduced_log_accept_prob = self.reduce_fn(
reduced_log_accept_prob, axis=reduce_indices, keepdims=True)
one_plus_adaptation_rate = 1. + tf.cast(
previous_kernel_results.adaptation_rate,
dtype=step_size_part.dtype)
new_step_size_part = mcmc_util.choose(
reduced_log_accept_prob > log_target_accept_prob,
step_size_part * one_plus_adaptation_rate,
step_size_part / one_plus_adaptation_rate)
new_step_size_parts.append(
tf.where(previous_kernel_results.step < self.num_adaptation_steps,
new_step_size_part,
step_size_part))
new_step_size = tf.nest.pack_sequence_as(step_size, new_step_size_parts)
return new_state, previous_kernel_results._replace(
inner_results=new_inner_results,
step=1 + previous_kernel_results.step,
new_step_size=new_step_size)
def bootstrap_results(self, init_state):
with tf.name_scope(mcmc_util.make_name(
self.name, 'simple_step_size_adaptation', 'bootstrap_results')):
inner_results = self.inner_kernel.bootstrap_results(init_state)
step_size = self.step_size_getter_fn(inner_results)
return SimpleStepSizeAdaptationResults(
inner_results=inner_results,
step=tf.constant(0, dtype=tf.int32),
target_accept_prob=self.parameters['target_accept_prob'],
adaptation_rate=self.parameters['adaptation_rate'],
new_step_size=step_size)
@property
def is_calibrated(self):
return self.inner_kernel.is_calibrated
@property
def experimental_shard_axis_names(self):
return self.inner_kernel.experimental_shard_axis_names
def experimental_with_shard_axes(self, shard_axis_names):
return self.copy(
inner_kernel=self.inner_kernel.experimental_with_shard_axes(
shard_axis_names))
@property
def experimental_reduce_chain_axis_names(self):
return self._parameters['experimental_reduce_chain_axis_names']
def _maybe_validate_target_accept_prob(target_accept_prob, validate_args):
"""Validates that target_accept_prob is in (0, 1)."""
if not validate_args:
return target_accept_prob
with tf.control_dependencies([
assert_util.assert_positive(
target_accept_prob, message='`target_accept_prob` must be > 0.'),
assert_util.assert_less(
target_accept_prob,
tf.ones_like(target_accept_prob),
message='`target_accept_prob` must be < 1.')
]):
return tf.identity(target_accept_prob)