-
Notifications
You must be signed in to change notification settings - Fork 443
/
compute_dp_sgd_privacy_lib.py
596 lines (521 loc) · 22.1 KB
/
compute_dp_sgd_privacy_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library for computing privacy values for DP-SGD."""
import enum
import functools
import math
import textwrap
from typing import Optional
from absl import app
from absl import logging
import dp_accounting
from scipy import optimize
from scipy import stats
class UserLevelDPComputationError(Exception):
"""Error raised if user-level epsilon computation fails."""
def _logexpm1(x: float) -> float:
"""Returns log(exp(x) - 1)."""
return x + math.log(-math.expm1(-x))
class AccountantType(enum.Enum):
"""Accountant to use for privacy accounting."""
RDP = 'RDP'
PLD = 'PLD'
def get_accountant(self) -> dp_accounting.PrivacyAccountant:
if self == AccountantType.RDP:
return dp_accounting.rdp.RdpAccountant()
if self == AccountantType.PLD:
return dp_accounting.pld.PLDAccountant()
raise ValueError(f'Unsupported Accountant type {self.value}')
def _compute_dp_sgd_user_privacy(
num_epochs: float,
noise_multiplier: float,
user_delta: float,
max_examples_per_user: int,
used_microbatching: bool = True,
poisson_subsampling_probability: Optional[float] = None,
accountant_type: AccountantType = AccountantType.RDP,
) -> float:
"""Computes add-or-remove-one-user DP epsilon using group privacy.
Without sampling, the privacy accounting reduces to example-level DP
accounting. Otherwise, see the helper methods
_compute_dp_sgd_group_privacy_using_rdp and
_compute_dp_sgd_group_privacy_using_pld for details of privacy accounting.
Args:
num_epochs: The number of passes over the data. May be fractional.
noise_multiplier: The ratio of the noise stddev to the l2 sensitivity.
user_delta: The target user-level delta.
max_examples_per_user: Upper bound on the number of examples per user.
used_microbatching: If true, increases sensitivity by a factor of two.
poisson_subsampling_probability: If not None, gives the probability that
each record is chosen in a batch. If None, assumes no subsampling.
accountant_type: The privacy accountant for computing epsilon.
Returns:
The add-or-remove-one-user DP epsilon value using group privacy.
Raises:
UserLevelDPComputationError: If line search for example-level delta fails.
"""
if num_epochs <= 0:
raise ValueError(f'num_epochs must be positive. Found {num_epochs}.')
if noise_multiplier < 0:
raise ValueError(
f'noise_multiplier must be non-negative. Found {noise_multiplier}.'
)
if not 0 <= user_delta <= 1:
raise ValueError(f'user_delta must be between 0 and 1. Found {user_delta}.')
if max_examples_per_user <= 0:
raise ValueError(
'max_examples_per_user must be a positive integer. Found '
f'{max_examples_per_user}.'
)
if max_examples_per_user == 1:
# Don't unnecessarily inflate epsilon if one example per user.
return _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
user_delta,
used_microbatching,
poisson_subsampling_probability,
accountant_type,
)
elif poisson_subsampling_probability is None:
# Without subsampling, the worst-case is when all max_examples_per_user
# examples participate in the same round (and in the microbatching case,
# they participate in different microbatches in this round), which
# effectively increases the sensitivity, i.e. decreases the
# noise_multiplier, by max_examples_per_user.
return _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier / max_examples_per_user,
user_delta,
used_microbatching,
poisson_subsampling_probability,
accountant_type,
)
elif accountant_type == AccountantType.RDP:
return _compute_dp_sgd_group_privacy_using_rdp(
num_epochs,
noise_multiplier,
user_delta,
max_examples_per_user,
used_microbatching,
poisson_subsampling_probability,
)
elif accountant_type == AccountantType.PLD:
return _compute_dp_sgd_group_privacy_using_pld(
num_epochs,
noise_multiplier,
user_delta,
max_examples_per_user,
poisson_subsampling_probability,
used_microbatching,
)
else:
raise ValueError(f'Unsupported accountant type: {accountant_type}')
def _compute_dp_sgd_example_privacy(
num_epochs: float,
noise_multiplier: float,
example_delta: float,
used_microbatching: bool = True,
poisson_subsampling_probability: Optional[float] = None,
accountant_type: AccountantType = AccountantType.RDP,
) -> float:
"""Computes add-or-remove-one-example DP epsilon.
This privacy guarantee uses add-or-remove-one-example adjacency, and protects
release of all model checkpoints in addition to the final model.
Args:
num_epochs: The number of passes over the data.
noise_multiplier: The ratio of the noise stddev to the l2 sensitivity.
example_delta: The target delta.
used_microbatching: If true, increases sensitivity by a factor of two.
poisson_subsampling_probability: If not None, gives the probability that
each record is chosen in a batch. If None, assumes no subsampling.
accountant_type: The privacy accountant for computing epsilon.
Returns:
The epsilon value.
"""
if num_epochs <= 0:
raise ValueError(f'num_epochs must be positive. Found {num_epochs}.')
if noise_multiplier < 0:
raise ValueError(
f'noise_multiplier must be non-negative. Found {noise_multiplier}.'
)
if not 0 <= example_delta <= 1:
raise ValueError(f'delta must be between 0 and 1. Found {example_delta}.')
if used_microbatching:
# TODO(b/271462792)
noise_multiplier /= 2
event_ = dp_accounting.GaussianDpEvent(noise_multiplier)
if poisson_subsampling_probability is not None:
event_ = dp_accounting.PoissonSampledDpEvent(
sampling_probability=poisson_subsampling_probability, event=event_
)
count = int(math.ceil(num_epochs / poisson_subsampling_probability))
else:
count = int(math.ceil(num_epochs))
event_ = dp_accounting.SelfComposedDpEvent(count=count, event=event_)
return (
accountant_type.get_accountant()
.compose(event_)
.get_epsilon(example_delta)
)
def _compute_dp_sgd_group_privacy_using_rdp(
num_epochs: float,
noise_multiplier: float,
user_delta: float,
max_examples_per_user: int,
used_microbatching: bool = True,
poisson_subsampling_probability: Optional[float] = None,
):
"""Computes add-or-remove-one-user DP epsilon using group privacy via RDP.
This privacy guarantee uses add-or-remove-one-user adjacency, and protects
release of all model checkpoints in addition to the final model.
Uses Vadhan (2017) "The complexity of differential privacy" Lemma 2.2.
# TODO(b/271330804): Consider using RDP to compute group privacy.
We use a line search to identify an example-level delta which, when the lemma
is applied, yields the requested user-level delta, then use it to compute the
user-level epsilon.
Args:
num_epochs: The number of passes over the data. May be fractional.
noise_multiplier: The ratio of the noise stddev to the l2 sensitivity.
user_delta: The target user-level delta.
max_examples_per_user: Upper bound on the number of examples per user.
used_microbatching: If true, increases sensitivity by a factor of two.
poisson_subsampling_probability: If not None, gives the probability that
each record is chosen in a batch. If None, assumes no subsampling.
Returns:
The add-or-remove-one-user DP epsilon value using group privacy.
Raises:
UserLevelDPComputationError: If line search for example-level delta fails.
"""
# The computation below to estimate user_eps works as follows.
# We have _compute_dp_sgd_example_privacy which maps
# F(example_delta) -> example_eps
# Vadhan (2017) "The complexity of differential privacy" Lemma 2.2 gives us
# G(example_eps, example_delta) -> user_delta
# H(example_eps) -> user_eps.
# We first identify an example_delta such that
# G(F(example_delta), example_delta) = user_delta
# Specifically, we use a line search in log space to solve for
# log(G(F(example_delta), example_delta)) - log(user_delta) = 0
# Then we can return user_eps = H(F(example_delta)).
target_user_log_delta = math.log(user_delta)
# Cache example privacy values, which can be expensive.
@functools.cache
def get_example_eps(example_log_delta):
return _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
math.exp(example_log_delta),
used_microbatching,
poisson_subsampling_probability,
)
def user_log_delta_gap(example_log_delta):
example_eps = get_example_eps(example_log_delta)
# Estimate user_eps, user_log_delta using Vadhan Lemma 2.2, using a
# tighter bound seen in the penultimate line of the proof, given as
# user_delta = (example_delta * (exp(k * example_eps) - 1)
# / (exp(example_eps) - 1))
user_eps = max_examples_per_user * example_eps
user_log_delta = (
example_log_delta + _logexpm1(user_eps) - _logexpm1(example_eps)
)
return user_log_delta - target_user_log_delta
# We need bounds on the example-level delta. The supplied user-level delta
# is an upper bound. Search exponentially toward zero for lower bound.
example_log_delta_max = target_user_log_delta
example_log_delta_min = example_log_delta_max - math.log(10)
user_log_delta_gap_min = user_log_delta_gap(example_log_delta_min)
while user_log_delta_gap_min > 0:
# Assuming that _compute_dp_sgd_example_privacy is decreasing in
# example_delta, it is not difficult to show that if user_delta_min
# corresponding to example_delta_min is too large, then we must reduce
# example_delta by at least a factor of (user_delta / user_delta_min).
# In other words, if example_log_delta_min is an upper bound, then so is
# example_log_delta_min - user_log_delta_gap_min.
example_log_delta_max = example_log_delta_min - user_log_delta_gap_min
example_log_delta_min = example_log_delta_max - math.log(10)
user_log_delta_gap_min = user_log_delta_gap(example_log_delta_min)
if not math.isfinite(user_log_delta_gap_min):
# User-level (epsilon, delta) DP is not achievable. This can happen
# because as example_delta decreases, example_eps increases. So it is
# possible for user_delta (which increases in both example_delta and
# example_eps) to diverge to infinity as example_delta goes to zero.
logging.warning(
(
'No upper bound on user-level DP epsilon can be computed with'
' %s examples per user.'
),
max_examples_per_user,
)
return math.inf
# By the same logic, we can improve on the lower bound we just found, before
# even starting the line search. We actually could do a custom line search
# that makes use of this at each step, but brentq should be fast enough.
example_log_delta_min -= user_log_delta_gap_min
example_log_delta, result = optimize.brentq(
user_log_delta_gap,
example_log_delta_min,
example_log_delta_max,
full_output=True,
)
if not result.converged:
raise UserLevelDPComputationError(
'Optimization failed trying to compute user-level DP epsilon.'
)
# Vadhan (2017) "The complexity of differential privacy" Lemma 2.2.
return max_examples_per_user * get_example_eps(example_log_delta)
def _compute_dp_sgd_group_privacy_using_pld(
num_epochs: float,
noise_multiplier: float,
user_delta: float,
max_examples_per_user: int,
poisson_subsampling_probability: float,
used_microbatching: bool = True,
):
"""Computes add-or-remove-one-user DP epsilon using group privacy via PLDs.
This privacy guarantee uses add-or-remove-one-user adjacency, and protects
release of all model checkpoints in addition to the final model.
Uses Ganesh (2024) "Tight Group-Level DP Guarantees for DP-SGD with Sampling
via Mixture of Gaussians Mechanisms" (https://arxiv.org/abs/2401.10294)
Theorem 3.1.
Args:
num_epochs: The number of passes over the data. May be fractional.
noise_multiplier: The ratio of the noise stddev to the l2 sensitivity.
user_delta: The target user-level delta.
max_examples_per_user: Upper bound on the number of examples per user.
poisson_subsampling_probability: Gives the probability that each record is
chosen in a batch.
used_microbatching: If true, increases sensitivity by a factor of two.
Returns:
The add-or-remove-one-user DP epsilon value using group privacy.
"""
# With microbatching, a (loose) pessimistic assumption is that when a user's
# examples are sampled, they appear in different microbatches. This reduces
# to the non-microbatching analysis, but with the sensitivity doubled.
if used_microbatching:
noise_multiplier /= 2
sensitivities = range(max_examples_per_user + 1)
probs = stats.binom.pmf(
sensitivities, max_examples_per_user, poisson_subsampling_probability
)
single_round_event = dp_accounting.dp_event.MixtureOfGaussiansDpEvent(
noise_multiplier, sensitivities, probs
)
accountant = dp_accounting.pld.PLDAccountant()
count = int(math.ceil(num_epochs / poisson_subsampling_probability))
accountant.compose(single_round_event, count)
return accountant.get_epsilon(user_delta)
def compute_dp_sgd_privacy_statement(
number_of_examples: int,
batch_size: int,
num_epochs: float,
noise_multiplier: float,
delta: float,
used_microbatching: bool = True,
max_examples_per_user: Optional[int] = None,
accountant_type: AccountantType = AccountantType.RDP,
) -> str:
"""Produces a privacy report summarizing the DP guarantee.
Args:
number_of_examples: Total number of examples in the dataset. For DP-SGD, an
"example" corresponds to one row in a minibatch. E.g., for sequence models
this would be a sequence of maximum length.
batch_size: The number of examples in a batch. This should be the number of
examples in a batch, *regardless of whether/how they are grouped into
microbatches*.
num_epochs: The number of epochs of training. May be fractional.
noise_multiplier: The ratio of the Gaussian noise stddev to the l2 clip norm
at each round. It is assumed that the noise_multiplier is constant
although the clip norm may be variable if, for example, adaptive clipping
is used.
delta: The target delta.
used_microbatching: Whether microbatching was used (with microbatch size
greater than one). Microbatching inflates sensitivity by a factor of two
in add-or-remove-one adjacency DP. (See "How to DP-fy ML: A Practical
Guide to Machine Learning with Differential Privacy",
https://arxiv.org/abs/2303.00654, Sec 5.6.)
max_examples_per_user: If the data set is constructed to cap the maximum
number of examples each user contributes, provide this argument to also
print a user-level DP guarantee.
accountant_type: The privacy accountant for computing epsilon. Since the
current approach for computing user-level privacy when using PLD
accountant can sometimes be overly pessimistic, this method does not
provide user-level privacy guarantee for PLD accountant_type. This remains
to be investigated and fixed (b/271341062).
Returns:
A str precisely articulating the privacy guarantee.
"""
paragraph = f"""\
DP-SGD performed over {number_of_examples} examples with {batch_size} \
examples per iteration, noise multiplier {noise_multiplier} for {num_epochs} \
epochs {'with' if used_microbatching else 'without'} microbatching"""
if max_examples_per_user is None:
paragraph += ', and no bound on number of examples per user.'
else:
paragraph += f', and at most {max_examples_per_user} examples per user.'
paragraphs = [textwrap.fill(paragraph, width=80)]
paragraphs.append(
textwrap.fill(
"""\
This privacy guarantee protects the release of all model checkpoints in \
addition to the final model.""",
width=80,
)
)
paragraph = textwrap.fill(
f"""\
Example-level DP with add-or-remove-one adjacency at delta = {delta} computed \
with {accountant_type.value} accounting:""",
width=80,
)
example_eps_no_subsampling = _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
delta,
used_microbatching,
accountant_type=accountant_type,
)
example_eps_subsampling = _compute_dp_sgd_example_privacy(
num_epochs,
noise_multiplier,
delta,
used_microbatching,
poisson_subsampling_probability=batch_size / number_of_examples,
accountant_type=accountant_type,
)
paragraph += f"""
Epsilon with each example occurring once per epoch: \
{example_eps_no_subsampling:12.3f}
Epsilon assuming Poisson sampling (*): \
{example_eps_subsampling:12.3f}"""
paragraphs.append(paragraph)
inf_user_eps = False
if max_examples_per_user is None:
paragraphs.append(
textwrap.fill(
"""\
No user-level privacy guarantee is possible without a bound on the number of \
examples per user.""",
width=80,
)
)
else: # Case: max_examples_per_user is not None and accountant_type is RDP
user_eps_no_subsampling = _compute_dp_sgd_user_privacy(
num_epochs,
noise_multiplier,
delta,
max_examples_per_user,
used_microbatching,
accountant_type=accountant_type,
)
user_eps_subsampling = _compute_dp_sgd_user_privacy(
num_epochs,
noise_multiplier,
delta,
max_examples_per_user,
used_microbatching,
poisson_subsampling_probability=batch_size / number_of_examples,
accountant_type=accountant_type,
)
if math.isinf(user_eps_no_subsampling):
user_eps_no_subsampling_str = ' inf (**)'
inf_user_eps = True
else:
user_eps_no_subsampling_str = f'{user_eps_no_subsampling:12.3f}'
if math.isinf(user_eps_subsampling):
user_eps_subsampling_str = ' inf (**)'
inf_user_eps = True
else:
user_eps_subsampling_str = f'{user_eps_subsampling:12.3f}'
paragraph = textwrap.fill(
f"""\
User-level DP with add-or-remove-one adjacency at delta = {delta} computed \
using {accountant_type.value} accounting and group privacy:""",
width=80,
)
paragraph += f"""
Epsilon with each example occurring once per epoch: \
{user_eps_no_subsampling_str}
Epsilon assuming Poisson sampling (*): \
{user_eps_subsampling_str}"""
paragraphs.append(paragraph)
paragraphs.append(
textwrap.fill(
"""\
(*) Poisson sampling is not usually done in training pipelines, but assuming \
that the data was randomly shuffled, it is believed that the actual epsilon \
should be closer to this value than the conservative assumption of an \
arbitrary data order.""",
width=80,
)
)
if inf_user_eps:
paragraphs.append(
textwrap.fill(
"""\
(**) A finite example-level epsilon implies a finite user-level epsilon at any \
`max_examples_per_user`, but because conversion from example-level to user-\
level DP is not exact, it is possible for the upper bound on the user-level \
epsilon to still be infinite.""",
width=80,
)
)
return '\n\n'.join(paragraphs) + '\n'
def compute_dp_sgd_privacy(n, batch_size, noise_multiplier, epochs, delta):
"""Compute epsilon based on the given hyperparameters.
This function is deprecated. It does not account for doubling of sensitivity
with microbatching, and assumes Poisson subsampling, which is rarely used in
practice. (See "How to DP-fy ML: A Practical Guide to Machine Learning with
Differential Privacy", https://arxiv.org/abs/2303.00654, Sec 5.6.) Most users
should call `compute_dp_sgd_privacy_statement`, which provides appropriate
context for the guarantee (see the reporting recommendations in "How to DP-fy
ML", Sec 5.3). If you need a numeric epsilon value under specific assumptions,
it is recommended to use the `dp_accounting` libraries directly to compute
epsilon, with the precise and correct assumptions of your application.
Args:
n: Number of examples in the training data.
batch_size: Batch size used in training.
noise_multiplier: Noise multiplier used in training.
epochs: Number of epochs in training.
delta: Value of delta for which to compute epsilon.
Returns:
A 2-tuple containing the value of epsilon and the optimal RDP order.
"""
logging.warning("""\
`compute_dp_sgd_privacy` is deprecated. It does not account for doubling of \
sensitivity with microbatching, and assumes Poisson subsampling, which is \
rarely used in practice. Please use `compute_dp_sgd_privacy_statement`, which \
provides appropriate context for the guarantee. To compute epsilon under \
different assumptions than those in `compute_dp_sgd_privacy_statement`, call \
the `dp_accounting` libraries directly.""")
q = batch_size / n # q - the sampling ratio.
if q > 1:
raise app.UsageError('n must be larger than the batch size.')
orders = (
[1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 3.5, 4.0, 4.5]
+ list(range(5, 64))
+ [128, 256, 512]
)
steps = int(math.ceil(epochs * n / batch_size))
accountant = dp_accounting.rdp.RdpAccountant(orders)
event = dp_accounting.SelfComposedDpEvent(
dp_accounting.PoissonSampledDpEvent(
sampling_probability=q,
event=dp_accounting.GaussianDpEvent(noise_multiplier),
),
steps,
)
accountant.compose(event)
return accountant.get_epsilon_and_optimal_order(delta)