forked from twmb/franz-go
/
topics_and_partitions.go
537 lines (471 loc) · 15 KB
/
topics_and_partitions.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
package kgo
import (
"fmt"
"sort"
"strings"
"sync"
"sync/atomic"
"github.com/tdx/franz-go/pkg/kerr"
)
func dupmsi32(m map[string]int32) map[string]int32 {
d := make(map[string]int32, len(m))
for t, ps := range m {
d[t] = ps
}
return d
}
// "Atomic map of topic partitions", for lack of a better name at this point.
type amtps struct {
v atomic.Value
}
func (a *amtps) read() map[string][]int32 {
v := a.v.Load()
if v == nil {
return nil
}
return v.(map[string][]int32)
}
func (a *amtps) write(fn func(map[string][]int32)) {
dup := a.clone()
fn(dup)
a.store(dup)
}
func (a *amtps) clone() map[string][]int32 {
orig := a.read()
dup := make(map[string][]int32, len(orig))
for t, ps := range orig {
dup[t] = append(dup[t], ps...)
}
return dup
}
func (a *amtps) store(m map[string][]int32) { a.v.Store(m) }
type mtps map[string][]int32
func (m mtps) String() string {
var sb strings.Builder
var topicsWritten int
ts := make([]string, 0, len(m))
var ps []int32
for t := range m {
ts = append(ts, t)
}
sort.Strings(ts)
for _, t := range ts {
ps = append(ps[:0], m[t]...)
sort.Slice(ps, func(i, j int) bool { return ps[i] < ps[j] })
topicsWritten++
fmt.Fprintf(&sb, "%s%v", t, ps)
if topicsWritten < len(m) {
sb.WriteString(", ")
}
}
return sb.String()
}
type pausedTopics map[string]pausedPartitions
type pausedPartitions struct {
all bool
m map[int32]struct{}
}
func (m pausedTopics) has(topic string, partition int32) (paused bool) {
if len(m) == 0 {
return false
}
pps, exists := m[topic]
if !exists {
return false
}
if pps.all {
return true
}
_, exists = pps.m[partition]
return exists
}
func (m pausedTopics) addTopics(topics ...string) {
for _, topic := range topics {
pps, exists := m[topic]
if !exists {
pps = pausedPartitions{m: make(map[int32]struct{})}
}
pps.all = true
m[topic] = pps
}
}
func (m pausedTopics) delTopics(topics ...string) {
for _, topic := range topics {
pps, exists := m[topic]
if !exists {
continue
}
pps.all = false
if !pps.all && len(pps.m) == 0 {
delete(m, topic)
}
}
}
func (m pausedTopics) addPartitions(topicPartitions map[string][]int32) {
for topic, partitions := range topicPartitions {
pps, exists := m[topic]
if !exists {
pps = pausedPartitions{m: make(map[int32]struct{})}
}
for _, partition := range partitions {
pps.m[partition] = struct{}{}
}
m[topic] = pps
}
}
func (m pausedTopics) delPartitions(topicPartitions map[string][]int32) {
for topic, partitions := range topicPartitions {
pps, exists := m[topic]
if !exists {
continue
}
for _, partition := range partitions {
delete(pps.m, partition)
}
if !pps.all && len(pps.m) == 0 {
delete(m, topic)
}
}
}
func (m pausedTopics) pausedTopics() []string {
var r []string
for topic, pps := range m {
if pps.all {
r = append(r, topic)
}
}
return r
}
func (m pausedTopics) pausedPartitions() map[string][]int32 {
r := make(map[string][]int32)
for topic, pps := range m {
ps := make([]int32, 0, len(pps.m))
for partition := range pps.m {
ps = append(ps, partition)
}
r[topic] = ps
}
return r
}
func (m pausedTopics) clone() pausedTopics {
dup := make(pausedTopics)
dup.addTopics(m.pausedTopics()...)
dup.addPartitions(m.pausedPartitions())
return dup
}
func newTopicPartitions() *topicPartitions {
parts := new(topicPartitions)
parts.v.Store(new(topicPartitionsData))
return parts
}
// Contains all information about a topic's partitions.
type topicPartitions struct {
v atomic.Value // *topicPartitionsData
partsMu sync.Mutex
partitioner TopicPartitioner
lb *leastBackupInput // for partitioning if the partitioner is a LoadTopicPartitioner
}
func (t *topicPartitions) load() *topicPartitionsData { return t.v.Load().(*topicPartitionsData) }
var noTopicsPartitions = newTopicsPartitions()
func newTopicsPartitions() *topicsPartitions {
var t topicsPartitions
t.v.Store(make(topicsPartitionsData))
return &t
}
// A helper type mapping topics to their partitions;
// this is the inner value of topicPartitions.v.
type topicsPartitionsData map[string]*topicPartitions
func (d topicsPartitionsData) hasTopic(t string) bool { _, exists := d[t]; return exists }
func (d topicsPartitionsData) loadTopic(t string) *topicPartitionsData {
tp, exists := d[t]
if !exists {
return nil
}
return tp.load()
}
// A helper type mapping topics to their partitions that can be updated
// atomically.
type topicsPartitions struct {
v atomic.Value // topicsPartitionsData (map[string]*topicPartitions)
}
func (t *topicsPartitions) load() topicsPartitionsData {
if t == nil {
return nil
}
return t.v.Load().(topicsPartitionsData)
}
func (t *topicsPartitions) storeData(d topicsPartitionsData) { t.v.Store(d) }
func (t *topicsPartitions) storeTopics(topics []string) { t.v.Store(t.ensureTopics(topics)) }
func (t *topicsPartitions) clone() topicsPartitionsData {
current := t.load()
clone := make(map[string]*topicPartitions, len(current))
for k, v := range current {
clone[k] = v
}
return clone
}
// Ensures that the topics exist in the returned map, but does not store the
// update. This can be used to update the data and store later, rather than
// storing immediately.
func (t *topicsPartitions) ensureTopics(topics []string) topicsPartitionsData {
var cloned bool
current := t.load()
for _, topic := range topics {
if _, exists := current[topic]; !exists {
if !cloned {
current = t.clone()
cloned = true
}
current[topic] = newTopicPartitions()
}
}
return current
}
// Opposite of ensureTopics, this purges the input topics and *does* store.
func (t *topicsPartitions) purgeTopics(topics []string) {
var cloned bool
current := t.load()
for _, topic := range topics {
if _, exists := current[topic]; exists {
if !cloned {
current = t.clone()
cloned = true
}
delete(current, topic)
}
}
if cloned {
t.storeData(current)
}
}
// Updates the topic partitions data atomic value.
//
// If this is the first time seeing partitions, we do processing of unknown
// partitions that may be buffered for producing.
func (cl *Client) storePartitionsUpdate(topic string, l *topicPartitions, lv *topicPartitionsData, hadPartitions bool) {
// If the topic already had partitions, then there would be no
// unknown topic waiting and we do not need to notify anything.
if hadPartitions {
l.v.Store(lv)
return
}
p := &cl.producer
p.unknownTopicsMu.Lock()
defer p.unknownTopicsMu.Unlock()
// If the topic did not have partitions, then we need to store the
// partition update BEFORE unlocking the mutex to guard against this
// sequence of events:
//
// - unlock waiters
// - delete waiter
// - new produce recreates waiter
// - we store update
// - we never notify the recreated waiter
//
// By storing before releasing the locks, we ensure that later
// partition loads for this topic under the mu will see our update.
defer l.v.Store(lv)
// If there are no unknown topics or this topic is not unknown, then we
// have nothing to do.
if len(p.unknownTopics) == 0 {
return
}
unknown, exists := p.unknownTopics[topic]
if !exists {
return
}
// If we loaded no partitions because of a retriable error, we signal
// the waiting goroutine that a try happened. It is possible the
// goroutine is quitting and will not be draining unknownWait, so we do
// not require the send.
if len(lv.partitions) == 0 && kerr.IsRetriable(lv.loadErr) {
select {
case unknown.wait <- lv.loadErr:
default:
}
return
}
// Either we have a fatal error or we can successfully partition.
//
// Even with a fatal error, if we loaded any partitions, we partition.
// If we only had a fatal error, we can finish promises in a goroutine.
// If we are partitioning, we have to do it under the unknownMu to
// ensure prior buffered records are produced in order before we
// release the mu.
delete(p.unknownTopics, topic)
close(unknown.wait) // allow waiting goroutine to quit
if len(lv.partitions) == 0 {
cl.producer.promiseBatch(batchPromise{
recs: unknown.buffered,
err: lv.loadErr,
})
} else {
for _, pr := range unknown.buffered {
cl.doPartitionRecord(l, lv, pr)
}
}
}
// If a metadata request fails after retrying (internally retrying, so only a
// few times), or the metadata request does not return topics that we requested
// (which may also happen additionally consuming via regex), then we need to
// bump errors for topics that were previously loaded, and bump errors for
// topics awaiting load.
//
// This has two modes of operation:
//
// 1. if no topics were missing, then the metadata request failed outright,
// and we need to bump errors on all stored topics and unknown topics.
//
// 2. if topics were missing, then the metadata request was successful but
// had missing data, and we need to bump errors on only what was mising.
func (cl *Client) bumpMetadataFailForTopics(requested map[string]*topicPartitions, err error, missingTopics ...string) {
p := &cl.producer
// mode 1
if len(missingTopics) == 0 {
for _, topic := range requested {
for _, topicPartition := range topic.load().partitions {
topicPartition.records.bumpRepeatedLoadErr(err)
}
}
}
// mode 2
var missing map[string]bool
for _, failTopic := range missingTopics {
if missing == nil {
missing = make(map[string]bool, len(missingTopics))
}
missing[failTopic] = true
if topic, exists := requested[failTopic]; exists {
for _, topicPartition := range topic.load().partitions {
topicPartition.records.bumpRepeatedLoadErr(err)
}
}
}
p.unknownTopicsMu.Lock()
defer p.unknownTopicsMu.Unlock()
for topic, unknown := range p.unknownTopics {
// if nil, mode 1 (req err), else mode 2 (missing resp)
if missing != nil && !missing[topic] {
continue
}
select {
case unknown.wait <- err:
default:
}
}
}
// topicPartitionsData is the data behind a topicPartitions' v.
//
// We keep this in an atomic because it is expected to be extremely read heavy,
// and if it were behind a lock, the lock would need to be held for a while.
type topicPartitionsData struct {
// NOTE if adding anything to this struct, be sure to fix meta merge.
loadErr error // could be auth, unknown, leader not avail, or creation err
isInternal bool
partitions []*topicPartition // partition num => partition
writablePartitions []*topicPartition // subset of above
}
// topicPartition contains all information from Kafka for a topic's partition,
// as well as what a client is producing to it or info about consuming from it.
type topicPartition struct {
// If we have a load error (leader/listener/replica not available), we
// keep the old topicPartition data and the new error.
loadErr error
// If, on metadata refresh, the leader epoch for this partition goes
// backwards, we ignore the metadata refresh and signal the metadata
// should be reloaded: the broker we requested is stale. However, the
// broker could get into a bad state through some weird cluster failure
// scenarios. If we see the epoch rewind repeatedly, we eventually keep
// the metadata refresh. This is not detrimental and at worst will lead
// to the broker telling us to update our metadata.
epochRewinds uint8
// If we do not have a load error, we determine if the new
// topicPartition is the same or different from the old based on
// whether the data changed (leader or leader epoch, etc.).
topicPartitionData
// If we do not have a load error, we copy the records and cursor
// pointers from the old after updating any necessary fields in them
// (see migrate functions below).
//
// Only one of records or cursor is non-nil.
records *recBuf
cursor *cursor
}
// Contains stuff that changes on metadata update that we copy into a cursor or
// recBuf.
type topicPartitionData struct {
// Our leader; if metadata sees this change, the metadata update
// migrates the cursor to a different source with the session stopped,
// and the recBuf to a different sink under a tight mutex.
leader int32
// What we believe to be the epoch of the leader for this partition.
//
// For cursors, for KIP-320, if a broker receives a fetch request where
// the current leader epoch does not match the brokers, either the
// broker is behind and returns UnknownLeaderEpoch, or we are behind
// and the broker returns FencedLeaderEpoch. For the former, we back
// off and retry. For the latter, we update our metadata.
leaderEpoch int32
}
// migrateProductionTo is called on metadata update if a topic partition's sink
// has changed. This moves record production from one sink to the other; this
// must be done such that records produced during migration follow those
// already buffered.
func (old *topicPartition) migrateProductionTo(new *topicPartition) {
// First, remove our record buffer from the old sink.
old.records.sink.removeRecBuf(old.records)
// Before this next lock, record producing will buffer to the
// in-migration-progress records and may trigger draining to
// the old sink. That is fine, the old sink no longer consumes
// from these records. We just have wasted drain triggers.
old.records.mu.Lock() // guard setting sink and topic partition data
old.records.sink = new.records.sink
old.records.topicPartitionData = new.topicPartitionData
old.records.mu.Unlock()
// After the unlock above, record buffering can trigger drains
// on the new sink, which is not yet consuming from these
// records. Again, just more wasted drain triggers.
old.records.sink.addRecBuf(old.records) // add our record source to the new sink
// At this point, the new sink will be draining our records. We lastly
// need to copy the records pointer to our new topicPartition.
new.records = old.records
}
// migrateCursorTo is called on metadata update if a topic partition's leader
// or leader epoch has changed.
//
// This is a little bit different from above, in that we do this logic only
// after stopping a consumer session. With the consumer session stopped, we
// have fewer concurrency issues to worry about.
func (old *topicPartition) migrateCursorTo(
new *topicPartition,
reloadOffsets *listOrEpochLoads,
stopConsumerSession func(),
) {
stopConsumerSession()
old.cursor.source.removeCursor(old.cursor)
// With the session stopped, we can update fields on the old cursor
// with no concurrency issue.
old.cursor.source = new.cursor.source
// KIP-320: if we had consumed some messages, we need to validate the
// leader epoch on the new broker to see if we experienced data loss
// before we can use this cursor.
//
// Metadata ensures that leaderEpoch is non-negative only if the broker
// supports KIP-320.
if new.leaderEpoch != -1 && old.cursor.lastConsumedEpoch >= 0 {
// Since the cursor consumed messages, it is definitely usable.
// We use it so that the epoch load can finish using it
// properly.
old.cursor.use()
reloadOffsets.addLoad(old.cursor.topic, old.cursor.partition, loadTypeEpoch, offsetLoad{
replica: -1,
Offset: Offset{
at: old.cursor.offset,
epoch: old.cursor.lastConsumedEpoch,
},
})
}
old.cursor.topicPartitionData = new.topicPartitionData
old.cursor.source.addCursor(old.cursor)
new.cursor = old.cursor
}