-
-
Notifications
You must be signed in to change notification settings - Fork 157
/
topics_and_partitions.go
341 lines (295 loc) · 10.9 KB
/
topics_and_partitions.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
package kgo
import (
"sync"
"sync/atomic"
"github.com/twmb/franz-go/pkg/kerr"
)
func newTopicPartitions() *topicPartitions {
parts := new(topicPartitions)
parts.v.Store(new(topicPartitionsData))
return parts
}
// Contains all information about a topic's partitions.
type topicPartitions struct {
v atomic.Value // *topicPartitionsData
partsMu sync.Mutex
partitioner TopicPartitioner
lpInput *lpInput // for partitioning if the partitioner is a LoadTopicPartitioner
}
func (t *topicPartitions) load() *topicPartitionsData { return t.v.Load().(*topicPartitionsData) }
var noTopicsPartitions = newTopicsPartitions()
func newTopicsPartitions() *topicsPartitions {
var t topicsPartitions
t.v.Store(make(topicsPartitionsData))
return &t
}
// A helper type mapping topics to their partitions;
// this is the inner value of topicPartitions.v.
type topicsPartitionsData map[string]*topicPartitions
func (d topicsPartitionsData) hasTopic(t string) bool { _, exists := d[t]; return exists }
func (d topicsPartitionsData) loadTopic(t string) *topicPartitionsData {
tp, exists := d[t]
if !exists {
return nil
}
return tp.load()
}
// A helper type mapping topics to their partitions that can be updated
// atomically.
type topicsPartitions struct {
v atomic.Value // topicsPartitionsData (map[string]*topicPartitions)
}
func (t *topicsPartitions) load() topicsPartitionsData {
if t == nil {
return nil
}
return t.v.Load().(topicsPartitionsData)
}
func (t *topicsPartitions) storeData(d topicsPartitionsData) { t.v.Store(d) }
func (t *topicsPartitions) storeTopics(topics []string) { t.v.Store(t.ensureTopics(topics)) }
func (t *topicsPartitions) clone() topicsPartitionsData {
current := t.load()
clone := make(map[string]*topicPartitions, len(current))
for k, v := range current {
clone[k] = v
}
return clone
}
// Ensures that the topics exist in the returned map, but does not store the
// update. This can be used to update the data and store later, rather than
// storing immediately.
func (t *topicsPartitions) ensureTopics(topics []string) topicsPartitionsData {
var cloned bool
current := t.load()
for _, topic := range topics {
if _, exists := current[topic]; !exists {
if !cloned {
current = t.clone()
cloned = true
}
current[topic] = newTopicPartitions()
}
}
return current
}
// Updates the topic partitions data atomic value.
//
// If this is the first time seeing partitions, we do processing of unknown
// partitions that may be buffered for producing.
func (cl *Client) storePartitionsUpdate(topic string, l *topicPartitions, lv *topicPartitionsData, hadPartitions bool) {
// If the topic already had partitions, then there would be no
// unknown topic waiting and we do not need to notify anything.
if hadPartitions {
l.v.Store(lv)
return
}
p := &cl.producer
p.unknownTopicsMu.Lock()
defer p.unknownTopicsMu.Unlock()
// If the topic did not have partitions, then we need to store the
// partition update BEFORE unlocking the mutex to guard against this
// sequence of events:
//
// - unlock waiters
// - delete waiter
// - new produce recreates waiter
// - we store update
// - we never notify the recreated waiter
//
// By storing before releasing the locks, we ensure that later
// partition loads for this topic under the mu will see our update.
defer l.v.Store(lv)
// If there are no unknown topics or this topic is not unknown, then we
// have nothing to do.
if len(p.unknownTopics) == 0 {
return
}
unknown, exists := p.unknownTopics[topic]
if !exists {
return
}
// If we loaded no partitions because of a retriable error, we signal
// the waiting goroutine that a try happened. It is possible the
// goroutine is quitting and will not be draining unknownWait, so we do
// not require the send.
if len(lv.partitions) == 0 && kerr.IsRetriable(lv.loadErr) {
select {
case unknown.wait <- lv.loadErr:
default:
}
return
}
// Either we have a fatal error or we can successfully partition.
//
// Even with a fatal error, if we loaded any partitions, we partition.
// If we only had a fatal error, we can finish promises in a goroutine.
// If we are partitioning, we have to do it under the unknownMu to
// ensure prior buffered records are produced in order before we
// release the mu.
delete(p.unknownTopics, topic)
close(unknown.wait) // allow waiting goroutine to quit
if len(lv.partitions) == 0 {
cl.failUnknownTopicRecords(topic, unknown, lv.loadErr)
} else {
for _, pr := range unknown.buffered {
cl.doPartitionRecord(l, lv, pr)
}
}
}
// If a metadata request fails after retrying (internally retrying, so only a
// few times), or the metadata request does not return topics that we requested
// (which may also happen additionally consuming via regex), then we need to
// bump errors for topics that were previously loaded, and bump errors for
// topics awaiting load.
//
// This has two modes of operation:
//
// 1) if no topics were missing, then the metadata request failed outright,
// and we need to bump errors on all stored topics and unknown topics.
//
// 2) if topics were missing, then the metadata request was successful but
// had missing data, and we need to bump errors on only what was mising.
//
func (cl *Client) bumpMetadataFailForTopics(requested map[string]*topicPartitions, err error, missingTopics ...string) {
p := &cl.producer
// mode 1
if len(missingTopics) == 0 {
for _, topic := range requested {
for _, topicPartition := range topic.load().partitions {
topicPartition.records.bumpRepeatedLoadErr(err)
}
}
}
// mode 2
var missing map[string]bool
for _, failTopic := range missingTopics {
if missing == nil {
missing = make(map[string]bool, len(missingTopics))
}
missing[failTopic] = true
if topic, exists := requested[failTopic]; exists {
for _, topicPartition := range topic.load().partitions {
topicPartition.records.bumpRepeatedLoadErr(err)
}
}
}
p.unknownTopicsMu.Lock()
defer p.unknownTopicsMu.Unlock()
for topic, unknown := range p.unknownTopics {
// if nil, mode 1, else mode 2
if missing != nil && !missing[topic] {
continue
}
select {
case unknown.wait <- err:
default:
}
}
}
// topicPartitionsData is the data behind a topicPartitions' v.
//
// We keep this in an atomic because it is expected to be extremely read heavy,
// and if it were behind a lock, the lock would need to be held for a while.
type topicPartitionsData struct {
// NOTE if adding anything to this struct, be sure to fix meta merge.
loadErr error // could be auth, unknown, leader not avail, or creation err
isInternal bool
partitions []*topicPartition // partition num => partition
writablePartitions []*topicPartition // subset of above
}
// topicPartition contains all information from Kafka for a topic's partition,
// as well as what a client is producing to it or info about consuming from it.
type topicPartition struct {
// If we have a load error (leader/listener/replica not available), we
// keep the old topicPartition data and the new error.
loadErr error
// If we do not have a load error, we determine if the new
// topicPartition is the same or different from the old based on
// whether the data changed (leader or leader epoch, etc.).
topicPartitionData
// If we do not have a load error, we copy the records and cursor
// pointers from the old after updating any necessary fields in them
// (see migrate functions below).
//
// Only one of records or cursor is non-nil.
records *recBuf
cursor *cursor
}
// Contains stuff that changes on metadata update that we copy into a cursor or
// recBuf.
type topicPartitionData struct {
// Our leader; if metadata sees this change, the metadata update
// migrates the cursor to a different source with the session stopped,
// and the recBuf to a different sink under a tight mutex.
leader int32
// What we believe to be the epoch of the leader for this partition.
//
// For cursors, for KIP-320, if a broker receives a fetch request where
// the current leader epoch does not match the brokers, either the
// broker is behind and returns UnknownLeaderEpoch, or we are behind
// and the broker returns FencedLeaderEpoch. For the former, we back
// off and retry. For the latter, we update our metadata.
leaderEpoch int32
}
// migrateProductionTo is called on metadata update if a topic partition's sink
// has changed. This moves record production from one sink to the other; this
// must be done such that records produced during migration follow those
// already buffered.
func (old *topicPartition) migrateProductionTo(new *topicPartition) {
// First, remove our record buffer from the old sink.
old.records.sink.removeRecBuf(old.records)
// Before this next lock, record producing will buffer to the
// in-migration-progress records and may trigger draining to
// the old sink. That is fine, the old sink no longer consumes
// from these records. We just have wasted drain triggers.
old.records.mu.Lock() // guard setting sink and topic partition data
old.records.sink = new.records.sink
old.records.topicPartitionData = new.topicPartitionData
old.records.mu.Unlock()
// After the unlock above, record buffering can trigger drains
// on the new sink, which is not yet consuming from these
// records. Again, just more wasted drain triggers.
old.records.sink.addRecBuf(old.records) // add our record source to the new sink
// At this point, the new sink will be draining our records. We lastly
// need to copy the records pointer to our new topicPartition.
new.records = old.records
}
// migrateCursorTo is called on metadata update if a topic partition's leader
// or leader epoch has changed.
//
// This is a little bit different from above, in that we do this logic only
// after stopping a consumer session. With the consumer session stopped, we
// have fewer concurrency issues to worry about.
func (old *topicPartition) migrateCursorTo(
new *topicPartition,
reloadOffsets *listOrEpochLoads,
stopConsumerSession func(),
) {
stopConsumerSession()
old.cursor.source.removeCursor(old.cursor)
// With the session stopped, we can update fields on the old cursor
// with no concurrency issue.
old.cursor.source = new.cursor.source
// KIP-320: if we had consumed some messages, we need to validate the
// leader epoch on the new broker to see if we experienced data loss
// before we can use this cursor.
//
// Metadata ensures that leaderEpoch is non-negative only if the broker
// supports KIP-320.
if new.leaderEpoch != -1 && old.cursor.lastConsumedEpoch >= 0 {
// Since the cursor consumed messages, it is definitely usable.
// We use it so that the epoch load can finish using it
// properly.
old.cursor.use()
reloadOffsets.addLoad(old.cursor.topic, old.cursor.partition, loadTypeEpoch, offsetLoad{
replica: -1,
Offset: Offset{
at: old.cursor.offset,
epoch: old.cursor.lastConsumedEpoch,
},
})
}
old.cursor.topicPartitionData = new.topicPartitionData
old.cursor.source.addCursor(old.cursor)
new.cursor = old.cursor
}