forked from pingcap/tidb
/
reporter.go
333 lines (300 loc) · 11.9 KB
/
reporter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package reporter
import (
"context"
"time"
"github.com/pingcap/failpoint"
"github.com/twotigers93/tidb/util"
"github.com/twotigers93/tidb/util/logutil"
"github.com/twotigers93/tidb/util/topsql/collector"
reporter_metrics "github.com/twotigers93/tidb/util/topsql/reporter/metrics"
topsqlstate "github.com/twotigers93/tidb/util/topsql/state"
"github.com/twotigers93/tidb/util/topsql/stmtstats"
"go.uber.org/zap"
)
const (
reportTimeout = 40 * time.Second
collectChanBufferSize = 2
)
var nowFunc = time.Now
// TopSQLReporter collects Top SQL metrics.
type TopSQLReporter interface {
collector.Collector
stmtstats.Collector
// Start uses to start the reporter.
Start()
// RegisterSQL registers a normalizedSQL with SQLDigest.
//
// Note that the normalized SQL string can be of >1M long.
// This function should be thread-safe, which means concurrently calling it
// in several goroutines should be fine. It should also return immediately,
// and do any CPU-intensive job asynchronously.
RegisterSQL(sqlDigest []byte, normalizedSQL string, isInternal bool)
// RegisterPlan like RegisterSQL, but for normalized plan strings.
// isLarge indicates the size of normalizedPlan is big.
RegisterPlan(planDigest []byte, normalizedPlan string, isLarge bool)
// Close uses to close and release the reporter resource.
Close()
}
var _ TopSQLReporter = &RemoteTopSQLReporter{}
var _ DataSinkRegisterer = &RemoteTopSQLReporter{}
// RemoteTopSQLReporter implements TopSQLReporter that sends data to a remote agent.
// This should be called periodically to collect TopSQL resource usage metrics.
type RemoteTopSQLReporter struct {
ctx context.Context
reportCollectedDataChan chan collectedData
cancel context.CancelFunc
sqlCPUCollector *collector.SQLCPUCollector
collectCPUTimeChan chan []collector.SQLCPUTimeRecord
collectStmtStatsChan chan stmtstats.StatementStatsMap
collecting *collecting
normalizedSQLMap *normalizedSQLMap
normalizedPlanMap *normalizedPlanMap
stmtStatsBuffer map[uint64]stmtstats.StatementStatsMap // timestamp => stmtstats.StatementStatsMap
// calling decodePlan this can take a while, so should not block critical paths.
decodePlan planBinaryDecodeFunc
// Instead of dropping large plans, we compress it into encoded format and report
compressPlan planBinaryCompressFunc
DefaultDataSinkRegisterer
}
// NewRemoteTopSQLReporter creates a new RemoteTopSQLReporter.
//
// decodePlan is a decoding function which will be called asynchronously to decode the plan binary to string.
func NewRemoteTopSQLReporter(decodePlan planBinaryDecodeFunc, compressPlan planBinaryCompressFunc) *RemoteTopSQLReporter {
ctx, cancel := context.WithCancel(context.Background())
tsr := &RemoteTopSQLReporter{
DefaultDataSinkRegisterer: NewDefaultDataSinkRegisterer(ctx),
ctx: ctx,
cancel: cancel,
collectCPUTimeChan: make(chan []collector.SQLCPUTimeRecord, collectChanBufferSize),
collectStmtStatsChan: make(chan stmtstats.StatementStatsMap, collectChanBufferSize),
reportCollectedDataChan: make(chan collectedData, 1),
collecting: newCollecting(),
normalizedSQLMap: newNormalizedSQLMap(),
normalizedPlanMap: newNormalizedPlanMap(),
stmtStatsBuffer: map[uint64]stmtstats.StatementStatsMap{},
decodePlan: decodePlan,
compressPlan: compressPlan,
}
tsr.sqlCPUCollector = collector.NewSQLCPUCollector(tsr)
return tsr
}
// Start implements the TopSQLReporter interface.
func (tsr *RemoteTopSQLReporter) Start() {
tsr.sqlCPUCollector.Start()
go tsr.collectWorker()
go tsr.reportWorker()
}
// Collect implements tracecpu.Collector.
//
// WARN: It will drop the DataRecords if the processing is not in time.
// This function is thread-safe and efficient.
func (tsr *RemoteTopSQLReporter) Collect(data []collector.SQLCPUTimeRecord) {
if len(data) == 0 {
return
}
select {
case tsr.collectCPUTimeChan <- data:
default:
// ignore if chan blocked
reporter_metrics.IgnoreCollectChannelFullCounter.Inc()
}
}
// CollectStmtStatsMap implements stmtstats.Collector.
//
// WARN: It will drop the DataRecords if the processing is not in time.
// This function is thread-safe and efficient.
func (tsr *RemoteTopSQLReporter) CollectStmtStatsMap(data stmtstats.StatementStatsMap) {
if len(data) == 0 {
return
}
select {
case tsr.collectStmtStatsChan <- data:
default:
// ignore if chan blocked
reporter_metrics.IgnoreCollectStmtChannelFullCounter.Inc()
}
}
// RegisterSQL implements TopSQLReporter.
//
// This function is thread-safe and efficient.
func (tsr *RemoteTopSQLReporter) RegisterSQL(sqlDigest []byte, normalizedSQL string, isInternal bool) {
tsr.normalizedSQLMap.register(sqlDigest, normalizedSQL, isInternal)
}
// RegisterPlan implements TopSQLReporter.
//
// This function is thread-safe and efficient.
func (tsr *RemoteTopSQLReporter) RegisterPlan(planDigest []byte, normalizedPlan string, isLarge bool) {
tsr.normalizedPlanMap.register(planDigest, normalizedPlan, isLarge)
}
// Close implements TopSQLReporter.
func (tsr *RemoteTopSQLReporter) Close() {
tsr.cancel()
tsr.sqlCPUCollector.Stop()
tsr.onReporterClosing()
}
// collectWorker consumes and collects data from tracecpu.Collector/stmtstats.Collector.
func (tsr *RemoteTopSQLReporter) collectWorker() {
defer util.Recover("top-sql", "collectWorker", nil, false)
currentReportInterval := topsqlstate.GlobalState.ReportIntervalSeconds.Load()
reportTicker := time.NewTicker(time.Second * time.Duration(currentReportInterval))
defer reportTicker.Stop()
for {
select {
case <-tsr.ctx.Done():
return
case data := <-tsr.collectCPUTimeChan:
timestamp := uint64(nowFunc().Unix())
tsr.processCPUTimeData(timestamp, data)
case data := <-tsr.collectStmtStatsChan:
timestamp := uint64(nowFunc().Unix())
tsr.stmtStatsBuffer[timestamp] = data
case <-reportTicker.C:
tsr.processStmtStatsData()
tsr.takeDataAndSendToReportChan()
// Update `reportTicker` if report interval changed.
if newInterval := topsqlstate.GlobalState.ReportIntervalSeconds.Load(); newInterval != currentReportInterval {
currentReportInterval = newInterval
reportTicker.Reset(time.Second * time.Duration(currentReportInterval))
}
}
}
}
// processCPUTimeData collects top N cpuRecords of each round into tsr.collecting, and evict the
// data that is not in top N. All the evicted cpuRecords will be summary into the others.
func (tsr *RemoteTopSQLReporter) processCPUTimeData(timestamp uint64, data cpuRecords) {
defer util.Recover("top-sql", "processCPUTimeData", nil, false)
// Get top N cpuRecords of each round cpuRecords. Collect the top N to tsr.collecting
// for each round. SQL meta will not be evicted, since the evicted SQL can be appeared
// on other components (TiKV) TopN DataRecords.
top, evicted := data.topN(int(topsqlstate.GlobalState.MaxStatementCount.Load()))
for _, r := range top {
tsr.collecting.getOrCreateRecord(r.SQLDigest, r.PlanDigest).appendCPUTime(timestamp, r.CPUTimeMs)
}
if len(evicted) == 0 {
return
}
totalEvictedCPUTime := uint32(0)
for _, e := range evicted {
totalEvictedCPUTime += e.CPUTimeMs
// Mark which digests are evicted under each timestamp.
// We will determine whether the corresponding CPUTime has been evicted
// when collecting stmtstats. If so, then we can ignore it directly.
tsr.collecting.markAsEvicted(timestamp, e.SQLDigest, e.PlanDigest)
}
tsr.collecting.appendOthersCPUTime(timestamp, totalEvictedCPUTime)
}
// processStmtStatsData collects tsr.stmtStatsBuffer into tsr.collecting.
// All the evicted items will be summary into the others.
func (tsr *RemoteTopSQLReporter) processStmtStatsData() {
defer util.Recover("top-sql", "processStmtStatsData", nil, false)
for timestamp, data := range tsr.stmtStatsBuffer {
for digest, item := range data {
sqlDigest, planDigest := []byte(digest.SQLDigest), []byte(digest.PlanDigest)
if tsr.collecting.hasEvicted(timestamp, sqlDigest, planDigest) {
// This timestamp+sql+plan has been evicted due to low CPUTime.
tsr.collecting.appendOthersStmtStatsItem(timestamp, *item)
continue
}
tsr.collecting.getOrCreateRecord(sqlDigest, planDigest).appendStmtStatsItem(timestamp, *item)
}
}
tsr.stmtStatsBuffer = map[uint64]stmtstats.StatementStatsMap{}
}
// takeDataAndSendToReportChan takes records data and then send to the report channel for reporting.
func (tsr *RemoteTopSQLReporter) takeDataAndSendToReportChan() {
// Send to report channel. When channel is full, data will be dropped.
select {
case tsr.reportCollectedDataChan <- collectedData{
collected: tsr.collecting.take(),
normalizedSQLMap: tsr.normalizedSQLMap.take(),
normalizedPlanMap: tsr.normalizedPlanMap.take(),
}:
default:
// ignore if chan blocked
reporter_metrics.IgnoreReportChannelFullCounter.Inc()
}
}
// reportWorker sends data to the gRPC endpoint from the `reportCollectedDataChan` one by one.
func (tsr *RemoteTopSQLReporter) reportWorker() {
defer util.Recover("top-sql", "reportWorker", nil, false)
for {
select {
case data := <-tsr.reportCollectedDataChan:
// When `reportCollectedDataChan` receives something, there could be ongoing
// `RegisterSQL` and `RegisterPlan` running, who writes to the data structure
// that `data` contains. So we wait for a little while to ensure that writes
// are finished.
time.Sleep(time.Millisecond * 100)
rs := data.collected.getReportRecords()
// Convert to protobuf data and do report.
tsr.doReport(&ReportData{
DataRecords: rs.toProto(),
SQLMetas: data.normalizedSQLMap.toProto(),
PlanMetas: data.normalizedPlanMap.toProto(tsr.decodePlan, tsr.compressPlan),
})
case <-tsr.ctx.Done():
return
}
}
}
// doReport sends ReportData to DataSinks.
func (tsr *RemoteTopSQLReporter) doReport(data *ReportData) {
defer util.Recover("top-sql", "doReport", nil, false)
if !data.hasData() {
return
}
timeout := reportTimeout
failpoint.Inject("resetTimeoutForTest", func(val failpoint.Value) {
if val.(bool) {
interval := time.Duration(topsqlstate.GlobalState.ReportIntervalSeconds.Load()) * time.Second
if interval < timeout {
timeout = interval
}
}
})
_ = tsr.trySend(data, time.Now().Add(timeout))
}
// trySend sends ReportData to all internal registered DataSinks.
func (tsr *RemoteTopSQLReporter) trySend(data *ReportData, deadline time.Time) error {
tsr.DefaultDataSinkRegisterer.Lock()
dataSinks := make([]DataSink, 0, len(tsr.dataSinks))
for ds := range tsr.dataSinks {
dataSinks = append(dataSinks, ds)
}
tsr.DefaultDataSinkRegisterer.Unlock()
for _, ds := range dataSinks {
if err := ds.TrySend(data, deadline); err != nil {
logutil.BgLogger().Warn("[top-sql] failed to send data to datasink", zap.Error(err))
}
}
return nil
}
// onReporterClosing calls the OnReporterClosing method of all internally registered DataSinks.
func (tsr *RemoteTopSQLReporter) onReporterClosing() {
var m map[DataSink]struct{}
tsr.DefaultDataSinkRegisterer.Lock()
m, tsr.dataSinks = tsr.dataSinks, make(map[DataSink]struct{})
tsr.DefaultDataSinkRegisterer.Unlock()
for d := range m {
d.OnReporterClosing()
}
}
// collectedData is used for transmission in the channel.
type collectedData struct {
collected *collecting
normalizedSQLMap *normalizedSQLMap
normalizedPlanMap *normalizedPlanMap
}