Skip to content

Commit

Permalink
improve region request log for diagnose (1300) (#1340)
Browse files Browse the repository at this point in the history
* improve region request log for diagnose

Signed-off-by: crazycs520 <crazycs520@gmail.com>

* remove useless filed

Signed-off-by: crazycs520 <crazycs520@gmail.com>

* use context logger as much as possible (#908)

Signed-off-by: crazycs520 <crazycs520@gmail.com>

* add leader peer id log

Signed-off-by: crazycs520 <crazycs520@gmail.com>

* *: fix panic in get cause error (#1344)

Signed-off-by: crazycs520 <crazycs520@gmail.com>

* address comment

Signed-off-by: crazycs520 <crazycs520@gmail.com>

---------

Signed-off-by: crazycs520 <crazycs520@gmail.com>
  • Loading branch information
crazycs520 committed May 21, 2024
1 parent 18d0dab commit f9fbc4c
Show file tree
Hide file tree
Showing 13 changed files with 527 additions and 178 deletions.
1 change: 1 addition & 0 deletions integration_tests/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func TestMain(m *testing.M) {
opts := []goleak.Option{
goleak.IgnoreTopFunction("github.com/pingcap/goleveldb/leveldb.(*DB).mpoolDrain"),
goleak.IgnoreTopFunction("github.com/tikv/client-go/v2/txnkv/transaction.keepAlive"), // TODO: fix ttlManager goroutine leak
goleak.IgnoreTopFunction("github.com/tikv/client-go/v2/config/retry.(*Config).createBackoffFn.newBackoffFn.func2"),
}

goleak.VerifyTestMain(m, opts...)
Expand Down
11 changes: 6 additions & 5 deletions integration_tests/snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,13 @@ func (s *testSnapshotSuite) TestSnapshotThreadSafe() {

func (s *testSnapshotSuite) TestSnapshotRuntimeStats() {
reqStats := tikv.NewRegionRequestRuntimeStats()
tikv.RecordRegionRequestRuntimeStats(reqStats.Stats, tikvrpc.CmdGet, time.Second)
tikv.RecordRegionRequestRuntimeStats(reqStats.Stats, tikvrpc.CmdGet, time.Millisecond)
reqStats.RecordRPCRuntimeStats(tikvrpc.CmdGet, time.Second)
reqStats.RecordRPCRuntimeStats(tikvrpc.CmdGet, time.Millisecond)
snapshot := s.store.GetSnapshot(0)
snapshot.SetRuntimeStats(&txnkv.SnapshotRuntimeStats{})
snapshot.MergeRegionRequestStats(reqStats.Stats)
snapshot.MergeRegionRequestStats(reqStats.Stats)
runtimeStats := &txnkv.SnapshotRuntimeStats{}
snapshot.SetRuntimeStats(runtimeStats)
snapshot.MergeRegionRequestStats(reqStats)
snapshot.MergeRegionRequestStats(reqStats)
bo := tikv.NewBackofferWithVars(context.Background(), 2000, nil)
err := bo.BackoffWithMaxSleepTxnLockFast(5, errors.New("test"))
s.Nil(err)
Expand Down
4 changes: 2 additions & 2 deletions internal/client/client_batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,7 @@ func sendBatchRequest(
select {
case batchConn.batchCommandsCh <- entry:
case <-ctx.Done():
logutil.BgLogger().Debug("send request is cancelled",
logutil.Logger(ctx).Debug("send request is cancelled",
zap.String("to", addr), zap.String("cause", ctx.Err().Error()))
return nil, errors.WithStack(ctx.Err())
case <-timer.C:
Expand All @@ -807,7 +807,7 @@ func sendBatchRequest(
return tikvrpc.FromBatchCommandsResponse(res)
case <-ctx.Done():
atomic.StoreInt32(&entry.canceled, 1)
logutil.BgLogger().Debug("wait response is cancelled",
logutil.Logger(ctx).Debug("wait response is cancelled",
zap.String("to", addr), zap.String("cause", ctx.Err().Error()))
return nil, errors.WithStack(ctx.Err())
case <-timer.C:
Expand Down
6 changes: 3 additions & 3 deletions internal/locate/region_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ func (c *RegionCache) GetTiKVRPCContext(bo *retry.Backoffer, id RegionVerID, rep
storeFailEpoch := atomic.LoadUint32(&store.epoch)
if storeFailEpoch != regionStore.storeEpochs[storeIdx] {
cachedRegion.invalidate(Other)
logutil.BgLogger().Info("invalidate current region, because others failed on same store",
logutil.Logger(bo.GetCtx()).Info("invalidate current region, because others failed on same store",
zap.Uint64("region", id.GetID()),
zap.String("store", store.addr))
return nil, nil
Expand Down Expand Up @@ -789,7 +789,7 @@ func (c *RegionCache) GetTiFlashRPCContext(bo *retry.Backoffer, id RegionVerID,
storeFailEpoch := atomic.LoadUint32(&store.epoch)
if storeFailEpoch != regionStore.storeEpochs[storeIdx] {
cachedRegion.invalidate(Other)
logutil.BgLogger().Info("invalidate current region, because others failed on same store",
logutil.Logger(bo.GetCtx()).Info("invalidate current region, because others failed on same store",
zap.Uint64("region", id.GetID()),
zap.String("store", store.addr))
// TiFlash will always try to find out a valid peer, avoiding to retry too many times.
Expand Down Expand Up @@ -1826,7 +1826,7 @@ func (c *RegionCache) OnRegionEpochNotMatch(bo *retry.Backoffer, ctx *RPCContext
(meta.GetRegionEpoch().GetConfVer() < ctx.Region.confVer ||
meta.GetRegionEpoch().GetVersion() < ctx.Region.ver) {
err := errors.Errorf("region epoch is ahead of tikv. rpc ctx: %+v, currentRegions: %+v", ctx, currentRegions)
logutil.BgLogger().Info("region epoch is ahead of tikv", zap.Error(err))
logutil.Logger(bo.GetCtx()).Info("region epoch is ahead of tikv", zap.Error(err))
return true, bo.Backoff(retry.BoRegionMiss, err)
}
}
Expand Down
Loading

0 comments on commit f9fbc4c

Please sign in to comment.