From d82f4fab6cf37cd1eca9c3574984e12a7ae27c42 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 1 Jun 2022 20:02:27 +0800 Subject: [PATCH] unsafe recovery: Fix force leader stage's infinite retry (#5088) (#5090) close tikv/pd#5085, ref tikv/pd#5088 Fix unsafe recovery infinite retry on force leader stage Signed-off-by: Connor1996 Co-authored-by: Connor1996 --- server/cluster/unsafe_recovery_controller.go | 12 ++++ .../unsafe_recovery_controller_test.go | 57 ++++++++++++++----- 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/server/cluster/unsafe_recovery_controller.go b/server/cluster/unsafe_recovery_controller.go index 2a84535624c..9782d1a20d8 100644 --- a/server/cluster/unsafe_recovery_controller.go +++ b/server/cluster/unsafe_recovery_controller.go @@ -454,6 +454,7 @@ func (u *unsafeRecoveryController) changeStage(stage unsafeRecoveryStage) { stores += ", " } } + // TODO: clean up existing operators output.Info = fmt.Sprintf("Unsafe recovery enters collect report stage: failed stores %s", stores) case tombstoneTiFlashLearner: output.Info = "Unsafe recovery enters tombstone TiFlash learner stage" @@ -967,6 +968,17 @@ func (u *unsafeRecoveryController) generateForceLeaderPlan(newestRegionTree *reg return true }) + if hasPlan { + for storeID := range u.storeReports { + plan := u.getRecoveryPlan(storeID) + if plan.ForceLeader == nil { + // Fill an empty force leader plan to the stores that doesn't have any force leader plan + // to avoid exiting existing force leaders. + plan.ForceLeader = &pdpb.ForceLeader{} + } + } + } + return hasPlan } diff --git a/server/cluster/unsafe_recovery_controller_test.go b/server/cluster/unsafe_recovery_controller_test.go index 8b70b4cd0b6..edd6bf9c187 100644 --- a/server/cluster/unsafe_recovery_controller_test.go +++ b/server/cluster/unsafe_recovery_controller_test.go @@ -328,14 +328,14 @@ func (s *testUnsafeRecoverySuite) TestForceLeaderFail(c *C) { cluster := newTestRaftCluster(s.ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) cluster.coordinator = newCoordinator(s.ctx, cluster, hbstream.NewTestHeartbeatStreams(s.ctx, cluster.meta.GetId(), cluster, true)) cluster.coordinator.run() - for _, store := range newTestStores(3, "6.0.0") { + for _, store := range newTestStores(4, "6.0.0") { c.Assert(cluster.PutStore(store.GetMeta()), IsNil) } recoveryController := newUnsafeRecoveryController(cluster) c.Assert(recoveryController.RemoveFailedStores(map[uint64]struct{}{ - 2: {}, 3: {}, - }, 1), IsNil) + 4: {}, + }, 60), IsNil) reports := map[uint64]*pdpb.StoreReport{ 1: { @@ -345,28 +345,57 @@ func (s *testUnsafeRecoverySuite) TestForceLeaderFail(c *C) { RegionState: &raft_serverpb.RegionLocalState{ Region: &metapb.Region{ Id: 1001, + StartKey: []byte(""), + EndKey: []byte("x"), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}, Peers: []*metapb.Peer{ - {Id: 11, StoreId: 1}, {Id: 21, StoreId: 2}, {Id: 31, StoreId: 3}}}}}, + {Id: 11, StoreId: 1}, {Id: 21, StoreId: 3}, {Id: 31, StoreId: 4}}}}}, + }, + }, + 2: { + PeerReports: []*pdpb.PeerReport{ + { + RaftState: &raft_serverpb.RaftLocalState{LastIndex: 10, HardState: &eraftpb.HardState{Term: 1, Commit: 10}}, + RegionState: &raft_serverpb.RegionLocalState{ + Region: &metapb.Region{ + Id: 1002, + StartKey: []byte("x"), + EndKey: []byte(""), + RegionEpoch: &metapb.RegionEpoch{ConfVer: 10, Version: 1}, + Peers: []*metapb.Peer{ + {Id: 12, StoreId: 2}, {Id: 22, StoreId: 3}, {Id: 32, StoreId: 4}}}}}, }, }, } - req := newStoreHeartbeat(1, reports[1]) - resp := &pdpb.StoreHeartbeatResponse{} - req.StoreReport.Step = 1 - recoveryController.HandleStoreHeartbeat(req, resp) + req1 := newStoreHeartbeat(1, reports[1]) + resp1 := &pdpb.StoreHeartbeatResponse{} + req1.StoreReport.Step = 1 + recoveryController.HandleStoreHeartbeat(req1, resp1) + req2 := newStoreHeartbeat(2, reports[2]) + resp2 := &pdpb.StoreHeartbeatResponse{} + req2.StoreReport.Step = 1 + recoveryController.HandleStoreHeartbeat(req2, resp2) c.Assert(recoveryController.GetStage(), Equals, forceLeader) + recoveryController.HandleStoreHeartbeat(req1, resp1) - applyRecoveryPlan(c, 1, reports, resp) - // force leader doesn't succeed - reports[1].PeerReports[0].IsForceLeader = false - recoveryController.HandleStoreHeartbeat(req, resp) + // force leader on store 1 succeed + applyRecoveryPlan(c, 1, reports, resp1) + applyRecoveryPlan(c, 2, reports, resp2) + // force leader on store 2 doesn't succeed + reports[2].PeerReports[0].IsForceLeader = false + + // force leader should retry on store 2 + recoveryController.HandleStoreHeartbeat(req1, resp1) + recoveryController.HandleStoreHeartbeat(req2, resp2) c.Assert(recoveryController.GetStage(), Equals, forceLeader) + recoveryController.HandleStoreHeartbeat(req1, resp1) // force leader succeed this time - applyRecoveryPlan(c, 1, reports, resp) - recoveryController.HandleStoreHeartbeat(req, resp) + applyRecoveryPlan(c, 1, reports, resp1) + applyRecoveryPlan(c, 2, reports, resp2) + recoveryController.HandleStoreHeartbeat(req1, resp1) + recoveryController.HandleStoreHeartbeat(req2, resp2) c.Assert(recoveryController.GetStage(), Equals, demoteFailedVoter) }