Skip to content

Commit

Permalink
unsafe recovery: Fix force leader stage's infinite retry (#5088) (#5090)
Browse files Browse the repository at this point in the history
close #5085, ref #5088

Fix unsafe recovery infinite retry on force leader stage

Signed-off-by: Connor1996 <zbk602423539@gmail.com>

Co-authored-by: Connor1996 <zbk602423539@gmail.com>
  • Loading branch information
ti-chi-bot and Connor1996 committed Jun 1, 2022
1 parent 1c3aa15 commit d82f4fa
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 14 deletions.
12 changes: 12 additions & 0 deletions server/cluster/unsafe_recovery_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ func (u *unsafeRecoveryController) changeStage(stage unsafeRecoveryStage) {
stores += ", "
}
}
// TODO: clean up existing operators
output.Info = fmt.Sprintf("Unsafe recovery enters collect report stage: failed stores %s", stores)
case tombstoneTiFlashLearner:
output.Info = "Unsafe recovery enters tombstone TiFlash learner stage"
Expand Down Expand Up @@ -967,6 +968,17 @@ func (u *unsafeRecoveryController) generateForceLeaderPlan(newestRegionTree *reg
return true
})

if hasPlan {
for storeID := range u.storeReports {
plan := u.getRecoveryPlan(storeID)
if plan.ForceLeader == nil {
// Fill an empty force leader plan to the stores that doesn't have any force leader plan
// to avoid exiting existing force leaders.
plan.ForceLeader = &pdpb.ForceLeader{}
}
}
}

return hasPlan
}

Expand Down
57 changes: 43 additions & 14 deletions server/cluster/unsafe_recovery_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,14 +328,14 @@ func (s *testUnsafeRecoverySuite) TestForceLeaderFail(c *C) {
cluster := newTestRaftCluster(s.ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster())
cluster.coordinator = newCoordinator(s.ctx, cluster, hbstream.NewTestHeartbeatStreams(s.ctx, cluster.meta.GetId(), cluster, true))
cluster.coordinator.run()
for _, store := range newTestStores(3, "6.0.0") {
for _, store := range newTestStores(4, "6.0.0") {
c.Assert(cluster.PutStore(store.GetMeta()), IsNil)
}
recoveryController := newUnsafeRecoveryController(cluster)
c.Assert(recoveryController.RemoveFailedStores(map[uint64]struct{}{
2: {},
3: {},
}, 1), IsNil)
4: {},
}, 60), IsNil)

reports := map[uint64]*pdpb.StoreReport{
1: {
Expand All @@ -345,28 +345,57 @@ func (s *testUnsafeRecoverySuite) TestForceLeaderFail(c *C) {
RegionState: &raft_serverpb.RegionLocalState{
Region: &metapb.Region{
Id: 1001,
StartKey: []byte(""),
EndKey: []byte("x"),
RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1},
Peers: []*metapb.Peer{
{Id: 11, StoreId: 1}, {Id: 21, StoreId: 2}, {Id: 31, StoreId: 3}}}}},
{Id: 11, StoreId: 1}, {Id: 21, StoreId: 3}, {Id: 31, StoreId: 4}}}}},
},
},
2: {
PeerReports: []*pdpb.PeerReport{
{
RaftState: &raft_serverpb.RaftLocalState{LastIndex: 10, HardState: &eraftpb.HardState{Term: 1, Commit: 10}},
RegionState: &raft_serverpb.RegionLocalState{
Region: &metapb.Region{
Id: 1002,
StartKey: []byte("x"),
EndKey: []byte(""),
RegionEpoch: &metapb.RegionEpoch{ConfVer: 10, Version: 1},
Peers: []*metapb.Peer{
{Id: 12, StoreId: 2}, {Id: 22, StoreId: 3}, {Id: 32, StoreId: 4}}}}},
},
},
}

req := newStoreHeartbeat(1, reports[1])
resp := &pdpb.StoreHeartbeatResponse{}
req.StoreReport.Step = 1
recoveryController.HandleStoreHeartbeat(req, resp)
req1 := newStoreHeartbeat(1, reports[1])
resp1 := &pdpb.StoreHeartbeatResponse{}
req1.StoreReport.Step = 1
recoveryController.HandleStoreHeartbeat(req1, resp1)
req2 := newStoreHeartbeat(2, reports[2])
resp2 := &pdpb.StoreHeartbeatResponse{}
req2.StoreReport.Step = 1
recoveryController.HandleStoreHeartbeat(req2, resp2)
c.Assert(recoveryController.GetStage(), Equals, forceLeader)
recoveryController.HandleStoreHeartbeat(req1, resp1)

applyRecoveryPlan(c, 1, reports, resp)
// force leader doesn't succeed
reports[1].PeerReports[0].IsForceLeader = false
recoveryController.HandleStoreHeartbeat(req, resp)
// force leader on store 1 succeed
applyRecoveryPlan(c, 1, reports, resp1)
applyRecoveryPlan(c, 2, reports, resp2)
// force leader on store 2 doesn't succeed
reports[2].PeerReports[0].IsForceLeader = false

// force leader should retry on store 2
recoveryController.HandleStoreHeartbeat(req1, resp1)
recoveryController.HandleStoreHeartbeat(req2, resp2)
c.Assert(recoveryController.GetStage(), Equals, forceLeader)
recoveryController.HandleStoreHeartbeat(req1, resp1)

// force leader succeed this time
applyRecoveryPlan(c, 1, reports, resp)
recoveryController.HandleStoreHeartbeat(req, resp)
applyRecoveryPlan(c, 1, reports, resp1)
applyRecoveryPlan(c, 2, reports, resp2)
recoveryController.HandleStoreHeartbeat(req1, resp1)
recoveryController.HandleStoreHeartbeat(req2, resp2)
c.Assert(recoveryController.GetStage(), Equals, demoteFailedVoter)
}

Expand Down

0 comments on commit d82f4fa

Please sign in to comment.