Skip to content

Commit

Permalink
Revert "raftstore: improve the remove peer check (tikv#16467) (tikv#1…
Browse files Browse the repository at this point in the history
…6470)"

This reverts commit 13bbe32.

Signed-off-by: tonyxuqqi <tonyxuqi@outlook.com>
  • Loading branch information
tonyxuqqi committed Feb 1, 2024
1 parent 13bbe32 commit c57eb86
Showing 1 changed file with 21 additions and 48 deletions.
69 changes: 21 additions & 48 deletions components/raftstore/src/store/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,13 @@ pub fn check_conf_change(
return Err(box_err!("multiple changes that only effect learner"));
}

check_remove_or_demote_voter(region, cfg, change_peers, leader.get_id(), peer_heartbeat)?;
check_remove_or_demote_voter(
region.get_id(),
cfg,
change_peers,
leader.get_id(),
peer_heartbeat,
)?;
if !ignore_safety {
let promoted_commit_index = after_progress.maximal_committed_index().0;
let first_index = node.raft.raft_log.first_index();
Expand Down Expand Up @@ -1021,7 +1027,7 @@ pub fn check_conf_change(
}

fn check_remove_or_demote_voter(
region: &metapb::Region,
region_id: u64,
cfg: &Config,
change_peers: &[ChangePeerRequest],
leader_id: u64,
Expand All @@ -1030,24 +1036,16 @@ fn check_remove_or_demote_voter(
let mut slow_peer_count = 0;
let mut normal_peer_count = 0;
// Here we assume if the last beartbeat is within 2 election timeout, the peer
// is healthy. When a region is hibernate, we expect all its peers are *slow*
// and it would still allow the operation
// is healthy. This is to be tolerant to some slightly slow peers when
// the leader is in hibernate mode.
let slow_peer_threshold =
2 * cfg.raft_base_tick_interval.0 * cfg.raft_max_election_timeout_ticks as u32;
for (id, last_heartbeat) in peer_heartbeat {
// for slow and normal peer calculation, we only count voter role
if region
.get_peers()
.iter()
.find(|p| p.get_id() == *id)
.map_or(false, |p| p.role == PeerRole::Voter)
{
// leader itself is not a slow peer
if *id == leader_id || last_heartbeat.elapsed() <= slow_peer_threshold {
normal_peer_count += 1;
} else {
slow_peer_count += 1;
}
// leader itself is not a slow peer
if *id == leader_id || last_heartbeat.elapsed() <= slow_peer_threshold {
normal_peer_count += 1;
} else {
slow_peer_count += 1;
}
}

Expand All @@ -1057,16 +1055,10 @@ fn check_remove_or_demote_voter(
if change_type == ConfChangeType::RemoveNode
|| change_type == ConfChangeType::AddLearnerNode
{
let is_voter = region
.get_peers()
.iter()
.find(|p| p.get_id() == peer.get_id())
.map_or(false, |p| p.role == PeerRole::Voter);

// If the change_type is AddLearnerNode and the last heartbeat is found, it
// means it's a demote from voter as AddLearnerNode on existing learner node is
// not allowed.
if is_voter && let Some(last_heartbeat) = peer_heartbeat.get(&peer.get_id()) {
if let Some(last_heartbeat) = peer_heartbeat.get(&peer.get_id()) {
// peer itself is *not* slow peer, but current slow peer is >= total peers/2
if last_heartbeat.elapsed() <= slow_peer_threshold {
normal_peer_count -= 1;
Expand All @@ -1087,7 +1079,7 @@ fn check_remove_or_demote_voter(
{
return Err(box_err!(
"Ignore conf change command on region {} because RemoveNode or Demote a voter on peers {:?} may lead to unavailability. There're {} slow peers and {} normal peers",
region.get_id(),
region_id,
&normal_peers_to_remove,
slow_peer_count,
normal_peer_count
Expand Down Expand Up @@ -2379,13 +2371,6 @@ mod tests {
},
];

let mut region = Region::default();
for i in 1..4 {
region.mut_peers().push(metapb::Peer {
id: i,
..Default::default()
});
}
for i in 0..change_peers.len() {
// Call the function under test and assert that the function returns failed
let mut cp = vec![change_peers[i].clone()];
Expand All @@ -2403,15 +2388,15 @@ mod tests {
std::time::Instant::now() - std::time::Duration::from_secs(1),
);
// Call the function under test and assert that the function returns Ok
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();
check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat).unwrap();

// now make one peer slow
if let Some(peer_heartbeat) = peer_heartbeat.get_mut(&3) {
*peer_heartbeat = std::time::Instant::now() - std::time::Duration::from_secs(100);
}

// Call the function under test
let result = check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat);
let result = check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat);
// Assert that the function returns failed
assert!(result.is_err());

Expand All @@ -2422,19 +2407,7 @@ mod tests {
})
.into();
// Call the function under test
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();

// make peer to learner and remove the peer 2
region.mut_peers()[1].set_role(metapb::PeerRole::Learner);
cp[0].peer = Some(metapb::Peer {
id: 2,
..Default::default()
})
.into();
// Call the function under test
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();
// set peer 2 voter again
region.mut_peers()[1].set_role(metapb::PeerRole::Voter);
check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat).unwrap();

// there's no remove node, it's fine with slow peers.
cp[0] = ChangePeerRequest {
Expand All @@ -2447,7 +2420,7 @@ mod tests {
..Default::default()
};
// Call the function under test
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();
check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat).unwrap();
}
}
}

0 comments on commit c57eb86

Please sign in to comment.