Skip to content

Commit

Permalink
improve the remove peer check. Only check when the updating role is v…
Browse files Browse the repository at this point in the history
…oter

Signed-off-by: tonyxuqqi <tonyxuqi@outlook.com>
  • Loading branch information
tonyxuqqi authored and ti-chi-bot committed Jan 31, 2024
1 parent cde0928 commit ad096d3
Showing 1 changed file with 48 additions and 21 deletions.
69 changes: 48 additions & 21 deletions components/raftstore/src/store/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -993,13 +993,7 @@ pub fn check_conf_change(
return Err(box_err!("multiple changes that only effect learner"));
}

check_remove_or_demote_voter(
region.get_id(),
cfg,
change_peers,
leader.get_id(),
peer_heartbeat,
)?;
check_remove_or_demote_voter(region, cfg, change_peers, leader.get_id(), peer_heartbeat)?;
if !ignore_safety {
let promoted_commit_index = after_progress.maximal_committed_index().0;
let first_index = node.raft.raft_log.first_index();
Expand Down Expand Up @@ -1027,7 +1021,7 @@ pub fn check_conf_change(
}

fn check_remove_or_demote_voter(
region_id: u64,
region: &metapb::Region,
cfg: &Config,
change_peers: &[ChangePeerRequest],
leader_id: u64,
Expand All @@ -1036,16 +1030,24 @@ fn check_remove_or_demote_voter(
let mut slow_peer_count = 0;
let mut normal_peer_count = 0;
// Here we assume if the last beartbeat is within 2 election timeout, the peer
// is healthy. This is to be tolerant to some slightly slow peers when
// the leader is in hibernate mode.
// is healthy. When a region is hibernate, we expect all its peers are *slow*
// and it would still allow the operation
let slow_peer_threshold =
2 * cfg.raft_base_tick_interval.0 * cfg.raft_max_election_timeout_ticks as u32;
for (id, last_heartbeat) in peer_heartbeat {
// leader itself is not a slow peer
if *id == leader_id || last_heartbeat.elapsed() <= slow_peer_threshold {
normal_peer_count += 1;
} else {
slow_peer_count += 1;
// for slow and normal peer calculation, we only count voter role
if region
.get_peers()
.iter()
.find(|p| p.get_id() == *id)
.map_or(false, |p| p.role == PeerRole::Voter)
{
// leader itself is not a slow peer
if *id == leader_id || last_heartbeat.elapsed() <= slow_peer_threshold {
normal_peer_count += 1;
} else {
slow_peer_count += 1;
}
}
}

Expand All @@ -1055,10 +1057,16 @@ fn check_remove_or_demote_voter(
if change_type == ConfChangeType::RemoveNode
|| change_type == ConfChangeType::AddLearnerNode
{
let is_voter = region
.get_peers()
.iter()
.find(|p| p.get_id() == peer.get_id())
.map_or(false, |p| p.role == PeerRole::Voter);

// If the change_type is AddLearnerNode and the last heartbeat is found, it
// means it's a demote from voter as AddLearnerNode on existing learner node is
// not allowed.
if let Some(last_heartbeat) = peer_heartbeat.get(&peer.get_id()) {
if is_voter && let Some(last_heartbeat) = peer_heartbeat.get(&peer.get_id()) {
// peer itself is *not* slow peer, but current slow peer is >= total peers/2
if last_heartbeat.elapsed() <= slow_peer_threshold {
normal_peer_count -= 1;
Expand All @@ -1079,7 +1087,7 @@ fn check_remove_or_demote_voter(
{
return Err(box_err!(
"Ignore conf change command on region {} because RemoveNode or Demote a voter on peers {:?} may lead to unavailability. There're {} slow peers and {} normal peers",
region_id,
region.get_id(),
&normal_peers_to_remove,
slow_peer_count,
normal_peer_count
Expand Down Expand Up @@ -2371,6 +2379,13 @@ mod tests {
},
];

let mut region = Region::default();
for i in 1..4 {
region.mut_peers().push(metapb::Peer {
id: i,
..Default::default()
});
}
for i in 0..change_peers.len() {
// Call the function under test and assert that the function returns failed
let mut cp = vec![change_peers[i].clone()];
Expand All @@ -2388,15 +2403,15 @@ mod tests {
std::time::Instant::now() - std::time::Duration::from_secs(1),
);
// Call the function under test and assert that the function returns Ok
check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat).unwrap();
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();

// now make one peer slow
if let Some(peer_heartbeat) = peer_heartbeat.get_mut(&3) {
*peer_heartbeat = std::time::Instant::now() - std::time::Duration::from_secs(100);
}

// Call the function under test
let result = check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat);
let result = check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat);
// Assert that the function returns failed
assert!(result.is_err());

Expand All @@ -2407,7 +2422,19 @@ mod tests {
})
.into();
// Call the function under test
check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat).unwrap();
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();

// make peer to learner and remove the peer 2
region.mut_peers()[1].set_role(metapb::PeerRole::Learner);
cp[0].peer = Some(metapb::Peer {
id: 2,
..Default::default()
})
.into();
// Call the function under test
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();
// set peer 2 voter again
region.mut_peers()[1].set_role(metapb::PeerRole::Voter);

// there's no remove node, it's fine with slow peers.
cp[0] = ChangePeerRequest {
Expand All @@ -2420,7 +2447,7 @@ mod tests {
..Default::default()
};
// Call the function under test
check_remove_or_demote_voter(1, &cfg, &cp, 1, &peer_heartbeat).unwrap();
check_remove_or_demote_voter(&region, &cfg, &cp, 1, &peer_heartbeat).unwrap();
}
}
}

0 comments on commit ad096d3

Please sign in to comment.