Skip to content

Commit

Permalink
raftstore-v2: optimize the slowness detection on the Io-Hang scenario. (
Browse files Browse the repository at this point in the history
#15071)

ref #15070

This pr includes:
* Polish the annotations in `raftstore-v2/src/worker/pd/slowness.rs`.
* Optimize the detection and the interval of reporting statistics on the io hang case.
  • Loading branch information
LykxSassinator committed Jul 10, 2023
1 parent 993eb2f commit 20afd17
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 20 deletions.
44 changes: 25 additions & 19 deletions components/raftstore-v2/src/worker/pd/slowness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,30 +82,36 @@ where
}

pub fn handle_slowness_stats_tick(&mut self) {
// The following code records a periodic "white noise", which helps
// mitigate any minor fluctuations in disk I/O or network I/O latency.
// After conducting extensive e2e testing, "100ms" has been determined
// to be the most suitable choice for it.
self.slowness_stats
.slow_cause
.record(100_000, Instant::now()); // 100ms

// Handle timeout if last tick is not finished as expected.
let mock_slowness_last_tick_unfinished = || {
fail_point!("mock_slowness_last_tick_unfinished", |_| { true });
false
};
if mock_slowness_last_tick_unfinished()
|| (!self.slowness_stats.last_tick_finished && self.is_store_heartbeat_delayed())
{
// If the last slowness tick already reached abnormal state and was delayed for
// reporting by `store-heartbeat` to PD, we should report it here manually as a
// FAKE `store-heartbeat`. It's an assurance that the heartbeat to
// PD is not lost. Normally, this case rarely happens in
// raftstore-v2.
self.handle_fake_store_heartbeat();
// Handle timeout if the last tick is not finished as expected.
if mock_slowness_last_tick_unfinished() || !self.slowness_stats.last_tick_finished {
// Record a sufficiently large interval to indicate potential write progress
// hanging on I/O. We use the store heartbeat interval as the default value.
self.slowness_stats.slow_cause.record(
self.store_heartbeat_interval.as_micros() as u64,
Instant::now(),
);

// If the last slowness tick already reached an abnormal state and was delayed
// for reporting by `store-heartbeat` to PD, we should manually report it here
// as a FAKE `store-heartbeat`. This ensures that the heartbeat to PD is not
// lost. Normally, this case rarely happens in raftstore-v2.
if self.is_store_heartbeat_delayed() {
self.handle_fake_store_heartbeat();
}
} else {
// The following code records a periodic "white noise", which helps mitigate any
// minor fluctuations in disk I/O or network I/O latency. After
// extensive e2e testing, a duration of "100ms" has been determined
// to be the most suitable choice.
self.slowness_stats
.slow_cause
.record(100_000, Instant::now()); // 100ms
}
// Move to next tick.
// Move to the next tick.
self.slowness_stats.last_tick_finished = false;
}

Expand Down
4 changes: 3 additions & 1 deletion components/raftstore-v2/src/worker/pd/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,10 @@ where
pub fn is_store_heartbeat_delayed(&self) -> bool {
let now = UnixSecs::now();
let interval_second = now.into_inner() - self.store_stat.last_report_ts.into_inner();
(interval_second >= self.store_heartbeat_interval.as_secs())
let store_heartbeat_interval = std::cmp::max(self.store_heartbeat_interval.as_secs(), 1);
(interval_second >= store_heartbeat_interval)
&& (interval_second <= STORE_HEARTBEAT_DELAY_LIMIT)
&& (interval_second % store_heartbeat_interval == 0)
}

pub fn handle_inspect_latency(&self, send_time: TiInstant, inspector: LatencyInspector) {
Expand Down

0 comments on commit 20afd17

Please sign in to comment.