Skip to content

Commit

Permalink
raftstore: record metrics of proposal send wait duration (#12837)
Browse files Browse the repository at this point in the history
ref #12362

Our raft implementation has its flow control mechanism to limit the inflight
message number. But we're not able to know whether and when we are
throttled. Then, it's hard for us to know whether we should adjust the
max-inflight-msgs config.

This commit tries to improve this case. It's complex to add hooks to raft-rs
to know how long a message is throttled, but we can do it in the raftstore.
We record the propose time, and consume it and record it in the histogram
when the messages is send through the transport. If flow control takes effect,
the ready will be smaller than all under-replicated logs. So, we can know if
flow control takes effect.

Signed-off-by: Yilin Chen <sticnarf@gmail.com>
  • Loading branch information
sticnarf committed Jun 20, 2022
1 parent cead3f5 commit 6bc2492
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 2 deletions.
3 changes: 3 additions & 0 deletions components/raftstore/src/store/local_metrics.rs
Expand Up @@ -415,6 +415,7 @@ pub struct RaftMetrics {
pub wf_persist_log: LocalHistogram,
pub wf_commit_log: LocalHistogram,
pub wf_commit_not_persist_log: LocalHistogram,
pub proposal_send_wait: LocalHistogram,
pub raft_log_gc_skipped: RaftLogGcSkippedMetrics,
}

Expand All @@ -439,6 +440,7 @@ impl RaftMetrics {
wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(),
wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(),
wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(),
proposal_send_wait: PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM.local(),
raft_log_gc_skipped: RaftLogGcSkippedMetrics::default(),
}
}
Expand All @@ -461,6 +463,7 @@ impl RaftMetrics {
self.wf_persist_log.flush();
self.wf_commit_log.flush();
self.wf_commit_not_persist_log.flush();
self.proposal_send_wait.flush();
}
let mut missing = self.leader_missing.lock().unwrap();
LEADER_MISSING.set(missing.len() as i64);
Expand Down
6 changes: 6 additions & 0 deletions components/raftstore/src/store/metrics.rs
Expand Up @@ -360,6 +360,12 @@ lazy_static! {
"Bucketed histogram of proposals' commit but not persist duration",
exponential_buckets(0.00001, 2.0, 26).unwrap()
).unwrap();
pub static ref PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM: Histogram =
register_histogram!(
"tikv_raftstore_proposal_send_wait_duration_seconds",
"Bucketed histogram of proposals' send wait duration",
exponential_buckets(1e-6, 2.0, 26).unwrap()
).unwrap();

pub static ref PEER_PROPOSAL_COUNTER_VEC: IntCounterVec =
register_int_counter_vec!(
Expand Down
29 changes: 29 additions & 0 deletions components/raftstore/src/store/peer.rs
Expand Up @@ -725,6 +725,9 @@ where
#[getset(get = "pub")]
leader_lease: Lease,
pending_reads: ReadIndexQueue<EK::Snapshot>,
/// Record the propose instants to calculate the wait duration before
/// the proposal is sent through the Raft client.
pending_propose_instants: VecDeque<(u64, Instant)>,

/// If it fails to send messages to leader.
pub leader_unreachable: bool,
Expand Down Expand Up @@ -925,6 +928,7 @@ where
raft_max_inflight_msgs: cfg.raft_max_inflight_msgs,
proposals: ProposalQueue::new(tag.clone()),
pending_reads: Default::default(),
pending_propose_instants: Default::default(),
peer_cache: RefCell::new(HashMap::default()),
peer_heartbeats: HashMap::default(),
peers_start_pending_time: vec![],
Expand Down Expand Up @@ -1571,6 +1575,7 @@ where
ctx: &mut PollContext<EK, ER, T>,
msgs: Vec<RaftMessage>,
) {
let now = Instant::now();
for msg in msgs {
let msg_type = msg.get_message().get_msg_type();
if msg_type == MessageType::MsgTimeoutNow && self.is_leader() {
Expand All @@ -1596,6 +1601,26 @@ where
"disk_usage" => ?msg.get_disk_usage(),
);

for index in msg
.get_message()
.get_entries()
.iter()
.map(|e| e.get_index())
{
while let Some((propose_idx, instant)) = self.pending_propose_instants.front() {
if index == *propose_idx {
ctx.raft_metrics
.proposal_send_wait
.observe(now.saturating_duration_since(*instant).as_secs_f64());
}
if index >= *propose_idx {
self.pending_propose_instants.pop_front();
} else {
break;
}
}
}

if let Err(e) = ctx.trans.send(msg) {
// We use metrics to observe failure on production.
debug!(
Expand Down Expand Up @@ -2048,6 +2073,7 @@ where
self.mut_store().cancel_generating_snap(None);
self.clear_disk_full_peers(ctx);
self.clear_in_memory_pessimistic_locks();
self.pending_propose_instants.clear();
}
_ => {}
}
Expand Down Expand Up @@ -4270,6 +4296,9 @@ where
}
}

self.pending_propose_instants
.push_back((propose_index, Instant::now()));

Ok(Either::Left(propose_index))
}

Expand Down
184 changes: 182 additions & 2 deletions metrics/grafana/tikv_details.json
Expand Up @@ -13588,6 +13588,186 @@
"yBucketNumber": null,
"yBucketSize": null
},
{
"cards": {
"cardPadding": null,
"cardRound": null
},
"color": {
"cardColor": "#b4ff00",
"colorScale": "sqrt",
"colorScheme": "interpolateSpectral",
"exponent": 0.5,
"mode": "spectrum"
},
"dataFormat": "tsbuckets",
"datasource": "${DS_TEST-CLUSTER}",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 47
},
"heatmap": {},
"hideZeroBuckets": true,
"highlightCards": true,
"id": 23763572784,
"legend": {
"show": false
},
"links": [],
"reverseYBuckets": false,
"targets": [
{
"exemplar": true,
"expr": "sum(delta(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)",
"format": "heatmap",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{le}}",
"metric": "tikv_raftstore_request_wait_time_duration_secs_bucket",
"refId": "A",
"step": 4
}
],
"timeFrom": null,
"timeShift": null,
"title": "Proposal send wait duration",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"xBucketNumber": null,
"xBucketSize": null,
"yAxis": {
"decimals": 0,
"format": "s",
"logBase": 1,
"max": null,
"min": null,
"show": true,
"splitFactor": null
},
"yBucketBound": "upper",
"yBucketNumber": null,
"yBucketSize": null
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 47
},
"hiddenSeries": false,
"id": 23763572783,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.11",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "99% Proposal send wait duration per server",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:106",
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:107",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
Expand All @@ -13605,7 +13785,7 @@
"h": 7,
"w": 12,
"x": 0,
"y": 46
"y": 54
},
"hiddenSeries": false,
"id": 1975,
Expand Down Expand Up @@ -13708,7 +13888,7 @@
"h": 7,
"w": 12,
"x": 12,
"y": 46
"y": 54
},
"hiddenSeries": false,
"id": 1976,
Expand Down

0 comments on commit 6bc2492

Please sign in to comment.