diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index d6e6dc265bc..aa23f22bc2c 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -415,6 +415,7 @@ pub struct RaftMetrics { pub wf_persist_log: LocalHistogram, pub wf_commit_log: LocalHistogram, pub wf_commit_not_persist_log: LocalHistogram, + pub proposal_send_wait: LocalHistogram, pub raft_log_gc_skipped: RaftLogGcSkippedMetrics, } @@ -439,6 +440,7 @@ impl RaftMetrics { wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), + proposal_send_wait: PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM.local(), raft_log_gc_skipped: RaftLogGcSkippedMetrics::default(), } } @@ -461,6 +463,7 @@ impl RaftMetrics { self.wf_persist_log.flush(); self.wf_commit_log.flush(); self.wf_commit_not_persist_log.flush(); + self.proposal_send_wait.flush(); } let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 3a4426fcbcb..e3d3a23e389 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -360,6 +360,12 @@ lazy_static! { "Bucketed histogram of proposals' commit but not persist duration", exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_proposal_send_wait_duration_seconds", + "Bucketed histogram of proposals' send wait duration", + exponential_buckets(1e-6, 2.0, 26).unwrap() + ).unwrap(); pub static ref PEER_PROPOSAL_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 73e1a6ecb50..ad63e3b1b34 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -725,6 +725,9 @@ where #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, + /// Record the propose instants to calculate the wait duration before + /// the proposal is sent through the Raft client. + pending_propose_instants: VecDeque<(u64, Instant)>, /// If it fails to send messages to leader. pub leader_unreachable: bool, @@ -925,6 +928,7 @@ where raft_max_inflight_msgs: cfg.raft_max_inflight_msgs, proposals: ProposalQueue::new(tag.clone()), pending_reads: Default::default(), + pending_propose_instants: Default::default(), peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), peers_start_pending_time: vec![], @@ -1571,6 +1575,7 @@ where ctx: &mut PollContext, msgs: Vec, ) { + let now = Instant::now(); for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgTimeoutNow && self.is_leader() { @@ -1596,6 +1601,26 @@ where "disk_usage" => ?msg.get_disk_usage(), ); + for index in msg + .get_message() + .get_entries() + .iter() + .map(|e| e.get_index()) + { + while let Some((propose_idx, instant)) = self.pending_propose_instants.front() { + if index == *propose_idx { + ctx.raft_metrics + .proposal_send_wait + .observe(now.saturating_duration_since(*instant).as_secs_f64()); + } + if index >= *propose_idx { + self.pending_propose_instants.pop_front(); + } else { + break; + } + } + } + if let Err(e) = ctx.trans.send(msg) { // We use metrics to observe failure on production. debug!( @@ -2048,6 +2073,7 @@ where self.mut_store().cancel_generating_snap(None); self.clear_disk_full_peers(ctx); self.clear_in_memory_pessimistic_locks(); + self.pending_propose_instants.clear(); } _ => {} } @@ -4270,6 +4296,9 @@ where } } + self.pending_propose_instants + .push_back((propose_index, Instant::now())); + Ok(Either::Left(propose_index)) } diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 15dfa8c684b..46d72775cb6 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -13588,6 +13588,186 @@ "yBucketNumber": null, "yBucketSize": null }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 47 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572784, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Proposal send wait duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 47 + }, + "hiddenSeries": false, + "id": 23763572783, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "99% Proposal send wait duration per server", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:106", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:107", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -13605,7 +13785,7 @@ "h": 7, "w": 12, "x": 0, - "y": 46 + "y": 54 }, "hiddenSeries": false, "id": 1975, @@ -13708,7 +13888,7 @@ "h": 7, "w": 12, "x": 12, - "y": 46 + "y": 54 }, "hiddenSeries": false, "id": 1976,