Skip to content

Commit

Permalink
server, metrics: let server TSO handle duration including failed requ…
Browse files Browse the repository at this point in the history
…ests (#8282)

ref #8281

- Delete the 99.999% percentile data because it does not correspond with TiDB and is too tail-end, which can easily mislead.
- Emphasize PD server/client in the panel title.
- Add corresponding 90/99/99.9% percentile data on the client handle duration for easier comparison.
- The PD server TSO handle duration now includes the failed requests, directly reflecting TSO HA anomalies in the monitoring data.

Signed-off-by: JmPotato <ghzpotato@gmail.com>
  • Loading branch information
JmPotato authored Jun 13, 2024
1 parent c75e98b commit e52f5be
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 25 deletions.
50 changes: 26 additions & 24 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -10633,20 +10633,13 @@
"refId": "C",
"step": 2
},
{
"expr": "histogram_quantile(0.99999, sum(rate(pd_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "99.999% tso",
"refId": "D"
},
{
"expr": "histogram_quantile(0.90, sum(rate(tso_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
"legendFormat": "90% tso",
"refId": "E",
"refId": "D",
"step": 2
},
{
Expand All @@ -10655,7 +10648,7 @@
"hide": false,
"intervalFactor": 2,
"legendFormat": "99% tso",
"refId": "F",
"refId": "E",
"step": 2
},
{
Expand All @@ -10664,22 +10657,15 @@
"hide": false,
"intervalFactor": 2,
"legendFormat": "99.9% tso",
"refId": "G",
"refId": "F",
"step": 2
},
{
"expr": "histogram_quantile(0.99999, sum(rate(tso_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "99.999% tso",
"refId": "H"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "PD server TSO handle time",
"title": "PD server TSO handle duration",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -10766,26 +10752,42 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.98, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"hide": false,
"expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type)",
"intervalFactor": 2,
"legendFormat": "{{type}} 98th percentile",
"legendFormat": "avg {{type}}",
"refId": "A",
"step": 2
},
{
"expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type)",
"expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{type}} average",
"legendFormat": "90% {{type}}",
"refId": "B",
"step": 2
},
{
"expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "99% {{type}}",
"refId": "C",
"step": 2
},
{
"expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "99.9% {{type}}",
"refId": "D",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Handle requests duration",
"title": "PD client requests handle duration",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down
2 changes: 1 addition & 1 deletion server/grpc_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,10 +578,10 @@ func (s *GrpcServer) Tso(stream pdpb.PD_TsoServer) error {
ctx, task := trace.NewTask(ctx, "tso")
ts, err := s.tsoAllocatorManager.HandleRequest(ctx, request.GetDcLocation(), count)
task.End()
tsoHandleDuration.Observe(time.Since(start).Seconds())
if err != nil {
return status.Errorf(codes.Unknown, err.Error())
}
tsoHandleDuration.Observe(time.Since(start).Seconds())
response := &pdpb.TsoResponse{
Header: s.header(),
Timestamp: &ts,
Expand Down

0 comments on commit e52f5be

Please sign in to comment.