Skip to content

Commit

Permalink
Merge pull request #6231 from douglascamata/improve-error-dashboards
Browse files Browse the repository at this point in the history
mixins: Add code/grpc-code dimension to error widgets
  • Loading branch information
fpetkovski committed Mar 31, 2023
2 parents 15dcfdb + 19d14c3 commit b5560c1
Show file tree
Hide file tree
Showing 11 changed files with 45 additions and 44 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -32,6 +32,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#6212](https://github.com/thanos-io/thanos/pull/6212) Query-Frontend: Disable scalar for vertical sharding.
- [#6107](https://github.com/thanos-io/thanos/pull/6082) Change default user id in container image from 0(root) to 1001
- [#6228](https://github.com/thanos-io/thanos/pull/6228) Conditionally generate debug messages in ProxyStore to avoid memory bloat.
- [#6231](https://github.com/thanos-io/thanos/pull/6231) mixins: Add code/grpc-code dimension to error widgets.

### Removed

Expand Down
15 changes: 5 additions & 10 deletions examples/dashboards/overview.json
Expand Up @@ -161,10 +161,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -466,10 +465,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -823,10 +821,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1180,10 +1177,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1485,10 +1481,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
3 changes: 1 addition & 2 deletions examples/dashboards/query-frontend.json
Expand Up @@ -242,10 +242,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
12 changes: 4 additions & 8 deletions examples/dashboards/query.json
Expand Up @@ -145,10 +145,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -450,10 +449,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -807,10 +805,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1164,10 +1161,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
12 changes: 4 additions & 8 deletions examples/dashboards/receive.json
Expand Up @@ -145,10 +145,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1632,10 +1631,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1989,10 +1987,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -2346,10 +2343,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
6 changes: 2 additions & 4 deletions examples/dashboards/rule.json
Expand Up @@ -966,10 +966,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1323,10 +1322,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
6 changes: 2 additions & 4 deletions examples/dashboards/sidecar.json
Expand Up @@ -197,10 +197,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -553,10 +552,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
6 changes: 2 additions & 4 deletions examples/dashboards/store.json
Expand Up @@ -197,10 +197,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -554,10 +553,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
18 changes: 18 additions & 0 deletions mixin/lib/thanos-grafana-builder/builder.libsonnet
Expand Up @@ -78,6 +78,24 @@ local utils = import '../utils.libsonnet';
],
},

qpsErrTotalPerLabelPanel(selectorErr, selectorTotal, dimensions, perLabel):: {
local errExpr = 'sum by (%s, %s) (rate(%s[$interval]))' % [dimensions, perLabel, selectorErr],
local totalExpr = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selectorTotal],

aliasColors: {
'error': '#E24D42',
},
targets: [
{
expr: '%s / ignoring (%s) group_left() %s' % [errExpr, perLabel, totalExpr],
format: 'time_series',
intervalFactor: 2,
step: 10,
},
],
yaxes: $.yaxes({ format: 'percentunit' }),
} + $.stack,

qpsErrTotalPanel(selectorErr, selectorTotal, dimensions):: {
local expr(selector) = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selector],

Expand Down
5 changes: 3 additions & 2 deletions mixin/lib/thanos-grafana-builder/grpc.libsonnet
Expand Up @@ -37,9 +37,10 @@ local utils = import '../utils.libsonnet';
} + $.stack,

grpcErrorsPanel(metric, selector, dimensions)::
$.qpsErrTotalPanel(
$.qpsErrTotalPerLabelPanel(
'%s{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss",%s}' % [metric, selector],
'%s{%s}' % [metric, selector],
dimensions
dimensions,
'grpc_code',
),
}
5 changes: 3 additions & 2 deletions mixin/lib/thanos-grafana-builder/http.libsonnet
Expand Up @@ -25,9 +25,10 @@ local utils = import '../utils.libsonnet';
} + $.stack,

httpErrPanel(metric, selector, dimensions)::
$.qpsErrTotalPanel(
$.qpsErrTotalPerLabelPanel(
'%s{%s,code=~"5.."}' % [metric, selector],
'%s{%s}' % [metric, selector],
dimensions
dimensions,
'code',
),
}

0 comments on commit b5560c1

Please sign in to comment.