Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mixins: Add code/grpc-code dimension to error widgets #6231

Merged
merged 12 commits into from Mar 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -32,6 +32,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#6212](https://github.com/thanos-io/thanos/pull/6212) Query-Frontend: Disable scalar for vertical sharding.
- [#6107](https://github.com/thanos-io/thanos/pull/6082) Change default user id in container image from 0(root) to 1001
- [#6228](https://github.com/thanos-io/thanos/pull/6228) Conditionally generate debug messages in ProxyStore to avoid memory bloat.
- [#6231](https://github.com/thanos-io/thanos/pull/6231) mixins: Add code/grpc-code dimension to error widgets.

### Removed

Expand Down
15 changes: 5 additions & 10 deletions examples/dashboards/overview.json
Expand Up @@ -161,10 +161,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -466,10 +465,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -823,10 +821,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1180,10 +1177,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1485,10 +1481,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
3 changes: 1 addition & 2 deletions examples/dashboards/query-frontend.json
Expand Up @@ -242,10 +242,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
12 changes: 4 additions & 8 deletions examples/dashboards/query.json
Expand Up @@ -145,10 +145,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -450,10 +449,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -807,10 +805,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1164,10 +1161,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
12 changes: 4 additions & 8 deletions examples/dashboards/receive.json
Expand Up @@ -145,10 +145,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\"}[$interval]))",
"expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1632,10 +1631,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1989,10 +1987,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -2346,10 +2343,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
6 changes: 2 additions & 4 deletions examples/dashboards/rule.json
Expand Up @@ -966,10 +966,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -1323,10 +1322,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
6 changes: 2 additions & 4 deletions examples/dashboards/sidecar.json
Expand Up @@ -197,10 +197,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -553,10 +552,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
6 changes: 2 additions & 4 deletions examples/dashboards/store.json
Expand Up @@ -197,10 +197,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down Expand Up @@ -554,10 +553,9 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",
"step": 10
}
],
Expand Down
18 changes: 18 additions & 0 deletions mixin/lib/thanos-grafana-builder/builder.libsonnet
Expand Up @@ -78,6 +78,24 @@ local utils = import '../utils.libsonnet';
],
},

qpsErrTotalPerLabelPanel(selectorErr, selectorTotal, dimensions, perLabel):: {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving a decision documented: I created this qpsErrTotalPerLabelPanel function because in many places qpsErrTotalPanel is used for generic error charts (i.e. in the Compactor and Ruler dashboards).

As future work we could investigate whether those widgets have some sort of code/reason/message field that is worth surfacing in the widget's legend.

local errExpr = 'sum by (%s, %s) (rate(%s[$interval]))' % [dimensions, perLabel, selectorErr],
local totalExpr = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selectorTotal],

aliasColors: {
'error': '#E24D42',
},
targets: [
{
expr: '%s / ignoring (%s) group_left() %s' % [errExpr, perLabel, totalExpr],
format: 'time_series',
intervalFactor: 2,
step: 10,
},
],
yaxes: $.yaxes({ format: 'percentunit' }),
} + $.stack,

qpsErrTotalPanel(selectorErr, selectorTotal, dimensions):: {
local expr(selector) = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selector],

Expand Down
5 changes: 3 additions & 2 deletions mixin/lib/thanos-grafana-builder/grpc.libsonnet
Expand Up @@ -37,9 +37,10 @@ local utils = import '../utils.libsonnet';
} + $.stack,

grpcErrorsPanel(metric, selector, dimensions)::
$.qpsErrTotalPanel(
$.qpsErrTotalPerLabelPanel(
'%s{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss",%s}' % [metric, selector],
'%s{%s}' % [metric, selector],
dimensions
dimensions,
'grpc_code',
),
}
5 changes: 3 additions & 2 deletions mixin/lib/thanos-grafana-builder/http.libsonnet
Expand Up @@ -25,9 +25,10 @@ local utils = import '../utils.libsonnet';
} + $.stack,

httpErrPanel(metric, selector, dimensions)::
$.qpsErrTotalPanel(
$.qpsErrTotalPerLabelPanel(
'%s{%s,code=~"5.."}' % [metric, selector],
'%s{%s}' % [metric, selector],
dimensions
dimensions,
'code',
),
}