Skip to content

Commit

Permalink
Merge branch 'main' into improve-error-dashboards
Browse files Browse the repository at this point in the history
  • Loading branch information
douglascamata committed Mar 31, 2023
2 parents 9ec46ab + 15dcfdb commit 19d14c3
Show file tree
Hide file tree
Showing 15 changed files with 38 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -24,6 +24,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#6207](https://github.com/thanos-io/thanos/pull/6207) Receive: Remove the shipper once a tenant has been pruned.
- [#6216](https://github.com/thanos-io/thanos/pull/6216) Receiver: removed hard-coded value of EnableExemplarStorage flag and set it according to max-exemplar value.
- [#6222](https://github.com/thanos-io/thanos/pull/6222) mixin(Receive): Fix tenant series received charts.
- [#6218](https://github.com/thanos-io/thanos/pull/6218) mixin(Store): handle ResourceExhausted as a non-server error. As a consequence, this error won't contribute to Store's grpc errors alerts.

### Changed
- [#6168](https://github.com/thanos-io/thanos/pull/6168) Receiver: Make ketama hashring fail early when configured with number of nodes lower than the replication factor.
Expand Down
2 changes: 1 addition & 1 deletion cmd/thanos/config.go
Expand Up @@ -40,7 +40,7 @@ func (gc *grpcConfig) registerFlag(cmd extkingpin.FlagClause) *grpcConfig {
"TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert)").
Default("").StringVar(&gc.tlsSrvClientCA)
cmd.Flag("grpc-server-max-connection-age", "The grpc server max connection age. This controls how often to re-establish connections and redo TLS handshakes.").
Default("0s").DurationVar(&gc.maxConnectionAge)
Default("60m").DurationVar(&gc.maxConnectionAge)
cmd.Flag("grpc-grace-period",
"Time to wait after an interrupt received for GRPC Server.").
Default("2m").DurationVar(&gc.gracePeriod)
Expand Down
2 changes: 1 addition & 1 deletion docs/components/query.md
Expand Up @@ -317,7 +317,7 @@ Flags:
to other clients. Must be one of: snappy, none
--grpc-grace-period=2m Time to wait after an interrupt received for
GRPC Server.
--grpc-server-max-connection-age=0s
--grpc-server-max-connection-age=60m
The grpc server max connection age. This
controls how often to re-establish connections
and redo TLS handshakes.
Expand Down
2 changes: 1 addition & 1 deletion docs/components/receive.md
Expand Up @@ -221,7 +221,7 @@ Flags:
from other components.
--grpc-grace-period=2m Time to wait after an interrupt received for
GRPC Server.
--grpc-server-max-connection-age=0s
--grpc-server-max-connection-age=60m
The grpc server max connection age. This
controls how often to re-establish connections
and redo TLS handshakes.
Expand Down
2 changes: 1 addition & 1 deletion docs/components/rule.md
Expand Up @@ -323,7 +323,7 @@ Flags:
from other components.
--grpc-grace-period=2m Time to wait after an interrupt received for
GRPC Server.
--grpc-server-max-connection-age=0s
--grpc-server-max-connection-age=60m
The grpc server max connection age. This
controls how often to re-establish connections
and redo TLS handshakes.
Expand Down
2 changes: 1 addition & 1 deletion docs/components/sidecar.md
Expand Up @@ -82,7 +82,7 @@ Flags:
from other components.
--grpc-grace-period=2m Time to wait after an interrupt received for
GRPC Server.
--grpc-server-max-connection-age=0s
--grpc-server-max-connection-age=60m
The grpc server max connection age. This
controls how often to re-establish connections
and redo TLS handshakes.
Expand Down
2 changes: 1 addition & 1 deletion docs/components/store.md
Expand Up @@ -66,7 +66,7 @@ Flags:
from other components.
--grpc-grace-period=2m Time to wait after an interrupt received for
GRPC Server.
--grpc-server-max-connection-age=0s
--grpc-server-max-connection-age=60m
The grpc server max connection age. This
controls how often to re-establish connections
and redo TLS handshakes.
Expand Down
4 changes: 2 additions & 2 deletions examples/alerts/alerts.md
Expand Up @@ -220,10 +220,10 @@ rules:
annotations:
description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate
summary: Thanos Store is failing to handle qrpcd requests.
summary: Thanos Store is failing to handle gRPC requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))
/
sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))
* 100 > 5
Expand Down
4 changes: 2 additions & 2 deletions examples/alerts/alerts.yaml
Expand Up @@ -304,10 +304,10 @@ groups:
annotations:
description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate
summary: Thanos Store is failing to handle qrpcd requests.
summary: Thanos Store is failing to handle gRPC requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))
/
sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))
* 100 > 5
Expand Down
4 changes: 2 additions & 2 deletions examples/alerts/rules.yaml
Expand Up @@ -91,14 +91,14 @@ groups:
rules:
- expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="unary"}[5m]))
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="unary"}[5m]))
/
sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="unary"}[5m]))
)
record: :grpc_server_failures_per_unary:sum_rate
- expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="server_stream"}[5m]))
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="server_stream"}[5m]))
/
sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="server_stream"}[5m]))
)
Expand Down
4 changes: 2 additions & 2 deletions mixin/alerts/store.libsonnet
Expand Up @@ -19,11 +19,11 @@
alert: 'ThanosStoreGrpcErrorRate',
annotations: {
description: 'Thanos Store {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location,
summary: 'Thanos Store is failing to handle qrpcd requests.',
summary: 'Thanos Store is failing to handle gRPC requests.',
},
expr: |||
(
sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
/
sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m]))
* 100 > %(grpcErrorThreshold)s
Expand Down
4 changes: 2 additions & 2 deletions mixin/rules/store.libsonnet
Expand Up @@ -13,7 +13,7 @@
record: ':grpc_server_failures_per_unary:sum_rate',
expr: |||
(
sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m]))
sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m]))
/
sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m]))
)
Expand All @@ -23,7 +23,7 @@
record: ':grpc_server_failures_per_stream:sum_rate',
expr: |||
(
sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m]))
sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m]))
/
sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m]))
)
Expand Down
2 changes: 1 addition & 1 deletion mixin/runbook.md
Expand Up @@ -91,7 +91,7 @@

|Name|Summary|Description|Severity|Runbook|
|---|---|---|---|---|
|ThanosStoreGrpcErrorRate|Thanos Store is failing to handle qrpcd requests.|Thanos Store {{$labels.job}} is failing to handle {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate)|
|ThanosStoreGrpcErrorRate|Thanos Store is failing to handle gRPC requests.|Thanos Store {{$labels.job}} is failing to handle {{$value humanize}}% of requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate)|
|ThanosStoreSeriesGateLatencyHigh|Thanos Store has high latency for store series gate requests.|Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh)|
|ThanosStoreBucketHighOperationFailures|Thanos Store Bucket is failing to execute operations.|Thanos Store {{$labels.job}} Bucket is failing to execute {{$value humanize}}% of operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures)|
|ThanosStoreObjstoreOperationLatencyHigh|Thanos Store is having high latency for bucket operations.|Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh)|
3 changes: 1 addition & 2 deletions test/e2e/compact_test.go
Expand Up @@ -77,7 +77,6 @@ func TestCompactWithStoreGateway(t *testing.T) {
}

func TestCompactWithStoreGatewayWithPenaltyDedup(t *testing.T) {
t.Skip("Flaky test, needs deeper investigation before re-enabling, details are in https://github.com/thanos-io/thanos/issues/4866")
testCompactWithStoreGateway(t, true)
}

Expand Down Expand Up @@ -724,7 +723,7 @@ func testCompactWithStoreGateway(t *testing.T, penaltyDedup bool) {
operationMatcher, err := matchers.NewMatcher(matchers.MatchEqual, "operation", "get")
testutil.Ok(t, err)
testutil.Ok(t, c.WaitSumMetricsWithOptions(
e2emon.Equals(573),
e2ethanos.Between(0, 1000),
[]string{"thanos_objstore_bucket_operations_total"}, e2emon.WithLabelMatchers(
bucketMatcher,
operationMatcher,
Expand Down
19 changes: 19 additions & 0 deletions test/e2e/e2ethanos/custom_test_matchers.go
@@ -0,0 +1,19 @@
// Copyright (c) The Thanos Authors.
// Licensed under the Apache License 2.0.

package e2ethanos

import (
e2emon "github.com/efficientgo/e2e/monitoring"
)

// Between is a MetricValueExpectation function for WaitSumMetrics that returns true if given single sum is between
// the lower and upper bounds (non-inclusive, as in `lower < x < upper`).
func Between(lower, upper float64) e2emon.MetricValueExpectation {
return func(sums ...float64) bool {
if len(sums) != 1 {
panic("between: expected one value")
}
return sums[0] > lower && sums[0] < upper
}
}

0 comments on commit 19d14c3

Please sign in to comment.