From 5d695e9226e4360c450a2dbf3076f79835829dd1 Mon Sep 17 00:00:00 2001 From: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> Date: Mon, 26 Jun 2023 09:03:41 +0200 Subject: [PATCH] add alert for tenant reaching head series limit (#6467) Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> --- CHANGELOG.md | 1 + examples/alerts/alerts.md | 9 +++++++++ examples/alerts/alerts.yaml | 9 +++++++++ mixin/alerts/receive.libsonnet | 12 ++++++++++++ mixin/runbook.md | 1 + pkg/rules/rules_test.go | 2 +- 6 files changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 020dedcef3..2cb23eb680 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#6420](https://github.com/thanos-io/thanos/pull/6420) Index Cache: Cache expanded postings. - [#6441](https://github.com/thanos-io/thanos/pull/6441) Compact: Compactor will set `index_stats` in `meta.json` file with max series and chunk size information. - [#6466](https://github.com/thanos-io/thanos/pull/6466) Mixin (Receive): add limits alerting for configuration reload and meta-monitoring. +- [#6467](https://github.com/thanos-io/thanos/pull/6467) Mixin (Receive): add alert for tenant reaching head series limit. ### Fixed - [#6456](https://github.com/thanos-io/thanos/pull/6456) Store: fix crash when computing set matches from regex pattern diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 1f56461073..caa89ad0ef 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -548,6 +548,15 @@ rules: for: 5m labels: severity: warning +- alert: ThanosReceiveTenantLimitedByHeadSeries + annotations: + description: Thanos Receive tenant {{$labels.tenant}} is limited by head series. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetenantlimitedbyheadseries + summary: A Thanos Receive tenant is limited by head series. + expr: sum by(job, tenant) (increase(thanos_receive_head_series_limited_requests_total{job=~".*thanos-receive.*"}[5m])) > 0 + for: 5m + labels: + severity: warning ``` ## Replicate diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 55cb140419..0b1c0422b7 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -292,6 +292,15 @@ groups: for: 5m labels: severity: warning + - alert: ThanosReceiveTenantLimitedByHeadSeries + annotations: + description: Thanos Receive tenant {{$labels.tenant}} is limited by head series. + runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetenantlimitedbyheadseries + summary: A Thanos Receive tenant is limited by head series. + expr: sum by(job, tenant) (increase(thanos_receive_head_series_limited_requests_total{job=~".*thanos-receive.*"}[5m])) > 0 + for: 5m + labels: + severity: warning - name: thanos-sidecar rules: - alert: ThanosSidecarBucketOperationsFailed diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index 528827a2c2..1315766763 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -170,6 +170,18 @@ severity: 'warning', }, }, + { + alert: 'ThanosReceiveTenantLimitedByHeadSeries', + annotations: { + description: 'Thanos Receive tenant {{$labels.tenant}}%s is limited by head series.' % location, + summary: 'A Thanos Receive tenant is limited by head series.', + }, + expr: 'sum by(%(dimensions)s, tenant) (increase(thanos_receive_head_series_limited_requests_total{%(selector)s}[5m])) > 0' % thanos.receive, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, ], }, ], diff --git a/mixin/runbook.md b/mixin/runbook.md index ba2131cd91..48c9a5ada0 100755 --- a/mixin/runbook.md +++ b/mixin/runbook.md @@ -65,6 +65,7 @@ |ThanosReceiveNoUpload|Thanos Receive has not uploaded latest data to object storage.|Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.|critical|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload)| |ThanosReceiveLimitsConfigReloadFailure|Thanos Receive has not been able to reload the limits configuration.|Thanos Receive {{$labels.job}} has not been able to reload the limits configuration.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure)| |ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate|Thanos Receive has not been able to update the number of head series.|Thanos Receive {{$labels.job}} is failing for {{$value humanize}}% of meta monitoring queries.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate)| +|ThanosReceiveTenantLimitedByHeadSeries|A Thanos Receive tenant is limited by head series.|Thanos Receive tenant {{$labels.tenant}} is limited by head series.|warning|[https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetenantlimitedbyheadseries](https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetenantlimitedbyheadseries)| ## thanos-rule diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index b5ee5dacd2..721da26c10 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -69,7 +69,7 @@ func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServ Name: "thanos-receive", File: filepath.Join(dir, "alerts.yaml"), Rules: []*rulespb.Rule{ - someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, + someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, someAlert, }, Interval: 60, PartialResponseStrategy: storepb.PartialResponseStrategy_ABORT,