From 8f5021166d21005f9a63c3c44fd876113fd0c05d Mon Sep 17 00:00:00 2001 From: Arunprasad Rajkumar Date: Wed, 23 Jun 2021 16:22:51 +0530 Subject: [PATCH] fix(mixin): ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy (#4342) * Revert "mixin: Use sidecar's metric timestamp for healthcheck (#3204) (#3979)" This reverts commit 5139e339eca62787e3a8dc38af5d4a9bf3ea39c4. Signed-off-by: Arunprasad Rajkumar * fix(mixin): ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy Signed-off-by: Arunprasad Rajkumar --- CHANGELOG.md | 2 +- examples/alerts/alerts.md | 3 +- examples/alerts/alerts.yaml | 3 +- examples/alerts/tests.yaml | 142 ++++++++++++++++++++++----------- mixin/alerts/sidecar.libsonnet | 3 +- 5 files changed, 102 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5654d7ebae..491817f3c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Fixed -- +- [#4342](https://github.com/thanos-io/thanos/pull/4342) ThanosSidecarUnhealthy doesn't fire if the sidecar is never healthy ### Changed diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 7209920a60..21b5438564 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -327,7 +327,8 @@ rules: runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 240 + for: 5m labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 7c9f08bdc7..8c0d7d7340 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -328,7 +328,8 @@ groups: runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy summary: Thanos Sidecar is unhealthy. expr: | - time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) >= 240 + time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"}) >= 240 + for: 5m labels: severity: critical - name: thanos-store diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 951dcec9b4..64207c46f7 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -8,9 +8,9 @@ tests: - interval: 1m input_series: - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-0"}' - values: '5 10 43 17 11 _x5 0x10' + values: '5 10 43 17 11 0 0 0' - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-1"}' - values: '4 9 42 15 10 _x5 0x10' + values: '4 9 42 15 10 0 0 0' promql_expr_test: - expr: time() eval_time: 1m @@ -22,64 +22,112 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 2m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 60 + value: 43 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 60 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 6m + value: 42 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 10m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 120 + value: 0 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 120 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 7m + value: 0 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 11m + exp_samples: + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' + value: 0 + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' + value: 0 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 10m + exp_samples: + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' + value: 600 + - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) + eval_time: 11m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 180 + value: 660 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 180 - - expr: time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~".*thanos-sidecar.*"})) - eval_time: 8m + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) >= 600 + eval_time: 12m exp_samples: - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}' - value: 240 + value: 720 - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}' - value: 240 + value: 720 alert_rule_test: - - eval_time: 1m - alertname: ThanosSidecarUnhealthy - - eval_time: 2m - alertname: ThanosSidecarUnhealthy - - eval_time: 3m - alertname: ThanosSidecarUnhealthy - - eval_time: 5m - alertname: ThanosSidecarUnhealthy - - eval_time: 8m - alertname: ThanosSidecarUnhealthy - exp_alerts: - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-0 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 240 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - exp_labels: - severity: critical - job: thanos-sidecar - instance: thanos-sidecar-1 - exp_annotations: - description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 240 seconds.' - runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' - summary: 'Thanos Sidecar is unhealthy.' - - eval_time: 10m - alertname: ThanosSidecarUnhealthy + - eval_time: 1m + alertname: ThanosSidecarUnhealthy + - eval_time: 2m + alertname: ThanosSidecarUnhealthy + - eval_time: 3m + alertname: ThanosSidecarUnhealthy + - eval_time: 10m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 600 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 600 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 11m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 660 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 660 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - eval_time: 12m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-0 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for more than 720 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' + - exp_labels: + severity: critical + job: thanos-sidecar + instance: thanos-sidecar-1 + exp_annotations: + description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for more than 720 seconds.' + runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy' + summary: 'Thanos Sidecar is unhealthy.' - interval: 1m input_series: - series: 'prometheus_rule_evaluations_total{namespace="production", job="thanos-ruler", instance="thanos-ruler-0"}' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index 4e21f0785a..b468210619 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -45,8 +45,9 @@ summary: 'Thanos Sidecar is unhealthy.', }, expr: ||| - time() - max by (%(dimensions)s) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s})) >= 240 + time() - max by (%(dimensions)s) (thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) >= 240 ||| % thanos.sidecar, + 'for': '5m', labels: { severity: 'critical', },