thanos-io · bwplotka · Mar 19, 2021 · Mar 2, 2021 · Mar 5, 2021 · Mar 5, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,10 +15,13 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re
 ### Added
 
 ### Fixed
+
 - [#3204](https://github.com/thanos-io/thanos/pull/3204) Mixin: Use sidecar's metric timestamp for healthcheck.
 
 ### Changed
 
+- [#3856](https://github.com/thanos-io/thanos/pull/3856) Mixin: _breaking :warning:_ Introduce flexible multi-cluster/namespace mode for alerts and dashboards. Removes jobPrefix config option. Removes `namespace` by default.
+
 ### Removed
 
 ## [v0.19.0 - <in progress>](https://github.com/thanos-io/thanos/tree/release-0.19)
@@ -32,7 +35,6 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re
 - [#3792](https://github.com/thanos-io/thanos/pull/3792) Receiver: Added `--tsdb.allow-overlapping-blocks` flag to allow overlapping tsdb blocks and enable vertical compaction
 - [#3031](https://github.com/thanos-io/thanos/pull/3031) Compact/Sidecar/other writers: added `--hash-func`. If some function has been specified, writers calculate hashes using that function of each file in a block before uploading them. If those hashes exist in the `meta.json` file then Compact does not download the files if they already exist on disk and with the same hash. This also means that the data directory passed to Thanos Compact is only *cleared once at boot* or *if everything succeeds*. So, if you, for example, use persistent volumes on k8s and your Thanos Compact crashes or fails to make an iteration properly then the last downloaded files are not wiped from the disk. The directories that were created the last time are only wiped again after a successful iteration or if the previously picked up blocks have disappeared.
 - [#3686](https://github.com/thanos-io/thanos/pull/3686) Query: Added federated metric metadata support.
-
 ### Fixed
 
 - [#3773](https://github.com/thanos-io/thanos/pull/3773) Compact: Pad compaction planner size check

diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md
diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml
diff --git a/examples/alerts/rules.yaml b/examples/alerts/rules.yaml
@@ -3,116 +3,116 @@ groups:
   rules:
   - expr: |
       (
-        sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m]))
+        sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m]))
       /
-        sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m]))
+        sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m]))
       )
     record: :grpc_client_failures_per_unary:sum_rate
   - expr: |
       (
-        sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
+        sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
       /
-        sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
+        sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
       )
     record: :grpc_client_failures_per_stream:sum_rate
   - expr: |
       (
-        sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
+        sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
       /
-        sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
+        sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
       )
     record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
   - expr: |
       histogram_quantile(0.99,
-        sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)
+        sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))
       )
     labels:
       quantile: "0.99"
     record: :query_duration_seconds:histogram_quantile
   - expr: |
       histogram_quantile(0.99,
-        sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le)
+        sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))
       )
     labels:
       quantile: "0.99"
     record: :api_range_query_duration_seconds:histogram_quantile
 - name: thanos-receive.rules
   rules:
   - expr: |
-      sum(
-        rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m])
+      (
+        sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m]))
       /
-        rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m])
+        sum by (job) (rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m]))
       )
     record: :grpc_server_failures_per_unary:sum_rate
   - expr: |
-      sum(
-        rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
+      (
+        sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m]))
       /
-        rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
+        sum by (job) (rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m]))
       )
     record: :grpc_server_failures_per_stream:sum_rate
   - expr: |
-      sum(
-        rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m])
+      (
+        sum by (job) (rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m]))
       /
-        rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m])
+        sum by (job) (rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m]))
       )
     record: :http_failure_per_request:sum_rate
   - expr: |
       histogram_quantile(0.99,
-        sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le)
+        sum by (job, le) (rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m]))
       )
     labels:
       quantile: "0.99"
     record: :http_request_duration_seconds:histogram_quantile
   - expr: |
       (
-        sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
+        sum by (job) (rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
       /
-        sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
+        sum by (job) (rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
       )
     record: :thanos_receive_replication_failure_per_requests:sum_rate
   - expr: |
       (
-        sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
+        sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
       /
-        sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
+        sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
       )
     record: :thanos_receive_forward_failure_per_requests:sum_rate
   - expr: |
       (
-        sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
+        sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
       /
-        sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
+        sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
       )
     record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate
 - name: thanos-store.rules
   rules:
   - expr: |
       (
-        sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m]))
+        sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m]))
       /
-        sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m]))
+        sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m]))
       )
     record: :grpc_server_failures_per_unary:sum_rate
   - expr: |
       (
-        sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
+        sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
       /
-        sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
+        sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
       )
     record: :grpc_server_failures_per_stream:sum_rate
   - expr: |
       (
-        sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
+        sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
       /
-        sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
+        sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
       )
     record: :thanos_objstore_bucket_failures_per_operation:sum_rate
   - expr: |
       histogram_quantile(0.99,
-        sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le)
+        sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))
       )
     labels:
       quantile: "0.99"

diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml
@@ -8,9 +8,9 @@ tests:
 - interval: 1m
   input_series:
   - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-0"}'
-    values: '5 10 43 17 11 _x5 0x10'
+    values: '5 10 43 17 11 0 0 0'
   - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", instance="thanos-sidecar-1"}'
-    values: '4 9 42 15 10 _x5 0x10'
+    values: '4 9 42 15 10 0 0 0'
   promql_expr_test:
     - expr: time()
       eval_time: 1m
@@ -22,61 +22,109 @@ tests:
       exp_samples:
         - labels: '{}'
           value: 120
-    - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance)
-      eval_time: 5m
+    - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
+      eval_time: 2m
       exp_samples:
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
-          value: 60
+          value: 43
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
-          value: 60
-    - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance)
-      eval_time: 6m
+          value: 42
+    - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
+      eval_time: 10m
       exp_samples:
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
-          value: 120
+          value: 0
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
-          value: 120
-    - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance)
-      eval_time: 7m
+          value: 0
+    - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
+      eval_time: 11m
+      exp_samples:
+        - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
+          value: 0
+        - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
+          value: 0
+    - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
+      eval_time: 10m
+      exp_samples:
+        - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
+          value: 600
+        - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
+          value: 600
+    - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance)
+      eval_time: 11m
       exp_samples:
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
-          value: 180
+          value: 660
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
-          value: 180
-    - expr: time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"})) by (job, instance)
-      eval_time: 8m
+          value: 660
+    - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, instance) >= 600
+      eval_time: 12m
       exp_samples:
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-0"}'
-          value: 240
+          value: 720
         - labels: '{job="thanos-sidecar", instance="thanos-sidecar-1"}'
-          value: 240
+          value: 720
   alert_rule_test:
-    - eval_time: 1m
-      alertname: ThanosSidecarUnhealthy
-    - eval_time: 2m
-      alertname: ThanosSidecarUnhealthy
-    - eval_time: 3m
-      alertname: ThanosSidecarUnhealthy
-    - eval_time: 5m
-      alertname: ThanosSidecarUnhealthy
-    - eval_time: 8m
-      alertname: ThanosSidecarUnhealthy
-      exp_alerts:
-      - exp_labels:
-          severity: critical
-          job: thanos-sidecar
-          instance: thanos-sidecar-0
-        exp_annotations:
-          description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-0 is unhealthy for more than 240 seconds.'
-          runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
-          summary: 'Thanos Sidecar is unhealthy.'
-      - exp_labels:
-          severity: critical
-          job: thanos-sidecar
-          instance: thanos-sidecar-1
-        exp_annotations:
-          description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-1 is unhealthy for more than 240 seconds.'
-          runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
-          summary: 'Thanos Sidecar is unhealthy.'
-    - eval_time: 10m
-      alertname: ThanosSidecarUnhealthy
+  - eval_time: 1m
+    alertname: ThanosSidecarUnhealthy
+  - eval_time: 2m
+    alertname: ThanosSidecarUnhealthy
+  - eval_time: 3m
+    alertname: ThanosSidecarUnhealthy
+  - eval_time: 10m
+    alertname: ThanosSidecarUnhealthy
+    exp_alerts:
+    - exp_labels:
+        severity: critical
+        job: thanos-sidecar
+        instance: thanos-sidecar-0
+      exp_annotations:
+        description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 600 seconds.'
+        runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
+        summary: 'Thanos Sidecar is unhealthy.'
+    - exp_labels:
+        severity: critical
+        job: thanos-sidecar
+        instance: thanos-sidecar-1
+      exp_annotations:
+        description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 600 seconds.'
+        runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
+        summary: 'Thanos Sidecar is unhealthy.'
+  - eval_time: 11m
+    alertname: ThanosSidecarUnhealthy
+    exp_alerts:
+    - exp_labels:
+        severity: critical
+        job: thanos-sidecar
+        instance: thanos-sidecar-0
+      exp_annotations:
+        description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 660 seconds.'
+        runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
+        summary: 'Thanos Sidecar is unhealthy.'
+    - exp_labels:
+        severity: critical
+        job: thanos-sidecar
+        instance: thanos-sidecar-1
+      exp_annotations:
+        description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 660 seconds.'
+        runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
+        summary: 'Thanos Sidecar is unhealthy.'
+  - eval_time: 12m
+    alertname: ThanosSidecarUnhealthy
+    exp_alerts:
+    - exp_labels:
+        severity: critical
+        job: thanos-sidecar
+        instance: thanos-sidecar-0
+      exp_annotations:
+        description: 'Thanos Sidecar thanos-sidecar-0 is unhealthy for 720 seconds.'
+        runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
+        summary: 'Thanos Sidecar is unhealthy.'
+    - exp_labels:
+        severity: critical
+        job: thanos-sidecar
+        instance: thanos-sidecar-1
+      exp_annotations:
+        description: 'Thanos Sidecar thanos-sidecar-1 is unhealthy for 720 seconds.'
+        runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
+        summary: 'Thanos Sidecar is unhealthy.'