-
Notifications
You must be signed in to change notification settings - Fork 0
/
probes.yaml
84 lines (77 loc) · 4.65 KB
/
probes.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# deploy-tag: ops
# Some of these alerts (probedown, ssl cert expiry) target all probes in jobs starting with probes/.
# Similar alerts are also defined/deployed by Puppet via
# prometheus::blackbox::check::http to allow users some customisation in terms of
# severity/team directly in puppet.
# When making changes remember to update modules/prometheus/manifests/blackbox/check/http.pp as well to keep alert definition in sync.
groups:
- name: probes
rules:
- &probe_down
alert: ProbeDown
annotations:
description: '{{ $labels.instance }} failed when probed by {{ $labels.module }} from {{ $externalLabels.site }}. Availability is {{ $value }}%.'
summary: 'Service {{ $labels.instance }} has failed probes ({{ $labels.module }})'
dashboard: 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module=All'
logs: 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))'
runbook: 'https://wikitech.wikimedia.org/wiki/Runbook#{{ $labels.instance }}'
# Probes are sent out every 15s, allow at most two to fail in a minute
# (i.e. one minute availability is less than 75%)
expr: avg_over_time(probe_success{job=~"probes/.*", module=~"(http|tcp).*"}[1m]) * 100 < 75
labels:
severity: warning
team: sre
- <<: *probe_down
expr: (avg_over_time(probe_success{job=~"probes/.*", module=~"(http|tcp).*"}[1m]) and on (instance) service_catalog_page == 1) * 100 < 75
for: 2m
labels:
severity: critical
team: sre
annotations:
description: '{{ $labels.instance }} failed when probed by {{ $labels.module }} from {{ $externalLabels.site }}. Availability is {{ $value }}%.'
summary: 'Service {{ $labels.instance }} has failed probes ({{ $labels.module }})'
dashboard: 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module=All'
logs: 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))'
runbook: 'https://wikitech.wikimedia.org/wiki/Runbook#{{ $labels.instance }}'
# paging alert: low availability (<10%) for one minute to avoid false positives and flaps
- <<: *probe_down
expr: (avg_over_time(probe_success{job=~"probes/.*", module=~"(http|tcp).*"}[1m]) and on (instance) service_catalog_page == 1) * 100 < 10
for: 1m
labels:
severity: page
team: sre
annotations:
description: '{{ $labels.instance }} failed when probed by {{ $labels.module }} from {{ $externalLabels.site }}. Availability is {{ $value }}%.'
summary: 'Service {{ $labels.instance }} has failed probes ({{ $labels.module }}) #page'
dashboard: 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module=All'
logs: 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))'
runbook: 'https://wikitech.wikimedia.org/wiki/Runbook#{{ $labels.instance }}'
- &cert_expired
alert: CertAlmostExpired
annotations:
description: 'The certificate presented by service {{ $labels.instance }} is going to expire in {{ $value | humanizeDuration }}'
summary: 'Certificate for service {{ $labels.instance }} is about to expire'
runbook: 'https://wikitech.wikimedia.org/wiki/TLS/Runbook#{{ $labels.instance }}'
dashboard: TODO
expr: probe_ssl_earliest_cert_expiry{job=~"probes/.*"} - time() < (86400 * 9)
for: 5m
labels:
severity: warning
team: sre
- <<: *cert_expired
expr: probe_ssl_earliest_cert_expiry{job=~"probes/.*"} - time() < (86400 * 7)
labels:
severity: critical
team: sre
- alert: ProbeSlow
annotations:
description: '{{ $labels.instance }} took {{ $value | humanizeDuration }} to reply successfully to probe {{ $labels.module }}'
summary: 'Slow probes for service {{ $labels.instance }}'
dashboard: 'https://grafana.wikimedia.org/d/O0nHhdhnz/network-probes-overview?var-job={{ $labels.job }}&var-module={{ $labels.module }}'
logs: 'https://logstash.wikimedia.org/app/dashboards#/view/f3e709c0-a5f8-11ec-bf8e-43f1807d5bc2?_g=(filters:!((query:(match_phrase:(service.name:{{ $labels.module }})))))'
runbook: TODO
expr: probe_duration_seconds{job=~"probes/.*", module=~"(http|tcp).*"} * probe_success > 1
for: 2m
labels:
severity: warning
team: sre