/
prometheus-k8s-rules.yaml
65 lines (65 loc) · 2.02 KB
/
prometheus-k8s-rules.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-k8s-rules
namespace: monitoring
spec:
groups:
- name: APIServerMonitoring
rules:
- alert: APIServerDown
annotations:
detail: "APIServer Down"
summary: "APIServer has disappeared from Prometheus target discovery."
expr: |
absent(up{job="apiserver"} == 1)
for: 1m
labels:
severity: 严重
- name: KubeSchedulerMonitoring
rules:
- alert: KubeSchedulerDown
annotations:
detail: "KubeSchedulerDown"
summary: "KubeScheduler has disappeared from Prometheus target discovery."
expr: |
absent(up{job="kube-scheduler"} == 1)
for: 1m
labels:
severity: 严重
# - name: KubeControllerManagerMonitoring
# rules:
# - alert: KubeControllerManagerDown
# annotations:
# detail: "KubeControllerManagerDown"
# summary: "KubeControllerManager has disappeared from Prometheus target discovery."
# expr: |
# absent(up{job="kube-controller-manager"} == 1)
# for: 1m
# labels:
# severity: 严重
- name: KubeletMonitorings
rules:
- alert: KubeletDown
annotations:
detail: "你有一台NodeName为:{{$labels.node}}, 实例IP端口:{{$labels.instance}},Kubelet Down (当前值: {{ $value }})"
summary: "{{$labels.instance}}: Kubelet Down"
expr: |
up{endpoint="https-metrics",job="kubelet",namespace="kube-system",service="kubelet"} == 0
for: 1m
labels:
severity: 严重
- name: KubeNodeNotReady
rules:
- alert: KubeNodeDown
annotations:
detail: "{{ $labels.node }} has been unready for more than an hour."
summary: "有一台Node状态是NoReady了,请管理员尽快检查!"
expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: 严重