/
prometheus-k8s-pod-rules.yaml
32 lines (32 loc) · 1.1 KB
/
prometheus-k8s-pod-rules.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-k8s-pod-rules
namespace: monitoring
spec:
groups:
- name: KubePodCrashLooping
rules:
- alert: KubePodCrashLooping
annotations:
detail: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.'
summary: "POD五分钟内发生重启次数"
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 1h
labels:
severity: 严重
- name: KubePodNotReadyMonitoring
rules:
- alert: KubePodNotReady
annotations:
detail: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour."
summary: "POD状态异常,或者没有准备,在一小时内"
expr: |
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
for: 1h
labels:
severity: 严重