Skip to content

Commit

Permalink
feat(platform,monitor): add more metrics for k8s node
Browse files Browse the repository at this point in the history
Signed-off-by: Feng Kun <fengkun32@gmail.com>
  • Loading branch information
kevinfeng authored and tke-robot committed Sep 30, 2020
1 parent 34d9355 commit b62fc41
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 8 deletions.
6 changes: 3 additions & 3 deletions pkg/monitor/controller/prometheus/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1171,7 +1171,7 @@ func createPrometheusCRD(components images.Components, prometheus *v1.Prometheus
rw.WriteRelabelConfigs = []monitoringv1.RelabelConfig{
{
SourceLabels: []string{"__name__"},
Regex: "k8s_(.*)|apiserver_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)",
Regex: "k8s_(.*)|apiserver_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)|up",
Action: "keep",
},
}
Expand All @@ -1187,7 +1187,7 @@ func createPrometheusCRD(components images.Components, prometheus *v1.Prometheus
rw.WriteRelabelConfigs = []monitoringv1.RelabelConfig{
{
SourceLabels: []string{"__name__"},
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)",
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)|up",
Action: "keep",
},
}
Expand All @@ -1202,7 +1202,7 @@ func createPrometheusCRD(components images.Components, prometheus *v1.Prometheus
rw.WriteRelabelConfigs = []monitoringv1.RelabelConfig{
{
SourceLabels: []string{"__name__"},
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)",
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)|up",
Action: "keep",
},
}
Expand Down
14 changes: 13 additions & 1 deletion pkg/monitor/controller/prometheus/yamls.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func scrapeConfigForPrometheus() string {
replacement: $1:$2
metric_relabel_configs:
- source_labels: [ __name__ ]
regex: 'container_gpu_utilization|container_request_gpu_utilization|container_gpu_memory_total|container_request_gpu_memory|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_job_status_failed|kube_statefulset_status_replicas_ready|kube_statefulset_replicas|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_pod_labels|kube_pod_info|kube_pod_status_ready|kube_pod_container_status_restarts_total|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kube_node_status_condition|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_replicaset_owner|kube_namespace_labels|kube_node_spec_taint|kube_node_info|kube_node_spec_unschedulable|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_updated|kube_deployment_status_replicas_available|kube_daemonset_status_number_ready|kube_daemonset_status_desired_number_scheduled|kube_pod_status_phase|kube_pod_container_status_running|kube_pod_container_status_waiting|kube_pod_container_status_terminated|kube_pod_container_status_last_terminated_reason|kube_job_status_succeeded|kube_job_status_active|kube_cronjob_spec_suspend|kube_persistentvolume_status_phase|kube_resourcequota|kube_service_created'
regex: 'container_gpu_utilization|container_request_gpu_utilization|container_gpu_memory_total|container_request_gpu_memory|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_job_status_failed|kube_statefulset_status_replicas_ready|kube_statefulset_replicas|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_pod_labels|kube_pod_info|kube_pod_status_ready|kube_pod_container_status_restarts_total|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kube_node_status_condition|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_replicaset_owner|kube_namespace_labels|kube_node_spec_taint|kube_node_info|kube_node_spec_unschedulable|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_updated|kube_deployment_status_replicas_available|kube_daemonset_status_number_ready|kube_daemonset_status_desired_number_scheduled|kube_pod_status_phase|kube_pod_container_status_running|kube_pod_container_status_waiting|kube_pod_container_status_terminated|kube_pod_container_status_last_terminated_reason|kube_job_status_succeeded|kube_job_status_active|kube_cronjob_spec_suspend|kube_persistentvolume_status_phase|kube_resourcequota|kube_service_created|kube_node_created'
action: keep
- source_labels: [created_by_kind]
action: replace
Expand Down Expand Up @@ -671,6 +671,9 @@ groups:
- record: k8s_pod_restart_total
expr: sum(idelta(kube_pod_container_status_restarts_total [2m])) by (namespace,pod_name) * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

- record: k8s_pod_restart_total_number
expr: sum(kube_pod_container_status_restarts_total) by (namespace,pod_name) * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

- record: k8s_pod_status_phase
expr: kube_pod_status_phase * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

Expand Down Expand Up @@ -698,6 +701,9 @@ groups:
- record: k8s_node_status_allocatable
expr: kube_node_status_allocatable * on (node) group_left(node_role, device_type) kube_node_labels

- record: k8s_node_info
expr: kube_node_info * on (node) group_left(node_role, device_type) kube_node_labels

- record: k8s_node_cpu_usage
expr: (100 - sum (irate(node_cpu_seconds_total{mode="idle"}[5m])) by (node) / count (irate(node_cpu_seconds_total{mode="idle"}[5m])) by (node) * 100) * on(node) group_left(node_role, device_type) kube_node_labels

Expand Down Expand Up @@ -737,6 +743,12 @@ groups:
- record: k8s_node_filesystem_size_bytes
expr: node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs"} *on(node) group_left(node_role, device_type) kube_node_labels

- record: k8s_node_filesystem_usage
expr: (1 - k8s_node_filesystem_avail_bytes / k8s_node_filesystem_size_bytes) * 100

- record: k8s_node_age
expr: time() - kube_node_created

- record: k8s_node_network_receive_bytes_bw
expr: (sum by (node) (irate(node_network_receive_bytes_total{device!~"lo|veth(.*)|virb(.*)|docker(.*)|tunl(.*)|v-h(.*)|flannel(.*)"}[5m])))*on(node) group_left(node_role, device_type) kube_node_labels

Expand Down
6 changes: 3 additions & 3 deletions pkg/platform/controller/addon/prometheus/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1168,7 +1168,7 @@ func createPrometheusCRD(components images.Components, prometheus *v1.Prometheus
rw.WriteRelabelConfigs = []monitoringv1.RelabelConfig{
{
SourceLabels: []string{"__name__"},
Regex: "k8s_(.*)|apiserver_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)",
Regex: "k8s_(.*)|apiserver_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)|up",
Action: "keep",
},
}
Expand All @@ -1184,7 +1184,7 @@ func createPrometheusCRD(components images.Components, prometheus *v1.Prometheus
rw.WriteRelabelConfigs = []monitoringv1.RelabelConfig{
{
SourceLabels: []string{"__name__"},
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)",
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)|up",
Action: "keep",
},
}
Expand All @@ -1199,7 +1199,7 @@ func createPrometheusCRD(components images.Components, prometheus *v1.Prometheus
rw.WriteRelabelConfigs = []monitoringv1.RelabelConfig{
{
SourceLabels: []string{"__name__"},
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)",
Regex: "project_(.*)|apiserver_(.*)|k8s_(.*)|kube_pod_labels|kube_node_labels|kube_namespace_labels|etcd_(.*)|grpc_(.*)|process_(.*)|scheduler_(.*)|workqueue_(.*)|rest_client_requests_(.*)|go_goroutines|kubelet_(.*)|volume_manager_(.*)|storage_operation_(.*)|coredns_(.*)|up",
Action: "keep",
},
}
Expand Down
14 changes: 13 additions & 1 deletion pkg/platform/controller/addon/prometheus/yamls.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func scrapeConfigForPrometheus() string {
replacement: $1:$2
metric_relabel_configs:
- source_labels: [ __name__ ]
regex: 'container_gpu_utilization|container_request_gpu_utilization|container_gpu_memory_total|container_request_gpu_memory|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_job_status_failed|kube_statefulset_status_replicas_ready|kube_statefulset_replicas|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_pod_labels|kube_pod_info|kube_pod_status_ready|kube_pod_container_status_restarts_total|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kube_node_status_condition|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_replicaset_owner|kube_namespace_labels|kube_node_spec_taint|kube_node_info|kube_node_spec_unschedulable|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_updated|kube_deployment_status_replicas_available|kube_daemonset_status_number_ready|kube_daemonset_status_desired_number_scheduled|kube_pod_status_phase|kube_pod_container_status_running|kube_pod_container_status_waiting|kube_pod_container_status_terminated|kube_pod_container_status_last_terminated_reason|kube_job_status_succeeded|kube_job_status_active|kube_cronjob_spec_suspend|kube_persistentvolume_status_phase|kube_resourcequota|kube_service_created'
regex: 'container_gpu_utilization|container_request_gpu_utilization|container_gpu_memory_total|container_request_gpu_memory|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_allocatable_cpu_cores|kube_node_status_allocatable_memory_bytes|kube_job_status_failed|kube_statefulset_status_replicas_ready|kube_statefulset_replicas|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_pod_labels|kube_pod_info|kube_pod_status_ready|kube_pod_container_status_restarts_total|kube_pod_container_resource_requests|kube_pod_container_resource_limits|kube_node_status_condition|kube_node_status_capacity_cpu_cores|kube_node_status_capacity_memory_bytes|kube_replicaset_owner|kube_namespace_labels|kube_node_spec_taint|kube_node_info|kube_node_spec_unschedulable|kube_deployment_spec_replicas|kube_deployment_status_replicas|kube_deployment_status_replicas_updated|kube_deployment_status_replicas_available|kube_daemonset_status_number_ready|kube_daemonset_status_desired_number_scheduled|kube_pod_status_phase|kube_pod_container_status_running|kube_pod_container_status_waiting|kube_pod_container_status_terminated|kube_pod_container_status_last_terminated_reason|kube_job_status_succeeded|kube_job_status_active|kube_cronjob_spec_suspend|kube_persistentvolume_status_phase|kube_resourcequota|kube_service_created|kube_node_created'
action: keep
- source_labels: [created_by_kind]
action: replace
Expand Down Expand Up @@ -671,6 +671,9 @@ groups:
- record: k8s_pod_restart_total
expr: sum(idelta(kube_pod_container_status_restarts_total [2m])) by (namespace,pod_name) * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

- record: k8s_pod_restart_total_number
expr: sum(kube_pod_container_status_restarts_total) by (namespace,pod_name) * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

- record: k8s_pod_status_phase
expr: kube_pod_status_phase * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

Expand Down Expand Up @@ -698,6 +701,9 @@ groups:
- record: k8s_node_status_allocatable
expr: kube_node_status_allocatable * on (node) group_left(node_role, device_type) kube_node_labels

- record: k8s_node_info
expr: kube_node_info * on (node) group_left(node_role, device_type) kube_node_labels

- record: k8s_node_cpu_usage
expr: (100 - sum (irate(node_cpu_seconds_total{mode="idle"}[5m])) by (node) / count (irate(node_cpu_seconds_total{mode="idle"}[5m])) by (node) * 100) * on(node) group_left(node_role, device_type) kube_node_labels

Expand Down Expand Up @@ -737,6 +743,12 @@ groups:
- record: k8s_node_filesystem_size_bytes
expr: node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs"} *on(node) group_left(node_role, device_type) kube_node_labels

- record: k8s_node_filesystem_usage
expr: (1 - k8s_node_filesystem_avail_bytes / k8s_node_filesystem_size_bytes) * 100

- record: k8s_node_age
expr: time() - kube_node_created

- record: k8s_node_network_receive_bytes_bw
expr: (sum by (node) (irate(node_network_receive_bytes_total{device!~"lo|veth(.*)|virb(.*)|docker(.*)|tunl(.*)|v-h(.*)|flannel(.*)"}[5m])))*on(node) group_left(node_role, device_type) kube_node_labels

Expand Down

0 comments on commit b62fc41

Please sign in to comment.