Skip to content

Reduce prometheus memory usage by dropping more labels #2378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions dev/prometheus.md
Original file line number Diff line number Diff line change
@@ -51,21 +51,16 @@ The following is a list of metrics that are currently in use.
#### Kubelet metrics
1. container_cpu_usage_seconds_total with the following labels:
1. pod
1. container
1. name
1. container_memory_working_set_bytes with the following labels:
1. pod
1. name
1. container

#### Kube-state-metrics metrics

1. kube_pod_container_resource_requests with the following labels:
1. exported_pod
1. pod
1. resource
1. exported_container (required for not dropping the values for each container of each pod)
1. kube_pod_info with the following labels:
1. exported_pod
1. pod
1. kube_deployment_status_replicas_available with the following labels:
1. deployment
1. kube_job_status_active with the following labels:
@@ -74,11 +69,11 @@ The following is a list of metrics that are currently in use.
#### DCGM metrics

1. DCGM_FI_DEV_GPU_UTIL with the following labels:
1. exported_pod
1. pod
1. DCGM_FI_DEV_FB_USED with the following labels:
1. exported_pod
1. pod
1. DCGM_FI_DEV_FB_FREE with the following labels:
1. exported_pod
1. pod

#### Node metrics

32 changes: 16 additions & 16 deletions manager/manifests/grafana/grafana-dashboard-async.yaml
Original file line number Diff line number Diff line change
@@ -1086,7 +1086,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))",
"format": "time_series",
"instant": false,
"interval": "",
@@ -1095,7 +1095,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})",
"hide": false,
"interval": "",
"legendFormat": "Total CPU Request",
@@ -1190,7 +1190,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"format": "time_series",
"instant": false,
"interval": "",
@@ -1199,7 +1199,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
"hide": false,
"interval": "",
"legendFormat": "Total Memory Request",
@@ -1294,14 +1294,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Capacity",
@@ -1395,15 +1395,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
@@ -1515,7 +1515,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
@@ -1524,7 +1524,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg CPU Request",
@@ -1621,7 +1621,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
@@ -1630,7 +1630,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Memory Request",
@@ -1726,14 +1726,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))",
"hide": false,
"interval": "",
"legendFormat": "Avg GPU Capacity",
@@ -1829,15 +1829,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
"expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
32 changes: 16 additions & 16 deletions manager/manifests/grafana/grafana-dashboard-batch.yaml
Original file line number Diff line number Diff line change
@@ -522,7 +522,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))",
"format": "time_series",
"instant": false,
"interval": "",
@@ -531,7 +531,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})",
"hide": false,
"interval": "",
"legendFormat": "Total CPU Request",
@@ -628,7 +628,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
"format": "time_series",
"instant": false,
"interval": "",
@@ -637,7 +637,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
"hide": false,
"interval": "",
"legendFormat": "Total Memory Request",
@@ -734,14 +734,14 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total GPU Capacity",
@@ -837,15 +837,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Total Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
@@ -963,7 +963,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
@@ -972,7 +972,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg CPU Request",
@@ -1071,7 +1071,7 @@ data:
"targets": [
{
"exemplar": false,
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
@@ -1080,7 +1080,7 @@ data:
},
{
"exemplar": true,
"expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
"expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Memory Request",
@@ -1179,15 +1179,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
"hide": false,
"instant": false,
"interval": "",
"legendFormat": "Avg GPU Usage",
"refId": "GPU Usage"
},
{
"expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
"expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))",
"hide": false,
"instant": false,
"interval": "",
@@ -1286,15 +1286,15 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"hide": false,
"interval": "",
"legendFormat": "Avg Used GPU Memory",
"refId": "GPU Used Memory"
},
{
"exemplar": false,
"expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
"expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
"format": "time_series",
"instant": false,
"interval": "",
2 changes: 1 addition & 1 deletion manager/manifests/grafana/grafana-dashboard-cluster.yaml
Original file line number Diff line number Diff line change
@@ -213,7 +213,7 @@ data:
"targets": [
{
"exemplar": true,
"expr": "sum(kube_pod_info{exported_pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
"expr": "sum(kube_pod_info{pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Loading
Oops, something went wrong.