cortexlabs · RobertLucian · Jul 28, 2021 · Jul 28, 2021 · Jul 28, 2021 · Jul 28, 2021
diff --git a/dev/prometheus.md b/dev/prometheus.md
@@ -51,21 +51,16 @@ The following is a list of metrics that are currently in use.
 #### Kubelet metrics
 1. container_cpu_usage_seconds_total with the following labels:
     1. pod
-    1. container
-    1. name
 1. container_memory_working_set_bytes with the following labels:
     1. pod
-    1. name
-    1. container
 
 #### Kube-state-metrics metrics
 
 1. kube_pod_container_resource_requests with the following labels:
-    1. exported_pod
+    1. pod
     1. resource
-    1. exported_container (required for not dropping the values for each container of each pod)
 1. kube_pod_info with the following labels:
-    1. exported_pod
+    1. pod
 1. kube_deployment_status_replicas_available with the following labels:
     1. deployment
 1. kube_job_status_active with the following labels:
@@ -74,11 +69,11 @@ The following is a list of metrics that are currently in use.
 #### DCGM metrics
 
 1. DCGM_FI_DEV_GPU_UTIL with the following labels:
-    1. exported_pod
+    1. pod
 1. DCGM_FI_DEV_FB_USED with the following labels:
-    1. exported_pod
+    1. pod
 1. DCGM_FI_DEV_FB_FREE with the following labels:
-    1. exported_pod
+    1. pod
 
 #### Node metrics
 

diff --git a/manager/manifests/grafana/grafana-dashboard-async.yaml b/manager/manifests/grafana/grafana-dashboard-async.yaml
@@ -1086,7 +1086,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"}[1m]))",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1095,7 +1095,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total CPU Request",
@@ -1190,7 +1190,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m])) /\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1199,7 +1199,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Memory Request",
@@ -1294,14 +1294,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Capacity",
@@ -1395,15 +1395,15 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Used GPU Memory",
               "refId": "GPU Used Memory"
             },
             {
               "exemplar": false,
-              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1515,7 +1515,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1524,7 +1524,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg CPU Request",
@@ -1621,7 +1621,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1630,7 +1630,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"api-$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Memory Request",
@@ -1726,14 +1726,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"api-$api_name.+\"}) by (pod))",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg GPU Capacity",
@@ -1829,15 +1829,15 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Used GPU Memory",
               "refId": "GPU Used Memory"
             },
             {
               "exemplar": false,
-              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"api-$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",

diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -522,7 +522,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -531,7 +531,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total CPU Request",
@@ -628,7 +628,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -637,7 +637,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Memory Request",
@@ -734,14 +734,14 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total GPU Capacity",
@@ -837,15 +837,15 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Total Used GPU Memory",
               "refId": "GPU Used Memory"
             },
             {
               "exemplar": false,
-              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -963,7 +963,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\"}[1m]))\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -972,7 +972,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource_requests{exported_pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource_requests{pod=~\"$api_name.+\", resource=\"cpu\"})\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg CPU Request",
@@ -1071,7 +1071,7 @@ data:
           "targets": [
             {
               "exemplar": false,
-              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\"}[1m]))\n/\navg(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",
@@ -1080,7 +1080,7 @@ data:
             },
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_container_resource{exported_pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(kube_pod_container_resource{pod=~\"$api_name.+\", resource=\"memory\"}) / 1024^2\n/\nsum(kube_pod_info{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Memory Request",
@@ -1179,15 +1179,15 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})",
               "hide": false,
               "instant": false,
               "interval": "",
               "legendFormat": "Avg GPU Usage",
               "refId": "GPU Usage"
             },
             {
-              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{pod=~\"$api_name.+\"}) by (pod))",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -1286,15 +1286,15 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "hide": false,
               "interval": "",
               "legendFormat": "Avg Used GPU Memory",
               "refId": "GPU Used Memory"
             },
             {
               "exemplar": false,
-              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{pod=~\"$api_name.+\"})",
               "format": "time_series",
               "instant": false,
               "interval": "",

diff --git a/manager/manifests/grafana/grafana-dashboard-cluster.yaml b/manager/manifests/grafana/grafana-dashboard-cluster.yaml
@@ -213,7 +213,7 @@ data:
           "targets": [
             {
               "exemplar": true,
-              "expr": "sum(kube_pod_info{exported_pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
+              "expr": "sum(kube_pod_info{pod!~\"(aws-node|grafana|autoscaler|cluster-autoscaler|coredns|event-exporter|fluent-bit|kube-proxy|k8s-neuron-scheduler|kube-state-metrics|metrics-server|node-exporter|operator|operator-controller-manager|prometheus-operator|prometheus-prometheus|prometheus-statsd-exporter|dcgm-exporter|ingressgateway|istiod|activator|enqueuer|gateway|nvidia-device-plugin-daemonset|neuron-device-plugin-daemonset)-(.+)\"})",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 2,