VPA: v1 - first version of vertical pod autoscaler

ppalucki · ppalucki · commit 8edb9dc30f10 · 2024-05-22T14:49:18.000-01:00
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+/deployment
diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,4 @@ src/simdjson
 /deployment/pcm/smarter-device-manager/
 /deployment/pcm/nri/
 /deployment/pcm/kind-with-registry.sh
+/deployment/pcm/autoscaler
diff --git a/deployment/pcm/.helmignore b/deployment/pcm/.helmignore
@@ -23,3 +23,4 @@
 .vscode/
 smarter-device-manager/
 nri/
+autoscaler/
diff --git a/deployment/pcm/README.md b/deployment/pcm/README.md
@@ -4,13 +4,14 @@ Helm chart instructions
 
 ### Features:
 
-- Configurable as non-privileged container (value: `privileged=false`, default) and privileged container,
+- Configurable as non-privileged container (value: `privileged`, default: false) and privileged container,
 - Support for bare-metal and VM host configurations (files: [values-metal.yaml](values-metal.yaml), [values-vm.yaml](values-vm.yaml)),
 - Ability to deploy multiple releases alongside configured differently to handle different kinds of machines (bare-metal, VM) at the [same time](#heterogeneous-mixed-vmmetal-instances-cluster),
 - Linux Watchdog handling (controlled with `PCM_KEEP_NMI_WATCHDOG`, `PCM_NO_AWS_WORKAROUND`, `nmiWatchdogMount` values).
 - Deploy to own namespace with "helm install ... **-n pcm --create-namespace**".
-- Silent mode (value: `silent=false`, default).
-- Backward compatbile with older Linux kernels (<5.8) - (value: cap_perfmon).
+- Silent mode (value: `silent`, default: false).
+- Backward compatible with older Linux kernels (<5.8) - (value: cap_perfmon, default: false).
+- VerticalPodAutoscaler (value: `verticalPodAutoscaler.enabled`, default: false)
 
 Here are available methods in this chart of metrics collection w.r.t interfaces and required access:
 
@@ -47,13 +48,12 @@ helm install pcm .
 helm install pcm . -f values-direct-privileged.yaml
 ```
 
-#### Node-feature-discovery + Prometheus podMonitor 
+#### All opt-in features: Node-feature-discovery + Prometheus podMonitor + vertical
 
 ```
-helm install ... --set nfd=true --set podMonitor=true 
+helm install ... --set nfd=true --set podMonitor=true --set verticalPodAutoscaler.enabled=true
 ```
 
-
 ### Requirements
 
 - Full set of metrics (uncore/UPI, RDT, energy) requires bare-metal or .metal cloud instance.
@@ -180,7 +180,27 @@ kubectl get sts prometheus-prometheus-kube-prometheus-prometheus
 Note: `podMonitorSelectorNilUsesHelmValues` is disabled (set to false) so Prometheus operator will be able to handle PCM podMonitor deployed without extra `podMonitorLabels` or otherwise pcm need to be deployed like this:
 `helm install pcm . --set podMonitor=true --set podMonitorLabels.release=prometheus` (assuming Prometheus operator was deployed as "prometheus")
 
-#### 5) Deploy PCM helm chart
+
+#### 5) (Optionally) Deploy metric-server and vertical-pod-autoscaler
+
+Note this is irrelevant to pcm-sensor-server functionality, but useful to observer pcm pod CPU/memory usage:
+
+a) metric-server
+
+```
+helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/
+helm repo update
+helm upgrade --install --set args={--kubelet-insecure-tls} metrics-server metrics-server/metrics-server --namespace kube-system
+```
+
+b) vertical pod autoscaler
+
+```
+git clone https://github.com/kubernetes/autoscaler
+./autoscaler/vertical-pod-autoscaler/hack/vpa-up.sh
+```
+
+#### 6) Deploy PCM helm chart
 
 ```
 # a) Deploy to current namespace with defaults
@@ -194,7 +214,7 @@ helm install pcm . --set nfd=true
 helm install pcm . --namespace pcm 
 ```
 
-#### 6) Check metrics are exported
+#### 7) Check metrics are exported
 
 Run proxy in background:
 ```
diff --git a/deployment/pcm/templates/verticalpodautoscaler.yaml b/deployment/pcm/templates/verticalpodautoscaler.yaml
@@ -0,0 +1,40 @@
+{{- if and (.Capabilities.APIVersions.Has "autoscaling.k8s.io/v1") (.Values.verticalPodAutoscaler.enabled) }}
+apiVersion: autoscaling.k8s.io/v1
+kind: VerticalPodAutoscaler
+metadata:
+  name: {{ include "pcm.fullname" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "pcm.labels" . | nindent 4 }}
+spec:
+  {{- with .Values.verticalPodAutoscaler.recommenders }}
+  recommenders:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  resourcePolicy:
+    containerPolicies:
+    - containerName: pcm
+      {{- with .Values.verticalPodAutoscaler.controlledResources }}
+      controlledResources:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.verticalPodAutoscaler.controlledValues }}
+      controlledValues: {{ . }}
+      {{- end }}
+      {{- with .Values.verticalPodAutoscaler.maxAllowed }}
+      maxAllowed:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.verticalPodAutoscaler.minAllowed }}
+      minAllowed:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+  targetRef:
+    apiVersion: apps/v1
+    kind: DaemonSet
+    name: {{ include "pcm.fullname" . }}
+  {{- with .Values.verticalPodAutoscaler.updatePolicy }}
+  updatePolicy:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+{{- end }}
diff --git a/deployment/pcm/values.yaml b/deployment/pcm/values.yaml
@@ -131,3 +131,35 @@ nfdBaremetalAffinity: false
 # feature.node.kubernetes.io/cpu-rdt.RDTMON=true
 nfdRDTAffinity: false
 
+
+### -------------- verticalPodAutoscaler ------------------
+# Enable vertical pod autoscaler support for pcm-sensor-server
+verticalPodAutoscaler:
+  enabled: false
+
+  # Recommender responsible for generating recommendation for the object.
+  # List should be empty (then the default recommender will generate the recommendation)
+  # or contain exactly one recommender.
+  # recommenders:
+  # - name: custom-recommender-performance
+
+  # List of resources that the vertical pod autoscaler can control. Defaults to cpu and memory
+  controlledResources: []
+  # Specifies which resource values should be controlled: RequestsOnly or RequestsAndLimits.
+  # controlledValues: RequestsAndLimits
+
+  # Define the max allowed resources for the pod
+  maxAllowed: {}
+  # cpu: 200m
+  # memory: 100Mi
+  # Define the min allowed resources for the pod
+  minAllowed: {}
+  # cpu: 200m
+  # memory: 100Mi
+
+  # updatePolicy:
+    # Specifies minimal number of replicas which need to be alive for VPA Updater to attempt pod eviction
+    # minReplicas: 1
+    # Specifies whether recommended updates are applied when a Pod is started and whether recommended updates
+    # are applied during the life of a Pod. Possible values are "Off", "Initial", "Recreate", and "Auto".
+    # updateMode: Auto