Skip to content

Commit

Permalink
add patch record
Browse files Browse the repository at this point in the history
  • Loading branch information
zoe.chou committed Jun 26, 2024
1 parent 0fd721a commit 34bfb52
Showing 1 changed file with 214 additions and 16 deletions.
230 changes: 214 additions & 16 deletions patch/1.27.patch
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,19 @@ index 000000000..6b4ae3a2a
+CMD ["./cluster-autoscaler"]
\ No newline at end of file
diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go
index f45716359..d489a11cc 100644
index f45716359..8c5ae0e70 100644
--- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go
+++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go
@@ -39,6 +39,9 @@ import (
@@ -39,6 +39,8 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/aws/aws-sdk-go/service/eks"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
+ "k8s.io/autoscaler/cluster-autoscaler/utils/gpumemory"
+ "k8s.io/autoscaler/cluster-autoscaler/utils/mpscontext"
+ klog "k8s.io/klog/v2"
)

const (
@@ -270,6 +273,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap
@@ -270,6 +272,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap
node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(template.InstanceType.VCPU, resource.DecimalSI)
node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(template.InstanceType.GPU, resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.InstanceType.MemoryMb*1024*1024, resource.DecimalSI)
Expand All @@ -144,6 +143,19 @@ index f45716359..d489a11cc 100644

m.updateCapacityWithRequirementsOverrides(&node.Status.Capacity, asg.MixedInstancesPolicy)

diff --git a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
index 53399e6d2..74bd812fa 100644
--- a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
+++ b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
@@ -25,6 +25,8 @@ type InstanceType struct {
MemoryMb int64
GPU int64
Architecture string
+ GPUMemory int64
+ MPSContext int64
}

// StaticListLastUpdateTime is a string declaring the last time the static list was updated.
diff --git a/cluster-autoscaler/cloudprovider/builder/builder_all.go b/cluster-autoscaler/cloudprovider/builder/builder_all.go
index c8a2677ac..dddf6578d 100644
--- a/cluster-autoscaler/cloudprovider/builder/builder_all.go
Expand Down Expand Up @@ -188,7 +200,7 @@ index 5b482857c..34f96f871 100644
// GpuConfig contains the label, type and the resource name for a GPU.
diff --git a/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go
new file mode 100644
index 000000000..20034e389
index 000000000..38cd9e0a3
--- /dev/null
+++ b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go
@@ -0,0 +1,4540 @@
Expand All @@ -213,7 +225,7 @@ index 000000000..20034e389
+package spotinst
+
+// InstanceType is spec of EC2 instance
+type InstanceType struct {
+type instanceType struct {
+ InstanceType string
+ VCPU int64
+ MemoryMb int64
Expand Down Expand Up @@ -6363,13 +6375,72 @@ index 000000000..22d9a6668
\ No newline at end of file
diff --git a/cluster-autoscaler/visenze.md b/cluster-autoscaler/visenze.md
new file mode 100644
index 000000000..e69de29bb
index 000000000..1d362f5c1
--- /dev/null
+++ b/cluster-autoscaler/visenze.md
@@ -0,0 +1,55 @@
+#Upgrade workflow
+Suppose that we want to upgrade from 1.27 to 1.28
+1. Pull the latest cost from the open source repository
+ - git pull upstream
+2. Create a new branch `release-1.27-eks` based on the open source branch `cluster-autoscaler-release-1.27`
+3. Find the patch file `patch/1.26.patch` in the branch `release-1.27-eks`, try to use it apply to the current branch
+ ```
+ export GO111MODULE=on
+ # generate go.sum
+ go mod tidy
+ # generate vendor
+ go mod vendor
+ ```
+1. After modifications, push the code to our repository. Then it will trigger the build https://jenkins.visenze.com/job/kubernetes-cluster-autoscaler/
+1. Then we can test it in staging environment.
+
+
+#How to test the cluster autoscaler work
+1. Test if gpu related resources can trigger the scaling up and scaling down with this pod definition.
+
+```
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: gpu-pod
+ labels:
+ app: gpu-pod
+spec:
+ replicas: 0
+ selector:
+ matchLabels:
+ app: gpu-pod
+ template:
+ metadata:
+ labels:
+ app: gpu-pod
+ spec:
+ nodeSelector:
+ visenze.component: search
+ visenze.gpu: "true"
+ containers:
+ - name: digits-container
+ image: nvcr.io/nvidia/digits:20.12-tensorflow-py3
+ #image: banst/awscli
+ resources:
+ limits:
+ visenze.com/nvidia-gpu-memory: 8988051968
+ # visenze.com/nvidia-mps-context: 20
+ # nvidia.com/gpu: 1
+```
+2. Or you can use the the files `test-ca.sh` and `gpu-deploy-tmpl.yaml` in [scripts](scripts) folder to test it automatically
+
+#Note
+* If it can work, then generate and commit a new patch for the next version upgrade. The command to generate the patch:
+ `git diff [commit that before applying the patch] ':(exclude)cluster-autoscaler/go.sum' ':(exclude)cluster-autoscaler/vendor' > patch/1.27.patch`
\ No newline at end of file
diff --git a/patch/1.27.patch b/patch/1.27.patch
new file mode 100644
index 000000000..9801fafa0
index 000000000..e0109ba48
--- /dev/null
+++ b/patch/1.27.patch
@@ -0,0 +1,6260 @@
@@ -0,0 +1,6387 @@
+diff --git a/Jenkinsfile b/Jenkinsfile
+new file mode 100644
+index 000000000..39890731c
Expand Down Expand Up @@ -6494,20 +6565,19 @@ index 000000000..9801fafa0
++CMD ["./cluster-autoscaler"]
+\ No newline at end of file
+diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go
+index f45716359..d489a11cc 100644
+index f45716359..8c5ae0e70 100644
+--- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go
++++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go
+@@ -39,6 +39,9 @@ import (
+@@ -39,6 +39,8 @@ import (
+ "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/aws/aws-sdk-go/service/eks"
+ "k8s.io/autoscaler/cluster-autoscaler/config"
+ "k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
++ "k8s.io/autoscaler/cluster-autoscaler/utils/gpumemory"
++ "k8s.io/autoscaler/cluster-autoscaler/utils/mpscontext"
++ klog "k8s.io/klog/v2"
+ )
+
+ const (
+@@ -270,6 +273,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap
+@@ -270,6 +272,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap
+ node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(template.InstanceType.VCPU, resource.DecimalSI)
+ node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(template.InstanceType.GPU, resource.DecimalSI)
+ node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.InstanceType.MemoryMb*1024*1024, resource.DecimalSI)
Expand All @@ -6516,6 +6586,19 @@ index 000000000..9801fafa0
+
+ m.updateCapacityWithRequirementsOverrides(&node.Status.Capacity, asg.MixedInstancesPolicy)
+
+diff --git a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
+index 53399e6d2..74bd812fa 100644
+--- a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
++++ b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go
+@@ -25,6 +25,8 @@ type InstanceType struct {
+ MemoryMb int64
+ GPU int64
+ Architecture string
++ GPUMemory int64
++ MPSContext int64
+ }
+
+ // StaticListLastUpdateTime is a string declaring the last time the static list was updated.
+diff --git a/cluster-autoscaler/cloudprovider/builder/builder_all.go b/cluster-autoscaler/cloudprovider/builder/builder_all.go
+index c8a2677ac..dddf6578d 100644
+--- a/cluster-autoscaler/cloudprovider/builder/builder_all.go
Expand Down Expand Up @@ -6560,7 +6643,7 @@ index 000000000..9801fafa0
+ // GpuConfig contains the label, type and the resource name for a GPU.
+diff --git a/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go
+new file mode 100644
+index 000000000..20034e389
+index 000000000..38cd9e0a3
+--- /dev/null
++++ b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go
+@@ -0,0 +1,4540 @@
Expand All @@ -6585,7 +6668,7 @@ index 000000000..9801fafa0
++package spotinst
++
++// InstanceType is spec of EC2 instance
++type InstanceType struct {
++type instanceType struct {
++ InstanceType string
++ VCPU int64
++ MemoryMb int64
Expand Down Expand Up @@ -12629,5 +12712,120 @@ index 000000000..9801fafa0
++ TotalMemory resource.Quantity
++ Pods []*apiv1.Pod
++}
+\ No
+\ No newline at end of file
+diff --git a/cluster-autoscaler/utils/gpumemory/gpumemory_test.go b/cluster-autoscaler/utils/gpumemory/gpumemory_test.go
+new file mode 100644
+index 000000000..14507cf51
+--- /dev/null
++++ b/cluster-autoscaler/utils/gpumemory/gpumemory_test.go
+@@ -0,0 +1,83 @@
++package gpumemory
++
++import (
++ "testing"
++
++ "github.com/stretchr/testify/assert"
++ apiv1 "k8s.io/api/core/v1"
++ "k8s.io/apimachinery/pkg/api/resource"
++ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
++)
++
++func TestNodeHasGpuMemory(t *testing.T) {
++ gpuLabels := map[string]string{
++ GPULabel: "nvidia-tesla-k80",
++ }
++ nodeGpuReady := &apiv1.Node{
++ ObjectMeta: metav1.ObjectMeta{
++ Name: "nodeGpuReady",
++ Labels: gpuLabels,
++ },
++ Status: apiv1.NodeStatus{
++ Capacity: apiv1.ResourceList{},
++ Allocatable: apiv1.ResourceList{},
++ },
++ }
++ nodeGpuReady.Status.Allocatable[ResourceVisenzeGPUMemory] = *resource.NewQuantity(8e9, resource.DecimalSI)
++ nodeGpuReady.Status.Capacity[ResourceVisenzeGPUMemory] = *resource.NewQuantity(8e9, resource.DecimalSI)
++ assert.True(t, NodeHasGpuMemory(nodeGpuReady))
++
++ nodeGpuUnready := &apiv1.Node{
++ ObjectMeta: metav1.ObjectMeta{
++ Name: "nodeGpuUnready",
++ Labels: gpuLabels,
++ },
++ Status: apiv1.NodeStatus{
++ Capacity: apiv1.ResourceList{},
++ Allocatable: apiv1.ResourceList{},
++ },
++ }
++ assert.True(t, NodeHasGpuMemory(nodeGpuUnready))
++
++ nodeNoGpu := &apiv1.Node{
++ ObjectMeta: metav1.ObjectMeta{
++ Name: "nodeNoGpu",
++ Labels: map[string]string{},
++ },
++ Status: apiv1.NodeStatus{
++ Capacity: apiv1.ResourceList{},
++ Allocatable: apiv1.ResourceList{},
++ },
++ }
++ assert.False(t, NodeHasGpuMemory(nodeNoGpu))
++}
++
++func TestPodRequestsGpuMemory(t *testing.T) {
++ podNoGpu := &apiv1.Pod{
++ Spec: apiv1.PodSpec{
++ Containers: []apiv1.Container{
++ apiv1.Container{
++ Resources: apiv1.ResourceRequirements{
++ Requests: apiv1.ResourceList{
++ apiv1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI),
++ },
++ },
++ },
++ },
++ },
++ }
++ podWithGpu := &apiv1.Pod{Spec: apiv1.PodSpec{Containers: []apiv1.Container{
++ apiv1.Container{
++ Resources: apiv1.ResourceRequirements{
++ Requests: apiv1.ResourceList{
++ apiv1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI),
++ ResourceVisenzeGPUMemory: *resource.NewQuantity(1, resource.DecimalSI),
++ },
++ },
++ },
++ }}}
++ podWithGpu.Spec.Containers[0].Resources.Requests[ResourceVisenzeGPUMemory] = *resource.NewQuantity(1, resource.DecimalSI)
++
++ assert.False(t, PodRequestsGpuMemory(podNoGpu))
++ assert.True(t, PodRequestsGpuMemory(podWithGpu))
++}
+\ No newline at end of file
+diff --git a/cluster-autoscaler/utils/mpscontext/mpscontext.go b/cluster-autoscaler/utils/mpscontext/mpscontext.go
+new file mode 100644
+index 000000000..22d9a6668
+--- /dev/null
++++ b/cluster-autoscaler/utils/mpscontext/mpscontext.go
+@@ -0,0 +1,6 @@
++package mpscontext
++
++// Custom resource for NVIDIA MPS context
++const (
++ ResourceVisenzeMPSContext = "visenze.com/nvidia-mps-context"
++)
+\ No newline at end of file
+diff --git a/cluster-autoscaler/visenze.md b/cluster-autoscaler/visenze.md
+new file mode 100644
+index 000000000..1d362f5c1
+--- /dev/null
++++ b/cluster-autoscaler/visenze.md
+@@ -0,0 +1,55 @@
++#Upgrade workflow
++Suppose that we want to upgrade from 1.27 to 1.28
++1. Pull the latest cost from the open source repository
++ - git pull upstream
++2. Create a new branch `release-1.27-eks` based on the open source branch `cluster-autoscaler-release-1.27`
++3. Find the patch file `patch/1.26.
\ No newline at end of file

0 comments on commit 34bfb52

Please sign in to comment.