diff --git a/patch/1.27.patch b/patch/1.27.patch index 62b3d6856296..39b6a5af8a16 100644 --- a/patch/1.27.patch +++ b/patch/1.27.patch @@ -122,20 +122,19 @@ index 000000000..6b4ae3a2a +CMD ["./cluster-autoscaler"] \ No newline at end of file diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go -index f45716359..d489a11cc 100644 +index f45716359..8c5ae0e70 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go -@@ -39,6 +39,9 @@ import ( +@@ -39,6 +39,8 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/aws/aws-sdk-go/service/eks" "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" + "k8s.io/autoscaler/cluster-autoscaler/utils/gpumemory" + "k8s.io/autoscaler/cluster-autoscaler/utils/mpscontext" -+ klog "k8s.io/klog/v2" ) const ( -@@ -270,6 +273,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap +@@ -270,6 +272,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(template.InstanceType.VCPU, resource.DecimalSI) node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(template.InstanceType.GPU, resource.DecimalSI) node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.InstanceType.MemoryMb*1024*1024, resource.DecimalSI) @@ -144,6 +143,19 @@ index f45716359..d489a11cc 100644 m.updateCapacityWithRequirementsOverrides(&node.Status.Capacity, asg.MixedInstancesPolicy) +diff --git a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go +index 53399e6d2..74bd812fa 100644 +--- a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go ++++ b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go +@@ -25,6 +25,8 @@ type InstanceType struct { + MemoryMb int64 + GPU int64 + Architecture string ++ GPUMemory int64 ++ MPSContext int64 + } + + // StaticListLastUpdateTime is a string declaring the last time the static list was updated. diff --git a/cluster-autoscaler/cloudprovider/builder/builder_all.go b/cluster-autoscaler/cloudprovider/builder/builder_all.go index c8a2677ac..dddf6578d 100644 --- a/cluster-autoscaler/cloudprovider/builder/builder_all.go @@ -188,7 +200,7 @@ index 5b482857c..34f96f871 100644 // GpuConfig contains the label, type and the resource name for a GPU. diff --git a/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go new file mode 100644 -index 000000000..20034e389 +index 000000000..38cd9e0a3 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go @@ -0,0 +1,4540 @@ @@ -213,7 +225,7 @@ index 000000000..20034e389 +package spotinst + +// InstanceType is spec of EC2 instance -+type InstanceType struct { ++type instanceType struct { + InstanceType string + VCPU int64 + MemoryMb int64 @@ -6363,13 +6375,72 @@ index 000000000..22d9a6668 \ No newline at end of file diff --git a/cluster-autoscaler/visenze.md b/cluster-autoscaler/visenze.md new file mode 100644 -index 000000000..e69de29bb +index 000000000..1d362f5c1 +--- /dev/null ++++ b/cluster-autoscaler/visenze.md +@@ -0,0 +1,55 @@ ++#Upgrade workflow ++Suppose that we want to upgrade from 1.27 to 1.28 ++1. Pull the latest cost from the open source repository ++ - git pull upstream ++2. Create a new branch `release-1.27-eks` based on the open source branch `cluster-autoscaler-release-1.27` ++3. Find the patch file `patch/1.26.patch` in the branch `release-1.27-eks`, try to use it apply to the current branch ++ ``` ++ export GO111MODULE=on ++ # generate go.sum ++ go mod tidy ++ # generate vendor ++ go mod vendor ++ ``` ++1. After modifications, push the code to our repository. Then it will trigger the build https://jenkins.visenze.com/job/kubernetes-cluster-autoscaler/ ++1. Then we can test it in staging environment. ++ ++ ++#How to test the cluster autoscaler work ++1. Test if gpu related resources can trigger the scaling up and scaling down with this pod definition. ++ ++``` ++apiVersion: apps/v1 ++kind: Deployment ++metadata: ++ name: gpu-pod ++ labels: ++ app: gpu-pod ++spec: ++ replicas: 0 ++ selector: ++ matchLabels: ++ app: gpu-pod ++ template: ++ metadata: ++ labels: ++ app: gpu-pod ++ spec: ++ nodeSelector: ++ visenze.component: search ++ visenze.gpu: "true" ++ containers: ++ - name: digits-container ++ image: nvcr.io/nvidia/digits:20.12-tensorflow-py3 ++ #image: banst/awscli ++ resources: ++ limits: ++ visenze.com/nvidia-gpu-memory: 8988051968 ++ # visenze.com/nvidia-mps-context: 20 ++ # nvidia.com/gpu: 1 ++``` ++2. Or you can use the the files `test-ca.sh` and `gpu-deploy-tmpl.yaml` in [scripts](scripts) folder to test it automatically ++ ++#Note ++* If it can work, then generate and commit a new patch for the next version upgrade. The command to generate the patch: ++ `git diff [commit that before applying the patch] ':(exclude)cluster-autoscaler/go.sum' ':(exclude)cluster-autoscaler/vendor' > patch/1.27.patch` +\ No newline at end of file diff --git a/patch/1.27.patch b/patch/1.27.patch new file mode 100644 -index 000000000..9801fafa0 +index 000000000..e0109ba48 --- /dev/null +++ b/patch/1.27.patch -@@ -0,0 +1,6260 @@ +@@ -0,0 +1,6387 @@ +diff --git a/Jenkinsfile b/Jenkinsfile +new file mode 100644 +index 000000000..39890731c @@ -6494,20 +6565,19 @@ index 000000000..9801fafa0 ++CMD ["./cluster-autoscaler"] +\ No newline at end of file +diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go -+index f45716359..d489a11cc 100644 ++index f45716359..8c5ae0e70 100644 +--- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go ++++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go -+@@ -39,6 +39,9 @@ import ( ++@@ -39,6 +39,8 @@ import ( + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/aws/aws-sdk-go/service/eks" + "k8s.io/autoscaler/cluster-autoscaler/config" + "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" ++ "k8s.io/autoscaler/cluster-autoscaler/utils/gpumemory" ++ "k8s.io/autoscaler/cluster-autoscaler/utils/mpscontext" -++ klog "k8s.io/klog/v2" + ) + + const ( -+@@ -270,6 +273,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap ++@@ -270,6 +272,8 @@ func (m *AwsManager) buildNodeFromTemplate(asg *asg, template *asgTemplate) (*ap + node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(template.InstanceType.VCPU, resource.DecimalSI) + node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(template.InstanceType.GPU, resource.DecimalSI) + node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(template.InstanceType.MemoryMb*1024*1024, resource.DecimalSI) @@ -6516,6 +6586,19 @@ index 000000000..9801fafa0 + + m.updateCapacityWithRequirementsOverrides(&node.Status.Capacity, asg.MixedInstancesPolicy) + ++diff --git a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go ++index 53399e6d2..74bd812fa 100644 ++--- a/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go +++++ b/cluster-autoscaler/cloudprovider/aws/ec2_instance_types.go ++@@ -25,6 +25,8 @@ type InstanceType struct { ++ MemoryMb int64 ++ GPU int64 ++ Architecture string +++ GPUMemory int64 +++ MPSContext int64 ++ } ++ ++ // StaticListLastUpdateTime is a string declaring the last time the static list was updated. +diff --git a/cluster-autoscaler/cloudprovider/builder/builder_all.go b/cluster-autoscaler/cloudprovider/builder/builder_all.go +index c8a2677ac..dddf6578d 100644 +--- a/cluster-autoscaler/cloudprovider/builder/builder_all.go @@ -6560,7 +6643,7 @@ index 000000000..9801fafa0 + // GpuConfig contains the label, type and the resource name for a GPU. +diff --git a/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go +new file mode 100644 -+index 000000000..20034e389 ++index 000000000..38cd9e0a3 +--- /dev/null ++++ b/cluster-autoscaler/cloudprovider/spotinst/aws_ec2_instance_types.go +@@ -0,0 +1,4540 @@ @@ -6585,7 +6668,7 @@ index 000000000..9801fafa0 ++package spotinst ++ ++// InstanceType is spec of EC2 instance -++type InstanceType struct { +++type instanceType struct { ++ InstanceType string ++ VCPU int64 ++ MemoryMb int64 @@ -12629,5 +12712,120 @@ index 000000000..9801fafa0 ++ TotalMemory resource.Quantity ++ Pods []*apiv1.Pod ++} -+\ No ++\ No newline at end of file ++diff --git a/cluster-autoscaler/utils/gpumemory/gpumemory_test.go b/cluster-autoscaler/utils/gpumemory/gpumemory_test.go ++new file mode 100644 ++index 000000000..14507cf51 ++--- /dev/null +++++ b/cluster-autoscaler/utils/gpumemory/gpumemory_test.go ++@@ -0,0 +1,83 @@ +++package gpumemory +++ +++import ( +++ "testing" +++ +++ "github.com/stretchr/testify/assert" +++ apiv1 "k8s.io/api/core/v1" +++ "k8s.io/apimachinery/pkg/api/resource" +++ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +++) +++ +++func TestNodeHasGpuMemory(t *testing.T) { +++ gpuLabels := map[string]string{ +++ GPULabel: "nvidia-tesla-k80", +++ } +++ nodeGpuReady := &apiv1.Node{ +++ ObjectMeta: metav1.ObjectMeta{ +++ Name: "nodeGpuReady", +++ Labels: gpuLabels, +++ }, +++ Status: apiv1.NodeStatus{ +++ Capacity: apiv1.ResourceList{}, +++ Allocatable: apiv1.ResourceList{}, +++ }, +++ } +++ nodeGpuReady.Status.Allocatable[ResourceVisenzeGPUMemory] = *resource.NewQuantity(8e9, resource.DecimalSI) +++ nodeGpuReady.Status.Capacity[ResourceVisenzeGPUMemory] = *resource.NewQuantity(8e9, resource.DecimalSI) +++ assert.True(t, NodeHasGpuMemory(nodeGpuReady)) +++ +++ nodeGpuUnready := &apiv1.Node{ +++ ObjectMeta: metav1.ObjectMeta{ +++ Name: "nodeGpuUnready", +++ Labels: gpuLabels, +++ }, +++ Status: apiv1.NodeStatus{ +++ Capacity: apiv1.ResourceList{}, +++ Allocatable: apiv1.ResourceList{}, +++ }, +++ } +++ assert.True(t, NodeHasGpuMemory(nodeGpuUnready)) +++ +++ nodeNoGpu := &apiv1.Node{ +++ ObjectMeta: metav1.ObjectMeta{ +++ Name: "nodeNoGpu", +++ Labels: map[string]string{}, +++ }, +++ Status: apiv1.NodeStatus{ +++ Capacity: apiv1.ResourceList{}, +++ Allocatable: apiv1.ResourceList{}, +++ }, +++ } +++ assert.False(t, NodeHasGpuMemory(nodeNoGpu)) +++} +++ +++func TestPodRequestsGpuMemory(t *testing.T) { +++ podNoGpu := &apiv1.Pod{ +++ Spec: apiv1.PodSpec{ +++ Containers: []apiv1.Container{ +++ apiv1.Container{ +++ Resources: apiv1.ResourceRequirements{ +++ Requests: apiv1.ResourceList{ +++ apiv1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), +++ }, +++ }, +++ }, +++ }, +++ }, +++ } +++ podWithGpu := &apiv1.Pod{Spec: apiv1.PodSpec{Containers: []apiv1.Container{ +++ apiv1.Container{ +++ Resources: apiv1.ResourceRequirements{ +++ Requests: apiv1.ResourceList{ +++ apiv1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), +++ ResourceVisenzeGPUMemory: *resource.NewQuantity(1, resource.DecimalSI), +++ }, +++ }, +++ }, +++ }}} +++ podWithGpu.Spec.Containers[0].Resources.Requests[ResourceVisenzeGPUMemory] = *resource.NewQuantity(1, resource.DecimalSI) +++ +++ assert.False(t, PodRequestsGpuMemory(podNoGpu)) +++ assert.True(t, PodRequestsGpuMemory(podWithGpu)) +++} ++\ No newline at end of file ++diff --git a/cluster-autoscaler/utils/mpscontext/mpscontext.go b/cluster-autoscaler/utils/mpscontext/mpscontext.go ++new file mode 100644 ++index 000000000..22d9a6668 ++--- /dev/null +++++ b/cluster-autoscaler/utils/mpscontext/mpscontext.go ++@@ -0,0 +1,6 @@ +++package mpscontext +++ +++// Custom resource for NVIDIA MPS context +++const ( +++ ResourceVisenzeMPSContext = "visenze.com/nvidia-mps-context" +++) ++\ No newline at end of file ++diff --git a/cluster-autoscaler/visenze.md b/cluster-autoscaler/visenze.md ++new file mode 100644 ++index 000000000..1d362f5c1 ++--- /dev/null +++++ b/cluster-autoscaler/visenze.md ++@@ -0,0 +1,55 @@ +++#Upgrade workflow +++Suppose that we want to upgrade from 1.27 to 1.28 +++1. Pull the latest cost from the open source repository +++ - git pull upstream +++2. Create a new branch `release-1.27-eks` based on the open source branch `cluster-autoscaler-release-1.27` +++3. Find the patch file `patch/1.26. \ No newline at end of file