Skip to content

Commit 1d428ea

Browse files
committed
kuberay glue
1 parent ddff964 commit 1d428ea

File tree

1 file changed

+201
-0
lines changed

1 file changed

+201
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: raycluster-autoscaler
5+
namespace: default
6+
spec:
7+
priority: 9
8+
resources:
9+
Items: []
10+
GenericItems:
11+
- replicas: 1
12+
custompodresources:
13+
- replicas: 2
14+
requests:
15+
cpu: 3
16+
memory: 16G
17+
nvidia.com/gpu: 1
18+
limits:
19+
cpu: 3
20+
memory: 16G
21+
nvidia.com/gpu: 1
22+
generictemplate:
23+
# This config demonstrates KubeRay's Ray autoscaler integration.
24+
# The resource requests and limits in this config are too small for production!
25+
# For an example with more realistic resource configuration, see
26+
# ray-cluster.autoscaler.large.yaml.
27+
apiVersion: ray.io/v1alpha1
28+
kind: RayCluster
29+
metadata:
30+
labels:
31+
controller-tools.k8s.io: "1.0"
32+
# A unique identifier for the head node and workers of this cluster.
33+
name: glue-cluster
34+
# finalizers:
35+
# - kubernetes
36+
spec:
37+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
38+
rayVersion: '1.12.0'
39+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
40+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
41+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
42+
enableInTreeAutoscaling: false
43+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
44+
# The example configuration shown below below represents the DEFAULT values.
45+
# (You may delete autoscalerOptions if the defaults are suitable.)
46+
autoscalerOptions:
47+
# upscalingMode is "Default" or "Aggressive."
48+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
49+
# Default: Upscaling is not rate-limited.
50+
# Aggressive: An alias for Default; upscaling is not rate-limited.
51+
upscalingMode: Default
52+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
53+
idleTimeoutSeconds: 60
54+
# image optionally overrides the autoscaler's container image.
55+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
56+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
57+
## image: "my-repo/my-custom-autoscaler-image:tag"
58+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
59+
imagePullPolicy: Always
60+
# resources specifies optional resource request and limit overrides for the autoscaler container.
61+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
62+
resources:
63+
limits:
64+
cpu: "500m"
65+
memory: "512Mi"
66+
requests:
67+
cpu: "500m"
68+
memory: "512Mi"
69+
######################headGroupSpec#################################
70+
# head group template and specs, (perhaps 'group' is not needed in the name)
71+
headGroupSpec:
72+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
73+
serviceType: ClusterIP
74+
# logical group name, for this called head-group, also can be functional
75+
# pod type head or worker
76+
# rayNodeType: head # Not needed since it is under the headgroup
77+
# the following params are used to complete the ray start: ray start --head --block ...
78+
rayStartParams:
79+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
80+
dashboard-host: '0.0.0.0'
81+
block: 'true'
82+
# num-cpus: '1' # can be auto-completed from the limits
83+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
84+
# The value of `resources` is a string-integer mapping.
85+
# Currently, `resources` must be provided in the specific format demonstrated below:
86+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
87+
num-gpus: '0'
88+
#pod template
89+
template:
90+
spec:
91+
containers:
92+
# The Ray head pod
93+
- name: ray-head
94+
image: projectcodeflare/codeflare-glue:latest
95+
env:
96+
- name: AWS_ACCESS_KEY_ID
97+
valueFrom:
98+
secretKeyRef:
99+
name: glue-s3-creds
100+
key: AWS_ACCESS_KEY_ID
101+
- name: AWS_SECRET_ACCESS_KEY
102+
valueFrom:
103+
secretKeyRef:
104+
name: glue-s3-creds
105+
key: AWS_SECRET_ACCESS_KEY
106+
- name: ENDPOINT_URL
107+
valueFrom:
108+
secretKeyRef:
109+
name: glue-s3-creds
110+
key: ENDPOINT_URL
111+
imagePullPolicy: Always
112+
ports:
113+
- containerPort: 6379
114+
name: gcs
115+
- containerPort: 8265
116+
name: dashboard
117+
- containerPort: 10001
118+
name: client
119+
lifecycle:
120+
preStop:
121+
exec:
122+
command: ["/bin/sh","-c","ray stop"]
123+
resources:
124+
limits:
125+
cpu: "2"
126+
memory: "16G"
127+
nvidia.com/gpu: "0"
128+
requests:
129+
cpu: "2"
130+
memory: "16G"
131+
nvidia.com/gpu: "0"
132+
workerGroupSpecs:
133+
# the pod replicas in this group typed worker
134+
- replicas: 1
135+
minReplicas: 1
136+
maxReplicas: 1
137+
# logical group name, for this called small-group, also can be functional
138+
groupName: small-group
139+
# if worker pods need to be added, we can simply increment the replicas
140+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
141+
# the operator will remove pods from the list until the number of replicas is satisfied
142+
# when a pod is confirmed to be deleted, its name will be removed from the list below
143+
#scaleStrategy:
144+
# workersToDelete:
145+
# - raycluster-complete-worker-small-group-bdtwh
146+
# - raycluster-complete-worker-small-group-hv457
147+
# - raycluster-complete-worker-small-group-k8tj7
148+
# the following params are used to complete the ray start: ray start --block ...
149+
rayStartParams:
150+
block: 'true'
151+
num-gpus: '1'
152+
#pod template
153+
template:
154+
metadata:
155+
labels:
156+
key: value
157+
# annotations for pod
158+
annotations:
159+
key: value
160+
# finalizers:
161+
# - kubernetes
162+
spec:
163+
initContainers:
164+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
165+
- name: init-myservice
166+
image: busybox:1.28
167+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
168+
containers:
169+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
170+
image: projectcodeflare/codeflare-glue:latest
171+
env:
172+
- name: AWS_ACCESS_KEY_ID
173+
valueFrom:
174+
secretKeyRef:
175+
name: glue-s3-creds
176+
key: AWS_ACCESS_KEY_ID
177+
- name: AWS_SECRET_ACCESS_KEY
178+
valueFrom:
179+
secretKeyRef:
180+
name: glue-s3-creds
181+
key: AWS_SECRET_ACCESS_KEY
182+
- name: ENDPOINT_URL
183+
valueFrom:
184+
secretKeyRef:
185+
name: glue-s3-creds
186+
key: ENDPOINT_URL
187+
# environment variables to set in the container.Optional.
188+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
189+
lifecycle:
190+
preStop:
191+
exec:
192+
command: ["/bin/sh","-c","ray stop"]
193+
resources:
194+
limits:
195+
cpu: "4"
196+
memory: "16G"
197+
nvidia.com/gpu: "1"
198+
requests:
199+
cpu: "4"
200+
memory: "16G"
201+
nvidia.com/gpu: "1"

0 commit comments

Comments
 (0)