From 87a12010f0355c91cb62724bc67505c4dbdb8bc5 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@jaroslaws-mbp.lan> Date: Tue, 15 Aug 2023 13:41:05 -0400 Subject: [PATCH 01/18] using 1.32.0 tag --- deployment/mcad-controller/templates/deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/mcad-controller/templates/deployment.yaml b/deployment/mcad-controller/templates/deployment.yaml index 56a7445d0..7e2e54896 100644 --- a/deployment/mcad-controller/templates/deployment.yaml +++ b/deployment/mcad-controller/templates/deployment.yaml @@ -33,7 +33,7 @@ spec: #{{ end }} --- #{{ if (eq .Values.configMap.multiCluster true) }} -apiVersion: apiregistration.k8s.io/v1beta1 +apiVersion: apiregistration.k8s.io/v1 kind: APIService metadata: name: v1beta1.external.metrics.k8s.io @@ -47,7 +47,7 @@ spec: groupPriorityMinimum: 100 versionPriority: 100 --- -apiVersion: apiregistration.k8s.io/v1beta1 +apiVersion: apiregistration.k8s.io/v1 kind: APIService metadata: name: v1beta1.custom.metrics.k8s.io From 7715679e7709f645f14b36455552a685aad6cfee Mon Sep 17 00:00:00 2001 From: dmatch01 <darroyo@us.ibm.com> Date: Wed, 16 Aug 2023 06:29:04 -0700 Subject: [PATCH 02/18] Added stanze def. for clusterscheduling. --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 71 ++++++++++++++++++- .../bases/mcad.ibm.com_schedulingspecs.yaml | 65 +++++++++++++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 71 ++++++++++++++++++- .../crds/mcad.ibm.com_schedulingspecs.yaml | 65 +++++++++++++++++ 4 files changed, 266 insertions(+), 6 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index b43d6d3cf..65cc7e1e9 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -62,10 +62,10 @@ spec: completionstatus: description: Optional field that drives completion status of appwrapper. This field within an item of an appwrapper determines the full state of appwrapper. - The completionstatus field contains a list of conditions that make the associate item considered + The completionstatus field contains a list of conditions that make the associate item considered completed, for instance :- completion conditions could be "Complete" or "Failed". - The associated item's level .status.conditions[].type field is monitored for any one of these conditions. Once all items with this - option is set and the conditionstatus is met the entire appwrapper state will be changed to one of the valid appwrapper completion state. Note :- this is an AND + The associated item's level .status.conditions[].type field is monitored for any one of these conditions. Once all items with this + option is set and the conditionstatus is met the entire appwrapper state will be changed to one of the valid appwrapper completion state. Note :- this is an AND operation for all items where this option is set. See the list of appwrapper states for a list of valid complete states. type: string @@ -299,6 +299,71 @@ spec: type: integer default: 0 type: object + clusterScheduling: + description: Stanza to define cluster allocation restrictions. + Used only in multiCluster mode. + properties: + clusters: + description: Specifies cluster IDs for allocation + restrictions. Used only in + multiCluster mode. + items: + properties: + name: + description: Cluster ID name. Used only in + multiCluster mode. + type: string + type: object + type: array + clusterSelector: + description: (UNSUPPORTED FIELD) A label selector is a label + query over a set of cluster objects. The result of + matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds properties: diff --git a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml index 3de2af124..d6125adfd 100644 --- a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml +++ b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml @@ -74,6 +74,71 @@ spec: type: integer default: 0 type: object + clusterScheduling: + description: Stanza to define cluster allocation restrictions. + Used only in multiCluster mode. + properties: + clusters: + description: Specifies cluster IDs for allocation + restrictions. Used only in + multiCluster mode. + items: + properties: + name: + description: Cluster ID name. Used only in + multiCluster mode. + type: string + type: object + type: array + clusterSelector: + description: (UNSUPPORTED FIELD) A label selector is a label + query over a set of cluster objects. The result of + matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds properties: diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index b43d6d3cf..65cc7e1e9 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -62,10 +62,10 @@ spec: completionstatus: description: Optional field that drives completion status of appwrapper. This field within an item of an appwrapper determines the full state of appwrapper. - The completionstatus field contains a list of conditions that make the associate item considered + The completionstatus field contains a list of conditions that make the associate item considered completed, for instance :- completion conditions could be "Complete" or "Failed". - The associated item's level .status.conditions[].type field is monitored for any one of these conditions. Once all items with this - option is set and the conditionstatus is met the entire appwrapper state will be changed to one of the valid appwrapper completion state. Note :- this is an AND + The associated item's level .status.conditions[].type field is monitored for any one of these conditions. Once all items with this + option is set and the conditionstatus is met the entire appwrapper state will be changed to one of the valid appwrapper completion state. Note :- this is an AND operation for all items where this option is set. See the list of appwrapper states for a list of valid complete states. type: string @@ -299,6 +299,71 @@ spec: type: integer default: 0 type: object + clusterScheduling: + description: Stanza to define cluster allocation restrictions. + Used only in multiCluster mode. + properties: + clusters: + description: Specifies cluster IDs for allocation + restrictions. Used only in + multiCluster mode. + items: + properties: + name: + description: Cluster ID name. Used only in + multiCluster mode. + type: string + type: object + type: array + clusterSelector: + description: (UNSUPPORTED FIELD) A label selector is a label + query over a set of cluster objects. The result of + matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds properties: diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml index 3de2af124..d6125adfd 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml @@ -74,6 +74,71 @@ spec: type: integer default: 0 type: object + clusterScheduling: + description: Stanza to define cluster allocation restrictions. + Used only in multiCluster mode. + properties: + clusters: + description: Specifies cluster IDs for allocation + restrictions. Used only in + multiCluster mode. + items: + properties: + name: + description: Cluster ID name. Used only in + multiCluster mode. + type: string + type: object + type: array + clusterSelector: + description: (UNSUPPORTED FIELD) A label selector is a label + query over a set of cluster objects. The result of + matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds properties: From bcd41271d2690152d7b0560ca7b0c4b91d6d9c45 Mon Sep 17 00:00:00 2001 From: dmatch01 <darroyo@us.ibm.com> Date: Wed, 16 Aug 2023 07:43:06 -0700 Subject: [PATCH 03/18] Added and fixed example. Signed-off-by: dmatch01 <darroyo@us.ibm.com> --- doc/usage/examples/aw-1-k8s-job1.yaml | 2 +- .../aw-1-k8s-job4-with-cluster-sched.yaml | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 doc/usage/examples/aw-1-k8s-job4-with-cluster-sched.yaml diff --git a/doc/usage/examples/aw-1-k8s-job1.yaml b/doc/usage/examples/aw-1-k8s-job1.yaml index 35e3b372e..02fa69f59 100644 --- a/doc/usage/examples/aw-1-k8s-job1.yaml +++ b/doc/usage/examples/aw-1-k8s-job1.yaml @@ -8,7 +8,7 @@ spec: minAvailable: 2 resources: GenericItems: - replicas: 1 + - replicas: 1 metadata: name: aw-generic-statefulset-2 namespace: test1 diff --git a/doc/usage/examples/aw-1-k8s-job4-with-cluster-sched.yaml b/doc/usage/examples/aw-1-k8s-job4-with-cluster-sched.yaml new file mode 100644 index 000000000..53ef6b25c --- /dev/null +++ b/doc/usage/examples/aw-1-k8s-job4-with-cluster-sched.yaml @@ -0,0 +1,41 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: aw-generic-ss-cluster-sched-2 + namespace: test1 +spec: + schedulingSpec: + minAvailable: 2 + clusterScheduling: + clusters: + - name: helloworld + resources: + GenericItems: + - replicas: 1 + metadata: + name: aw-generic-ss-cluster-sched-2 + namespace: test1 + generictemplate: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: aw-generic-ss-cluster-sched-2 + namespace: test1 + labels: + app: aw-generic-ss-cluster-sched-2 + spec: + replicas: 2 + selector: + matchLabels: + app: aw-generic-ss-cluster-sched-2 + template: + metadata: + labels: + app: aw-generic-ss-cluster-sched-2 + spec: + containers: + - name: aw-generic-ss-cluster-sched-2 + image: k8s.gcr.io/echoserver:1.4 + imagePullPolicy: Never + ports: + - containerPort: 80 From f8c7031a95270fb0167a031e8b0074316b22092f Mon Sep 17 00:00:00 2001 From: dmatch01 <darroyo@us.ibm.com> Date: Tue, 22 Aug 2023 12:21:18 -0700 Subject: [PATCH 04/18] Added api field for targetClusterName Signed-off-by: dmatch01 <darroyo@us.ibm.com> --- config/crd/bases/mcad.ibm.com_appwrappers.yaml | 2 ++ deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml | 2 ++ go.sum | 2 ++ pkg/apis/controller/v1beta1/appwrapper.go | 3 +++ 4 files changed, 9 insertions(+) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 65cc7e1e9..838c12184 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -860,6 +860,8 @@ spec: type: boolean message: type: string + targetClusterName: + type: string pending: description: The number of pending pods. format: int32 diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 65cc7e1e9..838c12184 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -860,6 +860,8 @@ spec: type: boolean message: type: string + targetClusterName: + type: string pending: description: The number of pending pods. format: int32 diff --git a/go.sum b/go.sum index 84487dcf2..ab1361640 100644 --- a/go.sum +++ b/go.sum @@ -406,6 +406,7 @@ github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijb github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0= @@ -547,6 +548,7 @@ golang.org/x/sys v0.0.0-20201112073958-5cba982894dd/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad h1:ntjMns5wyP/fN65tdBD4g8J5w8n015+iIIs9rtjXkY0= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index f5302b3d7..066a16b5b 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -263,6 +263,9 @@ type AppWrapperStatus struct { // Represents the latest available observations of pods under appwrapper PendingPodConditions []PendingPodSpec `json:"pendingpodconditions"` + + // Represents multi-cluster observatioins + TargetClusterName string `json:"targetClusterName"` } type AppWrapperState string From 0afc21fb4818fcd9da4e3581c20f2c42aa0bdb85 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-56-143.wecm.ibm.com> Date: Wed, 23 Aug 2023 20:22:25 -0400 Subject: [PATCH 05/18] added support for setting targetClusterName in status --- cmd/kar-controllers/app/options/options.go | 9 +++++++++ .../queuejob/queuejob_controller_ex.go | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/cmd/kar-controllers/app/options/options.go b/cmd/kar-controllers/app/options/options.go index ada825490..2bfb45d1d 100644 --- a/cmd/kar-controllers/app/options/options.go +++ b/cmd/kar-controllers/app/options/options.go @@ -56,6 +56,7 @@ type ServerOption struct { QuotaRestURL string HealthProbeListenAddr string DispatchResourceReservationTimeout int64 + ExternalDispatch bool // if true, will use external plugin to dispatch workloads } // NewServerOption creates a new CMServer with a default config. @@ -83,6 +84,8 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) { fs.IntVar(&s.SecurePort, "secure-port", 6443, "The port on which to serve secured, authenticated access for metrics.") fs.StringVar(&s.HealthProbeListenAddr, "healthProbeListenAddr", ":8081", "Listen address for health probes. Defaults to ':8081'") fs.Int64Var(&s.DispatchResourceReservationTimeout, "dispatchResourceReservationTimeout", s.DispatchResourceReservationTimeout, "Resource reservation timeout for pods to be created once AppWrapper is dispatched, in millisecond. Defaults to '300000', 5 minutes") + fs.BoolVar(&s.ExternalDispatch,"externalDispatch", s.ExternalDispatch,"Use external workload dispatch plugin. Default is false.") + flag.Parse() klog.V(4).Infof("[AddFlags] Controller configuration: %#v", s) } @@ -147,6 +150,12 @@ func (s *ServerOption) loadDefaultsFromEnvVars() { s.DispatchResourceReservationTimeout = to } } + externalDispatch, envVarExists := os.LookupEnv("EXTERNAL_DISPATCH") + s.ExternalDispatch = false + if envVarExists && strings.EqualFold(externalDispatch, "true") { + s.ExternalDispatch = true + } + } func (s *ServerOption) CheckOptionOrDie() { diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 009192998..7571a9960 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1922,7 +1922,6 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool defer func() { klog.V(10).Infof("[worker-manageQJ] Ending %s manageQJ time=%s &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(startTime), qj, qj.ResourceVersion, qj.Status) }() - if !cc.isDispatcher { // Agent Mode if qj.DeletionTimestamp != nil { @@ -2222,7 +2221,20 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool // if agentId!=nil { if agentId, ok := cc.dispatchMap[queuejobKey]; ok { klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) - cc.agentMap[agentId].CreateJob(qj) + if cc.serverOption.ExternalDispatch { + clusterList := qj.Spec.SchedSpec.ClusterScheduling.Clusters + if len(clusterList) == 0 { + klog.Errorf("[Dispatcher Controller] AppWrapper %s does not include a list of clusterIds in Spec.SchedSpec.ClusterScheduling.Clusters.", qj.Name) + return nil + } else { + // choose target clusterId at random + clusterId := clusterList[rand.Int()%len(clusterList)] + klog.V(1).Infof("ClusterId %s is chosen randomly\n", clusterId) + qj.Status.TargetClusterName = clusterId.Name + } + } else { + cc.agentMap[agentId].CreateJob(qj) + } qj.Status.IsDispatched = true } else { klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name) From 0cbb7fb3456e9659c3a3f0f18b3d08768c11e1d4 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-142-143.wecm.ibm.com> Date: Thu, 24 Aug 2023 15:28:29 -0400 Subject: [PATCH 06/18] added support to set target cluster id and delete appwrapper when no longer needed --- .../queuejob/queuejob_controller_ex.go | 66 ++++++++++++------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 7571a9960..361346bda 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1011,6 +1011,24 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust } func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { + + if qjm.serverOption.ExternalDispatch { + clusterList := qj.Spec.SchedSpec.ClusterScheduling.Clusters + var clusterId = "" + // target clusters no defined by the submitter of workload. Just pick a target + // from a known list of clusters provided in serverOption.AgentConfigs + if len(clusterList) == 0 { + clusterId = qjm.agentList[rand.Int()%len(qjm.agentList)] + klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided by mcad\n", clusterId) + } else { + // choose target clusterId at random + clusterId = clusterList[rand.Int()%len(clusterList)].Name + klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided in Spec.SchedSpec.ClusterScheduling.Clusters: %s\n", clusterId, clusterList) + //qj.Status.TargetClusterName = + //qj.Status.TargetClusterName = clusterList[rand.Int()%len(clusterList)].Name + } + return clusterId; + } qjAggrResources := qjm.GetAggregatedResources(qj) klog.V(2).Infof("[chooseAgent] Aggregated Resources of XQJ %s: %v\n", qj.Name, qjAggrResources) @@ -2215,30 +2233,21 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool klog.V(10).Infof("[worker-manageQJ] XQJ %s has Overhead Before Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) klog.V(10).Infof("[TTime] %s, %s: WorkerBeforeDispatch", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time)) } - - queuejobKey, _ := GetQueueJobKey(qj) - // agentId:=cc.dispatchMap[queuejobKey] - // if agentId!=nil { - if agentId, ok := cc.dispatchMap[queuejobKey]; ok { - klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) - if cc.serverOption.ExternalDispatch { - clusterList := qj.Spec.SchedSpec.ClusterScheduling.Clusters - if len(clusterList) == 0 { - klog.Errorf("[Dispatcher Controller] AppWrapper %s does not include a list of clusterIds in Spec.SchedSpec.ClusterScheduling.Clusters.", qj.Name) - return nil - } else { - // choose target clusterId at random - clusterId := clusterList[rand.Int()%len(clusterList)] - klog.V(1).Infof("ClusterId %s is chosen randomly\n", clusterId) - qj.Status.TargetClusterName = clusterId.Name - } - } else { - cc.agentMap[agentId].CreateJob(qj) - } + + if cc.serverOption.ExternalDispatch { + qj.Status.TargetClusterName = cc.chooseAgent(qj) qj.Status.IsDispatched = true } else { - klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name) + queuejobKey, _ := GetQueueJobKey(qj) + if agentId, ok := cc.dispatchMap[queuejobKey]; ok { + klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) + cc.agentMap[agentId].CreateJob(qj) + qj.Status.IsDispatched = true + } else { + klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name) + } } + if klog.V(10).Enabled() { current_time := time.Now() klog.V(10).Infof("[Dispatcher Controller] XQJ %s has Overhead After Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) @@ -2287,9 +2296,18 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error { // klog.Infof("[Dispatcher] Cleanup: State=%s\n", appwrapper.Status.State) //if ! appwrapper.Status.CanRun && appwrapper.Status.IsDispatched { if appwrapper.Status.IsDispatched { - queuejobKey, _ := GetQueueJobKey(appwrapper) - if obj, ok := cc.dispatchMap[queuejobKey]; ok { - cc.agentMap[obj].DeleteJob(appwrapper) + + if cc.serverOption.ExternalDispatch { + if err := cc.arbclients.ArbV1().AppWrappers(appwrapper.Namespace).Delete(appwrapper.Name, &metav1.DeleteOptions{}); err != nil { + klog.Errorf("Failed to delete AppWrapper %v/%v: %v", + appwrapper.Namespace, appwrapper.Name, err) + return err + } + } else { + queuejobKey, _ := GetQueueJobKey(appwrapper) + if obj, ok := cc.dispatchMap[queuejobKey]; ok { + cc.agentMap[obj].DeleteJob(appwrapper) + } } appwrapper.Status.IsDispatched = false } From 9e1821aaa5081bbb6fadc42978f8e1d2d1961347 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@jaroslaws-mbp.lan> Date: Thu, 24 Aug 2023 18:04:42 -0400 Subject: [PATCH 07/18] fixed assignment for targetClusterName and modified cleanup() --- .../queuejob/queuejob_controller_ex.go | 39 +++++++------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 361346bda..3087ce380 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -2232,22 +2232,19 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool current_time := time.Now() klog.V(10).Infof("[worker-manageQJ] XQJ %s has Overhead Before Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) klog.V(10).Infof("[TTime] %s, %s: WorkerBeforeDispatch", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time)) - } - - if cc.serverOption.ExternalDispatch { - qj.Status.TargetClusterName = cc.chooseAgent(qj) - qj.Status.IsDispatched = true - } else { - queuejobKey, _ := GetQueueJobKey(qj) - if agentId, ok := cc.dispatchMap[queuejobKey]; ok { - klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) - cc.agentMap[agentId].CreateJob(qj) - qj.Status.IsDispatched = true + } + queuejobKey, _ := GetQueueJobKey(qj) + if agentId, ok := cc.dispatchMap[queuejobKey]; ok { + klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) + if cc.serverOption.ExternalDispatch { + qj.Status.TargetClusterName = agentId } else { - klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name) + cc.agentMap[agentId].CreateJob(qj) } - } - + qj.Status.IsDispatched = true + } else { + klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name) + } if klog.V(10).Enabled() { current_time := time.Now() klog.V(10).Infof("[Dispatcher Controller] XQJ %s has Overhead After Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) @@ -2296,18 +2293,12 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error { // klog.Infof("[Dispatcher] Cleanup: State=%s\n", appwrapper.Status.State) //if ! appwrapper.Status.CanRun && appwrapper.Status.IsDispatched { if appwrapper.Status.IsDispatched { - - if cc.serverOption.ExternalDispatch { - if err := cc.arbclients.ArbV1().AppWrappers(appwrapper.Namespace).Delete(appwrapper.Name, &metav1.DeleteOptions{}); err != nil { - klog.Errorf("Failed to delete AppWrapper %v/%v: %v", - appwrapper.Namespace, appwrapper.Name, err) - return err - } - } else { - queuejobKey, _ := GetQueueJobKey(appwrapper) - if obj, ok := cc.dispatchMap[queuejobKey]; ok { + queuejobKey, _ := GetQueueJobKey(appwrapper) + if obj, ok := cc.dispatchMap[queuejobKey]; ok { + if !cc.serverOption.ExternalDispatch { cc.agentMap[obj].DeleteJob(appwrapper) } + delete(cc.dispatchMap,queuejobKey) } appwrapper.Status.IsDispatched = false } From 40b121faeb3562cd5a88ba18ec956fb92296d886 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@jaroslaws-mbp.lan> Date: Thu, 24 Aug 2023 18:54:20 -0400 Subject: [PATCH 08/18] modified helm related resources to support new option externalDispatch --- deployment/mcad-controller/templates/configmap.yaml | 1 + deployment/mcad-controller/values.yaml | 1 + .../bundle/manifests/mcad-operator.clusterserviceversion.yaml | 2 +- .../deploy/crds/mcad.ibm.com_v1beta1_mcadhelmconfig_cr.yaml | 1 + .../manifests/mcad-operator.clusterserviceversion.yaml | 2 +- .../helm-charts/mcad-controller/templates/configmap.yaml | 1 + .../mcad-operator/helm-charts/mcad-controller/values.yaml | 1 + doc/deploy/deployment.md | 1 + 8 files changed, 8 insertions(+), 2 deletions(-) diff --git a/deployment/mcad-controller/templates/configmap.yaml b/deployment/mcad-controller/templates/configmap.yaml index b25f71f77..6ab34efd9 100644 --- a/deployment/mcad-controller/templates/configmap.yaml +++ b/deployment/mcad-controller/templates/configmap.yaml @@ -7,6 +7,7 @@ metadata: data: QUOTA_ENABLED: {{ .Values.configMap.quotaEnabled }} DISPATCHER_MODE: {{ .Values.configMap.dispatcherMode }} + EXTERNAL_DISPATCH: {{.Values.configMap.externalDispatch }} {{ if .Values.configMap.agentConfigs }}DISPATCHER_AGENT_CONFIGS: {{ .Values.configMap.agentConfigs }}{{ end }} PREEMPTION: {{ .Values.configMap.preemptionEnabled }} {{ if .Values.configMap.quotaRestUrl }}QUOTA_REST_URL: {{ .Values.configMap.quotaRestUrl }}{{ end }} diff --git a/deployment/mcad-controller/values.yaml b/deployment/mcad-controller/values.yaml index 1d289f2f8..741985b82 100644 --- a/deployment/mcad-controller/values.yaml +++ b/deployment/mcad-controller/values.yaml @@ -47,6 +47,7 @@ configMap: quotaEnabled: '"false"' multiCluster: false dispatcherMode: '"false"' + externalDispatch: '"false"' preemptionEnabled: '"false"' agentConfigs: "" quotaRestUrl: "" diff --git a/deployment/mcad-operator/bundle/manifests/mcad-operator.clusterserviceversion.yaml b/deployment/mcad-operator/bundle/manifests/mcad-operator.clusterserviceversion.yaml index a82f6f43d..7166e0460 100644 --- a/deployment/mcad-operator/bundle/manifests/mcad-operator.clusterserviceversion.yaml +++ b/deployment/mcad-operator/bundle/manifests/mcad-operator.clusterserviceversion.yaml @@ -3,7 +3,7 @@ kind: ClusterServiceVersion metadata: annotations: alm-examples: >- - [{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}] + [{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","externalDispatch":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}] capabilities: Basic Install description: A Kubernetes Native Holistic Lifecycle Resource Manager for Applications name: mcad-operator.v0.1.9 diff --git a/deployment/mcad-operator/deploy/crds/mcad.ibm.com_v1beta1_mcadhelmconfig_cr.yaml b/deployment/mcad-operator/deploy/crds/mcad.ibm.com_v1beta1_mcadhelmconfig_cr.yaml index 00fa8b085..ea01e1c11 100644 --- a/deployment/mcad-operator/deploy/crds/mcad.ibm.com_v1beta1_mcadhelmconfig_cr.yaml +++ b/deployment/mcad-operator/deploy/crds/mcad.ibm.com_v1beta1_mcadhelmconfig_cr.yaml @@ -8,6 +8,7 @@ spec: configMap: agentConfigs: null dispatcherMode: "false" + externalDispatch: "false" name: null deploymentName: xqueuejob-controller image: diff --git a/deployment/mcad-operator/deploy/olm-catalog/mcad-operator/manifests/mcad-operator.clusterserviceversion.yaml b/deployment/mcad-operator/deploy/olm-catalog/mcad-operator/manifests/mcad-operator.clusterserviceversion.yaml index 58afc99f6..7dba6e615 100644 --- a/deployment/mcad-operator/deploy/olm-catalog/mcad-operator/manifests/mcad-operator.clusterserviceversion.yaml +++ b/deployment/mcad-operator/deploy/olm-catalog/mcad-operator/manifests/mcad-operator.clusterserviceversion.yaml @@ -3,7 +3,7 @@ kind: ClusterServiceVersion metadata: annotations: alm-examples: >- - [{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}] + [{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","externalDispatch":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}] capabilities: Basic Install description: A Kubernetes Native Holistic Lifecycle Resource Manager for Applications name: mcad-operator.v0.1.9 diff --git a/deployment/mcad-operator/helm-charts/mcad-controller/templates/configmap.yaml b/deployment/mcad-operator/helm-charts/mcad-controller/templates/configmap.yaml index 451bd3337..5fc7df1c6 100644 --- a/deployment/mcad-operator/helm-charts/mcad-controller/templates/configmap.yaml +++ b/deployment/mcad-operator/helm-charts/mcad-controller/templates/configmap.yaml @@ -6,5 +6,6 @@ metadata: namespace: kube-system data: DISPATCHER_MODE: {{ .Values.configMap.dispatcherMode }} + EXTERNAL_DISPATCH: {{.Values.configMap.externalDispatch }} DISPATCHER_AGENT_CONFIGS: {{ .Values.configMap.agentConfigs }} #{{ end }} diff --git a/deployment/mcad-operator/helm-charts/mcad-controller/values.yaml b/deployment/mcad-operator/helm-charts/mcad-controller/values.yaml index c2031f7cd..8a5cbac5c 100644 --- a/deployment/mcad-operator/helm-charts/mcad-controller/values.yaml +++ b/deployment/mcad-operator/helm-charts/mcad-controller/values.yaml @@ -32,6 +32,7 @@ configMap: name: multiCluster: false dispatcherMode: "false" + externalDispatch: "false" agentConfigs: volumes: diff --git a/doc/deploy/deployment.md b/doc/deploy/deployment.md index af81b4f7b..61cc07a3a 100644 --- a/doc/deploy/deployment.md +++ b/doc/deploy/deployment.md @@ -150,6 +150,7 @@ The following table lists the configurable parameters of the helm chart and thei | ----------------------- | ------------------------------------ | ------------- | ------------------------------------------------ | | `configMap.agentConfigs` | *For Every Agent Cluster separated by commas(,):* Name of *agent* config file _:_ Set the dispatching mode for the _*Agent Cluster*_. Note:For the dispatching mode `uncordon`, indicating _MCAD_ controller is allowed to dispatched jobs to the _*Agent Cluster*_, is only supported. | <_No default for agent config file_>:`uncordon` | `agent101config:uncordon,agent110config:uncordon` | | `configMap.dispatcherMode` | Whether the _MCAD_ Controller should be launched in Dispatcher mode or not | `false` | `true` | +| `configMap.externalDispatch` | Whether the _MCAD_ Controller should use external plugin to dispatch workloads or not | `false` | `true` | | `configMap.name` | Name of the Kubernetes *ConfigMap* resource to configure the _MCAD_ Controller | | `mcad-deployer` | | `deploymentName` | Name of _MCAD_ Controller Deployment Object | `mcad-controller` | `my-mcad-controller` | | `image.pullPolicy` | Policy that dictates when the specified image is pulled | `Always` | `Never` | From 7d97f3eb977362172690af4937b4b52fc471a5f5 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@jaroslaws-mbp.lan> Date: Fri, 25 Aug 2023 15:50:43 -0400 Subject: [PATCH 09/18] fixed wrong assignment of targetClusterName --- pkg/controller/queuejob/queuejob_controller_ex.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 3087ce380..07589259d 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -2237,7 +2237,9 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if agentId, ok := cc.dispatchMap[queuejobKey]; ok { klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) if cc.serverOption.ExternalDispatch { - qj.Status.TargetClusterName = agentId + values := strings.Split(agentId,"/") + klog.V(10).Infof("[Dispatcher Controller] Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1]) + qj.Status.TargetClusterName = values[len(values)-1] //agentId } else { cc.agentMap[agentId].CreateJob(qj) } From fd0d436fac92ee559fddd41a8ac8e8dc2c4035ae Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@jaroslaws-mbp.lan> Date: Fri, 25 Aug 2023 16:39:54 -0400 Subject: [PATCH 10/18] more fixes to support setting targetClusterName --- .../queuejob/queuejob_controller_ex.go | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 07589259d..5aec7bf11 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -34,6 +34,7 @@ import ( "fmt" "math" "math/rand" + "path" "reflect" "runtime/debug" "sort" @@ -1013,21 +1014,28 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { if qjm.serverOption.ExternalDispatch { - clusterList := qj.Spec.SchedSpec.ClusterScheduling.Clusters - var clusterId = "" + clusters := qj.Spec.SchedSpec.ClusterScheduling.Clusters + var agentId = "" + apath := path.Dir(qjm.agentList[0]) + var agentIdList = make([]string, len(clusters)) + clustersProvided := false // assume clusters not provided + for _, clusterRef := range clusters { + if clusterRef.Name != "" { + clustersProvided = true + agentIdList = append(agentIdList, apath+"/"+clusterRef.Name ) + } + } // target clusters no defined by the submitter of workload. Just pick a target // from a known list of clusters provided in serverOption.AgentConfigs - if len(clusterList) == 0 { - clusterId = qjm.agentList[rand.Int()%len(qjm.agentList)] - klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided by mcad\n", clusterId) + if !clustersProvided { + agentId = qjm.agentList[rand.Int()%len(qjm.agentList)] + klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided by mcad\n", agentId) } else { // choose target clusterId at random - clusterId = clusterList[rand.Int()%len(clusterList)].Name - klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided in Spec.SchedSpec.ClusterScheduling.Clusters: %s\n", clusterId, clusterList) - //qj.Status.TargetClusterName = - //qj.Status.TargetClusterName = clusterList[rand.Int()%len(clusterList)].Name + agentId = agentIdList[rand.Int()%len(agentIdList)] + klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided in Spec.SchedSpec.ClusterScheduling.Clusters: %s\n", agentId, agentIdList) } - return clusterId; + return agentId; } qjAggrResources := qjm.GetAggregatedResources(qj) From e972e07bd215233f99a436cf14086527e3ac8311 Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-140-153.wecm.ibm.com> Date: Thu, 31 Aug 2023 16:42:37 -0400 Subject: [PATCH 11/18] fixes supporting status --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 18 +++++ .../crds/mcad.ibm.com_appwrappers.yaml | 18 +++++ pkg/apis/controller/v1beta1/appwrapper.go | 16 ++++- .../queuejob/queuejob_controller_ex.go | 68 +++++++++++++------ .../genericresource/genericresource.go | 21 +++--- 5 files changed, 109 insertions(+), 32 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 838c12184..927445011 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -889,6 +889,24 @@ spec: (is this different from the MinAvailable from JobStatus) format: int32 type: integer + itemCompletionStatus: + properties: + GenericItems: + items: + properties: + condition: + type: string + name: + type: string + namespace: + type: string + required: + - condition + - name + - namespace + type: object + type: array + type: object type: object required: - spec diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 838c12184..927445011 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -889,6 +889,24 @@ spec: (is this different from the MinAvailable from JobStatus) format: int32 type: integer + itemCompletionStatus: + properties: + GenericItems: + items: + properties: + condition: + type: string + name: + type: string + namespace: + type: string + required: + - condition + - name + - namespace + type: object + type: array + type: object type: object required: - spec diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 066a16b5b..54e016f42 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -101,7 +101,7 @@ type AppWrapperService struct { } // AppWrapperResource is App Wrapper aggregation resource -//todo: To be depricated +// todo: To be depricated type AppWrapperResource struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata"` @@ -266,11 +266,23 @@ type AppWrapperStatus struct { // Represents multi-cluster observatioins TargetClusterName string `json:"targetClusterName"` + + ItemCompletionStatus GenericItemCompletionStatus `json:"itemCompletionStatus,omitempty"` +} + +type GenericItemCompletionStatus struct { + GenericItems [] GenericItem `json:"GenericItems,omitempty"` +} + +type GenericItem struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + Condition string `json:"condition"` } type AppWrapperState string -//enqueued, active, deleting, succeeded, failed +// enqueued, active, deleting, succeeded, failed const ( AppWrapperStateEnqueued AppWrapperState = "Pending" AppWrapperStateActive AppWrapperState = "Running" diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 5aec7bf11..d645620bd 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -680,8 +680,22 @@ func (qjm *XController) GetAggregatedResourcesPerGenericItem(cqj *arbv1.AppWrapp } // Gets all objects owned by AW from API server, check user supplied status and set whole AW status -func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arbv1.AppWrapperState { +func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (arbv1.AppWrapperState, arbv1.GenericItemCompletionStatus) { +/* + statusField := reflect.ValueOf(caw.Status) + if statusField.Kind() != reflect.Struct { + klog.Errorf("[getAppWrapperCompletionStatus] Error reflecting Status, %#v", statusField) + return caw.Status.State, arbv1.GenericItemCompletionStatus{} + } + completionStatus := caw.Status.ItemCompletionStatus + itemCompletionStatusField := statusField.FieldByName("ItemCompletionStatus") + if !itemCompletionStatusField.IsValid() { + completionStatus = arbv1.GenericItemCompletionStatus{} + } +*/ + completionStatus := arbv1.GenericItemCompletionStatus{} + _ = completionStatus // Get all pods and related resources countCompletionRequired := 0 for _, genericItem := range caw.Spec.AggrResources.GenericItems { @@ -706,10 +720,18 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb } klog.Infof("[getAppWrapperCompletionStatus] Checking items completed for appwrapper: %v in namespace: %v", caw.Name, caw.Namespace) - status := qjm.genericresources.IsItemCompleted(&genericItem, caw.Namespace, caw.Name, name) + status, condition := qjm.genericresources.IsItemCompleted(&genericItem, caw.Namespace, caw.Name, name) if !status { //early termination because a required item is not completed - return caw.Status.State + return caw.Status.State, completionStatus + } else { + _ = condition + genItemCompletionStatus := arbv1.GenericItem { + Name: caw.Name, + Namespace: caw.Namespace, + Condition: condition, + } + completionStatus.GenericItems = append(completionStatus.GenericItems, genItemCompletionStatus) } //only consider count completion required for valid items @@ -722,15 +744,15 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb //Set new status only when completion required flag is present in genericitems array if countCompletionRequired > 0 { if caw.Status.Running == 0 && caw.Status.Pending == 0 { - return arbv1.AppWrapperStateCompleted + return arbv1.AppWrapperStateCompleted, completionStatus } if caw.Status.Pending > 0 || caw.Status.Running > 0 { - return arbv1.AppWrapperStateRunningHoldCompletion + return arbv1.AppWrapperStateRunningHoldCompletion, completionStatus } } //return previous condition - return caw.Status.State + return caw.Status.State, completionStatus } func (qjm *XController) GetAggregatedResources(cqj *arbv1.AppWrapper) *clusterstateapi.Resource { @@ -1012,17 +1034,17 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust } func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { - + if qjm.serverOption.ExternalDispatch { clusters := qj.Spec.SchedSpec.ClusterScheduling.Clusters var agentId = "" - apath := path.Dir(qjm.agentList[0]) + apath := path.Dir(qjm.agentList[0]) var agentIdList = make([]string, len(clusters)) - clustersProvided := false // assume clusters not provided + clustersProvided := false // assume clusters not provided for _, clusterRef := range clusters { if clusterRef.Name != "" { clustersProvided = true - agentIdList = append(agentIdList, apath+"/"+clusterRef.Name ) + agentIdList = append(agentIdList, apath+"/"+clusterRef.Name) } } // target clusters no defined by the submitter of workload. Just pick a target @@ -1031,12 +1053,12 @@ func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { agentId = qjm.agentList[rand.Int()%len(qjm.agentList)] klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided by mcad\n", agentId) } else { - // choose target clusterId at random - agentId = agentIdList[rand.Int()%len(agentIdList)] - klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided in Spec.SchedSpec.ClusterScheduling.Clusters: %s\n", agentId, agentIdList) + // choose target clusterId at random + agentId = agentIdList[rand.Int()%len(agentIdList)] + klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided in Spec.SchedSpec.ClusterScheduling.Clusters: %s\n", agentId, agentIdList) } - return agentId; - } + return agentId + } qjAggrResources := qjm.GetAggregatedResources(qj) klog.V(2).Infof("[chooseAgent] Aggregated Resources of XQJ %s: %v\n", qj.Name, qjAggrResources) @@ -2123,12 +2145,14 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } else if qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateActive { //set appwrapper status to Complete or RunningHoldCompletion - derivedAwStatus := cc.getAppWrapperCompletionStatus(qj) - + derivedAwStatus, genericItemsCompletionStatus := cc.getAppWrapperCompletionStatus(qj) + //Set Appwrapper state to complete if all items in Appwrapper //are completed if derivedAwStatus == arbv1.AppWrapperStateRunningHoldCompletion { qj.Status.State = derivedAwStatus + klog.V(1).Infof("[>>>>>>>>>>] Setting ItemCompletionStatus 1") + qj.Status.ItemCompletionStatus = genericItemsCompletionStatus var updateQj *arbv1.AppWrapper index := getIndexOfMatchedCondition(qj, arbv1.AppWrapperCondRunningHoldCompletion, "SomeItemsCompleted") if index < 0 { @@ -2146,6 +2170,8 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } //Set appwrapper status to complete if derivedAwStatus == arbv1.AppWrapperStateCompleted { + klog.V(1).Infof("[>>>>>>>>>>] Setting ItemCompletionStatus 2") + qj.Status.ItemCompletionStatus = genericItemsCompletionStatus qj.Status.State = derivedAwStatus qj.Status.CanRun = false var updateQj *arbv1.AppWrapper @@ -2240,21 +2266,21 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool current_time := time.Now() klog.V(10).Infof("[worker-manageQJ] XQJ %s has Overhead Before Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) klog.V(10).Infof("[TTime] %s, %s: WorkerBeforeDispatch", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time)) - } + } queuejobKey, _ := GetQueueJobKey(qj) if agentId, ok := cc.dispatchMap[queuejobKey]; ok { klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) if cc.serverOption.ExternalDispatch { - values := strings.Split(agentId,"/") + values := strings.Split(agentId, "/") klog.V(10).Infof("[Dispatcher Controller] Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1]) - qj.Status.TargetClusterName = values[len(values)-1] //agentId + qj.Status.TargetClusterName = values[len(values)-1] //agentId } else { cc.agentMap[agentId].CreateJob(qj) } qj.Status.IsDispatched = true } else { klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name) - } + } if klog.V(10).Enabled() { current_time := time.Now() klog.V(10).Infof("[Dispatcher Controller] XQJ %s has Overhead After Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) diff --git a/pkg/controller/queuejobresources/genericresource/genericresource.go b/pkg/controller/queuejobresources/genericresource/genericresource.go index 4b2879305..21a6d498c 100644 --- a/pkg/controller/queuejobresources/genericresource/genericresource.go +++ b/pkg/controller/queuejobresources/genericresource/genericresource.go @@ -598,24 +598,25 @@ func getContainerResources(container v1.Container, replicas float64) *clustersta } //returns status of an item present in etcd -func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, namespace string, appwrapperName string, genericItemName string) (completed bool) { +func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, namespace string, appwrapperName string, genericItemName string) (completed bool, condition string) { dd := gr.clients.Discovery() + klog.V(8).Infof("[IsItemCompleted] - checking status !!!!!!!") apigroups, err := restmapper.GetAPIGroupResources(dd) if err != nil { klog.Errorf("[IsItemCompleted] Error getting API resources, err=%#v", err) - return false + return false, "" } restmapper := restmapper.NewDiscoveryRESTMapper(apigroups) _, gvk, err := unstructured.UnstructuredJSONScheme.Decode(awgr.GenericTemplate.Raw, nil, nil) if err != nil { klog.Errorf("[IsItemCompleted] Decoding error, please check your CR! Aborting handling the resource creation, err: `%v`", err) - return false + return false, "" } mapping, err := restmapper.RESTMapping(gvk.GroupKind(), gvk.Version) if err != nil { klog.Errorf("[IsItemCompleted] mapping error from raw object: `%v`", err) - return false + return false, "" } restconfig := gr.kubeClientConfig restconfig.GroupVersion = &schema.GroupVersion{ @@ -626,14 +627,14 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc dclient, err := dynamic.NewForConfig(restconfig) if err != nil { klog.Errorf("[IsItemCompleted] Error creating new dynamic client, err %v", err) - return false + return false, "" } labelSelector := fmt.Sprintf("%s=%s", appwrapperJobName, appwrapperName) inEtcd, err := dclient.Resource(rsrc).Namespace(namespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector}) if err != nil { klog.Errorf("[IsItemCompleted] Error listing object: %v", err) - return false + return false, "" } for _, job := range inEtcd.Items { @@ -660,7 +661,7 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc if jobMap == nil { continue } - + if job.Object["status"] != nil { status := job.Object["status"].(map[string]interface{}) if status["conditions"] != nil { @@ -676,7 +677,9 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc userSpecfiedCompletionConditions := strings.Split(awgr.CompletionStatus, ",") for _, condition := range userSpecfiedCompletionConditions { if strings.Contains(strings.ToLower(completionType), strings.ToLower(condition)) { - return true + klog.V(8).Infof("[IsItemCompleted] condition `%v`.\n", condition) + + return true, condition } } } @@ -685,5 +688,5 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc klog.Errorf("[IsItemCompleted] Found item with name %v that has status nil in namespace %v with labels %v", job.GetName(), job.GetNamespace(), job.GetLabels()) } } - return false + return false, "" } From 9f67d0fa2afd7caffcae6c63137a453db7b9b75f Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@jaroslaws-mbp.lan> Date: Wed, 6 Sep 2023 08:49:25 -0400 Subject: [PATCH 12/18] add completion status to every item in a pipeline --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 14 +++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 14 +++++++ pkg/apis/controller/v1beta1/appwrapper.go | 7 +++- .../queuejob/queuejob_controller_ex.go | 39 ++++++++++--------- 4 files changed, 55 insertions(+), 19 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 927445011..9109ce57d 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -896,6 +896,20 @@ spec: properties: condition: type: string + itemGVK: + description: Group Version Kind + properties: + group: + type: string + kind: + type: string + version: + type: string + required: + - group + - kind + - version + type: object name: type: string namespace: diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 927445011..9109ce57d 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -896,6 +896,20 @@ spec: properties: condition: type: string + itemGVK: + description: Group Version Kind + properties: + group: + type: string + kind: + type: string + version: + type: string + required: + - group + - kind + - version + type: object name: type: string namespace: diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 54e016f42..206dc143f 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -275,11 +275,16 @@ type GenericItemCompletionStatus struct { } type GenericItem struct { + ItemGVK ItemGVK `json:"itemGVK"` Name string `json:"name"` Namespace string `json:"namespace"` Condition string `json:"condition"` } - +type ItemGVK struct { + Group string `json:"group"` + Version string `json:"version"` + Kind string `json:"kind"` +} type AppWrapperState string // enqueued, active, deleting, succeeded, failed diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index d645620bd..4b5107cd0 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -681,20 +681,8 @@ func (qjm *XController) GetAggregatedResourcesPerGenericItem(cqj *arbv1.AppWrapp // Gets all objects owned by AW from API server, check user supplied status and set whole AW status func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (arbv1.AppWrapperState, arbv1.GenericItemCompletionStatus) { -/* - statusField := reflect.ValueOf(caw.Status) - if statusField.Kind() != reflect.Struct { - klog.Errorf("[getAppWrapperCompletionStatus] Error reflecting Status, %#v", statusField) - return caw.Status.State, arbv1.GenericItemCompletionStatus{} - } - completionStatus := caw.Status.ItemCompletionStatus - itemCompletionStatusField := statusField.FieldByName("ItemCompletionStatus") - if !itemCompletionStatusField.IsValid() { - completionStatus = arbv1.GenericItemCompletionStatus{} - } -*/ - completionStatus := arbv1.GenericItemCompletionStatus{} + completionStatus := arbv1.GenericItemCompletionStatus{} _ = completionStatus // Get all pods and related resources countCompletionRequired := 0 @@ -709,12 +697,21 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (ar } unstruct.Object = blob.(map[string]interface{}) //set object to the content of the blob after Unmarshalling name := "" + namespace := "" if md, ok := unstruct.Object["metadata"]; ok { metadata := md.(map[string]interface{}) if objectName, ok := metadata["name"]; ok { name = objectName.(string) } + if objectName, ok := metadata["namespace"]; ok { + namespace = objectName.(string) + } + } + _, gvk, err := unstructured.UnstructuredJSONScheme.Decode(objectName.Raw, nil, nil) + if err != nil { + klog.Errorf("[getAppWrapperCompletionStatus] Error unmarshalling, err=%#v", err) } + if len(name) == 0 { klog.Warningf("[getAppWrapperCompletionStatus] object name not present for appwrapper: %v in namespace: %v", caw.Name, caw.Namespace) } @@ -726,10 +723,16 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (ar return caw.Status.State, completionStatus } else { _ = condition - genItemCompletionStatus := arbv1.GenericItem { - Name: caw.Name, - Namespace: caw.Namespace, - Condition: condition, + + genItemCompletionStatus := arbv1.GenericItem{ + ItemGVK: arbv1.ItemGVK{ + Group: gvk.Group, + Version: gvk.Version, + Kind: gvk.Kind, + }, + Name: name, + Namespace: namespace, + Condition: condition, } completionStatus.GenericItems = append(completionStatus.GenericItems, genItemCompletionStatus) } @@ -2146,7 +2149,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } else if qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateActive { //set appwrapper status to Complete or RunningHoldCompletion derivedAwStatus, genericItemsCompletionStatus := cc.getAppWrapperCompletionStatus(qj) - + //Set Appwrapper state to complete if all items in Appwrapper //are completed if derivedAwStatus == arbv1.AppWrapperStateRunningHoldCompletion { From 0bbda5ab61f8a18bc8809a99a49486e8ba8159be Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-152-81.wecm.ibm.com> Date: Thu, 7 Sep 2023 14:13:21 -0400 Subject: [PATCH 13/18] simplified implementation of gvk in completion status for each item in the workload --- config/crd/bases/mcad.ibm.com_appwrappers.yaml | 16 ++++++++-------- .../crds/mcad.ibm.com_appwrappers.yaml | 16 ++++++++-------- pkg/apis/controller/v1beta1/appwrapper.go | 11 ++++------- .../queuejob/queuejob_controller_ex.go | 5 +++-- 4 files changed, 23 insertions(+), 25 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 9109ce57d..1d36b3ef8 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -891,24 +891,24 @@ spec: type: integer itemCompletionStatus: properties: - GenericItems: + genericItems: items: properties: condition: type: string - itemGVK: + gvk: description: Group Version Kind properties: - group: + Group: type: string - kind: + Kind: type: string - version: + Version: type: string required: - - group - - kind - - version + - Group + - Kind + - Version type: object name: type: string diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 9109ce57d..1d36b3ef8 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -891,24 +891,24 @@ spec: type: integer itemCompletionStatus: properties: - GenericItems: + genericItems: items: properties: condition: type: string - itemGVK: + gvk: description: Group Version Kind properties: - group: + Group: type: string - kind: + Kind: type: string - version: + Version: type: string required: - - group - - kind - - version + - Group + - Kind + - Version type: object name: type: string diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 206dc143f..43915bd6f 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -34,6 +34,7 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" ) const AppWrapperPlural string = "appwrappers" @@ -271,20 +272,16 @@ type AppWrapperStatus struct { } type GenericItemCompletionStatus struct { - GenericItems [] GenericItem `json:"GenericItems,omitempty"` + GenericItems [] GenericItem `json:"genericItems,omitempty"` } type GenericItem struct { - ItemGVK ItemGVK `json:"itemGVK"` + schema.GroupVersionKind `json:"gvk"` Name string `json:"name"` Namespace string `json:"namespace"` Condition string `json:"condition"` } -type ItemGVK struct { - Group string `json:"group"` - Version string `json:"version"` - Kind string `json:"kind"` -} + type AppWrapperState string // enqueued, active, deleting, succeeded, failed diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 4b5107cd0..7d73008c2 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -725,11 +725,12 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (ar _ = condition genItemCompletionStatus := arbv1.GenericItem{ - ItemGVK: arbv1.ItemGVK{ - Group: gvk.Group, + GroupVersionKind: schema.GroupVersionKind { + Group: gvk.Group, Version: gvk.Version, Kind: gvk.Kind, }, + Name: name, Namespace: namespace, Condition: condition, From c795e2a44b81684ce0ae83c8ca580126f2a44bde Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-134-145.wecm.ibm.com> Date: Fri, 8 Sep 2023 17:56:10 -0400 Subject: [PATCH 14/18] redone gvk. Modified dispatcher so that it handled job completion status --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 7 +- .../crds/mcad.ibm.com_appwrappers.yaml | 7 +- .../queuejob/queuejob_controller_ex.go | 119 +++++++++--------- 3 files changed, 64 insertions(+), 69 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 1d36b3ef8..62bb5c612 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -901,14 +901,14 @@ spec: properties: Group: type: string - Kind: - type: string Version: type: string + Kind: + type: string required: - Group - - Kind - Version + - Kind type: object name: type: string @@ -916,6 +916,7 @@ spec: type: string required: - condition + - gvk - name - namespace type: object diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 1d36b3ef8..62bb5c612 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -901,14 +901,14 @@ spec: properties: Group: type: string - Kind: - type: string Version: type: string + Kind: + type: string required: - Group - - Kind - Version + - Kind type: object name: type: string @@ -916,6 +916,7 @@ spec: type: string required: - condition + - gvk - name - namespace type: object diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 7d73008c2..174fb9516 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -683,7 +683,6 @@ func (qjm *XController) GetAggregatedResourcesPerGenericItem(cqj *arbv1.AppWrapp func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (arbv1.AppWrapperState, arbv1.GenericItemCompletionStatus) { completionStatus := arbv1.GenericItemCompletionStatus{} - _ = completionStatus // Get all pods and related resources countCompletionRequired := 0 for _, genericItem := range caw.Spec.AggrResources.GenericItems { @@ -703,15 +702,27 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (ar if objectName, ok := metadata["name"]; ok { name = objectName.(string) } - if objectName, ok := metadata["namespace"]; ok { - namespace = objectName.(string) + if objectNamespace, ok := metadata["namespace"]; ok { + if objectNamespace != nil { + namespace = objectNamespace.(string) + } + } + } + var gvk = schema.GroupVersionKind{} + _, groupversionkind, err := unstructured.UnstructuredJSONScheme.Decode(objectName.Raw, nil, nil) + if err != nil || groupversionkind == nil { + klog.Errorf("[getAppWrapperCompletionStatus] Error unmarshalling gvk, err=%#v", err) + unknown := "Unknown" + gvk = schema.GroupVersionKind { + Group: unknown, + Version: unknown, + Kind: unknown, } } - _, gvk, err := unstructured.UnstructuredJSONScheme.Decode(objectName.Raw, nil, nil) - if err != nil { - klog.Errorf("[getAppWrapperCompletionStatus] Error unmarshalling, err=%#v", err) + if groupversionkind != nil { + gvk = *groupversionkind } - + if len(name) == 0 { klog.Warningf("[getAppWrapperCompletionStatus] object name not present for appwrapper: %v in namespace: %v", caw.Name, caw.Namespace) } @@ -722,8 +733,6 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) (ar //early termination because a required item is not completed return caw.Status.State, completionStatus } else { - _ = condition - genItemCompletionStatus := arbv1.GenericItem{ GroupVersionKind: schema.GroupVersionKind { Group: gvk.Group, @@ -1974,40 +1983,41 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool defer func() { klog.V(10).Infof("[worker-manageQJ] Ending %s manageQJ time=%s &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(startTime), qj, qj.ResourceVersion, qj.Status) }() - if !cc.isDispatcher { // Agent Mode - if qj.DeletionTimestamp != nil { + if qj.DeletionTimestamp != nil { - klog.V(4).Infof("[manageQueueJob] AW job=%s/%s set for deletion.", qj.Name, qj.Namespace) + klog.V(4).Infof("[manageQueueJob] AW job=%s/%s set for deletion.", qj.Name, qj.Namespace) - // cleanup resources for running job - err = cc.Cleanup(qj) - if err != nil { - return err - } - //empty finalizers and delete the queuejob again - accessor, err := meta.Accessor(qj) - if err != nil { - return err - } - accessor.SetFinalizers(nil) + // cleanup resources for running job + err = cc.Cleanup(qj) + if err != nil { + return err + } + //empty finalizers and delete the queuejob again + accessor, err := meta.Accessor(qj) + if err != nil { + return err + } + accessor.SetFinalizers(nil) - // we delete the job from the queue if it is there - cc.qjqueue.Delete(qj) + // we delete the job from the queue if it is there + cc.qjqueue.Delete(qj) - return nil - } - //Job is Complete only update pods if needed. - if qj.Status.State == arbv1.AppWrapperStateCompleted || qj.Status.State == arbv1.AppWrapperStateRunningHoldCompletion { - if podPhaseChanges { - // Only update etcd if AW status has changed. This can happen for periodic - // updates of pod phase counts done in caller of this function. - if err := cc.updateEtcd(qj, "manageQueueJob - podPhaseChanges"); err != nil { - klog.Errorf("[manageQueueJob] Error updating etc for AW job=%s Status=%+v err=%+v", qj.Name, qj.Status, err) - } + return nil + } + //Job is Complete only update pods if needed. + if qj.Status.State == arbv1.AppWrapperStateCompleted || qj.Status.State == arbv1.AppWrapperStateRunningHoldCompletion { + if podPhaseChanges { + // Only update etcd if AW status has changed. This can happen for periodic + // updates of pod phase counts done in caller of this function. + if err := cc.updateEtcd(qj, "manageQueueJob - podPhaseChanges"); err != nil { + klog.Errorf("[manageQueueJob] Error updating etc for AW job=%s Status=%+v err=%+v", qj.Name, qj.Status, err) } - return nil } + return nil + } + + if !cc.isDispatcher { // Agent Mode // First execution of qj to set Status.State = Enqueued if !qj.Status.CanRun && (qj.Status.State != arbv1.AppWrapperStateEnqueued && qj.Status.State != arbv1.AppWrapperStateDeleted) { @@ -2172,6 +2182,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } cc.updateEtcd(updateQj, "[syncQueueJob] setRunningHoldCompletion") } + //Set appwrapper status to complete if derivedAwStatus == arbv1.AppWrapperStateCompleted { klog.V(1).Infof("[>>>>>>>>>>] Setting ItemCompletionStatus 2") @@ -2195,6 +2206,10 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool cc.quotaManager.Release(updateQj) } + if derivedAwStatus == arbv1.AppWrapperStateCompleted { + qj.Status.ItemCompletionStatus = genericItemsCompletionStatus + qj.Status.State = derivedAwStatus + } // Bugfix to eliminate performance problem of overloading the event queue. } else if podPhaseChanges { // Continued bug fix // Only update etcd if AW status has changed. This can happen for periodic @@ -2202,28 +2217,12 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if err := cc.updateEtcd(qj, "manageQueueJob - podPhaseChanges"); err != nil { klog.Errorf("[manageQueueJob] Error updating etc for AW job=%s Status=%+v err=%+v", qj.Name, qj.Status, err) } + // Finish adding qj to Etcd for dispatch + // return nil } - // Finish adding qj to Etcd for dispatch - } else { // Dispatcher Mode - - if qj.DeletionTimestamp != nil { - // cleanup resources for running job - err = cc.Cleanup(qj) - if err != nil { - return err - } - //empty finalizers and delete the queuejob again - accessor, err := meta.Accessor(qj) - if err != nil { - return err - } - accessor.SetFinalizers(nil) - - cc.qjqueue.Delete(qj) - - return nil - } + + } else { // Dispatcher Mode if !qj.Status.CanRun && (qj.Status.State != arbv1.AppWrapperStateEnqueued && qj.Status.State != arbv1.AppWrapperStateDeleted) { // if there are running resources for this job then delete them because the job was put in @@ -2251,21 +2250,15 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) } } - - //_, err = cc.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(qj) - //if err != nil { - // return err - //} return nil } - // if !qj.Status.CanRun && qj.Status.State == arbv1.QueueJobStateEnqueued { if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued { cc.qjqueue.AddIfNotPresent(qj) return nil } - if qj.Status.CanRun && !qj.Status.IsDispatched { + if qj.Status.CanRun && !qj.Status.IsDispatched { //&& qj.Status.State != arbv1.AppWrapperStateCompleted { if klog.V(10).Enabled() { current_time := time.Now() klog.V(10).Infof("[worker-manageQJ] XQJ %s has Overhead Before Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) From bc0966d13b1ec2fa88298a48820bfa3f26ebe72c Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-72-29.wecm.ibm.com> Date: Sun, 17 Sep 2023 16:26:56 -0400 Subject: [PATCH 15/18] add new status-check test suite --- .../queuejob/queuejob_controller_ex.go | 6 +- test/e2e-kuttl/status-check/00-assert.yaml | 12 ++++ test/e2e-kuttl/status-check/01-assert.yaml | 4 ++ test/e2e-kuttl/status-check/01-install.yaml | 4 ++ test/e2e-kuttl/status-check/02-assert.yaml | 16 +++++ test/e2e-kuttl/status-check/02-install.yaml | 63 +++++++++++++++++++ 6 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 test/e2e-kuttl/status-check/00-assert.yaml create mode 100644 test/e2e-kuttl/status-check/01-assert.yaml create mode 100644 test/e2e-kuttl/status-check/01-install.yaml create mode 100644 test/e2e-kuttl/status-check/02-assert.yaml create mode 100644 test/e2e-kuttl/status-check/02-install.yaml diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 174fb9516..fb6193665 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -2165,7 +2165,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool //are completed if derivedAwStatus == arbv1.AppWrapperStateRunningHoldCompletion { qj.Status.State = derivedAwStatus - klog.V(1).Infof("[>>>>>>>>>>] Setting ItemCompletionStatus 1") + klog.V(1).Infof("[manageQueueJob] Setting ItemCompletionStatus 1") qj.Status.ItemCompletionStatus = genericItemsCompletionStatus var updateQj *arbv1.AppWrapper index := getIndexOfMatchedCondition(qj, arbv1.AppWrapperCondRunningHoldCompletion, "SomeItemsCompleted") @@ -2185,7 +2185,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool //Set appwrapper status to complete if derivedAwStatus == arbv1.AppWrapperStateCompleted { - klog.V(1).Infof("[>>>>>>>>>>] Setting ItemCompletionStatus 2") + klog.V(1).Infof("[manageQueueJob] Setting ItemCompletionStatus 2") qj.Status.ItemCompletionStatus = genericItemsCompletionStatus qj.Status.State = derivedAwStatus qj.Status.CanRun = false @@ -2218,7 +2218,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool klog.Errorf("[manageQueueJob] Error updating etc for AW job=%s Status=%+v err=%+v", qj.Name, qj.Status, err) } // Finish adding qj to Etcd for dispatch - // return nil + return nil } diff --git a/test/e2e-kuttl/status-check/00-assert.yaml b/test/e2e-kuttl/status-check/00-assert.yaml new file mode 100644 index 000000000..f0edec999 --- /dev/null +++ b/test/e2e-kuttl/status-check/00-assert.yaml @@ -0,0 +1,12 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: appwrappers.mcad.ibm.com +status: + acceptedNames: + kind: AppWrapper + listKind: AppWrapperList + plural: appwrappers + singular: appwrapper + storedVersions: + - v1beta1 \ No newline at end of file diff --git a/test/e2e-kuttl/status-check/01-assert.yaml b/test/e2e-kuttl/status-check/01-assert.yaml new file mode 100644 index 000000000..5e0105d10 --- /dev/null +++ b/test/e2e-kuttl/status-check/01-assert.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test \ No newline at end of file diff --git a/test/e2e-kuttl/status-check/01-install.yaml b/test/e2e-kuttl/status-check/01-install.yaml new file mode 100644 index 000000000..7c265c019 --- /dev/null +++ b/test/e2e-kuttl/status-check/01-install.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test diff --git a/test/e2e-kuttl/status-check/02-assert.yaml b/test/e2e-kuttl/status-check/02-assert.yaml new file mode 100644 index 000000000..a31d7743a --- /dev/null +++ b/test/e2e-kuttl/status-check/02-assert.yaml @@ -0,0 +1,16 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: my-job-1 + namespace: test +status: + state: Completed + itemCompletionStatus: + genericItems: + - condition: Complete + gvk: + Group: batch + Kind: Job + Version: v1 + name: my-job-2 + namespace: test \ No newline at end of file diff --git a/test/e2e-kuttl/status-check/02-install.yaml b/test/e2e-kuttl/status-check/02-install.yaml new file mode 100644 index 000000000..538aaf4cb --- /dev/null +++ b/test/e2e-kuttl/status-check/02-install.yaml @@ -0,0 +1,63 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: my-job-1 + namespace: test + labels: + quota_context: bronze + quota_service: service-root +spec: + schedulingSpec: + minAvailable: 1 + resources: + GenericItems: + - replicas: 1 + completionstatus: Complete + custompodresources: + - replicas: 1 + requests: + cpu: 900m + nvidia.com/gpu: 0 + memory: 300Mi + limits: + cpu: 900m + nvidia.com/gpu: 0 + memory: 300Mi + generictemplate: + apiVersion: batch/v1 + kind: Job + metadata: + name: my-job-2 + namespace: test + labels: + appwrapper.mcad.ibm.com: my-job-2 + spec: + parallelism: 1 + completions: 1 + template: + metadata: + name: my-job-2 + namespace: test + labels: + appwrapper.mcad.ibm.com: my-job-2 + spec: + terminationGracePeriodSeconds: 1 + restartPolicy: Never + containers: + - name: ubuntu + image: ubuntu:latest + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + sleep 30 + resources: + requests: + cpu: 900m + nvidia.com/gpu: 0 + memory: 300Mi + limits: + cpu: 900m + nvidia.com/gpu: 0 + memory: 300Mi \ No newline at end of file From f3aea37cdf70a162037c337d2d9172bbe8a6b60e Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-27-136.wecm.ibm.com> Date: Mon, 18 Sep 2023 12:29:40 -0400 Subject: [PATCH 16/18] moved cluster assignment to spec --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 2 -- .../bases/mcad.ibm.com_schedulingspecs.yaml | 27 +++++++++++++++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 2 -- .../crds/mcad.ibm.com_schedulingspecs.yaml | 27 +++++++++++++++++++ pkg/apis/controller/v1beta1/appwrapper.go | 3 --- pkg/apis/controller/v1beta1/schedulingspec.go | 15 +++++++++++ .../queuejob/queuejob_controller_ex.go | 16 +++++++++-- 7 files changed, 83 insertions(+), 9 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 62bb5c612..ce400ec09 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -860,8 +860,6 @@ spec: type: boolean message: type: string - targetClusterName: - type: string pending: description: The number of pending pods. format: int32 diff --git a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml index d6125adfd..a0ae2b0ce 100644 --- a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml +++ b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml @@ -138,6 +138,33 @@ spec: are ANDed. type: object type: object + policyResult: + properties: + policySource: + items: + properties: + lastUpdateMicroTime: + description: The latest time this condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details + about the cluster decision. + type: string + name: + description: ID/Name of the policy decision maker. Most + often this will be MCAD but design can support alternatives + type: string + type: object + type: array + targetCluster: + properties: + name: + type: string + required: + - name + type: object + type: object type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 62bb5c612..ce400ec09 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -860,8 +860,6 @@ spec: type: boolean message: type: string - targetClusterName: - type: string pending: description: The number of pending pods. format: int32 diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml index d6125adfd..a0ae2b0ce 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml @@ -138,6 +138,33 @@ spec: are ANDed. type: object type: object + policyResult: + properties: + policySource: + items: + properties: + lastUpdateMicroTime: + description: The latest time this condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details + about the cluster decision. + type: string + name: + description: ID/Name of the policy decision maker. Most + often this will be MCAD but design can support alternatives + type: string + type: object + type: array + targetCluster: + properties: + name: + type: string + required: + - name + type: object + type: object type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 43915bd6f..e2af9cbab 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -265,9 +265,6 @@ type AppWrapperStatus struct { // Represents the latest available observations of pods under appwrapper PendingPodConditions []PendingPodSpec `json:"pendingpodconditions"` - // Represents multi-cluster observatioins - TargetClusterName string `json:"targetClusterName"` - ItemCompletionStatus GenericItemCompletionStatus `json:"itemCompletionStatus,omitempty"` } diff --git a/pkg/apis/controller/v1beta1/schedulingspec.go b/pkg/apis/controller/v1beta1/schedulingspec.go index 457cd1540..1caa7d582 100644 --- a/pkg/apis/controller/v1beta1/schedulingspec.go +++ b/pkg/apis/controller/v1beta1/schedulingspec.go @@ -80,6 +80,21 @@ type ClusterReference struct { type ClusterSchedulingSpec struct { Clusters []ClusterReference `json:"clusters,omitempty"` ClusterSelector *metav1.LabelSelector `json:"clusterSelector,omitempty"` + PolicyResult ClusterDecision `json:"policyResult,omitempty"` +} + +type ClusterDecision struct { + TargetCluster ClusterReference `json:"targetCluster,omitempty"` + PolicySource []PolicySourceReference `json:"policySource,omitempty"` +} + +type PolicySourceReference struct { + // ID/Name of the policy decision maker. Most often this will be MCAD but design can support alternatives + Name string `json:"name,omitempty"` + // The latest time this condition was updated. + LastUpdateMicroTime metav1.MicroTime `json:"lastUpdateMicroTime,omitempty"` + // A human readable message indicating details about the cluster decision. + Message string `json:"message,omitempty"` } type ScheduleTimeSpec struct { diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index fb6193665..fe2877b10 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -2269,8 +2269,19 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) if cc.serverOption.ExternalDispatch { values := strings.Split(agentId, "/") - klog.V(10).Infof("[Dispatcher Controller] Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1]) - qj.Status.TargetClusterName = values[len(values)-1] //agentId + klog.V(10).Infof("[Dispatcher Controller] ... Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1]) + qj.Spec.SchedSpec.ClusterScheduling.PolicyResult = arbv1.ClusterDecision { + TargetCluster: arbv1.ClusterReference{ + Name: values[len(values)-1], + }, + PolicySource : []arbv1.PolicySourceReference { + { + Name: "MCAD", + LastUpdateMicroTime: metav1.MicroTime{Time: time.Now()}, + Message: "Assigned by MCAD", + }, + }, + } } else { cc.agentMap[agentId].CreateJob(qj) } @@ -2289,6 +2300,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool qj.Namespace, qj.Name, err) return err } + klog.V(10).Infof("[Dispatcher Controller] Update Successfull - AppWrapper %s .", qj.Name) } } From c192f7ef5620790a335cf40742442f9f7b3d371b Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-27-136.wecm.ibm.com> Date: Tue, 19 Sep 2023 08:45:52 -0400 Subject: [PATCH 17/18] added clusterDecision to appwrappers crd --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 27 +++++++++++++++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 27 +++++++++++++++++++ .../queuejob/queuejob_controller_ex.go | 8 +++--- 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index ce400ec09..3f09fc279 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -363,6 +363,33 @@ spec: are ANDed. type: object type: object + policyResult: + properties: + policySource: + items: + properties: + lastUpdateMicroTime: + description: The latest time this condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details + about the cluster decision. + type: string + name: + description: ID/Name of the policy decision maker. Most + often this will be MCAD but design can support alternatives + type: string + type: object + type: array + targetCluster: + properties: + name: + type: string + required: + - name + type: object + type: object type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index ce400ec09..3f09fc279 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -363,6 +363,33 @@ spec: are ANDed. type: object type: object + policyResult: + properties: + policySource: + items: + properties: + lastUpdateMicroTime: + description: The latest time this condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details + about the cluster decision. + type: string + name: + description: ID/Name of the policy decision maker. Most + often this will be MCAD but design can support alternatives + type: string + type: object + type: array + targetCluster: + properties: + name: + type: string + required: + - name + type: object + type: object type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index fe2877b10..7f45b6bc9 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -2258,7 +2258,8 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool return nil } - if qj.Status.CanRun && !qj.Status.IsDispatched { //&& qj.Status.State != arbv1.AppWrapperStateCompleted { + if qj.Status.CanRun && !qj.Status.IsDispatched { + if klog.V(10).Enabled() { current_time := time.Now() klog.V(10).Infof("[worker-manageQJ] XQJ %s has Overhead Before Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) @@ -2294,15 +2295,14 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool klog.V(10).Infof("[Dispatcher Controller] XQJ %s has Overhead After Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time)) klog.V(10).Infof("[TTime] %s, %s: WorkerAfterDispatch", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time)) } - + if _, err := cc.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(qj); err != nil { klog.Errorf("Failed to update status of AppWrapper %v/%v: %v", qj.Namespace, qj.Name, err) return err } - klog.V(10).Infof("[Dispatcher Controller] Update Successfull - AppWrapper %s .", qj.Name) + klog.V(10).Infof("[Dispatcher Controller] Update Successfull - AppWrapper %s .", qj.Name) } - } return err } From 4d7ab51a4b7774ae18c7884c94b67e9841cf1c9d Mon Sep 17 00:00:00 2001 From: Jaroslaw Cwiklik <cwiklik@wecm-9-67-132-200.wecm.ibm.com> Date: Sun, 24 Sep 2023 10:13:47 -0400 Subject: [PATCH 18/18] added TODO --- pkg/controller/queuejob/queuejob_controller_ex.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 7f45b6bc9..93fb32dd2 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -2270,9 +2270,10 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId) if cc.serverOption.ExternalDispatch { values := strings.Split(agentId, "/") - klog.V(10).Infof("[Dispatcher Controller] ... Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1]) + klog.V(10).Infof("[Dispatcher Controller] Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1]) qj.Spec.SchedSpec.ClusterScheduling.PolicyResult = arbv1.ClusterDecision { TargetCluster: arbv1.ClusterReference{ + // TODO check if values len > 0. Name: values[len(values)-1], }, PolicySource : []arbv1.PolicySourceReference {