tektoncd · tekton-robot · May 24, 2023 · May 7, 2023 · afrittoli · May 22, 2023
diff --git a/config/200-clusterrole.yaml b/config/200-clusterrole.yaml
@@ -25,6 +25,10 @@ rules:
     # Controller needs to watch Pods created by TaskRuns to see them progress.
     resources: ["pods"]
     verbs: ["list", "watch"]
+  - apiGroups: [""]
+    # Controller needs to get the list of cordoned nodes over the course of a single run
+    resources: ["nodes"]
+    verbs: ["list"]
     # Controller needs cluster access to all of the CRDs that it is responsible for
     # managing.
   - apiGroups: ["tekton.dev"]

diff --git a/docs/additional-configs.md b/docs/additional-configs.md
@@ -30,6 +30,7 @@ installation.
     - [Verify signatures using `cosign`](#verify-signatures-using-cosign)
     - [Verify the transparency logs using `rekor-cli`](#verify-the-transparency-logs-using-rekor-cli)
   - [Verify Tekton Resources](#verify-tekton-resources)
+  - [Pipelinerun with Affinity Assistant](#pipelineruns-with-affinity-assistant)
   - [Next steps](#next-steps)
 
 
@@ -554,6 +555,11 @@ Trusted Resources is a feature to verify Tekton Tasks and Pipelines. The current
 version of feature supports `v1beta1` `Task` and `Pipeline`. For more details
 please take a look at [Trusted Resources](./trusted-resources.md).
 
+## Pipelineruns with Affinity Assistant
+
+The cluster operators can review the [guidelines](developers/affinity-assistant.md) to `cordon` a node in the cluster
+with the tekton controller and the affinity assistant is enabled.
+
 ## Next steps
 
 To get started with Tekton check the [Introductory tutorials][quickstarts],

diff --git a/docs/developers/README.md b/docs/developers/README.md
@@ -21,3 +21,4 @@ channel for training and tutorials on Tekton!
 - How specific features are implemented:
   - [PipelineResources (deprecated)](./pipelineresources.md)
   - [Results](./results-lifecycle.md)
+  - [Affinity Assistant](./affinity-assistant.md)
diff --git a/docs/developers/affinity-assistant.md b/docs/developers/affinity-assistant.md
@@ -0,0 +1,125 @@
+# Affinity Assistant
+
+
+[Specifying `Workspaces` in a `Pipeline`](../workspaces.md#specifying-workspace-order-in-a-pipeline-and-affinity-assistants) explains
+how an affinity assistant is created when a `persistentVolumeClaim` is used as a volume source for a `workspace` in a `pipelineRun`.
+Please refer to the same section for more details on the affinity assistant.
+
+This section gives an overview of how the affinity assistant is resilient to a cluster maintenance without losing
+the running `pipelineRun`. (Please refer to the issue https://github.com/tektoncd/pipeline/issues/6586 for more details.)
+
+When a list of `tasks` share a single workspace, the affinity assistant pod gets created on a `node` along with all
+`taskRun` pods. It is very common for a `pipeline` author to design a long-running tasks with a single workspace.
+With these long-running tasks, a `node` on which these pods are scheduled can be cordoned while the `pipelineRun` is
+still running. The tekton controller migrates the affinity assistant pod to any available `node` in a cluster along with
+the rest of the `taskRun` pods sharing the same workspace.
+
+Let's understand this with a sample `pipelineRun`:
+
+```yaml
+apiVersion: tekton.dev/v1
+kind: PipelineRun
+metadata:
+  generateName: pipeline-run-
+spec:
+  workspaces:
+  - name: source
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+          - ReadWriteOnce
+        resources:
+          requests:
+            storage: 10Mi
+  pipelineSpec:
+    workspaces:
+    - name: source
+    tasks:
+    - name: first-task
+      taskSpec:
+        workspaces:
+        - name: source
+        steps:
+        - image: alpine
+          script: |
+            echo $(workspaces.source.path)
+            sleep 60
+      workspaces:
+      - name: source
+    - name: last-task
+      taskSpec:
+        workspaces:
+        - name: source
+        steps:
+        - image: alpine
+          script: |
+            echo $(workspaces.source.path)
+            sleep 60
+      runAfter: ["first-task"]
+      workspaces:
+      - name: source
+```
+
+This `pipelineRun` has two long-running tasks, `first-task` and `last-task`. Both of these tasks are sharing a single
+volume with the access mode set to `ReadWriteOnce` which means the volume can be mounted to a single `node` at any
+given point of time.
+
+Create a `pipelineRun` and determine on which `node` the affinity assistant pod is scheduled:
+
+```shell
+kubectl get pods -l app.kubernetes.io/component=affinity-assistant -o wide -w
+NAME                              READY   STATUS    RESTARTS   AGE   IP       NODE     NOMINATED NODE   READINESS GATES
+affinity-assistant-c7b485007a-0   0/1     Pending   0          0s    <none>   <none>   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     Pending   0          0s    <none>   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     ContainerCreating   0          0s    <none>   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     ContainerCreating   0          1s    <none>   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   1/1     Running             0          5s    10.244.1.144   kind-multinode-worker1   <none>           <none>
+```
+
+Now, `cordon` that node to mark it unschedulable for any new pods:
+
+```shell
+kubectl cordon kind-multinode-worker1
+node/kind-multinode-worker1 cordoned
+```
+
+The node is cordoned:
+
+```shell
+kubectl get node
+NAME                           STATUS                     ROLES           AGE   VERSION
+kind-multinode-control-plane   Ready                      control-plane   13d   v1.26.3
+kind-multinode-worker1         Ready,SchedulingDisabled   <none>          13d   v1.26.3
+kind-multinode-worker2         Ready                      <none>          13d   v1.26.3
+```
+
+Now, watch the affinity assistant pod getting transferred onto other available node `kind-multinode-worker2`:
+
+```shell
+kubectl get pods -l app.kubernetes.io/component=affinity-assistant -o wide -w
+NAME                              READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
+affinity-assistant-c7b485007a-0   1/1     Running   0          49s   10.244.1.144   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   1/1     Terminating   0          70s   10.244.1.144   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   1/1     Terminating   0          70s   10.244.1.144   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     Terminating   0          70s   10.244.1.144   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     Terminating   0          70s   10.244.1.144   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     Terminating   0          70s   10.244.1.144   kind-multinode-worker1   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     Pending       0          0s    <none>          <none>          <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     Pending       0          1s    <none>          kind-multinode-worker2   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     ContainerCreating   0          1s    <none>          kind-multinode-worker2   <none>           <none>
+affinity-assistant-c7b485007a-0   0/1     ContainerCreating   0          2s    <none>          kind-multinode-worker2   <none>           <none>
+affinity-assistant-c7b485007a-0   1/1     Running             0          4s    10.244.2.144    kind-multinode-worker2   <none>           <none>
+```
+
+And, the `pipelineRun` finishes to completion:
+
+```shell
+kubectl get pr
+NAME                     SUCCEEDED   REASON      STARTTIME   COMPLETIONTIME
+pipeline-run-r2c7k       True        Succeeded   4m22s       2m1s
+
+kubectl get tr
+NAME                            SUCCEEDED   REASON      STARTTIME   COMPLETIONTIME
+pipeline-run-r2c7k-first-task   True        Succeeded   5m16s       4m7s
+pipeline-run-r2c7k-last-task    True        Succeeded   4m6s        2m56s
+```
diff --git a/docs/workspaces.md b/docs/workspaces.md
@@ -381,6 +381,12 @@ significantly. We do not recommend using them in clusters larger than several hu
 node in the cluster must have an appropriate label matching `topologyKey`. If some or all nodes
 are missing the specified `topologyKey` label, it can lead to unintended behavior.
 
+**Note:** Any time during the execution of a `pipelineRun`, if the node with a placeholder Affinity Assistant pod and
+the `taskRun` pods sharing a `workspace` is `cordoned` or disabled for scheduling anything new (`tainted`), the
+`pipelineRun` controller deletes the placeholder pod. The `taskRun` pods on a `cordoned` node continues running
+until completion. The deletion of a placeholder pod triggers creating a new placeholder pod on any available node
+such that the rest of the `pipelineRun` can continue without any disruption until it finishes.
+
 #### Specifying `Workspaces` in `PipelineRuns`
 
 For a `PipelineRun` to execute a `Pipeline` that includes one or more `Workspaces`, it needs to

diff --git a/pkg/reconciler/pipelinerun/affinity_assistant.go b/pkg/reconciler/pipelinerun/affinity_assistant.go
@@ -33,32 +33,35 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	errorutils "k8s.io/apimachinery/pkg/util/errors"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"knative.dev/pkg/kmeta"
 	"knative.dev/pkg/logging"
 )
 
 const (
-	// ReasonCouldntCreateAffinityAssistantStatefulSet indicates that a PipelineRun uses workspaces with PersistentVolumeClaim
+	// ReasonCouldntCreateOrUpdateAffinityAssistantStatefulSet indicates that a PipelineRun uses workspaces with PersistentVolumeClaim
 	// as a volume source and expect an Assistant StatefulSet, but couldn't create a StatefulSet.
-	ReasonCouldntCreateAffinityAssistantStatefulSet = "CouldntCreateAffinityAssistantStatefulSet"
+	ReasonCouldntCreateOrUpdateAffinityAssistantStatefulSet = "CouldntCreateOrUpdateAffinityAssistantstatefulSet"
 
 	featureFlagDisableAffinityAssistantKey = "disable-affinity-assistant"
 )
 
-// createAffinityAssistants creates an Affinity Assistant StatefulSet for every workspace in the PipelineRun that
+// createOrUpdateAffinityAssistants creates an Affinity Assistant StatefulSet for every workspace in the PipelineRun that
 // use a PersistentVolumeClaim volume. This is done to achieve Node Affinity for all TaskRuns that
 // share the workspace volume and make it possible for the tasks to execute parallel while sharing volume.
-func (c *Reconciler) createAffinityAssistants(ctx context.Context, wb []v1beta1.WorkspaceBinding, pr *v1beta1.PipelineRun, namespace string) error {
+func (c *Reconciler) createOrUpdateAffinityAssistants(ctx context.Context, wb []v1beta1.WorkspaceBinding, pr *v1beta1.PipelineRun, namespace string) error {
 	logger := logging.FromContext(ctx)
 	cfg := config.FromContextOrDefaults(ctx)
 
 	var errs []error
+	var unschedulableNodes sets.Set[string] = nil
 	for _, w := range wb {
 		if w.PersistentVolumeClaim != nil || w.VolumeClaimTemplate != nil {
 			affinityAssistantName := getAffinityAssistantName(w.Name, pr.Name)
-			_, err := c.KubeClientSet.AppsV1().StatefulSets(namespace).Get(ctx, affinityAssistantName, metav1.GetOptions{})
+			a, err := c.KubeClientSet.AppsV1().StatefulSets(namespace).Get(ctx, affinityAssistantName, metav1.GetOptions{})
 			claimName := getClaimName(w, *kmeta.NewControllerRef(pr))
 			switch {
+			// check whether the affinity assistant (StatefulSet) exists or not, create one if it does not exist
 			case apierrors.IsNotFound(err):
 				affinityAssistantStatefulSet := affinityAssistantStatefulSet(affinityAssistantName, pr, claimName, c.Images.NopImage, cfg.Defaults.DefaultAAPodTemplate)
 				_, err := c.KubeClientSet.AppsV1().StatefulSets(namespace).Create(ctx, affinityAssistantStatefulSet, metav1.CreateOptions{})
@@ -68,6 +71,40 @@ func (c *Reconciler) createAffinityAssistants(ctx context.Context, wb []v1beta1.
 				if err == nil {
 					logger.Infof("Created StatefulSet %s in namespace %s", affinityAssistantName, namespace)
 				}
+			// check whether the affinity assistant (StatefulSet) exists and the affinity assistant pod is created
+			// this check requires the StatefulSet to have the readyReplicas set to 1 to allow for any delay between the StatefulSet creation
+			// and the necessary pod creation, the delay can be caused by any dependency on PVCs and PVs creation
+			// this case addresses issues specified in https://github.com/tektoncd/pipeline/issues/6586
+			case err == nil && a != nil && a.Status.ReadyReplicas == 1:
+				if unschedulableNodes == nil {
+					ns, err := c.KubeClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{
+						FieldSelector: "spec.unschedulable=true",
+					})
+					if err != nil {
+						errs = append(errs, fmt.Errorf("could not get the list of nodes, err: %w", err))
+					}
+					unschedulableNodes = sets.Set[string]{}
+					// maintain the list of nodes which are unschedulable
+					for _, n := range ns.Items {
+						unschedulableNodes.Insert(n.Name)
+					}
+				}
+				if unschedulableNodes.Len() > 0 {
+					// get the pod created for a given StatefulSet, pod is assigned ordinal of 0 with the replicas set to 1
+					p, err := c.KubeClientSet.CoreV1().Pods(pr.Namespace).Get(ctx, a.Name+"-0", metav1.GetOptions{})
+					// ignore instead of failing if the affinity assistant pod was not found
+					if err != nil && !apierrors.IsNotFound(err) {
+						errs = append(errs, fmt.Errorf("could not get the affinity assistant pod for StatefulSet %s: %w", a.Name, err))
+					}
+					// check the node which hosts the affinity assistant pod if it is unschedulable or cordoned
+					if p != nil && unschedulableNodes.Has(p.Spec.NodeName) {
+						// if the node is unschedulable, delete the affinity assistant pod such that a StatefulSet can recreate the same pod on a different node
+						err = c.KubeClientSet.CoreV1().Pods(p.Namespace).Delete(ctx, p.Name, metav1.DeleteOptions{})
+						if err != nil {
+							errs = append(errs, fmt.Errorf("error deleting affinity assistant pod %s in ns %s: %w", p.Name, p.Namespace, err))
+						}
+					}
+				}
 			case err != nil:
 				errs = append(errs, fmt.Errorf("failed to retrieve StatefulSet %s: %w", affinityAssistantName, err))
 			}
@@ -107,7 +144,7 @@ func (c *Reconciler) cleanupAffinityAssistants(ctx context.Context, pr *v1beta1.
 func getAffinityAssistantName(pipelineWorkspaceName string, pipelineRunName string) string {
 	hashBytes := sha256.Sum256([]byte(pipelineWorkspaceName + pipelineRunName))
 	hashString := fmt.Sprintf("%x", hashBytes)
-	return fmt.Sprintf("%s-%s", "affinity-assistant", hashString[:10])
+	return fmt.Sprintf("%s-%s", workspace.ComponentNameAffinityAssistant, hashString[:10])
 }
 
 func getStatefulSetLabels(pr *v1beta1.PipelineRun, affinityAssistantName string) map[string]string {