From 95785b813c64e6f5a6ddb01e18afdefac0f6b675 Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Thu, 23 Dec 2021 12:01:56 +0100
Subject: [PATCH 1/7] include toleration in statefulset comparison

---
 e2e/tests/test_e2e.py  | 1 -
 pkg/cluster/cluster.go | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py
index 606abc95f..4ed52374c 100644
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@@ -1413,7 +1413,6 @@ def test_statefulset_annotation_propagation(self):
         self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")
 
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
-    @unittest.skip("Skipping this test until fixed")
     def test_taint_based_eviction(self):
         '''
            Add taint "postgres=:NoExecute" to node with master. This must cause a failover.
diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go
index 967f9d530..9895d6ba4 100644
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@@ -406,6 +406,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod affinity does not match the current one")
 	}
+	if len(c.Statefulset.Spec.Template.Spec.Tolerations) != len(statefulSet.Spec.Template.Spec.Tolerations) {
+		needsReplace = true
+		needsRollUpdate = true
+		reasons = append(reasons, "new statefulset's pod tolerations does not match the current one")
+	}
 
 	// Some generated fields like creationTimestamp make it not possible to use DeepCompare on Spec.Template.ObjectMeta
 	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Labels, statefulSet.Spec.Template.Labels) {
@@ -427,13 +432,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 	}
 
 	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Annotations, statefulSet.Spec.Template.Annotations) {
-		match = false
 		needsReplace = true
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod template metadata annotations does not match the current one")
 	}
 	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.SecurityContext, statefulSet.Spec.Template.Spec.SecurityContext) {
-		match = false
 		needsReplace = true
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod template security context in spec does not match the current one")
@@ -469,7 +472,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 	// we assume any change in priority happens by rolling out a new priority class
 	// changing the priority value in an existing class is not supproted
 	if c.Statefulset.Spec.Template.Spec.PriorityClassName != statefulSet.Spec.Template.Spec.PriorityClassName {
-		match = false
 		needsReplace = true
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod priority class in spec does not match the current one")

From ced0eae14a6a799e3c6c4b986af65f1d3face70e Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Thu, 23 Dec 2021 13:47:09 +0100
Subject: [PATCH 2/7] include toleration diff in stateful set sync

make taint e2e test more robust

change merge behavior of nodeSelectorTerms for node readiness label
---
 e2e/tests/k8s_api.py  |   2 +-
 e2e/tests/test_e2e.py | 138 +++++++++++++++++++++++-------------------
 2 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/e2e/tests/k8s_api.py b/e2e/tests/k8s_api.py
index c3ad1c999..f58e16b50 100644
--- a/e2e/tests/k8s_api.py
+++ b/e2e/tests/k8s_api.py
@@ -53,7 +53,7 @@ def get_pg_nodes(self, pg_cluster_name, namespace='default'):
 
         return master_pod_node, replica_pod_nodes
 
-    def get_cluster_nodes(self, cluster_labels='cluster-name=acid-minimal-cluster', namespace='default'):
+    def get_cluster_nodes(self, cluster_labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
         m = []
         r = []
         podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels)
diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py
index 4ed52374c..eb82dbb63 100644
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@@ -883,9 +883,9 @@ def test_node_affinity(self):
         self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
 
         # get nodes of master and replica(s)
-        master_node, replica_nodes = k8s.get_pg_nodes(cluster_label)
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
 
-        self.assertNotEqual(master_node, [])
+        self.assertNotEqual(master_nodes, [])
         self.assertNotEqual(replica_nodes, [])
 
         # label node with environment=postgres
@@ -898,8 +898,8 @@ def test_node_affinity(self):
         }
 
         try:
-            # patch current master node with the label
-            k8s.api.core_v1.patch_node(master_node, node_label_body)
+            # patch master node with the label
+            k8s.api.core_v1.patch_node(master_nodes[0], node_label_body)
 
             # add node affinity to cluster
             patch_node_affinity_config = {
@@ -923,7 +923,6 @@ def test_node_affinity(self):
                     }
                 }
             }
-
             k8s.api.custom_objects_api.patch_namespaced_custom_object(
                 group="acid.zalan.do",
                 version="v1",
@@ -934,14 +933,17 @@ def test_node_affinity(self):
             self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
             # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
-            k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label)
+            k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+            # master pod needs to be replaced as well to finish the rolling update
+            k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
             k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
 
             podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label)
             for pod in podsList.items:
                 if pod.metadata.labels.get('spilo-role') == 'replica':
-                    self.assertEqual(master_node, pod.spec.node_name,
-                         "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name))
+                    self.assertEqual(master_nodes[0], pod.spec.node_name,
+                         "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_nodes[0], pod.spec.node_name))
 
                     # check that pod has correct node affinity
                     key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key
@@ -966,7 +968,7 @@ def test_node_affinity(self):
             self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
             # node affinity change should cause another rolling update and relocation of replica
-            k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
+            k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=replica,' + cluster_label)
             k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
 
         except timeout_decorator.TimeoutError:
@@ -974,7 +976,6 @@ def test_node_affinity(self):
             raise
 
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
-    @unittest.skip("Skipping this test until fixed")
     def test_node_readiness_label(self):
         '''
            Remove node readiness label from master node. This must cause a failover.
@@ -984,12 +985,19 @@ def test_node_readiness_label(self):
         readiness_label = 'lifecycle-status'
         readiness_value = 'ready'
 
-        try:
-            # get nodes of master and replica(s) (expected target of new master)
-            current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
-            num_replicas = len(current_replica_nodes)
-            failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
+        # verify we are in good state from potential previous tests
+        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
+        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
+
+        # get nodes of master and replica(s) (expected target of new master)
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
+        self.assertNotEqual(master_nodes, [])
+        self.assertNotEqual(replica_nodes, [])
+
+        num_replicas = len(replica_nodes)
+        failover_targets = self.get_failover_targets(master_nodes[0], replica_nodes)
 
+        try:
             # add node_readiness_label to potential failover nodes
             patch_readiness_label = {
                 "metadata": {
@@ -998,7 +1006,6 @@ def test_node_readiness_label(self):
                     }
                 }
             }
-            self.assertTrue(len(failover_targets) > 0, "No failover targets available")
             for failover_target in failover_targets:
                 k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
 
@@ -1009,19 +1016,31 @@ def test_node_readiness_label(self):
                 }
             }
             k8s.update_config(patch_readiness_label_config, "setting readiness label")
-            new_master_node, new_replica_nodes = self.assert_failover(
-                current_master_node, num_replicas, failover_targets, cluster_label)
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
-            # patch also node where master ran before
-            k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label)
+            # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
+            k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
 
-            # toggle pod anti affinity to move replica away from master node
-            self.eventuallyTrue(lambda: self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label), "Pods are redistributed")
+            # the replica however will not start due to a volume node affinity conflict
+            # only if the pvc and pod are deleted it can be scheduled
+            replica = k8s.get_cluster_replica_pod()
+            if replica.status.phase == 'Pending':
+                k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
+                k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
+
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+
+            # patch also node where master ran before
+            k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)
 
         except timeout_decorator.TimeoutError:
             print('Operator log: {}'.format(k8s.get_operator_log()))
             raise
 
+        # toggle pod anti affinity to move replica away from master node
+        self.eventuallyTrue(lambda: self.assert_distributed_pods(master_nodes), "Pods are redistributed")
+
+
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
     def test_overwrite_pooler_deployment(self):
         k8s = self.k8s
@@ -1426,7 +1445,6 @@ def test_taint_based_eviction(self):
 
         # get nodes of master and replica(s) (expected target of new master)
         master_nodes, replica_nodes = k8s.get_cluster_nodes()
-
         self.assertNotEqual(master_nodes, [])
         self.assertNotEqual(replica_nodes, [])
 
@@ -1441,10 +1459,7 @@ def test_taint_based_eviction(self):
                 ]
             }
         }
-
         k8s.api.core_v1.patch_node(master_nodes[0], body)
-        self.eventuallyTrue(lambda: k8s.get_cluster_nodes()[0], replica_nodes)
-        self.assertNotEqual(lambda: k8s.get_cluster_nodes()[0], master_nodes)
 
         # add toleration to pods
         patch_toleration_config = {
@@ -1453,15 +1468,20 @@ def test_taint_based_eviction(self):
             }
         }
 
-        k8s.update_config(patch_toleration_config, step="allow tainted nodes")
+        try:
+            k8s.update_config(patch_toleration_config, step="allow tainted nodes")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"},
+                        "Operator does not get in sync")
 
-        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
-        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
+            self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
+            self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
+
+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
 
         # toggle pod anti affinity to move replica away from master node
-        nm, new_replica_nodes = k8s.get_cluster_nodes()
-        new_master_node = nm[0]
-        self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
+        self.assert_distributed_pods(replica_nodes)
 
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
     def test_zz_cluster_deletion(self):
@@ -1564,23 +1584,6 @@ def get_failover_targets(self, master_node, replica_nodes):
 
         return failover_targets
 
-    def assert_failover(self, current_master_node, num_replicas, failover_targets, cluster_label):
-        '''
-           Check if master is failing over. The replica should move first to be the switchover target
-        '''
-        k8s = self.k8s
-        k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
-        k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
-
-        new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
-        self.assertNotEqual(current_master_node, new_master_node,
-                            "Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
-        self.assertEqual(num_replicas, len(new_replica_nodes),
-                         "Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
-        self.assert_master_is_unique()
-
-        return new_master_node, new_replica_nodes
-
     def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
         '''
            Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
@@ -1592,14 +1595,13 @@ def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal
         num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
         self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))
 
-    def assert_distributed_pods(self, master_node, replica_nodes, cluster_label):
+    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
+    def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=acid-minimal-cluster'):
         '''
            Other tests can lead to the situation that master and replica are on the same node.
            Toggle pod anti affinty to distribute pods accross nodes (replica in particular).
         '''
         k8s = self.k8s
-        cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'
-        failover_targets = self.get_failover_targets(master_node, replica_nodes)
 
         # enable pod anti affintiy in config map which should trigger movement of replica
         patch_enable_antiaffinity = {
@@ -1607,18 +1609,30 @@ def assert_distributed_pods(self, master_node, replica_nodes, cluster_label):
                 "enable_pod_antiaffinity": "true"
             }
         }
-        k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
-        self.assert_failover(master_node, len(replica_nodes), failover_targets, cluster_label)
 
-        # now disable pod anti affintiy again which will cause yet another failover
-        patch_disable_antiaffinity = {
-            "data": {
-                "enable_pod_antiaffinity": "false"
+        try:
+            k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
+
+            k8s.wait_for_pod_failover(target_nodes, 'spilo-role=replica,' + cluster_labels)
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
+
+            # now disable pod anti affintiy again which will cause yet another failover
+            patch_disable_antiaffinity = {
+                "data": {
+                    "enable_pod_antiaffinity": "false"
+                }
             }
-        }
-        k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
-        k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
-        k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+            k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
+            
+            k8s.wait_for_pod_start('spilo-role=master,' + cluster_labels)
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
+
+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
         return True
 
     def list_databases(self, pod_name):

From 541a484264f4c25cca619682f4a7e98346aa1ee9 Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Wed, 5 Jan 2022 16:53:23 +0100
Subject: [PATCH 3/7] change merging nodeAffinity expression

---
 docs/reference/operator_parameters.md |  9 ++++--
 docs/user.md                          |  4 +++
 e2e/tests/test_e2e.py                 | 46 ++++++++++++++++++++-------
 pkg/cluster/k8sres.go                 | 18 +++++++++--
 4 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md
index 00febcf89..de3b370f3 100644
--- a/docs/reference/operator_parameters.md
+++ b/docs/reference/operator_parameters.md
@@ -343,8 +343,13 @@ configuration they are grouped under the `kubernetes` key.
   considered `ready`. The operator uses values of those labels to detect the
   start of the Kubernetes cluster upgrade procedure and move master pods off
   the nodes to be decommissioned. When the set is not empty, the operator also
-  assigns the `Affinity` clause to the Postgres pods to be scheduled only on
-  `ready` nodes. The default is empty.
+  assigns the `nodeAffinity` clause to the Postgres pods to be scheduled only
+  on `ready` nodes. If a `nodeAffinity` is specified in the postgres cluster
+  manifest as well the `nodeSelectorTerms` will get merged. If the 
+  `nodeAffinity` of the manifest contains only one `matchExpressions` slice
+  the node readiniess label expressions will be moved there (AND condition).
+  When multiple selector expressions are defined in the manifest an extra 
+  `matchExpressions` section is appended (OR condition). The default is empty.
 
 * **toleration**
   a dictionary that should contain `key`, `operator`, `value` and
diff --git a/docs/user.md b/docs/user.md
index 572d832ab..052c4c618 100644
--- a/docs/user.md
+++ b/docs/user.md
@@ -703,6 +703,10 @@ spec:
           - pci
 ```
 
+If you need to define a `nodeAffinity` for all your Postgres clusters use the
+`node_readiness_label` configuration option, which allows you to define a list
+of key-value pairs.
+
 ## In-place major version upgrade
 
 Starting with Spilo 13, operator supports in-place major version upgrade to a
diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py
index eb82dbb63..1878d93b6 100644
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@@ -880,11 +880,9 @@ def test_node_affinity(self):
 
         # verify we are in good state from potential previous tests
         self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
-        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
 
         # get nodes of master and replica(s)
         master_nodes, replica_nodes = k8s.get_cluster_nodes()
-
         self.assertNotEqual(master_nodes, [])
         self.assertNotEqual(replica_nodes, [])
 
@@ -975,6 +973,9 @@ def test_node_affinity(self):
             print('Operator log: {}'.format(k8s.get_operator_log()))
             raise
 
+        # toggle pod anti affinity to make sure replica and master run on separate nodes
+        self.assert_distributed_pods(replica_nodes)
+
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
     def test_node_readiness_label(self):
         '''
@@ -987,7 +988,6 @@ def test_node_readiness_label(self):
 
         # verify we are in good state from potential previous tests
         self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
-        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
 
         # get nodes of master and replica(s) (expected target of new master)
         master_nodes, replica_nodes = k8s.get_cluster_nodes()
@@ -1009,7 +1009,7 @@ def test_node_readiness_label(self):
             for failover_target in failover_targets:
                 k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
 
-            # define node_readiness_label in config map which should trigger a failover of the master
+            # define node_readiness_label in config map which should trigger a rolling update
             patch_readiness_label_config = {
                 "data": {
                     "node_readiness_label": readiness_label + ':' + readiness_value,
@@ -1018,17 +1018,19 @@ def test_node_readiness_label(self):
             k8s.update_config(patch_readiness_label_config, "setting readiness label")
             self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
-            # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
+            # first replica will be replaced and get the new affinity
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+
+            # next switchover of the master
             k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
 
-            # the replica however will not start due to a volume node affinity conflict
+            # the old master is replaced. However it might not start due to a volume node affinity conflict
             # only if the pvc and pod are deleted it can be scheduled
             replica = k8s.get_cluster_replica_pod()
             if replica.status.phase == 'Pending':
                 k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
                 k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
-
-            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+                k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
 
             # patch also node where master ran before
             k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)
@@ -1038,7 +1040,7 @@ def test_node_readiness_label(self):
             raise
 
         # toggle pod anti affinity to move replica away from master node
-        self.eventuallyTrue(lambda: self.assert_distributed_pods(master_nodes), "Pods are redistributed")
+        self.assert_distributed_pods(master_nodes)
 
 
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
@@ -1481,7 +1483,7 @@ def test_taint_based_eviction(self):
             raise
 
         # toggle pod anti affinity to move replica away from master node
-        self.assert_distributed_pods(replica_nodes)
+        self.assert_distributed_pods(master_nodes)
 
     @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
     def test_zz_cluster_deletion(self):
@@ -1602,6 +1604,16 @@ def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=aci
            Toggle pod anti affinty to distribute pods accross nodes (replica in particular).
         '''
         k8s = self.k8s
+        cluster_labels = 'application=spilo,cluster-name=acid-minimal-cluster'
+
+        # get nodes of master and replica(s)
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
+        self.assertNotEqual(master_nodes, [])
+        self.assertNotEqual(replica_nodes, [])
+
+        # if nodes are different we can quit here
+        if master_nodes[0] not in replica_nodes:
+            return True             
 
         # enable pod anti affintiy in config map which should trigger movement of replica
         patch_enable_antiaffinity = {
@@ -1614,8 +1626,8 @@ def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=aci
             k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
             self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
-            k8s.wait_for_pod_failover(target_nodes, 'spilo-role=replica,' + cluster_labels)
             k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
+            k8s.wait_for_running_pods(cluster_labels, 2)
 
             # now disable pod anti affintiy again which will cause yet another failover
             patch_disable_antiaffinity = {
@@ -1626,8 +1638,18 @@ def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=aci
             k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
             self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
             
-            k8s.wait_for_pod_start('spilo-role=master,' + cluster_labels)
             k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
+            k8s.wait_for_running_pods(cluster_labels, 2)
+
+            master_nodes, replica_nodes = k8s.get_cluster_nodes()
+            self.assertNotEqual(master_nodes, [])
+            self.assertNotEqual(replica_nodes, [])
+
+            # if nodes are different we can quit here
+            for target_node in target_nodes:
+                if (target_node not in master_nodes or target_node not in replica_nodes) and master_nodes[0] in replica_nodes:
+                    print('Pods run on the same node') 
+                    return False
 
         except timeout_decorator.TimeoutError:
             print('Operator log: {}'.format(k8s.get_operator_log()))
diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go
index e7d8ea376..ed6af9229 100644
--- a/pkg/cluster/k8sres.go
+++ b/pkg/cluster/k8sres.go
@@ -352,8 +352,22 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff
 				},
 			}
 		} else {
-			nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
-				NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm),
+			// if there are multiple node selector terms specified, append the node readiness label expressions (OR condition)
+			if len(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) > 1 {
+				manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
+				manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm)
+				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
+					NodeSelectorTerms: manifestTerms,
+				}
+				// if there's just one term defined merge it with the readiness label term (AND condition)
+			} else {
+				manifestExpressions := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[0].MatchExpressions
+				manifestExpressions = append(manifestExpressions, matchExpressions...)
+				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
+					NodeSelectorTerms: []v1.NodeSelectorTerm{
+						v1.NodeSelectorTerm{MatchExpressions: manifestExpressions},
+					},
+				}
 			}
 		}
 	}

From ea531ff1405d52109c97c1e41c06c305c0b8f54a Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Thu, 6 Jan 2022 12:12:58 +0100
Subject: [PATCH 4/7] extend docs and reflect review feedback

---
 docs/administrator.md                 | 72 +++++++++++++++++++++++++++
 docs/reference/operator_parameters.md | 15 ++----
 docs/user.md                          |  7 +--
 pkg/cluster/cluster.go                |  1 -
 pkg/cluster/k8sres.go                 |  4 +-
 5 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/docs/administrator.md b/docs/administrator.md
index 551ee5523..c15b324c7 100644
--- a/docs/administrator.md
+++ b/docs/administrator.md
@@ -339,6 +339,78 @@ master pods from being evicted by the K8s runtime. To prevent eviction
 completely, specify the toleration by leaving out the `tolerationSeconds` value
 (similar to how Kubernetes' own DaemonSets are configured)
 
+## Node readiness labels
+
+The operator can watch on certain node labels to detect e.g. the start of a
+Kubernetes cluster upgrade procedure and move master pods off the nodes to be
+decommissioned. Key-value pairs for these node readiness labels can be
+specified in the configuration (option name is in singular form):
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: postgres-operator
+data:
+  node_readiness_label: "status1:ready,status2:ready"
+```
+
+```yaml
+apiVersion: "acid.zalan.do/v1"
+kind: OperatorConfiguration
+metadata:
+  name: postgresql-configuration
+configuration:
+  kubernetes:
+    node_readiness_label:
+      status1: ready
+      status2: ready
+```
+
+The operator will create a `nodeAffinity` on the pods. This makes the
+`node_readiness_label` option the global configuration for defining node
+affinities for all Postgres clusters. You can have both, cluster-specific and
+global affinity, defined and they will get merged on the pods (AND condition).
+
+```yaml
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: environment
+            operator: In
+            values:
+            - pci
+          - key: status1
+            operator: In
+            values:
+            - ready
+          - key: status2
+            ...
+```
+
+If multiple `matchExpressions` are defined in the manifest (OR condition) the
+readiness label configuration will be appended with its own expressions block:
+
+```yaml
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: environment
+            ...
+        - matchExpressions:
+          - key: storage
+            ...
+        - matchExpressions:
+          - key: status1
+            ...
+          - key: status2
+            ...
+```
+
 ## Enable pod anti affinity
 
 To ensure Postgres pods are running on different topologies, you can use
diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md
index de3b370f3..5bcbdef97 100644
--- a/docs/reference/operator_parameters.md
+++ b/docs/reference/operator_parameters.md
@@ -340,16 +340,11 @@ configuration they are grouped under the `kubernetes` key.
 
 * **node_readiness_label**
   a set of labels that a running and active node should possess to be
-  considered `ready`. The operator uses values of those labels to detect the
-  start of the Kubernetes cluster upgrade procedure and move master pods off
-  the nodes to be decommissioned. When the set is not empty, the operator also
-  assigns the `nodeAffinity` clause to the Postgres pods to be scheduled only
-  on `ready` nodes. If a `nodeAffinity` is specified in the postgres cluster
-  manifest as well the `nodeSelectorTerms` will get merged. If the 
-  `nodeAffinity` of the manifest contains only one `matchExpressions` slice
-  the node readiniess label expressions will be moved there (AND condition).
-  When multiple selector expressions are defined in the manifest an extra 
-  `matchExpressions` section is appended (OR condition). The default is empty.
+  considered `ready`. When the set is not empty, the operator assigns the
+  `nodeAffinity` clause to the Postgres pods to be scheduled only on `ready`
+  nodes. If a `nodeAffinity` is also specified in the postgres cluster
+  manifest both affinities will get merged on the pods. See  [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes)
+  for more details. The default is empty.
 
 * **toleration**
   a dictionary that should contain `key`, `operator`, `value` and
diff --git a/docs/user.md b/docs/user.md
index 052c4c618..20db45979 100644
--- a/docs/user.md
+++ b/docs/user.md
@@ -671,7 +671,9 @@ configured [default requests](reference/operator_parameters.md#kubernetes-resour
 
 To ensure Postgres pods are running on nodes without any other application pods,
 you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
-and configure the required toleration in the manifest.
+and configure the required toleration in the manifest. Tolerations can also be
+defined in the [operator config](administrator.md#use-taints-and-tolerations-for-dedicated-postgresql-nodes)
+to apply for all Postgres clusters.
 
 ```yaml
 spec:
@@ -704,8 +706,7 @@ spec:
 ```
 
 If you need to define a `nodeAffinity` for all your Postgres clusters use the
-`node_readiness_label` configuration option, which allows you to define a list
-of key-value pairs.
+`node_readiness_label` [configuration](administrator.md#node-readiness-labels).
 
 ## In-place major version upgrade
 
diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go
index 9895d6ba4..ca58c10a0 100644
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@@ -375,7 +375,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 		reasons = append(reasons, "new statefulset's number of replicas does not match the current one")
 	}
 	if !reflect.DeepEqual(c.Statefulset.Annotations, statefulSet.Annotations) {
-		match = false
 		needsReplace = true
 		reasons = append(reasons, "new statefulset's annotations do not match the current one")
 	}
diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go
index ed6af9229..f67a89a71 100644
--- a/pkg/cluster/k8sres.go
+++ b/pkg/cluster/k8sres.go
@@ -352,15 +352,15 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff
 				},
 			}
 		} else {
-			// if there are multiple node selector terms specified, append the node readiness label expressions (OR condition)
 			if len(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) > 1 {
+				// if there are multiple node selector terms specified, append the node readiness label expressions (OR condition)
 				manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
 				manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm)
 				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
 					NodeSelectorTerms: manifestTerms,
 				}
-				// if there's just one term defined merge it with the readiness label term (AND condition)
 			} else {
+				// if there is just one term defined merge it with the readiness label term (AND condition)
 				manifestExpressions := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[0].MatchExpressions
 				manifestExpressions = append(manifestExpressions, matchExpressions...)
 				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{

From 92db09182e093861ec1338d220963a3cdf17e8a2 Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Thu, 6 Jan 2022 17:43:00 +0100
Subject: [PATCH 5/7] more e2e test cleanup

---
 e2e/tests/test_e2e.py | 42 +++++++++++-------------------------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py
index 1878d93b6..88cad63a0 100644
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@@ -286,7 +286,7 @@ def test_additional_teams_and_members(self):
         # revert config change
         revert_resync = {
             "data": {
-                "resync_period": "30m",
+                "resync_period": "4m",
             },
         }
         k8s.update_config(revert_resync)
@@ -933,7 +933,7 @@ def test_node_affinity(self):
             # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
             k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
             k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
-            # master pod needs to be replaced as well to finish the rolling update
+            # next master will be switched over and pod needs to be replaced as well to finish the rolling update
             k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
             k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
 
@@ -994,9 +994,6 @@ def test_node_readiness_label(self):
         self.assertNotEqual(master_nodes, [])
         self.assertNotEqual(replica_nodes, [])
 
-        num_replicas = len(replica_nodes)
-        failover_targets = self.get_failover_targets(master_nodes[0], replica_nodes)
-
         try:
             # add node_readiness_label to potential failover nodes
             patch_readiness_label = {
@@ -1006,8 +1003,8 @@ def test_node_readiness_label(self):
                     }
                 }
             }
-            for failover_target in failover_targets:
-                k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
+            for replica_node in replica_nodes:
+                k8s.api.core_v1.patch_node(replica_node, patch_readiness_label)
 
             # define node_readiness_label in config map which should trigger a rolling update
             patch_readiness_label_config = {
@@ -1019,19 +1016,18 @@ def test_node_readiness_label(self):
             self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
 
             # first replica will be replaced and get the new affinity
-            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
-
-            # next switchover of the master
-            k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
-
-            # the old master is replaced. However it might not start due to a volume node affinity conflict
-            # only if the pvc and pod are deleted it can be scheduled
+            # however, it might not start due to a volume node affinity conflict
+            # in this case only if the pvc and pod are deleted it can be scheduled
             replica = k8s.get_cluster_replica_pod()
             if replica.status.phase == 'Pending':
                 k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
                 k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
                 k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
 
+            # next master will be switched over and pod needs to be replaced as well to finish the rolling update
+            k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+
             # patch also node where master ran before
             k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)
 
@@ -1330,7 +1326,7 @@ def test_rolling_update_label_timeout(self):
             patch_resync_config = {
                 "data": {
                     "pod_label_wait_timeout": "10m",
-                    "resync_period": "30m",
+                    "resync_period": "4m",
                 }
             }
             k8s.update_config(patch_resync_config, "revert resync interval and pod_label_wait_timeout")
@@ -1570,22 +1566,6 @@ def test_zz_cluster_deletion(self):
         }
         k8s.update_config(patch_delete_annotations)
 
-    def get_failover_targets(self, master_node, replica_nodes):
-        '''
-           If all pods live on the same node, failover will happen to other worker(s)
-        '''
-        k8s = self.k8s
-        k8s_master_exclusion = 'kubernetes.io/hostname!=postgres-operator-e2e-tests-control-plane'
-
-        failover_targets = [x for x in replica_nodes if x != master_node]
-        if len(failover_targets) == 0:
-            nodes = k8s.api.core_v1.list_node(label_selector=k8s_master_exclusion)
-            for n in nodes.items:
-                if n.metadata.name != master_node:
-                    failover_targets.append(n.metadata.name)
-
-        return failover_targets
-
     def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
         '''
            Check that there is a single pod in the k8s cluster with the label "spilo-role=master"

From 596ad5375d6a7deeac573f39cbf99e1141c86182 Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Wed, 12 Jan 2022 13:21:19 +0100
Subject: [PATCH 6/7] add config option to change affinity merge behavior

---
 .../crds/operatorconfigurations.yaml           |  5 +++++
 charts/postgres-operator/values.yaml           |  3 +++
 docs/administrator.md                          |  9 ++++++---
 docs/reference/operator_parameters.md          | 11 ++++++++---
 e2e/tests/test_e2e.py                          |  1 +
 manifests/configmap.yaml                       |  3 ++-
 manifests/operatorconfiguration.crd.yaml       |  5 +++++
 ...tgresql-operator-default-configuration.yaml |  1 +
 pkg/apis/acid.zalan.do/v1/crds.go              | 11 +++++++++++
 .../v1/operator_configuration_type.go          |  1 +
 pkg/cluster/k8sres.go                          | 18 +++++++-----------
 pkg/controller/operator_config.go              |  1 +
 pkg/util/config/config.go                      |  1 +
 13 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/charts/postgres-operator/crds/operatorconfigurations.yaml b/charts/postgres-operator/crds/operatorconfigurations.yaml
index 043129516..e99dce3b9 100644
--- a/charts/postgres-operator/crds/operatorconfigurations.yaml
+++ b/charts/postgres-operator/crds/operatorconfigurations.yaml
@@ -230,6 +230,11 @@ spec:
                     type: object
                     additionalProperties:
                       type: string
+                  node_readiness_label_merge:
+                    type: string
+                    enum:
+                      - "AND"
+                      - "OR"
                   oauth_token_secret_name:
                     type: string
                     default: "postgresql-operator"
diff --git a/charts/postgres-operator/values.yaml b/charts/postgres-operator/values.yaml
index 65619845a..595009bac 100644
--- a/charts/postgres-operator/values.yaml
+++ b/charts/postgres-operator/values.yaml
@@ -130,6 +130,9 @@ configKubernetes:
   # node_readiness_label:
   #   status: ready
 
+  # defines how nodeAffinity from manifest should be merged with node_readiness_label
+  # node_readiness_label_merge: "OR"
+
   # namespaced name of the secret containing the OAuth2 token to pass to the teams API
   # oauth_token_secret_name: postgresql-operator
 
diff --git a/docs/administrator.md b/docs/administrator.md
index c15b324c7..879b677b9 100644
--- a/docs/administrator.md
+++ b/docs/administrator.md
@@ -370,7 +370,10 @@ configuration:
 The operator will create a `nodeAffinity` on the pods. This makes the
 `node_readiness_label` option the global configuration for defining node
 affinities for all Postgres clusters. You can have both, cluster-specific and
-global affinity, defined and they will get merged on the pods (AND condition).
+global affinity, defined and they will get merged on the pods. If
+`node_readiness_label_merge` is configured to `"AND"` the node readiness
+affinity will end up under the same `matchExpressions` section(s) from the
+manifest affinity.
 
 ```yaml
   affinity:
@@ -390,8 +393,8 @@ global affinity, defined and they will get merged on the pods (AND condition).
             ...
 ```
 
-If multiple `matchExpressions` are defined in the manifest (OR condition) the
-readiness label configuration will be appended with its own expressions block:
+If `node_readiness_label_merge` is set to `"OR"` (default) the readiness label
+affinty will be appended with its own expressions block:
 
 ```yaml
   affinity:
diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md
index 5bcbdef97..e6a0f19b8 100644
--- a/docs/reference/operator_parameters.md
+++ b/docs/reference/operator_parameters.md
@@ -342,9 +342,14 @@ configuration they are grouped under the `kubernetes` key.
   a set of labels that a running and active node should possess to be
   considered `ready`. When the set is not empty, the operator assigns the
   `nodeAffinity` clause to the Postgres pods to be scheduled only on `ready`
-  nodes. If a `nodeAffinity` is also specified in the postgres cluster
-  manifest both affinities will get merged on the pods. See  [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes)
-  for more details. The default is empty.
+  nodes. The default is empty.
+
+* **node_readiness_label_merge**
+  If a `nodeAffinity` is also specified in the postgres cluster manifest
+  it will get merged with the `node_readiness_label` affinity on the pods.
+  The merge strategy can be configured - it can either be "AND" or "OR".
+  See [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes)
+  for more details. Default is "OR".
 
 * **toleration**
   a dictionary that should contain `key`, `operator`, `value` and
diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py
index 88cad63a0..3047de259 100644
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@@ -1010,6 +1010,7 @@ def test_node_readiness_label(self):
             patch_readiness_label_config = {
                 "data": {
                     "node_readiness_label": readiness_label + ':' + readiness_value,
+                    "node_readiness_label_merge": "AND",
                 }
             }
             k8s.update_config(patch_readiness_label_config, "setting readiness label")
diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml
index 7d3e14ce3..332c184ed 100644
--- a/manifests/configmap.yaml
+++ b/manifests/configmap.yaml
@@ -85,7 +85,8 @@ data:
   # min_cpu_limit: 250m
   # min_memory_limit: 250Mi
   # minimal_major_version: "9.6"
-  # node_readiness_label: ""
+  # node_readiness_label: "status:ready"
+  # node_readiness_label_merge: "OR"
   # oauth_token_secret_name: postgresql-operator
   # pam_configuration: |
   #  https://info.example.com/oauth2/tokeninfo?access_token= uid realm=/employees
diff --git a/manifests/operatorconfiguration.crd.yaml b/manifests/operatorconfiguration.crd.yaml
index bb64995ab..84a8024e9 100644
--- a/manifests/operatorconfiguration.crd.yaml
+++ b/manifests/operatorconfiguration.crd.yaml
@@ -225,6 +225,11 @@ spec:
                     type: object
                     additionalProperties:
                       type: string
+                  node_readiness_label_merge:
+                    type: string
+                    enum:
+                      - "AND"
+                      - "OR"
                   oauth_token_secret_name:
                     type: string
                     default: "postgresql-operator"
diff --git a/manifests/postgresql-operator-default-configuration.yaml b/manifests/postgresql-operator-default-configuration.yaml
index 02d558543..a82226394 100644
--- a/manifests/postgresql-operator-default-configuration.yaml
+++ b/manifests/postgresql-operator-default-configuration.yaml
@@ -69,6 +69,7 @@ configuration:
     master_pod_move_timeout: 20m
     # node_readiness_label:
     #   status: ready
+    # node_readiness_label_merge: "OR"
     oauth_token_secret_name: postgresql-operator
     pdb_name_format: "postgres-{cluster}-pdb"
     pod_antiaffinity_topology_key: "kubernetes.io/hostname"
diff --git a/pkg/apis/acid.zalan.do/v1/crds.go b/pkg/apis/acid.zalan.do/v1/crds.go
index fae5a09f2..61efdcd0f 100644
--- a/pkg/apis/acid.zalan.do/v1/crds.go
+++ b/pkg/apis/acid.zalan.do/v1/crds.go
@@ -1164,6 +1164,17 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{
 									},
 								},
 							},
+							"node_readiness_label_merge": {
+								Type: "string",
+								Enum: []apiextv1.JSON{
+									{
+										Raw: []byte(`"AND"`),
+									},
+									{
+										Raw: []byte(`"OR"`),
+									},
+								},
+							},
 							"oauth_token_secret_name": {
 								Type: "string",
 							},
diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go
index f8eb5b5d1..a11cba0a5 100644
--- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go
+++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go
@@ -82,6 +82,7 @@ type KubernetesMetaConfiguration struct {
 	DeleteAnnotationDateKey                string                       `json:"delete_annotation_date_key,omitempty"`
 	DeleteAnnotationNameKey                string                       `json:"delete_annotation_name_key,omitempty"`
 	NodeReadinessLabel                     map[string]string            `json:"node_readiness_label,omitempty"`
+	NodeReadinessLabelMerge                string                       `json:"node_readiness_label_merge,omitempty"`
 	CustomPodAnnotations                   map[string]string            `json:"custom_pod_annotations,omitempty"`
 	// TODO: use a proper toleration structure?
 	PodToleration              map[string]string   `json:"toleration,omitempty"`
diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go
index f67a89a71..0749dc691 100644
--- a/pkg/cluster/k8sres.go
+++ b/pkg/cluster/k8sres.go
@@ -327,7 +327,7 @@ func generateCapabilities(capabilities []string) *v1.Capabilities {
 	return nil
 }
 
-func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
+func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
 	if len(nodeReadinessLabel) == 0 && nodeAffinity == nil {
 		return nil
 	}
@@ -352,21 +352,17 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff
 				},
 			}
 		} else {
-			if len(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) > 1 {
-				// if there are multiple node selector terms specified, append the node readiness label expressions (OR condition)
+			if c.OpConfig.NodeReadinessLabelMerge == "OR" {
 				manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
 				manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm)
 				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
 					NodeSelectorTerms: manifestTerms,
 				}
-			} else {
-				// if there is just one term defined merge it with the readiness label term (AND condition)
-				manifestExpressions := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[0].MatchExpressions
-				manifestExpressions = append(manifestExpressions, matchExpressions...)
-				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
-					NodeSelectorTerms: []v1.NodeSelectorTerm{
-						v1.NodeSelectorTerm{MatchExpressions: manifestExpressions},
-					},
+			} else if c.OpConfig.NodeReadinessLabelMerge == "AND" {
+				for i, nodeSelectorTerm := range nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
+					manifestExpressions := nodeSelectorTerm.MatchExpressions
+					manifestExpressions = append(manifestExpressions, matchExpressions...)
+					nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i] = v1.NodeSelectorTerm{MatchExpressions: manifestExpressions}
 				}
 			}
 		}
diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go
index 275898d8e..f94f3b96c 100644
--- a/pkg/controller/operator_config.go
+++ b/pkg/controller/operator_config.go
@@ -109,6 +109,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur
 	result.DeleteAnnotationDateKey = fromCRD.Kubernetes.DeleteAnnotationDateKey
 	result.DeleteAnnotationNameKey = fromCRD.Kubernetes.DeleteAnnotationNameKey
 	result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel
+	result.NodeReadinessLabelMerge = fromCRD.Kubernetes.NodeReadinessLabelMerge
 	result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName
 	result.PodManagementPolicy = util.Coalesce(fromCRD.Kubernetes.PodManagementPolicy, "ordered_ready")
 	result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m")
diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go
index 71bf406e4..e676a3c1e 100644
--- a/pkg/util/config/config.go
+++ b/pkg/util/config/config.go
@@ -54,6 +54,7 @@ type Resources struct {
 	PodEnvironmentConfigMap       spec.NamespacedName `name:"pod_environment_configmap"`
 	PodEnvironmentSecret          string              `name:"pod_environment_secret"`
 	NodeReadinessLabel            map[string]string   `name:"node_readiness_label" default:""`
+	NodeReadinessLabelMerge       string              `name:"node_readiness_label_merge" default:"OR"`
 	MaxInstances                  int32               `name:"max_instances" default:"-1"`
 	MinInstances                  int32               `name:"min_instances" default:"-1"`
 	ShmVolume                     *bool               `name:"enable_shm_volume" default:"true"`

From 133106bcc22c30ffeb187135065eba155b04aad6 Mon Sep 17 00:00:00 2001
From: Felix Kunde <felix-kunde@gmx.de>
Date: Wed, 12 Jan 2022 15:37:23 +0100
Subject: [PATCH 7/7] nodeAffinity cluster method

---
 pkg/cluster/connection_pooler.go | 2 +-
 pkg/cluster/k8sres.go            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/cluster/connection_pooler.go b/pkg/cluster/connection_pooler.go
index c5c55350f..ec9fe291d 100644
--- a/pkg/cluster/connection_pooler.go
+++ b/pkg/cluster/connection_pooler.go
@@ -309,7 +309,7 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) (
 		},
 	}
 
-	nodeAffinity := nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
+	nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
 	if c.OpConfig.EnablePodAntiAffinity {
 		labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels)
 		podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity)
diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go
index 0749dc691..fda192df8 100644
--- a/pkg/cluster/k8sres.go
+++ b/pkg/cluster/k8sres.go
@@ -1270,7 +1270,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
 		effectiveRunAsUser,
 		effectiveRunAsGroup,
 		effectiveFSGroup,
-		nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
+		c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
 		spec.SchedulerName,
 		int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
 		c.OpConfig.PodServiceAccountName,
@@ -2020,7 +2020,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) {
 		nil,
 		nil,
 		nil,
-		nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
+		c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
 		nil,
 		int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
 		c.OpConfig.PodServiceAccountName,