diff --git a/charts/postgres-operator/crds/operatorconfigurations.yaml b/charts/postgres-operator/crds/operatorconfigurations.yaml index 043129516..e99dce3b9 100644 --- a/charts/postgres-operator/crds/operatorconfigurations.yaml +++ b/charts/postgres-operator/crds/operatorconfigurations.yaml @@ -230,6 +230,11 @@ spec: type: object additionalProperties: type: string + node_readiness_label_merge: + type: string + enum: + - "AND" + - "OR" oauth_token_secret_name: type: string default: "postgresql-operator" diff --git a/charts/postgres-operator/values.yaml b/charts/postgres-operator/values.yaml index 65619845a..595009bac 100644 --- a/charts/postgres-operator/values.yaml +++ b/charts/postgres-operator/values.yaml @@ -130,6 +130,9 @@ configKubernetes: # node_readiness_label: # status: ready + # defines how nodeAffinity from manifest should be merged with node_readiness_label + # node_readiness_label_merge: "OR" + # namespaced name of the secret containing the OAuth2 token to pass to the teams API # oauth_token_secret_name: postgresql-operator diff --git a/docs/administrator.md b/docs/administrator.md index 551ee5523..879b677b9 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -339,6 +339,81 @@ master pods from being evicted by the K8s runtime. To prevent eviction completely, specify the toleration by leaving out the `tolerationSeconds` value (similar to how Kubernetes' own DaemonSets are configured) +## Node readiness labels + +The operator can watch on certain node labels to detect e.g. the start of a +Kubernetes cluster upgrade procedure and move master pods off the nodes to be +decommissioned. Key-value pairs for these node readiness labels can be +specified in the configuration (option name is in singular form): + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-operator +data: + node_readiness_label: "status1:ready,status2:ready" +``` + +```yaml +apiVersion: "acid.zalan.do/v1" +kind: OperatorConfiguration +metadata: + name: postgresql-configuration +configuration: + kubernetes: + node_readiness_label: + status1: ready + status2: ready +``` + +The operator will create a `nodeAffinity` on the pods. This makes the +`node_readiness_label` option the global configuration for defining node +affinities for all Postgres clusters. You can have both, cluster-specific and +global affinity, defined and they will get merged on the pods. If +`node_readiness_label_merge` is configured to `"AND"` the node readiness +affinity will end up under the same `matchExpressions` section(s) from the +manifest affinity. + +```yaml + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: environment + operator: In + values: + - pci + - key: status1 + operator: In + values: + - ready + - key: status2 + ... +``` + +If `node_readiness_label_merge` is set to `"OR"` (default) the readiness label +affinty will be appended with its own expressions block: + +```yaml + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: environment + ... + - matchExpressions: + - key: storage + ... + - matchExpressions: + - key: status1 + ... + - key: status2 + ... +``` + ## Enable pod anti affinity To ensure Postgres pods are running on different topologies, you can use diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 00febcf89..e6a0f19b8 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -340,11 +340,16 @@ configuration they are grouped under the `kubernetes` key. * **node_readiness_label** a set of labels that a running and active node should possess to be - considered `ready`. The operator uses values of those labels to detect the - start of the Kubernetes cluster upgrade procedure and move master pods off - the nodes to be decommissioned. When the set is not empty, the operator also - assigns the `Affinity` clause to the Postgres pods to be scheduled only on - `ready` nodes. The default is empty. + considered `ready`. When the set is not empty, the operator assigns the + `nodeAffinity` clause to the Postgres pods to be scheduled only on `ready` + nodes. The default is empty. + +* **node_readiness_label_merge** + If a `nodeAffinity` is also specified in the postgres cluster manifest + it will get merged with the `node_readiness_label` affinity on the pods. + The merge strategy can be configured - it can either be "AND" or "OR". + See [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes) + for more details. Default is "OR". * **toleration** a dictionary that should contain `key`, `operator`, `value` and diff --git a/docs/user.md b/docs/user.md index 572d832ab..20db45979 100644 --- a/docs/user.md +++ b/docs/user.md @@ -671,7 +671,9 @@ configured [default requests](reference/operator_parameters.md#kubernetes-resour To ensure Postgres pods are running on nodes without any other application pods, you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/) -and configure the required toleration in the manifest. +and configure the required toleration in the manifest. Tolerations can also be +defined in the [operator config](administrator.md#use-taints-and-tolerations-for-dedicated-postgresql-nodes) +to apply for all Postgres clusters. ```yaml spec: @@ -703,6 +705,9 @@ spec: - pci ``` +If you need to define a `nodeAffinity` for all your Postgres clusters use the +`node_readiness_label` [configuration](administrator.md#node-readiness-labels). + ## In-place major version upgrade Starting with Spilo 13, operator supports in-place major version upgrade to a diff --git a/e2e/tests/k8s_api.py b/e2e/tests/k8s_api.py index c3ad1c999..f58e16b50 100644 --- a/e2e/tests/k8s_api.py +++ b/e2e/tests/k8s_api.py @@ -53,7 +53,7 @@ def get_pg_nodes(self, pg_cluster_name, namespace='default'): return master_pod_node, replica_pod_nodes - def get_cluster_nodes(self, cluster_labels='cluster-name=acid-minimal-cluster', namespace='default'): + def get_cluster_nodes(self, cluster_labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'): m = [] r = [] podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels) diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 606abc95f..3047de259 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -286,7 +286,7 @@ def test_additional_teams_and_members(self): # revert config change revert_resync = { "data": { - "resync_period": "30m", + "resync_period": "4m", }, } k8s.update_config(revert_resync) @@ -880,12 +880,10 @@ def test_node_affinity(self): # verify we are in good state from potential previous tests self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") - self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") # get nodes of master and replica(s) - master_node, replica_nodes = k8s.get_pg_nodes(cluster_label) - - self.assertNotEqual(master_node, []) + master_nodes, replica_nodes = k8s.get_cluster_nodes() + self.assertNotEqual(master_nodes, []) self.assertNotEqual(replica_nodes, []) # label node with environment=postgres @@ -898,8 +896,8 @@ def test_node_affinity(self): } try: - # patch current master node with the label - k8s.api.core_v1.patch_node(master_node, node_label_body) + # patch master node with the label + k8s.api.core_v1.patch_node(master_nodes[0], node_label_body) # add node affinity to cluster patch_node_affinity_config = { @@ -923,7 +921,6 @@ def test_node_affinity(self): } } } - k8s.api.custom_objects_api.patch_namespaced_custom_object( group="acid.zalan.do", version="v1", @@ -934,14 +931,17 @@ def test_node_affinity(self): self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement - k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label) + k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + # next master will be switched over and pod needs to be replaced as well to finish the rolling update + k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label) for pod in podsList.items: if pod.metadata.labels.get('spilo-role') == 'replica': - self.assertEqual(master_node, pod.spec.node_name, - "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name)) + self.assertEqual(master_nodes[0], pod.spec.node_name, + "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_nodes[0], pod.spec.node_name)) # check that pod has correct node affinity key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key @@ -966,15 +966,17 @@ def test_node_affinity(self): self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") # node affinity change should cause another rolling update and relocation of replica - k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label) + k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) except timeout_decorator.TimeoutError: print('Operator log: {}'.format(k8s.get_operator_log())) raise + # toggle pod anti affinity to make sure replica and master run on separate nodes + self.assert_distributed_pods(replica_nodes) + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) - @unittest.skip("Skipping this test until fixed") def test_node_readiness_label(self): ''' Remove node readiness label from master node. This must cause a failover. @@ -984,12 +986,15 @@ def test_node_readiness_label(self): readiness_label = 'lifecycle-status' readiness_value = 'ready' - try: - # get nodes of master and replica(s) (expected target of new master) - current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label) - num_replicas = len(current_replica_nodes) - failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes) + # verify we are in good state from potential previous tests + self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") + # get nodes of master and replica(s) (expected target of new master) + master_nodes, replica_nodes = k8s.get_cluster_nodes() + self.assertNotEqual(master_nodes, []) + self.assertNotEqual(replica_nodes, []) + + try: # add node_readiness_label to potential failover nodes patch_readiness_label = { "metadata": { @@ -998,30 +1003,43 @@ def test_node_readiness_label(self): } } } - self.assertTrue(len(failover_targets) > 0, "No failover targets available") - for failover_target in failover_targets: - k8s.api.core_v1.patch_node(failover_target, patch_readiness_label) + for replica_node in replica_nodes: + k8s.api.core_v1.patch_node(replica_node, patch_readiness_label) - # define node_readiness_label in config map which should trigger a failover of the master + # define node_readiness_label in config map which should trigger a rolling update patch_readiness_label_config = { "data": { "node_readiness_label": readiness_label + ':' + readiness_value, + "node_readiness_label_merge": "AND", } } k8s.update_config(patch_readiness_label_config, "setting readiness label") - new_master_node, new_replica_nodes = self.assert_failover( - current_master_node, num_replicas, failover_targets, cluster_label) + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") - # patch also node where master ran before - k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label) + # first replica will be replaced and get the new affinity + # however, it might not start due to a volume node affinity conflict + # in this case only if the pvc and pod are deleted it can be scheduled + replica = k8s.get_cluster_replica_pod() + if replica.status.phase == 'Pending': + k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default') + k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default') + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + + # next master will be switched over and pod needs to be replaced as well to finish the rolling update + k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - # toggle pod anti affinity to move replica away from master node - self.eventuallyTrue(lambda: self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label), "Pods are redistributed") + # patch also node where master ran before + k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label) except timeout_decorator.TimeoutError: print('Operator log: {}'.format(k8s.get_operator_log())) raise + # toggle pod anti affinity to move replica away from master node + self.assert_distributed_pods(master_nodes) + + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) def test_overwrite_pooler_deployment(self): k8s = self.k8s @@ -1309,7 +1327,7 @@ def test_rolling_update_label_timeout(self): patch_resync_config = { "data": { "pod_label_wait_timeout": "10m", - "resync_period": "30m", + "resync_period": "4m", } } k8s.update_config(patch_resync_config, "revert resync interval and pod_label_wait_timeout") @@ -1413,7 +1431,6 @@ def test_statefulset_annotation_propagation(self): self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing") @timeout_decorator.timeout(TEST_TIMEOUT_SEC) - @unittest.skip("Skipping this test until fixed") def test_taint_based_eviction(self): ''' Add taint "postgres=:NoExecute" to node with master. This must cause a failover. @@ -1427,7 +1444,6 @@ def test_taint_based_eviction(self): # get nodes of master and replica(s) (expected target of new master) master_nodes, replica_nodes = k8s.get_cluster_nodes() - self.assertNotEqual(master_nodes, []) self.assertNotEqual(replica_nodes, []) @@ -1442,10 +1458,7 @@ def test_taint_based_eviction(self): ] } } - k8s.api.core_v1.patch_node(master_nodes[0], body) - self.eventuallyTrue(lambda: k8s.get_cluster_nodes()[0], replica_nodes) - self.assertNotEqual(lambda: k8s.get_cluster_nodes()[0], master_nodes) # add toleration to pods patch_toleration_config = { @@ -1454,15 +1467,20 @@ def test_taint_based_eviction(self): } } - k8s.update_config(patch_toleration_config, step="allow tainted nodes") + try: + k8s.update_config(patch_toleration_config, step="allow tainted nodes") + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, + "Operator does not get in sync") - self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") - self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") + self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") + self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") + + except timeout_decorator.TimeoutError: + print('Operator log: {}'.format(k8s.get_operator_log())) + raise # toggle pod anti affinity to move replica away from master node - nm, new_replica_nodes = k8s.get_cluster_nodes() - new_master_node = nm[0] - self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label) + self.assert_distributed_pods(master_nodes) @timeout_decorator.timeout(TEST_TIMEOUT_SEC) def test_zz_cluster_deletion(self): @@ -1549,39 +1567,6 @@ def test_zz_cluster_deletion(self): } k8s.update_config(patch_delete_annotations) - def get_failover_targets(self, master_node, replica_nodes): - ''' - If all pods live on the same node, failover will happen to other worker(s) - ''' - k8s = self.k8s - k8s_master_exclusion = 'kubernetes.io/hostname!=postgres-operator-e2e-tests-control-plane' - - failover_targets = [x for x in replica_nodes if x != master_node] - if len(failover_targets) == 0: - nodes = k8s.api.core_v1.list_node(label_selector=k8s_master_exclusion) - for n in nodes.items: - if n.metadata.name != master_node: - failover_targets.append(n.metadata.name) - - return failover_targets - - def assert_failover(self, current_master_node, num_replicas, failover_targets, cluster_label): - ''' - Check if master is failing over. The replica should move first to be the switchover target - ''' - k8s = self.k8s - k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label) - k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - - new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label) - self.assertNotEqual(current_master_node, new_master_node, - "Master on {} did not fail over to one of {}".format(current_master_node, failover_targets)) - self.assertEqual(num_replicas, len(new_replica_nodes), - "Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes))) - self.assert_master_is_unique() - - return new_master_node, new_replica_nodes - def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"): ''' Check that there is a single pod in the k8s cluster with the label "spilo-role=master" @@ -1593,14 +1578,23 @@ def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal num_of_master_pods = k8s.count_pods_with_label(labels, namespace) self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods)) - def assert_distributed_pods(self, master_node, replica_nodes, cluster_label): + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=acid-minimal-cluster'): ''' Other tests can lead to the situation that master and replica are on the same node. Toggle pod anti affinty to distribute pods accross nodes (replica in particular). ''' k8s = self.k8s - cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster' - failover_targets = self.get_failover_targets(master_node, replica_nodes) + cluster_labels = 'application=spilo,cluster-name=acid-minimal-cluster' + + # get nodes of master and replica(s) + master_nodes, replica_nodes = k8s.get_cluster_nodes() + self.assertNotEqual(master_nodes, []) + self.assertNotEqual(replica_nodes, []) + + # if nodes are different we can quit here + if master_nodes[0] not in replica_nodes: + return True # enable pod anti affintiy in config map which should trigger movement of replica patch_enable_antiaffinity = { @@ -1608,18 +1602,40 @@ def assert_distributed_pods(self, master_node, replica_nodes, cluster_label): "enable_pod_antiaffinity": "true" } } - k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity") - self.assert_failover(master_node, len(replica_nodes), failover_targets, cluster_label) - # now disable pod anti affintiy again which will cause yet another failover - patch_disable_antiaffinity = { - "data": { - "enable_pod_antiaffinity": "false" + try: + k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity") + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels) + k8s.wait_for_running_pods(cluster_labels, 2) + + # now disable pod anti affintiy again which will cause yet another failover + patch_disable_antiaffinity = { + "data": { + "enable_pod_antiaffinity": "false" + } } - } - k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity") - k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) - k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity") + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels) + k8s.wait_for_running_pods(cluster_labels, 2) + + master_nodes, replica_nodes = k8s.get_cluster_nodes() + self.assertNotEqual(master_nodes, []) + self.assertNotEqual(replica_nodes, []) + + # if nodes are different we can quit here + for target_node in target_nodes: + if (target_node not in master_nodes or target_node not in replica_nodes) and master_nodes[0] in replica_nodes: + print('Pods run on the same node') + return False + + except timeout_decorator.TimeoutError: + print('Operator log: {}'.format(k8s.get_operator_log())) + raise + return True def list_databases(self, pod_name): diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml index 7d3e14ce3..332c184ed 100644 --- a/manifests/configmap.yaml +++ b/manifests/configmap.yaml @@ -85,7 +85,8 @@ data: # min_cpu_limit: 250m # min_memory_limit: 250Mi # minimal_major_version: "9.6" - # node_readiness_label: "" + # node_readiness_label: "status:ready" + # node_readiness_label_merge: "OR" # oauth_token_secret_name: postgresql-operator # pam_configuration: | # https://info.example.com/oauth2/tokeninfo?access_token= uid realm=/employees diff --git a/manifests/operatorconfiguration.crd.yaml b/manifests/operatorconfiguration.crd.yaml index bb64995ab..84a8024e9 100644 --- a/manifests/operatorconfiguration.crd.yaml +++ b/manifests/operatorconfiguration.crd.yaml @@ -225,6 +225,11 @@ spec: type: object additionalProperties: type: string + node_readiness_label_merge: + type: string + enum: + - "AND" + - "OR" oauth_token_secret_name: type: string default: "postgresql-operator" diff --git a/manifests/postgresql-operator-default-configuration.yaml b/manifests/postgresql-operator-default-configuration.yaml index 02d558543..a82226394 100644 --- a/manifests/postgresql-operator-default-configuration.yaml +++ b/manifests/postgresql-operator-default-configuration.yaml @@ -69,6 +69,7 @@ configuration: master_pod_move_timeout: 20m # node_readiness_label: # status: ready + # node_readiness_label_merge: "OR" oauth_token_secret_name: postgresql-operator pdb_name_format: "postgres-{cluster}-pdb" pod_antiaffinity_topology_key: "kubernetes.io/hostname" diff --git a/pkg/apis/acid.zalan.do/v1/crds.go b/pkg/apis/acid.zalan.do/v1/crds.go index fae5a09f2..61efdcd0f 100644 --- a/pkg/apis/acid.zalan.do/v1/crds.go +++ b/pkg/apis/acid.zalan.do/v1/crds.go @@ -1164,6 +1164,17 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{ }, }, }, + "node_readiness_label_merge": { + Type: "string", + Enum: []apiextv1.JSON{ + { + Raw: []byte(`"AND"`), + }, + { + Raw: []byte(`"OR"`), + }, + }, + }, "oauth_token_secret_name": { Type: "string", }, diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index f8eb5b5d1..a11cba0a5 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -82,6 +82,7 @@ type KubernetesMetaConfiguration struct { DeleteAnnotationDateKey string `json:"delete_annotation_date_key,omitempty"` DeleteAnnotationNameKey string `json:"delete_annotation_name_key,omitempty"` NodeReadinessLabel map[string]string `json:"node_readiness_label,omitempty"` + NodeReadinessLabelMerge string `json:"node_readiness_label_merge,omitempty"` CustomPodAnnotations map[string]string `json:"custom_pod_annotations,omitempty"` // TODO: use a proper toleration structure? PodToleration map[string]string `json:"toleration,omitempty"` diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 967f9d530..ca58c10a0 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -375,7 +375,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa reasons = append(reasons, "new statefulset's number of replicas does not match the current one") } if !reflect.DeepEqual(c.Statefulset.Annotations, statefulSet.Annotations) { - match = false needsReplace = true reasons = append(reasons, "new statefulset's annotations do not match the current one") } @@ -406,6 +405,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa needsRollUpdate = true reasons = append(reasons, "new statefulset's pod affinity does not match the current one") } + if len(c.Statefulset.Spec.Template.Spec.Tolerations) != len(statefulSet.Spec.Template.Spec.Tolerations) { + needsReplace = true + needsRollUpdate = true + reasons = append(reasons, "new statefulset's pod tolerations does not match the current one") + } // Some generated fields like creationTimestamp make it not possible to use DeepCompare on Spec.Template.ObjectMeta if !reflect.DeepEqual(c.Statefulset.Spec.Template.Labels, statefulSet.Spec.Template.Labels) { @@ -427,13 +431,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa } if !reflect.DeepEqual(c.Statefulset.Spec.Template.Annotations, statefulSet.Spec.Template.Annotations) { - match = false needsReplace = true needsRollUpdate = true reasons = append(reasons, "new statefulset's pod template metadata annotations does not match the current one") } if !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.SecurityContext, statefulSet.Spec.Template.Spec.SecurityContext) { - match = false needsReplace = true needsRollUpdate = true reasons = append(reasons, "new statefulset's pod template security context in spec does not match the current one") @@ -469,7 +471,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa // we assume any change in priority happens by rolling out a new priority class // changing the priority value in an existing class is not supproted if c.Statefulset.Spec.Template.Spec.PriorityClassName != statefulSet.Spec.Template.Spec.PriorityClassName { - match = false needsReplace = true needsRollUpdate = true reasons = append(reasons, "new statefulset's pod priority class in spec does not match the current one") diff --git a/pkg/cluster/connection_pooler.go b/pkg/cluster/connection_pooler.go index c5c55350f..ec9fe291d 100644 --- a/pkg/cluster/connection_pooler.go +++ b/pkg/cluster/connection_pooler.go @@ -309,7 +309,7 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) ( }, } - nodeAffinity := nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity) + nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity) if c.OpConfig.EnablePodAntiAffinity { labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels) podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index e7d8ea376..fda192df8 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -327,7 +327,7 @@ func generateCapabilities(capabilities []string) *v1.Capabilities { return nil } -func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity { +func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity { if len(nodeReadinessLabel) == 0 && nodeAffinity == nil { return nil } @@ -352,8 +352,18 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff }, } } else { - nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{ - NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm), + if c.OpConfig.NodeReadinessLabelMerge == "OR" { + manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms + manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm) + nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{ + NodeSelectorTerms: manifestTerms, + } + } else if c.OpConfig.NodeReadinessLabelMerge == "AND" { + for i, nodeSelectorTerm := range nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms { + manifestExpressions := nodeSelectorTerm.MatchExpressions + manifestExpressions = append(manifestExpressions, matchExpressions...) + nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i] = v1.NodeSelectorTerm{MatchExpressions: manifestExpressions} + } } } } @@ -1260,7 +1270,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef effectiveRunAsUser, effectiveRunAsGroup, effectiveFSGroup, - nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity), + c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity), spec.SchedulerName, int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), c.OpConfig.PodServiceAccountName, @@ -2010,7 +2020,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) { nil, nil, nil, - nodeAffinity(c.OpConfig.NodeReadinessLabel, nil), + c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil), nil, int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), c.OpConfig.PodServiceAccountName, diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index 275898d8e..f94f3b96c 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -109,6 +109,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.DeleteAnnotationDateKey = fromCRD.Kubernetes.DeleteAnnotationDateKey result.DeleteAnnotationNameKey = fromCRD.Kubernetes.DeleteAnnotationNameKey result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel + result.NodeReadinessLabelMerge = fromCRD.Kubernetes.NodeReadinessLabelMerge result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName result.PodManagementPolicy = util.Coalesce(fromCRD.Kubernetes.PodManagementPolicy, "ordered_ready") result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m") diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 71bf406e4..e676a3c1e 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -54,6 +54,7 @@ type Resources struct { PodEnvironmentConfigMap spec.NamespacedName `name:"pod_environment_configmap"` PodEnvironmentSecret string `name:"pod_environment_secret"` NodeReadinessLabel map[string]string `name:"node_readiness_label" default:""` + NodeReadinessLabelMerge string `name:"node_readiness_label_merge" default:"OR"` MaxInstances int32 `name:"max_instances" default:"-1"` MinInstances int32 `name:"min_instances" default:"-1"` ShmVolume *bool `name:"enable_shm_volume" default:"true"`