feat: add blue/green upgrade strategy settings

terraform-google-modules · Jan 31, 2023 · 944d4ed · 944d4ed
1 parent 968b024
commit 944d4ed
Show file tree

Hide file tree

Showing 21 changed files with 638 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -275,8 +275,13 @@ The node_pools variable takes the following parameters:
 | min_cpu_platform | Minimum CPU platform to be used by the nodes in the pool. The nodes may be scheduled on the specified or newer CPU platform. | " " | Optional |
 | max_count | Maximum number of nodes in the NodePool. Must be >= min_count | 100 | Optional |
 | max_pods_per_node | The maximum number of pods per node in this cluster | null | Optional |
+| strategy | The upgrade stragey to be used for upgrading the nodes. Valid values of state are: `SURGE` or `BLUE_GREEN` | "SURGE" | Optional |
 | max_surge | The number of additional nodes that can be added to the node pool during an upgrade. Increasing max_surge raises the number of nodes that can be upgraded simultaneously. Can be set to 0 or greater. | 1 | Optional |
 | max_unavailable | The number of nodes that can be simultaneously unavailable during an upgrade. Increasing max_unavailable raises the number of nodes that can be upgraded in parallel. Can be set to 0 or greater. | 0 | Optional |
+| node_pool_soak_duration | Time needed after draining the entire blue pool. After this period, the blue pool will be cleaned up. By default, it is set to one hour (3600 seconds). The maximum length of the soak time is 7 days (604,800 seconds). | "3600s" | Optional |
+| batch_soak_duration | Soak time after each batch gets drained, with the default being zero seconds. | "0s" | Optional |
+| batch_node_count | Absolute number of nodes to drain in a batch. If it is set to zero, this phase will be skipped. | null | Optional |
+| batch_percentage | Percentage of nodes to drain in a batch. Must be in the range of [0.0, 1.0]. If it is set to zero, this phase will be skipped. | null | Optional |
 | min_count | Minimum number of nodes in the NodePool. Must be >=0 and <= max_count. Should be used when autoscaling is true | 1 | Optional |
 | name | The name of the node pool |  | Required |
 | node_count | The number of nodes in the nodepool when autoscaling is false. Otherwise defaults to 1. Only valid for non-autoscaling clusters |  | Required |

diff --git a/autogen/main/README.md b/autogen/main/README.md
@@ -212,8 +212,13 @@ The node_pools variable takes the following parameters:
 | min_cpu_platform | Minimum CPU platform to be used by the nodes in the pool. The nodes may be scheduled on the specified or newer CPU platform. | " " | Optional |
 | max_count | Maximum number of nodes in the NodePool. Must be >= min_count | 100 | Optional |
 | max_pods_per_node | The maximum number of pods per node in this cluster | null | Optional |
+| strategy | The upgrade stragey to be used for upgrading the nodes. Valid values of state are: `SURGE` or `BLUE_GREEN` | "SURGE" | Optional |
 | max_surge | The number of additional nodes that can be added to the node pool during an upgrade. Increasing max_surge raises the number of nodes that can be upgraded simultaneously. Can be set to 0 or greater. | 1 | Optional |
 | max_unavailable | The number of nodes that can be simultaneously unavailable during an upgrade. Increasing max_unavailable raises the number of nodes that can be upgraded in parallel. Can be set to 0 or greater. | 0 | Optional |
+| node_pool_soak_duration | Time needed after draining the entire blue pool. After this period, the blue pool will be cleaned up. By default, it is set to one hour (3600 seconds). The maximum length of the soak time is 7 days (604,800 seconds). | "3600s" | Optional |
+| batch_soak_duration | Soak time after each batch gets drained, with the default being zero seconds. | "0s" | Optional |
+| batch_node_count | Absolute number of nodes to drain in a batch. If it is set to zero, this phase will be skipped. | null | Optional |
+| batch_percentage | Percentage of nodes to drain in a batch. Must be in the range of [0.0, 1.0]. If it is set to zero, this phase will be skipped. | null | Optional |
 | min_count | Minimum number of nodes in the NodePool. Must be >=0 and <= max_count. Should be used when autoscaling is true | 1 | Optional |
 | name | The name of the node pool |  | Required |
 {% if beta_cluster %}

diff --git a/autogen/main/cluster.tf.tmpl b/autogen/main/cluster.tf.tmpl
@@ -676,9 +676,30 @@ resource "google_container_node_pool" "windows_pools" {
     auto_upgrade = lookup(each.value, "auto_upgrade", local.default_auto_upgrade)
   }
 
-  upgrade_settings {
-    max_surge       = lookup(each.value, "max_surge", 1)
-    max_unavailable = lookup(each.value, "max_unavailable", 0)
+  dynamic "upgrade_settings" {
+    for_each = lookup(each.value, "strategy", var.strategy) == "SURGE" ? [each.value] : []
+    content {
+      strategy        = lookup(each.value, "strategy", "SURGE")
+      max_surge       = lookup(each.value, "max_surge", 1)
+      max_unavailable = lookup(each.value, "max_unavailable", 0)
+    }
+  }
+
+  dynamic "upgrade_settings" {
+    for_each = lookup(each.value, "strategy", var.strategy) == "BLUE_GREEN" ? [each.value] : []
+    content {
+      strategy = lookup(each.value, "strategy", "BLUE_GREEN")
+
+      blue_green_settings {
+        node_pool_soak_duration = lookup(each.value, "node_pool_soak_duration", "3600s")
+
+        standard_rollout_policy {
+          batch_soak_duration = lookup(each.value, "batch_soak_duration", "60s")
+          batch_percentage    = lookup(each.value, "batch_percentage", null)
+          batch_node_count    = lookup(each.value, "batch_node_count", null)
+        }
+      }
+    }
   }
 
   node_config {

diff --git a/autogen/main/variables.tf.tmpl b/autogen/main/variables.tf.tmpl
@@ -725,7 +725,6 @@ variable "enable_pod_security_policy" {
   default     = false
 }
 
-
 variable "enable_l4_ilb_subsetting" {
   type        = bool
   description = "Enable L4 ILB Subsetting on the cluster"
@@ -749,5 +748,47 @@ variable "enable_identity_service" {
   description = "Enable the Identity Service component, which allows customers to use external identity providers with the K8S API."
   default     = false
 }
+
+variable "strategy" {
+  type        = string
+  description = "The upgrade stragey to be used for upgrading the nodes. Valid values of state are: `SURGE`; `BLUE_GREEN`. By default strategy is `SURGE` (Optional)"
+  default     = "SURGE"
+}
+
+variable "max_surge" {
+  type        = number
+  description = "The number of additional nodes that can be added to the node pool during an upgrade. Increasing max_surge raises the number of nodes that can be upgraded simultaneously. Can be set to 0 or greater (Optional)"
+  default     = null
+}
+
+variable "max_unavailable" {
+  type        = number
+  description = "The number of nodes that can be simultaneously unavailable during an upgrade. Increasing max_unavailable raises the number of nodes that can be upgraded in parallel. Can be set to 0 or greater (Optional)"
+  default     = null
+}
+
+variable "node_pool_soak_duration" {
+  type        = string
+  description = "Time needed after draining the entire blue pool. After this period, the blue pool will be cleaned up (Optional)"
+  default     = "3600s"
+}
+
+variable "batch_soak_duration" {
+  type        = string
+  description = "Soak time after each batch gets drained (Optionial)"
+  default     = "0s"
+}
+
+variable "batch_percentage" {
+  type        = string
+  description = "Percentage of the blue pool nodes to drain in a batch (Optional)"
+  default     = null
+}
+
+variable "batch_node_count" {
+  type        = number
+  description = "The number of blue nodes to drain in a batch (Optional)"
+  default     = null
+}
   {% endif %}
 {% endif %}
diff --git a/cluster.tf b/cluster.tf
@@ -399,9 +399,30 @@ resource "google_container_node_pool" "pools" {
     auto_upgrade = lookup(each.value, "auto_upgrade", local.default_auto_upgrade)
   }
 
-  upgrade_settings {
-    max_surge       = lookup(each.value, "max_surge", 1)
-    max_unavailable = lookup(each.value, "max_unavailable", 0)
+  dynamic "upgrade_settings" {
+    for_each = lookup(each.value, "strategy", var.strategy) == "SURGE" ? [each.value] : []
+    content {
+      strategy        = lookup(each.value, "strategy", "SURGE")
+      max_surge       = lookup(each.value, "max_surge", 1)
+      max_unavailable = lookup(each.value, "max_unavailable", 0)
+    }
+  }
+
+  dynamic "upgrade_settings" {
+    for_each = lookup(each.value, "strategy", var.strategy) == "BLUE_GREEN" ? [each.value] : []
+    content {
+      strategy = lookup(each.value, "strategy", "BLUE_GREEN")
+
+      blue_green_settings {
+        node_pool_soak_duration = lookup(each.value, "node_pool_soak_duration", "3600s")
+
+        standard_rollout_policy {
+          batch_soak_duration = lookup(each.value, "batch_soak_duration", "60s")
+          batch_percentage    = lookup(each.value, "batch_percentage", null)
+          batch_node_count    = lookup(each.value, "batch_node_count", null)
+        }
+      }
+    }
   }
 
   node_config {
@@ -557,9 +578,30 @@ resource "google_container_node_pool" "windows_pools" {
     auto_upgrade = lookup(each.value, "auto_upgrade", local.default_auto_upgrade)
   }
 
-  upgrade_settings {
-    max_surge       = lookup(each.value, "max_surge", 1)
-    max_unavailable = lookup(each.value, "max_unavailable", 0)
+  dynamic "upgrade_settings" {
+    for_each = lookup(each.value, "strategy", var.strategy) == "SURGE" ? [each.value] : []
+    content {
+      strategy        = lookup(each.value, "strategy", "SURGE")
+      max_surge       = lookup(each.value, "max_surge", 1)
+      max_unavailable = lookup(each.value, "max_unavailable", 0)
+    }
+  }
+
+  dynamic "upgrade_settings" {
+    for_each = lookup(each.value, "strategy", var.strategy) == "BLUE_GREEN" ? [each.value] : []
+    content {
+      strategy = lookup(each.value, "strategy", "BLUE_GREEN")
+
+      blue_green_settings {
+        node_pool_soak_duration = lookup(each.value, "node_pool_soak_duration", "3600s")
+
+        standard_rollout_policy {
+          batch_soak_duration = lookup(each.value, "batch_soak_duration", "60s")
+          batch_percentage    = lookup(each.value, "batch_percentage", null)
+          batch_node_count    = lookup(each.value, "batch_node_count", null)
+        }
+      }
+    }
   }
 
   node_config {

diff --git a/modules/beta-private-cluster-update-variant/README.md b/modules/beta-private-cluster-update-variant/README.md
@@ -163,6 +163,9 @@ Then perform the following commands on the root folder:
 | add\_master\_webhook\_firewall\_rules | Create master\_webhook firewall rules for ports defined in `firewall_inbound_ports` | `bool` | `false` | no |
 | add\_shadow\_firewall\_rules | Create GKE shadow firewall (the same as default firewall rules with firewall logs enabled). | `bool` | `false` | no |
 | authenticator\_security\_group | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no |
+| batch\_node\_count | The number of blue nodes to drain in a batch (Optional) | `number` | `null` | no |
+| batch\_percentage | Percentage of the blue pool nodes to drain in a batch (Optional) | `string` | `null` | no |
+| batch\_soak\_duration | Soak time after each batch gets drained (Optionial) | `string` | `"0s"` | no |
 | cloudrun | (Beta) Enable CloudRun addon | `bool` | `false` | no |
 | cloudrun\_load\_balancer\_type | (Beta) Configure the Cloud Run load balancer type. External by default. Set to `LOAD_BALANCER_TYPE_INTERNAL` to configure as an internal load balancer. | `string` | `""` | no |
 | cluster\_autoscaling | Cluster autoscaling configuration. See [more details](https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1beta1/projects.locations.clusters#clusterautoscaling) | <pre>object({<br>    enabled             = bool<br>    autoscaling_profile = string<br>    min_cpu_cores       = number<br>    max_cpu_cores       = number<br>    min_memory_gb       = number<br>    max_memory_gb       = number<br>    gpu_resources       = list(object({ resource_type = string, minimum = number, maximum = number }))<br>  })</pre> | <pre>{<br>  "autoscaling_profile": "BALANCED",<br>  "enabled": false,<br>  "gpu_resources": [],<br>  "max_cpu_cores": 0,<br>  "max_memory_gb": 0,<br>  "min_cpu_cores": 0,<br>  "min_memory_gb": 0<br>}</pre> | no |
@@ -227,6 +230,8 @@ Then perform the following commands on the root folder:
 | master\_authorized\_networks | List of master authorized networks. If none are provided, disallow external access (except the cluster node IPs, which GKE automatically whitelists). | `list(object({ cidr_block = string, display_name = string }))` | `[]` | no |
 | master\_global\_access\_enabled | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `true` | no |
 | master\_ipv4\_cidr\_block | (Beta) The IP range in CIDR notation to use for the hosted master network | `string` | `"10.0.0.0/28"` | no |
+| max\_surge | The number of additional nodes that can be added to the node pool during an upgrade. Increasing max\_surge raises the number of nodes that can be upgraded simultaneously. Can be set to 0 or greater (Optional) | `number` | `null` | no |
+| max\_unavailable | The number of nodes that can be simultaneously unavailable during an upgrade. Increasing max\_unavailable raises the number of nodes that can be upgraded in parallel. Can be set to 0 or greater (Optional) | `number` | `null` | no |
 | monitoring\_enable\_managed\_prometheus | Configuration for Managed Service for Prometheus. Whether or not the managed collection is enabled. | `bool` | `false` | no |
 | monitoring\_enabled\_components | List of services to monitor: SYSTEM\_COMPONENTS, WORKLOADS (provider version >= 3.89.0). Empty list is default GKE configuration. | `list(string)` | `[]` | no |
 | monitoring\_service | The monitoring service that the cluster should write metrics to. Automatically send metrics from pods in the cluster to the Google Cloud Monitoring API. VM metrics will be collected by Google Compute Engine regardless of this setting Available options include monitoring.googleapis.com, monitoring.googleapis.com/kubernetes (beta) and none | `string` | `"monitoring.googleapis.com/kubernetes"` | no |
@@ -236,6 +241,7 @@ Then perform the following commands on the root folder:
 | network\_policy\_provider | The network policy provider. | `string` | `"CALICO"` | no |
 | network\_project\_id | The project ID of the shared VPC's host (for shared vpc support) | `string` | `""` | no |
 | node\_metadata | Specifies how node metadata is exposed to the workload running on the node | `string` | `"GKE_METADATA"` | no |
+| node\_pool\_soak\_duration | Time needed after draining the entire blue pool. After this period, the blue pool will be cleaned up (Optional) | `string` | `"3600s"` | no |
 | node\_pools | List of maps containing node pools | `list(map(any))` | <pre>[<br>  {<br>    "name": "default-node-pool"<br>  }<br>]</pre> | no |
 | node\_pools\_labels | Map of maps containing node labels by node-pool name | `map(map(string))` | <pre>{<br>  "all": {},<br>  "default-node-pool": {}<br>}</pre> | no |
 | node\_pools\_linux\_node\_configs\_sysctls | Map of maps containing linux node config sysctls by node-pool name | `map(map(string))` | <pre>{<br>  "all": {},<br>  "default-node-pool": {}<br>}</pre> | no |
@@ -259,6 +265,7 @@ Then perform the following commands on the root folder:
 | shadow\_firewall\_rules\_log\_config | The log\_config for shadow firewall rules. You can set this variable to `null` to disable logging. | <pre>object({<br>    metadata = string<br>  })</pre> | <pre>{<br>  "metadata": "INCLUDE_ALL_METADATA"<br>}</pre> | no |
 | shadow\_firewall\_rules\_priority | The firewall priority of GKE shadow firewall rules. The priority should be less than default firewall, which is 1000. | `number` | `999` | no |
 | skip\_provisioners | Flag to skip all local-exec provisioners. It breaks `stub_domains` and `upstream_nameservers` variables functionality. | `bool` | `false` | no |
+| strategy | The upgrade stragey to be used for upgrading the nodes. Valid values of state are: `SURGE`; `BLUE_GREEN`. By default strategy is `SURGE` (Optional) | `string` | `"SURGE"` | no |
 | stub\_domains | Map of stub domains and their resolvers to forward DNS queries for a certain domain to an external DNS server | `map(list(string))` | `{}` | no |
 | subnetwork | The subnetwork to host the cluster in (required) | `string` | n/a | yes |
 | timeouts | Timeout for cluster operations. | `map(string)` | `{}` | no |
@@ -341,8 +348,13 @@ The node_pools variable takes the following parameters:
 | min_cpu_platform | Minimum CPU platform to be used by the nodes in the pool. The nodes may be scheduled on the specified or newer CPU platform. | " " | Optional |
 | max_count | Maximum number of nodes in the NodePool. Must be >= min_count | 100 | Optional |
 | max_pods_per_node | The maximum number of pods per node in this cluster | null | Optional |
+| strategy | The upgrade stragey to be used for upgrading the nodes. Valid values of state are: `SURGE` or `BLUE_GREEN` | "SURGE" | Optional |
 | max_surge | The number of additional nodes that can be added to the node pool during an upgrade. Increasing max_surge raises the number of nodes that can be upgraded simultaneously. Can be set to 0 or greater. | 1 | Optional |
 | max_unavailable | The number of nodes that can be simultaneously unavailable during an upgrade. Increasing max_unavailable raises the number of nodes that can be upgraded in parallel. Can be set to 0 or greater. | 0 | Optional |
+| node_pool_soak_duration | Time needed after draining the entire blue pool. After this period, the blue pool will be cleaned up. By default, it is set to one hour (3600 seconds). The maximum length of the soak time is 7 days (604,800 seconds). | "3600s" | Optional |
+| batch_soak_duration | Soak time after each batch gets drained, with the default being zero seconds. | "0s" | Optional |
+| batch_node_count | Absolute number of nodes to drain in a batch. If it is set to zero, this phase will be skipped. | null | Optional |
+| batch_percentage | Percentage of nodes to drain in a batch. Must be in the range of [0.0, 1.0]. If it is set to zero, this phase will be skipped. | null | Optional |
 | min_count | Minimum number of nodes in the NodePool. Must be >=0 and <= max_count. Should be used when autoscaling is true | 1 | Optional |
 | name | The name of the node pool |  | Required |
 | placement_policy | Placement type to set for nodes in a node pool. Can be set as [COMPACT](https://cloud.google.com/kubernetes-engine/docs/how-to/compact-placement#overview) if desired | Optional |