From 8101b0e6a2a0c96acf0183191b16ab42afce85ee Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Mon, 19 Feb 2024 16:00:31 -0600 Subject: [PATCH 01/20] implement volume snapshot backups --- charts/tembo-operator/Chart.yaml | 4 +- charts/tembo-operator/templates/crd.yaml | 17 +++ tembo-operator/Cargo.lock | 2 +- tembo-operator/Cargo.toml | 2 +- tembo-operator/src/apis/coredb_types.rs | 31 ++++- tembo-operator/src/cloudnativepg/clusters.rs | 6 +- tembo-operator/src/cloudnativepg/cnpg.rs | 127 +++++++++++++++++- .../src/cloudnativepg/scheduledbackups.rs | 2 +- tembo-operator/src/defaults.rs | 14 +- tembo-operator/tests/integration_tests.rs | 6 + .../yaml/sample-machine-learning-backup.yaml | 2 + .../yaml/sample-machine-learning-restore.yaml | 2 + .../yaml/sample-standard-backup.yaml | 2 + .../yaml/sample-standard-restore.yaml | 2 + 14 files changed, 203 insertions(+), 16 deletions(-) diff --git a/charts/tembo-operator/Chart.yaml b/charts/tembo-operator/Chart.yaml index 44b92eb73..c28437861 100644 --- a/charts/tembo-operator/Chart.yaml +++ b/charts/tembo-operator/Chart.yaml @@ -3,10 +3,10 @@ name: tembo-operator description: 'Helm chart to deploy the tembo-operator' type: application icon: https://cloud.tembo.io/images/TemboElephant.png -version: 0.3.0 +version: 0.3.1 home: https://tembo.io sources: - - https://github.com/tembo-io/tembo-stacks + - https://github.com/tembo-io/tembo - https://github.com/cloudnative-pg/cloudnative-pg keywords: - postgresql diff --git a/charts/tembo-operator/templates/crd.yaml b/charts/tembo-operator/templates/crd.yaml index d6d9c16ec..bcd8ed8cf 100644 --- a/charts/tembo-operator/templates/crd.yaml +++ b/charts/tembo-operator/templates/crd.yaml @@ -1344,6 +1344,8 @@ spec: endpointURL: null s3Credentials: inheritFromIAMRole: true + volumeSnapshot: + enabled: true description: |- The backup configuration for the CoreDB instance to facilitate database backups and WAL archive uploads to an S3 compatible object store. @@ -1432,6 +1434,21 @@ spec: description: The backup schedule set with cron syntax nullable: true type: string + volumeSnapshot: + default: + enabled: true + description: Enable using Volume Snapshots for backups instead of Object Storage + nullable: true + properties: + enabled: + default: true + description: Enable the volume snapshots for backups + type: boolean + snapshotClass: + description: The reference to the snapshot class + nullable: true + type: string + type: object type: object connectionPooler: default: diff --git a/tembo-operator/Cargo.lock b/tembo-operator/Cargo.lock index df08074ee..c1000ed2d 100644 --- a/tembo-operator/Cargo.lock +++ b/tembo-operator/Cargo.lock @@ -494,7 +494,7 @@ dependencies = [ [[package]] name = "controller" -version = "0.35.4" +version = "0.36.0" dependencies = [ "actix-web", "anyhow", diff --git a/tembo-operator/Cargo.toml b/tembo-operator/Cargo.toml index 30a01373e..d76d1fa26 100644 --- a/tembo-operator/Cargo.toml +++ b/tembo-operator/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "controller" description = "Tembo Operator for Postgres" -version = "0.35.4" +version = "0.36.0" edition = "2021" default-run = "controller" license = "Apache-2.0" diff --git a/tembo-operator/src/apis/coredb_types.rs b/tembo-operator/src/apis/coredb_types.rs index 8554c412e..e5c247790 100644 --- a/tembo-operator/src/apis/coredb_types.rs +++ b/tembo-operator/src/apis/coredb_types.rs @@ -157,11 +157,30 @@ pub struct S3CredentialsSessionToken { pub name: String, } +/// VolumeSnapshots is the type for the configuration of the volume snapshots +/// to be used for backups instead of object storage +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshot { + /// Enable the volume snapshots for backups + #[serde(default = "defaults::default_volume_snapshot_enabled")] + pub enabled: bool, + + /// The reference to the snapshot class + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "snapshotClass" + )] + pub snapshot_class: Option, +} + /// CoreDB Backup configuration /// The backup configuration for the CoreDB instance to facilitate database -/// backups and WAL archive uploads to an S3 compatible object store. +/// backups uploads to an S3 compatible object store or using Volume Snapshots +/// For WAL archive uploads utilite an S3 compatible object store. /// /// **Example**: A typical S3 backup configuration using IAM Role for authentication +/// with Volume Snapshots enabled /// /// See `ServiceAccountTemplate` for to map the IAM role ARN to a Kubernetes service account. /// @@ -178,6 +197,9 @@ pub struct S3CredentialsSessionToken { /// s3Credentials: /// inheritFromIAMRole: true /// schedule: "0 0 * * *" #every day at midnight +/// volumeSnapshots: +/// enabled: true +/// snapshotClass: my-snapshot-class-name /// ``` #[derive(Deserialize, Serialize, Clone, Debug, Default, JsonSchema)] #[allow(non_snake_case)] @@ -205,6 +227,13 @@ pub struct Backup { /// The S3 credentials to use for backups (if not using IAM Role) #[serde(default = "defaults::default_s3_credentials", rename = "s3Credentials")] pub s3_credentials: Option, + + /// Enable using Volume Snapshots for backups instead of Object Storage + #[serde( + default = "defaults::default_volume_snapshot", + rename = "volumeSnapshot" + )] + pub volume_snapshot: Option, } /// Restore configuration provides a way to restore a database from a backup diff --git a/tembo-operator/src/cloudnativepg/clusters.rs b/tembo-operator/src/cloudnativepg/clusters.rs index ae814d07a..a4240c6b1 100644 --- a/tembo-operator/src/cloudnativepg/clusters.rs +++ b/tembo-operator/src/cloudnativepg/clusters.rs @@ -1157,7 +1157,7 @@ pub enum ClusterBackupTarget { } /// VolumeSnapshot provides the configuration for the execution of volume snapshot backups. -#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema, PartialEq)] pub struct ClusterBackupVolumeSnapshot { /// Annotations key-value pairs that will be added to .metadata.annotations snapshot resources. #[serde(default, skip_serializing_if = "Option::is_none")] @@ -1202,7 +1202,7 @@ pub struct ClusterBackupVolumeSnapshot { } /// Configuration parameters to control the online/hot backup with volume snapshots -#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema, PartialEq)] pub struct ClusterBackupVolumeSnapshotOnlineConfiguration { /// Control whether the I/O workload for the backup initial checkpoint will be limited, according to the `checkpoint_completion_target` setting on the PostgreSQL server. If set to true, an immediate checkpoint will be used, meaning PostgreSQL will complete the checkpoint as soon as possible. `false` by default. #[serde( @@ -1221,7 +1221,7 @@ pub struct ClusterBackupVolumeSnapshotOnlineConfiguration { } /// VolumeSnapshot provides the configuration for the execution of volume snapshot backups. -#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema)] +#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema, PartialEq)] pub enum ClusterBackupVolumeSnapshotSnapshotOwnerReference { #[serde(rename = "none")] None, diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index af25b64eb..7343fc078 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -15,9 +15,12 @@ use crate::{ ClusterBackupBarmanObjectStoreS3CredentialsSecretAccessKey, ClusterBackupBarmanObjectStoreS3CredentialsSessionToken, ClusterBackupBarmanObjectStoreWal, ClusterBackupBarmanObjectStoreWalCompression, - ClusterBackupBarmanObjectStoreWalEncryption, ClusterBootstrap, ClusterBootstrapInitdb, - ClusterBootstrapRecovery, ClusterBootstrapRecoveryRecoveryTarget, ClusterCertificates, - ClusterExternalClusters, ClusterExternalClustersBarmanObjectStore, + ClusterBackupBarmanObjectStoreWalEncryption, ClusterBackupVolumeSnapshot, + ClusterBackupVolumeSnapshotOnlineConfiguration, + ClusterBackupVolumeSnapshotSnapshotOwnerReference, ClusterBootstrap, + ClusterBootstrapInitdb, ClusterBootstrapRecovery, + ClusterBootstrapRecoveryRecoveryTarget, ClusterCertificates, ClusterExternalClusters, + ClusterExternalClustersBarmanObjectStore, ClusterExternalClustersBarmanObjectStoreS3Credentials, ClusterExternalClustersBarmanObjectStoreS3CredentialsAccessKeyId, ClusterExternalClustersBarmanObjectStoreS3CredentialsRegion, @@ -41,7 +44,7 @@ use crate::{ }, scheduledbackups::{ ScheduledBackup, ScheduledBackupBackupOwnerReference, ScheduledBackupCluster, - ScheduledBackupSpec, + ScheduledBackupMethod, ScheduledBackupSpec, }, }, config::Config, @@ -66,6 +69,8 @@ use std::{collections::BTreeMap, sync::Arc}; use tokio::time::Duration; use tracing::{debug, error, info, instrument, warn}; +const VOLUME_SNAPSHOT_CLASS_NAME: &str = "cnpg-snapshot-class"; + pub struct PostgresConfig { pub postgres_parameters: Option>, pub shared_preload_libraries: Option>, @@ -103,7 +108,7 @@ fn create_cluster_backup_barman_wal(cdb: &CoreDB) -> Option Option { } } +fn create_cluster_backup_volume_snapshot(cdb: &CoreDB) -> ClusterBackupVolumeSnapshot { + let class_name = cdb + .spec + .backup + .volume_snapshot + .as_ref() + .and_then(|vs| vs.snapshot_class.as_ref()) + .cloned() // Directly clone the Option if present + .unwrap_or_else(|| VOLUME_SNAPSHOT_CLASS_NAME.to_string()); + + ClusterBackupVolumeSnapshot { + class_name: Some(class_name), + online: Some(true), + online_configuration: Some(ClusterBackupVolumeSnapshotOnlineConfiguration { + wait_for_archive: Some(true), + immediate_checkpoint: Some(true), + }), + snapshot_owner_reference: Some(ClusterBackupVolumeSnapshotSnapshotOwnerReference::Cluster), + ..ClusterBackupVolumeSnapshot::default() + } +} + fn create_cluster_backup( cdb: &CoreDB, endpoint_url: &str, @@ -171,6 +198,14 @@ fn create_cluster_backup( }, }; + let volume_snapshot = cdb.spec.backup.volume_snapshot.as_ref().and_then(|vs| { + if vs.enabled { + Some(create_cluster_backup_volume_snapshot(cdb)) + } else { + None + } + }); + Some(ClusterBackup { barman_object_store: Some(create_cluster_backup_barman_object_store( cdb, @@ -178,7 +213,8 @@ fn create_cluster_backup( backup_path, s3_credentials, )), - retention_policy: Some(retention_days), // Adjust as needed + retention_policy: Some(retention_days), + volume_snapshot, ..ClusterBackup::default() }) } @@ -1331,6 +1367,13 @@ fn schedule_expression_from_cdb(cdb: &CoreDB) -> String { fn cnpg_scheduled_backup(cdb: &CoreDB) -> ScheduledBackup { let name = cdb.name_any(); let namespace = cdb.namespace().unwrap(); + let method = cdb.spec.backup.volume_snapshot.as_ref().map(|vs| { + if vs.enabled { + ScheduledBackupMethod::VolumeSnapshot + } else { + ScheduledBackupMethod::BarmanObjectStore + } + }); ScheduledBackup { metadata: ObjectMeta { @@ -1344,6 +1387,7 @@ fn cnpg_scheduled_backup(cdb: &CoreDB) -> ScheduledBackup { immediate: Some(true), schedule: schedule_expression_from_cdb(cdb), suspend: Some(false), + method, ..ScheduledBackupSpec::default() }, status: None, @@ -2292,6 +2336,8 @@ mod tests { encryption: AES256 retentionPolicy: "45" schedule: 55 7 * * * + volumeSnapshot: + enabled: false image: quay.io/tembo/tembo-pg-cnpg:15.3.0-5-48d489e port: 5432 postgresExporterEnabled: true @@ -2323,6 +2369,11 @@ mod tests { "45d".to_string() ); + assert_eq!( + scheduled_backup.spec.method, + Some(ScheduledBackupMethod::BarmanObjectStore) + ); + // Assert to make sure that backup destination path is set assert_eq!( backup @@ -2634,4 +2685,68 @@ mod tests { let cdb_no_storage_class: CoreDB = from_str(cdb_no_storage_class_yaml).unwrap(); assert_eq!(cnpg_cluster_storage_class(&cdb_no_storage_class), None); } + + #[test] + fn test_cnpg_cluster_volume_snapshot() { + let cdb_yaml = r#" + apiVersion: coredb.io/v1alpha1 + kind: CoreDB + metadata: + name: test + namespace: default + spec: + backup: + destinationPath: s3://tembo-backup/sample-standard-backup + encryption: "" + retentionPolicy: "30" + schedule: 17 9 * * * + endpointURL: http://minio:9000 + volumeSnapshot: + enabled: true + snapshotClass: csi-vsc + image: quay.io/tembo/tembo-pg-cnpg:15.3.0-5-48d489e + port: 5432 + replicas: 1 + resources: + limits: + cpu: "1" + memory: 0.5Gi + serviceAccountTemplate: + metadata: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::012345678901:role/aws-iam-role-iam + sharedirStorage: 1Gi + stop: false + storage: 1Gi + storageClass: "gp3-enc" + uid: 999 + "#; + + let cdb: CoreDB = serde_yaml::from_str(cdb_yaml).expect("Failed to parse YAML"); + let snapshot = create_cluster_backup_volume_snapshot(&cdb); + let scheduled_backup = cnpg_scheduled_backup(&cdb); + + // Set an expected ClusterBackupVolumeSnapshot object + let expected_snapshot = ClusterBackupVolumeSnapshot { + class_name: Some("csi-vsc".to_string()), // Expected to match the YAML input + online: Some(true), + online_configuration: Some(ClusterBackupVolumeSnapshotOnlineConfiguration { + wait_for_archive: Some(true), + immediate_checkpoint: Some(true), + }), + snapshot_owner_reference: Some( + ClusterBackupVolumeSnapshotSnapshotOwnerReference::Cluster, + ), + ..ClusterBackupVolumeSnapshot::default() + }; + + // Assert to make sure that the snapshot.snapshot_class and expected_snapshot.snapshot_class are the same + assert_eq!(snapshot, expected_snapshot); + + // Assert to make sure that the ScheduledBackup method is set to VolumeSnapshot + assert_eq!( + scheduled_backup.spec.method, + Some(ScheduledBackupMethod::VolumeSnapshot) + ); + } } diff --git a/tembo-operator/src/cloudnativepg/scheduledbackups.rs b/tembo-operator/src/cloudnativepg/scheduledbackups.rs index d6cd3196d..0117c7445 100644 --- a/tembo-operator/src/cloudnativepg/scheduledbackups.rs +++ b/tembo-operator/src/cloudnativepg/scheduledbackups.rs @@ -71,7 +71,7 @@ pub struct ScheduledBackupCluster { } /// Specification of the desired behavior of the ScheduledBackup. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status -#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema)] +#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema, PartialEq)] pub enum ScheduledBackupMethod { #[serde(rename = "barmanObjectStore")] BarmanObjectStore, diff --git a/tembo-operator/src/defaults.rs b/tembo-operator/src/defaults.rs index b5e5fed73..c77522879 100644 --- a/tembo-operator/src/defaults.rs +++ b/tembo-operator/src/defaults.rs @@ -6,7 +6,7 @@ use std::collections::BTreeMap; use crate::{ apis::coredb_types::{ - Backup, ConnectionPooler, PgBouncer, S3Credentials, ServiceAccountTemplate, + Backup, ConnectionPooler, PgBouncer, S3Credentials, ServiceAccountTemplate, VolumeSnapshot, }, cloudnativepg::poolers::{PoolerPgbouncerPoolMode, PoolerTemplateSpecContainersResources}, extensions::types::{Extension, TrunkInstall}, @@ -139,6 +139,7 @@ pub fn default_backup() -> Backup { retentionPolicy: default_retention_policy(), schedule: default_backup_schedule(), s3_credentials: default_s3_credentials(), + volume_snapshot: default_volume_snapshot(), ..Default::default() } } @@ -210,3 +211,14 @@ pub fn default_s3_credentials() -> Option { ..Default::default() }) } + +pub fn default_volume_snapshot() -> Option { + Some(VolumeSnapshot { + enabled: default_volume_snapshot_enabled(), + ..Default::default() + }) +} + +pub fn default_volume_snapshot_enabled() -> bool { + true +} diff --git a/tembo-operator/tests/integration_tests.rs b/tembo-operator/tests/integration_tests.rs index 46e04c5f0..2f51cdef3 100644 --- a/tembo-operator/tests/integration_tests.rs +++ b/tembo-operator/tests/integration_tests.rs @@ -4433,6 +4433,9 @@ CREATE EVENT TRIGGER pgrst_watch "name": "s3creds", "key": "MINIO_SECRET_KEY" } + }, + "volumeSnapshot": { + "enabled": false, } }, "trunk_installs": [ @@ -4601,6 +4604,9 @@ CREATE EVENT TRIGGER pgrst_watch "name": "s3creds", "key": "MINIO_SECRET_KEY" } + }, + "volumeSnapshot": { + "enabled": false, } }, "restore": { diff --git a/tembo-operator/yaml/sample-machine-learning-backup.yaml b/tembo-operator/yaml/sample-machine-learning-backup.yaml index 4514bced9..460f3f5c2 100644 --- a/tembo-operator/yaml/sample-machine-learning-backup.yaml +++ b/tembo-operator/yaml/sample-machine-learning-backup.yaml @@ -17,6 +17,8 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY + volumeSnaphot: + enabled: false replicas: 2 stop: false stack: diff --git a/tembo-operator/yaml/sample-machine-learning-restore.yaml b/tembo-operator/yaml/sample-machine-learning-restore.yaml index d469c2e88..d35290e15 100644 --- a/tembo-operator/yaml/sample-machine-learning-restore.yaml +++ b/tembo-operator/yaml/sample-machine-learning-restore.yaml @@ -18,6 +18,8 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY + volumeSnaphot: + enabled: false restore: serverName: sample-machine-learning-backup endpointURL: http://minio.minio.svc.cluster.local:9000 diff --git a/tembo-operator/yaml/sample-standard-backup.yaml b/tembo-operator/yaml/sample-standard-backup.yaml index ef9f7bd60..a174baada 100644 --- a/tembo-operator/yaml/sample-standard-backup.yaml +++ b/tembo-operator/yaml/sample-standard-backup.yaml @@ -17,6 +17,8 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY + volumeSnaphot: + enabled: false stop: false stack: name: Standard diff --git a/tembo-operator/yaml/sample-standard-restore.yaml b/tembo-operator/yaml/sample-standard-restore.yaml index 425181078..0bf0816f2 100644 --- a/tembo-operator/yaml/sample-standard-restore.yaml +++ b/tembo-operator/yaml/sample-standard-restore.yaml @@ -18,6 +18,8 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY + volumeSnaphot: + enabled: false restore: serverName: sample-standard-backup endpointURL: http://minio.minio.svc.cluster.local:9000 From 9ffcd12b812eab099c621b1e02e8c1163dbd7472 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Mon, 19 Feb 2024 17:39:05 -0600 Subject: [PATCH 02/20] make volumeSnapshot disabled by default --- charts/tembo-operator/templates/crd.yaml | 7 ++++--- tembo-operator/src/apis/coredb_types.rs | 1 - tembo-operator/src/cloudnativepg/cnpg.rs | 6 ++---- tembo-operator/src/defaults.rs | 8 ++------ tembo-operator/yaml/sample-machine-learning-backup.yaml | 2 -- tembo-operator/yaml/sample-machine-learning-restore.yaml | 2 -- tembo-operator/yaml/sample-standard-backup.yaml | 2 -- tembo-operator/yaml/sample-standard-restore.yaml | 2 -- 8 files changed, 8 insertions(+), 22 deletions(-) diff --git a/charts/tembo-operator/templates/crd.yaml b/charts/tembo-operator/templates/crd.yaml index bcd8ed8cf..92a81f46b 100644 --- a/charts/tembo-operator/templates/crd.yaml +++ b/charts/tembo-operator/templates/crd.yaml @@ -1345,7 +1345,7 @@ spec: s3Credentials: inheritFromIAMRole: true volumeSnapshot: - enabled: true + enabled: false description: |- The backup configuration for the CoreDB instance to facilitate database backups and WAL archive uploads to an S3 compatible object store. @@ -1436,18 +1436,19 @@ spec: type: string volumeSnapshot: default: - enabled: true + enabled: false description: Enable using Volume Snapshots for backups instead of Object Storage nullable: true properties: enabled: - default: true description: Enable the volume snapshots for backups type: boolean snapshotClass: description: The reference to the snapshot class nullable: true type: string + required: + - enabled type: object type: object connectionPooler: diff --git a/tembo-operator/src/apis/coredb_types.rs b/tembo-operator/src/apis/coredb_types.rs index e5c247790..4c8f6b0f7 100644 --- a/tembo-operator/src/apis/coredb_types.rs +++ b/tembo-operator/src/apis/coredb_types.rs @@ -162,7 +162,6 @@ pub struct S3CredentialsSessionToken { #[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] pub struct VolumeSnapshot { /// Enable the volume snapshots for backups - #[serde(default = "defaults::default_volume_snapshot_enabled")] pub enabled: bool, /// The reference to the snapshot class diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index 7343fc078..5e52825c0 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -164,7 +164,7 @@ fn create_cluster_backup_volume_snapshot(cdb: &CoreDB) -> ClusterBackupVolumeSna .volume_snapshot .as_ref() .and_then(|vs| vs.snapshot_class.as_ref()) - .cloned() // Directly clone the Option if present + .cloned() .unwrap_or_else(|| VOLUME_SNAPSHOT_CLASS_NAME.to_string()); ClusterBackupVolumeSnapshot { @@ -2701,9 +2701,6 @@ mod tests { retentionPolicy: "30" schedule: 17 9 * * * endpointURL: http://minio:9000 - volumeSnapshot: - enabled: true - snapshotClass: csi-vsc image: quay.io/tembo/tembo-pg-cnpg:15.3.0-5-48d489e port: 5432 replicas: 1 @@ -2723,6 +2720,7 @@ mod tests { "#; let cdb: CoreDB = serde_yaml::from_str(cdb_yaml).expect("Failed to parse YAML"); + println!("{:?}", cdb); let snapshot = create_cluster_backup_volume_snapshot(&cdb); let scheduled_backup = cnpg_scheduled_backup(&cdb); diff --git a/tembo-operator/src/defaults.rs b/tembo-operator/src/defaults.rs index c77522879..3d76fe1a6 100644 --- a/tembo-operator/src/defaults.rs +++ b/tembo-operator/src/defaults.rs @@ -214,11 +214,7 @@ pub fn default_s3_credentials() -> Option { pub fn default_volume_snapshot() -> Option { Some(VolumeSnapshot { - enabled: default_volume_snapshot_enabled(), - ..Default::default() + enabled: false, + snapshot_class: None, }) } - -pub fn default_volume_snapshot_enabled() -> bool { - true -} diff --git a/tembo-operator/yaml/sample-machine-learning-backup.yaml b/tembo-operator/yaml/sample-machine-learning-backup.yaml index 460f3f5c2..4514bced9 100644 --- a/tembo-operator/yaml/sample-machine-learning-backup.yaml +++ b/tembo-operator/yaml/sample-machine-learning-backup.yaml @@ -17,8 +17,6 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY - volumeSnaphot: - enabled: false replicas: 2 stop: false stack: diff --git a/tembo-operator/yaml/sample-machine-learning-restore.yaml b/tembo-operator/yaml/sample-machine-learning-restore.yaml index d35290e15..d469c2e88 100644 --- a/tembo-operator/yaml/sample-machine-learning-restore.yaml +++ b/tembo-operator/yaml/sample-machine-learning-restore.yaml @@ -18,8 +18,6 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY - volumeSnaphot: - enabled: false restore: serverName: sample-machine-learning-backup endpointURL: http://minio.minio.svc.cluster.local:9000 diff --git a/tembo-operator/yaml/sample-standard-backup.yaml b/tembo-operator/yaml/sample-standard-backup.yaml index a174baada..ef9f7bd60 100644 --- a/tembo-operator/yaml/sample-standard-backup.yaml +++ b/tembo-operator/yaml/sample-standard-backup.yaml @@ -17,8 +17,6 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY - volumeSnaphot: - enabled: false stop: false stack: name: Standard diff --git a/tembo-operator/yaml/sample-standard-restore.yaml b/tembo-operator/yaml/sample-standard-restore.yaml index 0bf0816f2..425181078 100644 --- a/tembo-operator/yaml/sample-standard-restore.yaml +++ b/tembo-operator/yaml/sample-standard-restore.yaml @@ -18,8 +18,6 @@ spec: secretAccessKey: name: s3creds key: MINIO_SECRET_KEY - volumeSnaphot: - enabled: false restore: serverName: sample-standard-backup endpointURL: http://minio.minio.svc.cluster.local:9000 From c4cc9bbb2295dc7ba9c74e40d90c1f7d50a0ef42 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Mon, 19 Feb 2024 17:43:58 -0600 Subject: [PATCH 03/20] fix test --- tembo-operator/src/cloudnativepg/cnpg.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index 5e52825c0..dc73970c8 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -2701,6 +2701,9 @@ mod tests { retentionPolicy: "30" schedule: 17 9 * * * endpointURL: http://minio:9000 + volumeSnapshot: + enabled: true + snapshotClass: "csi-vsc" image: quay.io/tembo/tembo-pg-cnpg:15.3.0-5-48d489e port: 5432 replicas: 1 @@ -2720,7 +2723,6 @@ mod tests { "#; let cdb: CoreDB = serde_yaml::from_str(cdb_yaml).expect("Failed to parse YAML"); - println!("{:?}", cdb); let snapshot = create_cluster_backup_volume_snapshot(&cdb); let scheduled_backup = cnpg_scheduled_backup(&cdb); From 2901554756abaa00553fe63ad8bc358796c9f012 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Tue, 20 Feb 2024 16:06:01 -0600 Subject: [PATCH 04/20] add scheduled backups for both object store and snapshots if snapshots are enabled --- tembo-operator/src/cloudnativepg/cnpg.rs | 138 ++++++++++++++++++----- 1 file changed, 108 insertions(+), 30 deletions(-) diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index dc73970c8..fcbc89ece 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -1364,34 +1364,73 @@ fn schedule_expression_from_cdb(cdb: &CoreDB) -> String { } // Generate a ScheduledBackup -fn cnpg_scheduled_backup(cdb: &CoreDB) -> ScheduledBackup { - let name = cdb.name_any(); - let namespace = cdb.namespace().unwrap(); - let method = cdb.spec.backup.volume_snapshot.as_ref().map(|vs| { - if vs.enabled { - ScheduledBackupMethod::VolumeSnapshot - } else { - ScheduledBackupMethod::BarmanObjectStore - } - }); +fn cnpg_scheduled_backup( + cdb: &CoreDB, +) -> Result)>, &'static str> { + let namespace = match cdb.namespace() { + Some(ns) => ns, + None => return Err("Namespace is required but not found"), + }; - ScheduledBackup { + let name_ref = cdb.metadata.name.as_ref(); + let name = match name_ref { + Some(n) => n, + None => return Err("Name is required but not found"), + }; + + // Set a ScheduledBackup to backup to object store + let s3_scheduled_backup = ScheduledBackup { metadata: ObjectMeta { - name: Some(name.clone()), - namespace: Some(namespace), + name: Some(name.to_string()), + namespace: Some(namespace.to_string()), ..ObjectMeta::default() }, spec: ScheduledBackupSpec { backup_owner_reference: Some(ScheduledBackupBackupOwnerReference::Cluster), - cluster: ScheduledBackupCluster { name }, + cluster: ScheduledBackupCluster { + name: name.to_string(), + }, immediate: Some(true), schedule: schedule_expression_from_cdb(cdb), suspend: Some(false), - method, + method: Some(ScheduledBackupMethod::BarmanObjectStore), ..ScheduledBackupSpec::default() }, status: None, - } + }; + + // Set a ScheduledBackup to backup to volume snapshot if enabled + let volume_snapshot_scheduled_backup = cdb + .spec + .backup + .volume_snapshot + .as_ref() + .filter(|vs| vs.enabled) + .map(|_| ScheduledBackup { + metadata: ObjectMeta { + name: Some(name.to_string() + "-snapshot"), + namespace: Some(namespace), + ..ObjectMeta::default() + }, + spec: ScheduledBackupSpec { + backup_owner_reference: Some(ScheduledBackupBackupOwnerReference::Cluster), + cluster: ScheduledBackupCluster { + name: name.to_string(), + }, + immediate: Some(true), + schedule: schedule_expression_from_cdb(cdb), + suspend: Some(false), + method: Some(ScheduledBackupMethod::VolumeSnapshot), + ..ScheduledBackupSpec::default() + }, + status: None, + }); + + // Return the ScheduledBackup objects + Ok(vec![( + s3_scheduled_backup, + volume_snapshot_scheduled_backup, + )]) } // Reconcile a SheduledBackup @@ -1406,30 +1445,57 @@ pub async fn reconcile_cnpg_scheduled_backup( return Err(Action::requeue(Duration::from_secs(30))); } - let scheduledbackup = cnpg_scheduled_backup(cdb); let client = ctx.client.clone(); - let name = scheduledbackup + let scheduled_backups_result = cnpg_scheduled_backup(cdb); + let scheduled_backups = match scheduled_backups_result { + Ok(backups) => backups, + Err(e) => { + error!("Failed to generate scheduled backups: {}", e); + return Err(Action::requeue(Duration::from_secs(300))); + } + }; + + for (s3_backup, volume_snapshot_backup) in scheduled_backups { + // Always apply the s3_backup if backups are enabled + apply_scheduled_backup(&s3_backup, &client).await?; + + // Conditionally apply the volume_snapshot_backup if it exists + if let Some(vs_backup) = volume_snapshot_backup { + apply_scheduled_backup(&vs_backup, &client).await?; + } + } + + Ok(()) +} + +#[instrument(skip(client), fields(trace_id, scheduled_backup))] +async fn apply_scheduled_backup( + scheduled_backup: &ScheduledBackup, + client: &kube::Client, +) -> Result<(), Action> { + let name = scheduled_backup .metadata .name .clone() .expect("ScheduledBackup should always have a name"); - let namespace = scheduledbackup + let namespace = scheduled_backup .metadata .namespace .clone() .expect("ScheduledBackup should always have a namespace"); - let backup_api: Api = Api::namespaced(client.clone(), namespace.as_str()); + let backup_api: Api = Api::namespaced(client.clone(), &namespace); - debug!("Patching ScheduledBackup"); + debug!("Patching ScheduledBackup: {}", name); let ps = PatchParams::apply("cntrlr").force(); - let _o = backup_api - .patch(&name, &ps, &Patch::Apply(&scheduledbackup)) + backup_api + .patch(&name, &ps, &Patch::Apply(scheduled_backup)) .await .map_err(|e| { error!("Error patching ScheduledBackup: {}", e); Action::requeue(Duration::from_secs(300)) })?; - debug!("Applied ScheduledBackup"); + + debug!("Applied ScheduledBackup: {}", name); Ok(()) } @@ -2359,18 +2425,19 @@ mod tests { let cdb: CoreDB = from_str(cdb_yaml).unwrap(); let cfg = Config::default(); - let scheduled_backup: ScheduledBackup = cnpg_scheduled_backup(&cdb); let (backup, service_account_template) = cnpg_backup_configuration(&cdb, &cfg); + let backups_result = cnpg_scheduled_backup(&cdb).unwrap(); + let (s3_backup, _volume_snapshot_backup) = &backups_result[0]; // Assert to make sure that backup schedule is set - assert_eq!(scheduled_backup.spec.schedule, "0 55 7 * * *".to_string()); + assert_eq!(s3_backup.spec.schedule, "0 55 7 * * *".to_string()); assert_eq!( backup.clone().unwrap().retention_policy.unwrap(), "45d".to_string() ); assert_eq!( - scheduled_backup.spec.method, + s3_backup.spec.method, Some(ScheduledBackupMethod::BarmanObjectStore) ); @@ -2724,7 +2791,8 @@ mod tests { let cdb: CoreDB = serde_yaml::from_str(cdb_yaml).expect("Failed to parse YAML"); let snapshot = create_cluster_backup_volume_snapshot(&cdb); - let scheduled_backup = cnpg_scheduled_backup(&cdb); + let backups_result = cnpg_scheduled_backup(&cdb).unwrap(); + let (s3_backup, volume_snapshot_backup) = &backups_result[0]; // Set an expected ClusterBackupVolumeSnapshot object let expected_snapshot = ClusterBackupVolumeSnapshot { @@ -2744,9 +2812,19 @@ mod tests { assert_eq!(snapshot, expected_snapshot); // Assert to make sure that the ScheduledBackup method is set to VolumeSnapshot + if let Some(volume_snapshot_backup) = volume_snapshot_backup { + assert_eq!( + volume_snapshot_backup.spec.method, + Some(ScheduledBackupMethod::VolumeSnapshot) + ); + } else { + panic!("Expected volume snapshot backup to be Some, but was None"); + } + + // Assert to make sure that the ScheduledBackup method is set to BarmanObjectStore assert_eq!( - scheduled_backup.spec.method, - Some(ScheduledBackupMethod::VolumeSnapshot) + s3_backup.spec.method, + Some(ScheduledBackupMethod::BarmanObjectStore) ); } } From 830830f6516eaf98440bd3c927e0a61f39f2a1d8 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Tue, 20 Feb 2024 22:49:31 -0600 Subject: [PATCH 05/20] implement volume snapshot restores --- tembo-operator/src/apis/coredb_types.rs | 4 + tembo-operator/src/cloudnativepg/cnpg.rs | 134 ++++-- tembo-operator/src/lib.rs | 1 + tembo-operator/src/snapshots/mod.rs | 3 + .../snapshots/volumesnapshotcontents_crd.rs | 158 +++++++ .../src/snapshots/volumesnapshots.rs | 416 ++++++++++++++++++ .../src/snapshots/volumesnapshots_crd.rs | 66 +++ 7 files changed, 736 insertions(+), 46 deletions(-) create mode 100644 tembo-operator/src/snapshots/mod.rs create mode 100644 tembo-operator/src/snapshots/volumesnapshotcontents_crd.rs create mode 100644 tembo-operator/src/snapshots/volumesnapshots.rs create mode 100644 tembo-operator/src/snapshots/volumesnapshots_crd.rs diff --git a/tembo-operator/src/apis/coredb_types.rs b/tembo-operator/src/apis/coredb_types.rs index 4c8f6b0f7..830ab0d4b 100644 --- a/tembo-operator/src/apis/coredb_types.rs +++ b/tembo-operator/src/apis/coredb_types.rs @@ -277,6 +277,10 @@ pub struct Restore { /// s3Credentials is the S3 credentials to use for backups. #[serde(rename = "s3Credentials")] pub s3_credentials: Option, + + /// volumeSnapshot is a boolean to enable restoring from a Volume Snapshot + #[serde(rename = "volumeSnapshot")] + pub volume_snapshot: Option, } /// A connection pooler is a tool used to manage database connections, sitting diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index fcbc89ece..126670ca0 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -19,8 +19,9 @@ use crate::{ ClusterBackupVolumeSnapshotOnlineConfiguration, ClusterBackupVolumeSnapshotSnapshotOwnerReference, ClusterBootstrap, ClusterBootstrapInitdb, ClusterBootstrapRecovery, - ClusterBootstrapRecoveryRecoveryTarget, ClusterCertificates, ClusterExternalClusters, - ClusterExternalClustersBarmanObjectStore, + ClusterBootstrapRecoveryRecoveryTarget, ClusterBootstrapRecoveryVolumeSnapshots, + ClusterBootstrapRecoveryVolumeSnapshotsStorage, ClusterCertificates, + ClusterExternalClusters, ClusterExternalClustersBarmanObjectStore, ClusterExternalClustersBarmanObjectStoreS3Credentials, ClusterExternalClustersBarmanObjectStoreS3CredentialsAccessKeyId, ClusterExternalClustersBarmanObjectStoreS3CredentialsRegion, @@ -54,6 +55,7 @@ use crate::{ is_postgres_ready, patch_cdb_status_merge, postgres_exporter::EXPORTER_CONFIGMAP_PREFIX, psql::PsqlOutput, + snapshots::volumesnapshots::reconcile_volume_snapshot_restore, trunk::extensions_that_require_load, Context, RESTARTED_AT, }; @@ -390,49 +392,10 @@ pub fn cnpg_cluster_bootstrap_from_cdb( Option>, Option, ) { - // parse_target_time returns the parsed target_time which is used for point-in-time-recovery - // todo: Somehow turn this into a requeue action, so that we can retry when the target_time is not in the correct format. - // for now we just log the error and return None, which will disable point-in-time-recovery, but allow for a full recovery - let parsed_target_time = cdb.spec.restore.as_ref().and_then(|restore| { - restore.recovery_target_time.as_ref().and_then(|time_str| { - match parse_target_time(Some(time_str)) { - Ok(Some(parsed_time)) => Some(parsed_time), - Ok(None) => None, - Err(err) => { - error!( - "Failed to parse target_time for instance: {}, {}", - cdb.name_any(), - err - ); - None - } - } - }) - }); - - let cluster_bootstrap = if let Some(_restore) = &cdb.spec.restore { - ClusterBootstrap { - recovery: Some(ClusterBootstrapRecovery { - source: Some("tembo-recovery".to_string()), - database: Some("app".to_string()), - owner: Some("app".to_string()), - recovery_target: parsed_target_time.map(|target_time| { - ClusterBootstrapRecoveryRecoveryTarget { - target_time: Some(target_time), - ..ClusterBootstrapRecoveryRecoveryTarget::default() - } - }), - ..ClusterBootstrapRecovery::default() - }), - ..ClusterBootstrap::default() - } + let cluster_bootstrap = if cdb.spec.restore.is_some() { + cnpg_cluster_bootstrap(cdb, true) } else { - ClusterBootstrap { - initdb: Some(ClusterBootstrapInitdb { - ..ClusterBootstrapInitdb::default() - }), - ..ClusterBootstrap::default() - } + cnpg_cluster_bootstrap(cdb, false) }; let cluster_name = cdb.name_any(); @@ -448,7 +411,7 @@ pub fn cnpg_cluster_bootstrap_from_cdb( // Find destination_path from Backup to generate the restore destination path let restore_destination_path = match &cdb.spec.backup.destinationPath { Some(path) => generate_restore_destination_path(path), - None => "".to_string(), // or any other default value you'd like + None => "".to_string(), }; ClusterExternalClusters { name: "tembo-recovery".to_string(), @@ -457,7 +420,7 @@ pub fn cnpg_cluster_bootstrap_from_cdb( endpoint_url: restore.endpoint_url.clone(), s3_credentials: Some(s3_credentials), wal: Some(ClusterExternalClustersBarmanObjectStoreWal { - max_parallel: Some(5), + max_parallel: Some(8), encryption: Some(ClusterExternalClustersBarmanObjectStoreWalEncryption::Aes256), compression: Some( ClusterExternalClustersBarmanObjectStoreWalCompression::Snappy, @@ -494,6 +457,74 @@ pub fn cnpg_cluster_bootstrap_from_cdb( ) } +fn cnpg_cluster_bootstrap(cdb: &CoreDB, restore: bool) -> ClusterBootstrap { + // parse_target_time returns the parsed target_time which is used for point-in-time-recovery + // todo: Somehow turn this into a requeue action, so that we can retry when the target_time is not in the correct format. + // for now we just log the error and return None, which will disable point-in-time-recovery, but allow for a full recovery + let parsed_target_time = cdb.spec.restore.as_ref().and_then(|restore| { + restore.recovery_target_time.as_ref().and_then(|time_str| { + match parse_target_time(Some(time_str)) { + Ok(Some(parsed_time)) => Some(parsed_time), + Ok(None) => None, + Err(err) => { + error!( + "Failed to parse target_time for instance: {}, {}", + cdb.name_any(), + err + ); + None + } + } + }) + }); + + if restore { + ClusterBootstrap { + recovery: Some(ClusterBootstrapRecovery { + source: Some("tembo-recovery".to_string()), + database: Some("app".to_string()), + owner: Some("app".to_string()), + recovery_target: parsed_target_time.map(|target_time| { + ClusterBootstrapRecoveryRecoveryTarget { + target_time: Some(target_time), + ..ClusterBootstrapRecoveryRecoveryTarget::default() + } + }), + volume_snapshots: cnpg_cluster_bootstrap_recovery_volume_snapshots(cdb), + ..ClusterBootstrapRecovery::default() + }), + ..ClusterBootstrap::default() + } + } else { + ClusterBootstrap { + initdb: Some(ClusterBootstrapInitdb { + ..ClusterBootstrapInitdb::default() + }), + ..ClusterBootstrap::default() + } + } +} + +fn cnpg_cluster_bootstrap_recovery_volume_snapshots( + cdb: &CoreDB, +) -> Option { + if let Some(restore) = &cdb.spec.restore { + if restore.volume_snapshot.is_some() { + return Some(ClusterBootstrapRecoveryVolumeSnapshots { + storage: ClusterBootstrapRecoveryVolumeSnapshotsStorage { + // todo: Work on getting this from the VolumeSnapshot we created + // during the restore process + name: format!("{}-restore", cdb.name_any()), + kind: "VolumeSnapshot".to_string(), + api_group: Some("snapshot.storage.k8s.io".to_string()), + }, + ..ClusterBootstrapRecoveryVolumeSnapshots::default() + }); + } + } + None +} + // Get PGConfig from CoreDB and convert it to a postgres_parameters and shared_preload_libraries fn cnpg_postgres_config( cdb: &CoreDB, @@ -982,6 +1013,17 @@ pub async fn reconcile_cnpg(cdb: &CoreDB, ctx: Arc) -> Result<(), Actio extensions_that_require_load(ctx.client.clone(), &cdb.metadata.namespace.clone().unwrap()) .await?; + // If we are restoring and have volume snapshots enabled, make sure we setup + // the VolumeSnapshotContent and VolumeSnapshot so that the Cluster will have + // something to restore from. + if let Some(restore) = &cdb.spec.restore { + if restore.volume_snapshot.is_some() { + debug!("Reconciling VolumeSnapshotContent and VolumeSnapshot for restore"); + reconcile_volume_snapshot_restore(cdb, ctx.clone()).await?; + } + } + debug!("Setting up VolumeSnapshotContent and VolumeSnapshot for restore"); + debug!("Generating CNPG spec"); let mut cluster = cnpg_cluster_from_cdb(cdb, Some(pods_to_fence), requires_load); diff --git a/tembo-operator/src/lib.rs b/tembo-operator/src/lib.rs index 95c01b27c..753650546 100644 --- a/tembo-operator/src/lib.rs +++ b/tembo-operator/src/lib.rs @@ -34,6 +34,7 @@ pub mod psql; mod rbac; mod secret; mod service; +pub mod snapshots; mod trunk; pub const RESTARTED_AT: &str = "kubectl.kubernetes.io/restartedAt"; diff --git a/tembo-operator/src/snapshots/mod.rs b/tembo-operator/src/snapshots/mod.rs new file mode 100644 index 000000000..8a31e6740 --- /dev/null +++ b/tembo-operator/src/snapshots/mod.rs @@ -0,0 +1,3 @@ +mod volumesnapshotcontents_crd; +pub mod volumesnapshots; +pub mod volumesnapshots_crd; diff --git a/tembo-operator/src/snapshots/volumesnapshotcontents_crd.rs b/tembo-operator/src/snapshots/volumesnapshotcontents_crd.rs new file mode 100644 index 000000000..23173aced --- /dev/null +++ b/tembo-operator/src/snapshots/volumesnapshotcontents_crd.rs @@ -0,0 +1,158 @@ +// WARNING: generated by kopium - manual changes will be overwritten +// kopium command: kopium -D Default volumesnapshotcontents.snapshot.storage.k8s.io -A +// kopium version: 0.16.5 + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// spec defines properties of a VolumeSnapshotContent created by the underlying storage system. Required. +#[derive(CustomResource, Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +#[kube( + group = "snapshot.storage.k8s.io", + version = "v1", + kind = "VolumeSnapshotContent", + plural = "volumesnapshotcontents" +)] +#[kube(status = "VolumeSnapshotContentStatus")] +pub struct VolumeSnapshotContentSpec { + /// deletionPolicy determines whether this VolumeSnapshotContent and its physical snapshot on the underlying storage system should be deleted when its bound VolumeSnapshot is deleted. Supported values are "Retain" and "Delete". "Retain" means that the VolumeSnapshotContent and its physical snapshot on underlying storage system are kept. "Delete" means that the VolumeSnapshotContent and its physical snapshot on underlying storage system are deleted. For dynamically provisioned snapshots, this field will automatically be filled in by the CSI snapshotter sidecar with the "DeletionPolicy" field defined in the corresponding VolumeSnapshotClass. For pre-existing snapshots, users MUST specify this field when creating the VolumeSnapshotContent object. Required. + #[serde(rename = "deletionPolicy")] + pub deletion_policy: VolumeSnapshotContentDeletionPolicy, + /// driver is the name of the CSI driver used to create the physical snapshot on the underlying storage system. This MUST be the same as the name returned by the CSI GetPluginName() call for that driver. Required. + pub driver: String, + /// source specifies whether the snapshot is (or should be) dynamically provisioned or already exists, and just requires a Kubernetes object representation. This field is immutable after creation. Required. + pub source: VolumeSnapshotContentSource, + /// SourceVolumeMode is the mode of the volume whose snapshot is taken. Can be either “Filesystem” or “Block”. If not specified, it indicates the source volume's mode is unknown. This field is immutable. This field is an alpha field. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "sourceVolumeMode" + )] + pub source_volume_mode: Option, + /// name of the VolumeSnapshotClass from which this snapshot was (or will be) created. Note that after provisioning, the VolumeSnapshotClass may be deleted or recreated with different set of values, and as such, should not be referenced post-snapshot creation. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "volumeSnapshotClassName" + )] + pub volume_snapshot_class_name: Option, + /// volumeSnapshotRef specifies the VolumeSnapshot object to which this VolumeSnapshotContent object is bound. VolumeSnapshot.Spec.VolumeSnapshotContentName field must reference to this VolumeSnapshotContent's name for the bidirectional binding to be valid. For a pre-existing VolumeSnapshotContent object, name and namespace of the VolumeSnapshot object MUST be provided for binding to happen. This field is immutable after creation. Required. + #[serde(rename = "volumeSnapshotRef")] + pub volume_snapshot_ref: VolumeSnapshotContentVolumeSnapshotRef, +} + +/// spec defines properties of a VolumeSnapshotContent created by the underlying storage system. Required. +#[derive(Serialize, Deserialize, Clone, Default, Debug, JsonSchema)] +pub enum VolumeSnapshotContentDeletionPolicy { + Delete, + #[default] + Retain, +} + +/// source specifies whether the snapshot is (or should be) dynamically provisioned or already exists, and just requires a Kubernetes object representation. This field is immutable after creation. Required. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotContentSource { + /// snapshotHandle specifies the CSI "snapshot_id" of a pre-existing snapshot on the underlying storage system for which a Kubernetes object representation was (or should be) created. This field is immutable. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "snapshotHandle" + )] + pub snapshot_handle: Option, + /// volumeHandle specifies the CSI "volume_id" of the volume from which a snapshot should be dynamically taken from. This field is immutable. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "volumeHandle" + )] + pub volume_handle: Option, +} + +/// volumeSnapshotRef specifies the VolumeSnapshot object to which this VolumeSnapshotContent object is bound. VolumeSnapshot.Spec.VolumeSnapshotContentName field must reference to this VolumeSnapshotContent's name for the bidirectional binding to be valid. For a pre-existing VolumeSnapshotContent object, name and namespace of the VolumeSnapshot object MUST be provided for binding to happen. This field is immutable after creation. Required. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotContentVolumeSnapshotRef { + /// API version of the referent. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "apiVersion" + )] + pub api_version: Option, + /// If referring to a piece of an object instead of an entire object, this string should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. For example, if the object reference is to a container within a pod, this would take on a value like: "spec.containers{name}" (where "name" refers to the name of the container that triggered the event) or if no container name is specified "spec.containers[2]" (container with index 2 in this pod). This syntax is chosen only to have some well-defined way of referencing a part of an object. TODO: this design is not final and this field is subject to change in the future. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "fieldPath")] + pub field_path: Option, + /// Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + #[serde(default, skip_serializing_if = "Option::is_none")] + pub kind: Option, + /// Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + /// Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + #[serde(default, skip_serializing_if = "Option::is_none")] + pub namespace: Option, + /// Specific resourceVersion to which this reference is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "resourceVersion" + )] + pub resource_version: Option, + /// UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + #[serde(default, skip_serializing_if = "Option::is_none")] + pub uid: Option, +} + +/// status represents the current information of a snapshot. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotContentStatus { + /// creationTime is the timestamp when the point-in-time snapshot is taken by the underlying storage system. In dynamic snapshot creation case, this field will be filled in by the CSI snapshotter sidecar with the "creation_time" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "creation_time" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it. If not specified, it indicates the creation time is unknown. The format of this field is a Unix nanoseconds time encoded as an int64. On Unix, the command `date +%s%N` returns the current time in nanoseconds since 1970-01-01 00:00:00 UTC. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "creationTime" + )] + pub creation_time: Option, + /// error is the last observed error during snapshot creation, if any. Upon success after retry, this error field will be cleared. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error: Option, + /// readyToUse indicates if a snapshot is ready to be used to restore a volume. In dynamic snapshot creation case, this field will be filled in by the CSI snapshotter sidecar with the "ready_to_use" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "ready_to_use" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it, otherwise, this field will be set to "True". If not specified, it means the readiness of a snapshot is unknown. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "readyToUse" + )] + pub ready_to_use: Option, + /// restoreSize represents the complete size of the snapshot in bytes. In dynamic snapshot creation case, this field will be filled in by the CSI snapshotter sidecar with the "size_bytes" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "size_bytes" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it. When restoring a volume from this snapshot, the size of the volume MUST NOT be smaller than the restoreSize if it is specified, otherwise the restoration will fail. If not specified, it indicates that the size is unknown. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "restoreSize" + )] + pub restore_size: Option, + /// snapshotHandle is the CSI "snapshot_id" of a snapshot on the underlying storage system. If not specified, it indicates that dynamic snapshot creation has either failed or it is still in progress. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "snapshotHandle" + )] + pub snapshot_handle: Option, + /// VolumeGroupSnapshotContentName is the name of the VolumeGroupSnapshotContent of which this VolumeSnapshotContent is a part of. + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "volumeGroupSnapshotContentName" + )] + pub volume_group_snapshot_content_name: Option, +} + +/// error is the last observed error during snapshot creation, if any. Upon success after retry, this error field will be cleared. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotContentStatusError { + /// message is a string detailing the encountered error during snapshot creation if specified. NOTE: message may be logged, and it should not contain sensitive information. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub message: Option, + /// time is the timestamp when the error was encountered. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub time: Option, +} diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs new file mode 100644 index 000000000..cff809e0b --- /dev/null +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -0,0 +1,416 @@ +use crate::{ + apis::coredb_types::CoreDB, + snapshots::{ + volumesnapshotcontents_crd::{ + VolumeSnapshotContent, VolumeSnapshotContentDeletionPolicy, + VolumeSnapshotContentSource, VolumeSnapshotContentSpec, + VolumeSnapshotContentVolumeSnapshotRef, + }, + volumesnapshots_crd::{VolumeSnapshot, VolumeSnapshotSource, VolumeSnapshotSpec}, + }, + Context, +}; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use kube::{ + api::{ListParams, Patch, PatchParams}, + client::Client, + runtime::controller::Action, + Api, ResourceExt, +}; +use std::sync::Arc; +use tracing::{debug, error}; + +// Main function to reconcile the VolumeSnapshotContent and VolumeSnapshot +pub async fn reconcile_volume_snapshot_restore( + cdb: &CoreDB, + ctx: Arc, +) -> Result { + let client = ctx.client.clone(); + // Lookup the VolumeSnapshot of the original instance + let ogvs = lookup_volume_snapshot(cdb, &client).await?; + let ogvsc = lookup_volume_snapshot_content(&client, ogvs).await?; + + let vsc = generate_volume_snapshot_content(cdb, &ogvsc)?; + let vs = generate_volume_snapshot(cdb, &vsc)?; + + // Apply the VolumeSnapshotContent and VolumeSnapshot + apply_volume_snapshot_content(cdb, &client, &vsc).await?; + + // Apply the VolumeSnapshot + apply_volume_snapshot(cdb, &client, &vs).await?; + + Ok(vs) +} + +async fn apply_volume_snapshot( + cdb: &CoreDB, + client: &Client, + volume_snapshot: &VolumeSnapshot, +) -> Result<(), Action> { + let name = cdb.name_any(); + + // Namespace for the VolumeSnapshot + let namespace = volume_snapshot + .metadata + .namespace + .as_deref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + + // Apply VolumeSnapshot (Namespaced) + let vs_api: Api = Api::namespaced(client.clone(), namespace); + debug!("Patching VolumeSnapshot for instance: {}", name); + let ps = PatchParams::apply("cntrlr").force(); + + match vs_api + .patch(&name, &ps, &Patch::Apply(volume_snapshot)) + .await + { + Ok(_) => debug!("VolumeSnapshot created successfully for {}.", name), + Err(e) => { + error!("Failed to create VolumeSnapshot: {}", e); + return Err(Action::requeue(tokio::time::Duration::from_secs(300))); + } + } + + Ok(()) +} + +async fn apply_volume_snapshot_content( + cdb: &CoreDB, + client: &Client, + volume_snapshot_content: &VolumeSnapshotContent, +) -> Result<(), Action> { + let name = cdb.name_any(); + + // Apply VolumeSnapshotContent (All Namespaces) + let vs_api: Api = Api::all(client.clone()); + debug!("Patching VolumeSnapshotContent for instance: {}", name); + let ps = PatchParams::apply("cntrlr").force(); + + match vs_api + .patch(&name, &ps, &Patch::Apply(volume_snapshot_content)) + .await + { + Ok(_) => debug!("VolumeSnapshotContent created successfully for {}.", name), + Err(e) => { + error!("Failed to create VolumeSnapshotContent: {}", e); + return Err(Action::requeue(tokio::time::Duration::from_secs(300))); + } + } + + Ok(()) +} + +// generate_volume_snapshot_content function generates the VolumeSnapshotContent object +// to map the VolumeSnapshot for the restore +fn generate_volume_snapshot_content( + cdb: &CoreDB, + snapshot_content: &VolumeSnapshotContent, +) -> Result { + let name = cdb.name_any(); + let namespace = cdb + .namespace() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + + let volume_handle = &snapshot_content.spec.source.volume_handle; + let driver = &snapshot_content.spec.driver; + let volume_snapshot_class_name = snapshot_content + .spec + .volume_snapshot_class_name + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + let snapshot = format!("{}-restore", name); + + let vsc = VolumeSnapshotContent { + metadata: ObjectMeta { + name: Some(format!("{}-restore", name)), + namespace: Some(namespace.clone()), + ..ObjectMeta::default() + }, + spec: VolumeSnapshotContentSpec { + deletion_policy: VolumeSnapshotContentDeletionPolicy::Retain, + driver: driver.to_string(), + source: VolumeSnapshotContentSource { + snapshot_handle: volume_handle.clone(), + ..VolumeSnapshotContentSource::default() + }, + volume_snapshot_class_name: Some(volume_snapshot_class_name.to_string()), + volume_snapshot_ref: VolumeSnapshotContentVolumeSnapshotRef { + api_version: Some("snapshot.storage.k8s.io/v1".to_string()), + kind: Some("VolumeSnapshot".to_string()), + name: Some(snapshot), + namespace: Some(namespace.clone()), + ..VolumeSnapshotContentVolumeSnapshotRef::default() + }, + ..VolumeSnapshotContentSpec::default() + }, + status: None, + }; + + Ok(vsc) +} + +// generate_volume_snapshot function generates the VolumeSnapshot object and ties +// it back to the VolumeSnapshotContent +fn generate_volume_snapshot( + cdb: &CoreDB, + snapshot_content: &VolumeSnapshotContent, +) -> Result { + let volume_snapshot_content_name = snapshot_content + .metadata + .name + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + let volume_snapshot_class_name = snapshot_content + .spec + .volume_snapshot_class_name + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + + let vs = VolumeSnapshot { + metadata: ObjectMeta { + name: Some(format!("{}-restore", cdb.name_any())), + namespace: cdb.namespace(), + ..ObjectMeta::default() + }, + spec: VolumeSnapshotSpec { + source: VolumeSnapshotSource { + volume_snapshot_content_name: Some(volume_snapshot_content_name.to_string()), + ..VolumeSnapshotSource::default() + }, + volume_snapshot_class_name: Some(volume_snapshot_class_name.to_string()), + }, + status: None, + }; + Ok(vs) +} + +// lookup_volume_snapshot function looks up the VolumeSnapshot object from the +// original instance you are restoring from +async fn lookup_volume_snapshot(cdb: &CoreDB, client: &Client) -> Result { + // name will be the name of the original instance + let name = cdb + .spec + .restore + .as_ref() + .map(|r| r.server_name.clone()) + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + let namespace = cdb + .namespace() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + + let volume_snapshot_api: Api = Api::namespaced(client.clone(), &namespace); + + let label_selector = format!("cnpg.io/cluster={}", name); + let lp = ListParams::default().labels(&label_selector); + let backup_result = volume_snapshot_api.list(&lp).await.map_err(|e| { + error!("Error listing VolumeSnapshots: {}", e); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; + + // Filter snapshots that are ready to use and sort them by creation timestamp in descending order + let mut snapshots: Vec = backup_result + .items + .into_iter() + .filter(|vs| { + vs.status + .as_ref() + .map(|s| s.ready_to_use.unwrap_or(false)) + .unwrap_or(false) + }) + .collect(); + + if snapshots.is_empty() { + return Err(Action::requeue(tokio::time::Duration::from_secs(300))); + } + + snapshots.sort_by(|a, b| { + b.metadata + .creation_timestamp + .cmp(&a.metadata.creation_timestamp) + }); + + snapshots + .first() + .cloned() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300))) +} + +async fn lookup_volume_snapshot_content( + client: &Client, + snapshot: VolumeSnapshot, +) -> Result { + // The name of the VolumeSnapshotContext is in the status.boundVolumeSnapshotContentName field + // in the VolumeSnapshot + let name = snapshot + .status + .as_ref() + .and_then(|s| s.bound_volume_snapshot_content_name.clone()) + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + + // Lookup the VolumeSnapshotContent object, since it's not namespaced we will + // need to filter on all objects in the cluster + let volume_snapshot_content_api: Api = Api::all(client.clone()); + match volume_snapshot_content_api.get(&name).await { + Ok(vsc) => Ok(vsc), + Err(e) => { + error!("Failed to get VolumeSnapshotContent: {}", e); + Err(Action::requeue(tokio::time::Duration::from_secs(300))) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + apis::coredb_types::CoreDB, + snapshots::volumesnapshotcontents_crd::{ + VolumeSnapshotContent, VolumeSnapshotContentSource, VolumeSnapshotContentSpec, + }, + }; + use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; + + #[test] + fn test_generate_volume_snapshot_content() { + let cdb_yaml = r#" + apiVersion: coredb.io/v1alpha1 + kind: CoreDB + metadata: + name: test + namespace: default + spec: + backup: + destinationPath: s3://tembo-backup/sample-standard-backup + encryption: "" + retentionPolicy: "30" + schedule: 17 9 * * * + endpointURL: http://minio:9000 + volumeSnapshot: + enabled: true + snapshotClass: "csi-vsc" + image: quay.io/tembo/tembo-pg-cnpg:15.3.0-5-48d489e + port: 5432 + replicas: 1 + resources: + limits: + cpu: "1" + memory: 0.5Gi + serviceAccountTemplate: + metadata: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::012345678901:role/aws-iam-role-iam + sharedirStorage: 1Gi + stop: false + storage: 1Gi + storageClass: "gp3-enc" + uid: 999 + "#; + let cdb: CoreDB = serde_yaml::from_str(cdb_yaml).expect("Failed to parse YAML"); + + let snapshot_content = VolumeSnapshotContent { + metadata: ObjectMeta { + name: Some("test-snapshot-content".to_string()), + namespace: cdb.namespace(), + ..ObjectMeta::default() + }, + spec: VolumeSnapshotContentSpec { + source: VolumeSnapshotContentSource { + volume_handle: Some("test-volume-handle".to_string()), + ..VolumeSnapshotContentSource::default() + }, + driver: "test-driver".to_string(), + volume_snapshot_class_name: Some("test-class".to_string()), + ..VolumeSnapshotContentSpec::default() + }, + status: None, + }; + + let result = generate_volume_snapshot_content(&cdb, &snapshot_content).unwrap(); + + assert_eq!(result.spec.driver, "test-driver"); + assert_eq!( + result.spec.source.snapshot_handle, + Some("test-volume-handle".to_string()) + ); + assert_eq!( + result.spec.volume_snapshot_class_name, + Some("test-class".to_string()) + ); + } + + #[test] + fn test_generate_volume_snapshot() { + let cdb_yaml = r#" + apiVersion: coredb.io/v1alpha1 + kind: CoreDB + metadata: + name: test + namespace: default + spec: + backup: + destinationPath: s3://tembo-backup/sample-standard-backup + encryption: "" + retentionPolicy: "30" + schedule: 17 9 * * * + endpointURL: http://minio:9000 + volumeSnapshot: + enabled: true + snapshotClass: "csi-vsc" + image: quay.io/tembo/tembo-pg-cnpg:15.3.0-5-48d489e + port: 5432 + replicas: 1 + resources: + limits: + cpu: "1" + memory: 0.5Gi + serviceAccountTemplate: + metadata: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::012345678901:role/aws-iam-role-iam + sharedirStorage: 1Gi + stop: false + storage: 1Gi + storageClass: "gp3-enc" + uid: 999 + "#; + let cdb: CoreDB = serde_yaml::from_str(cdb_yaml).expect("Failed to parse YAML"); + + let snapshot_content = VolumeSnapshotContent { + metadata: ObjectMeta { + name: Some("test-snapshot-content".to_string()), + namespace: Some("default".to_string()), // Ensure namespace matches CoreDB for the test's purpose + ..ObjectMeta::default() + }, + spec: VolumeSnapshotContentSpec { + source: VolumeSnapshotContentSource { + volume_handle: Some("test-volume-handle".to_string()), // This might not be relevant for this test + ..VolumeSnapshotContentSource::default() + }, + driver: "test-driver".to_string(), // Not directly relevant for this test + volume_snapshot_class_name: Some("test-class".to_string()), + ..VolumeSnapshotContentSpec::default() + }, + status: None, + }; + + // Execute the function under test + let result = generate_volume_snapshot(&cdb, &snapshot_content).unwrap(); + + // Assertions + assert_eq!( + result.metadata.name.unwrap(), + format!("{}-restore", cdb.name_any()) + ); + assert_eq!( + result.spec.source.volume_snapshot_content_name, + Some("test-snapshot-content".to_string()) + ); + assert_eq!( + result.spec.volume_snapshot_class_name, + Some("test-class".to_string()) + ); + // The namespace of the generated VolumeSnapshot should match the namespace of the CoreDB + assert_eq!(result.metadata.namespace.unwrap(), "default"); + } +} diff --git a/tembo-operator/src/snapshots/volumesnapshots_crd.rs b/tembo-operator/src/snapshots/volumesnapshots_crd.rs new file mode 100644 index 000000000..fe853fe56 --- /dev/null +++ b/tembo-operator/src/snapshots/volumesnapshots_crd.rs @@ -0,0 +1,66 @@ +// WARNING: generated by kopium - manual changes will be overwritten +// kopium command: kopium -D Default volumesnapshots.snapshot.storage.k8s.io -A +// kopium version: 0.16.5 + +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Serialize, Deserialize}; + +/// spec defines the desired characteristics of a snapshot requested by a user. More info: https://kubernetes.io/docs/concepts/storage/volume-snapshots#volumesnapshots Required. +#[derive(CustomResource, Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +#[kube(group = "snapshot.storage.k8s.io", version = "v1", kind = "VolumeSnapshot", plural = "volumesnapshots")] +#[kube(namespaced)] +#[kube(status = "VolumeSnapshotStatus")] +pub struct VolumeSnapshotSpec { + /// source specifies where a snapshot will be created from. This field is immutable after creation. Required. + pub source: VolumeSnapshotSource, + /// VolumeSnapshotClassName is the name of the VolumeSnapshotClass requested by the VolumeSnapshot. VolumeSnapshotClassName may be left nil to indicate that the default SnapshotClass should be used. A given cluster may have multiple default Volume SnapshotClasses: one default per CSI Driver. If a VolumeSnapshot does not specify a SnapshotClass, VolumeSnapshotSource will be checked to figure out what the associated CSI Driver is, and the default VolumeSnapshotClass associated with that CSI Driver will be used. If more than one VolumeSnapshotClass exist for a given CSI Driver and more than one have been marked as default, CreateSnapshot will fail and generate an event. Empty string is not allowed for this field. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "volumeSnapshotClassName")] + pub volume_snapshot_class_name: Option, +} + +/// source specifies where a snapshot will be created from. This field is immutable after creation. Required. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotSource { + /// persistentVolumeClaimName specifies the name of the PersistentVolumeClaim object representing the volume from which a snapshot should be created. This PVC is assumed to be in the same namespace as the VolumeSnapshot object. This field should be set if the snapshot does not exists, and needs to be created. This field is immutable. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "persistentVolumeClaimName")] + pub persistent_volume_claim_name: Option, + /// volumeSnapshotContentName specifies the name of a pre-existing VolumeSnapshotContent object representing an existing volume snapshot. This field should be set if the snapshot already exists and only needs a representation in Kubernetes. This field is immutable. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "volumeSnapshotContentName")] + pub volume_snapshot_content_name: Option, +} + +/// status represents the current information of a snapshot. Consumers must verify binding between VolumeSnapshot and VolumeSnapshotContent objects is successful (by validating that both VolumeSnapshot and VolumeSnapshotContent point at each other) before using this object. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotStatus { + /// boundVolumeSnapshotContentName is the name of the VolumeSnapshotContent object to which this VolumeSnapshot object intends to bind to. If not specified, it indicates that the VolumeSnapshot object has not been successfully bound to a VolumeSnapshotContent object yet. NOTE: To avoid possible security issues, consumers must verify binding between VolumeSnapshot and VolumeSnapshotContent objects is successful (by validating that both VolumeSnapshot and VolumeSnapshotContent point at each other) before using this object. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "boundVolumeSnapshotContentName")] + pub bound_volume_snapshot_content_name: Option, + /// creationTime is the timestamp when the point-in-time snapshot is taken by the underlying storage system. In dynamic snapshot creation case, this field will be filled in by the snapshot controller with the "creation_time" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "creation_time" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it. If not specified, it may indicate that the creation time of the snapshot is unknown. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "creationTime")] + pub creation_time: Option, + /// error is the last observed error during snapshot creation, if any. This field could be helpful to upper level controllers(i.e., application controller) to decide whether they should continue on waiting for the snapshot to be created based on the type of error reported. The snapshot controller will keep retrying when an error occurs during the snapshot creation. Upon success, this error field will be cleared. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error: Option, + /// readyToUse indicates if the snapshot is ready to be used to restore a volume. In dynamic snapshot creation case, this field will be filled in by the snapshot controller with the "ready_to_use" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "ready_to_use" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it, otherwise, this field will be set to "True". If not specified, it means the readiness of a snapshot is unknown. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "readyToUse")] + pub ready_to_use: Option, + /// restoreSize represents the minimum size of volume required to create a volume from this snapshot. In dynamic snapshot creation case, this field will be filled in by the snapshot controller with the "size_bytes" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "size_bytes" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it. When restoring a volume from this snapshot, the size of the volume MUST NOT be smaller than the restoreSize if it is specified, otherwise the restoration will fail. If not specified, it indicates that the size is unknown. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "restoreSize")] + pub restore_size: Option, + /// VolumeGroupSnapshotName is the name of the VolumeGroupSnapshot of which this VolumeSnapshot is a part of. + #[serde(default, skip_serializing_if = "Option::is_none", rename = "volumeGroupSnapshotName")] + pub volume_group_snapshot_name: Option, +} + +/// error is the last observed error during snapshot creation, if any. This field could be helpful to upper level controllers(i.e., application controller) to decide whether they should continue on waiting for the snapshot to be created based on the type of error reported. The snapshot controller will keep retrying when an error occurs during the snapshot creation. Upon success, this error field will be cleared. +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +pub struct VolumeSnapshotStatusError { + /// message is a string detailing the encountered error during snapshot creation if specified. NOTE: message may be logged, and it should not contain sensitive information. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub message: Option, + /// time is the timestamp when the error was encountered. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub time: Option, +} + From 9f19ff52fbed93d3579776eea01fe379a631c603 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Tue, 20 Feb 2024 22:50:51 -0600 Subject: [PATCH 06/20] update crd manifest --- charts/tembo-operator/templates/crd.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/charts/tembo-operator/templates/crd.yaml b/charts/tembo-operator/templates/crd.yaml index 92a81f46b..8cc99612c 100644 --- a/charts/tembo-operator/templates/crd.yaml +++ b/charts/tembo-operator/templates/crd.yaml @@ -1821,6 +1821,10 @@ spec: This assumes you are keeping the backups in the new instance in the same root bucket path of `s3://my-bucket/`. type: string + volumeSnapshot: + description: volumeSnapshot is a boolean to enable restoring from a Volume Snapshot + nullable: true + type: boolean required: - serverName type: object From 88a7926d0a7c88b5f6adbf98c3eb9fea53e2ca32 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Tue, 20 Feb 2024 22:54:59 -0600 Subject: [PATCH 07/20] fix fmt --- .../src/snapshots/volumesnapshots_crd.rs | 58 +++++++++++++++---- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots_crd.rs b/tembo-operator/src/snapshots/volumesnapshots_crd.rs index fe853fe56..533529a12 100644 --- a/tembo-operator/src/snapshots/volumesnapshots_crd.rs +++ b/tembo-operator/src/snapshots/volumesnapshots_crd.rs @@ -4,18 +4,27 @@ use kube::CustomResource; use schemars::JsonSchema; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; /// spec defines the desired characteristics of a snapshot requested by a user. More info: https://kubernetes.io/docs/concepts/storage/volume-snapshots#volumesnapshots Required. #[derive(CustomResource, Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] -#[kube(group = "snapshot.storage.k8s.io", version = "v1", kind = "VolumeSnapshot", plural = "volumesnapshots")] +#[kube( + group = "snapshot.storage.k8s.io", + version = "v1", + kind = "VolumeSnapshot", + plural = "volumesnapshots" +)] #[kube(namespaced)] #[kube(status = "VolumeSnapshotStatus")] pub struct VolumeSnapshotSpec { /// source specifies where a snapshot will be created from. This field is immutable after creation. Required. pub source: VolumeSnapshotSource, /// VolumeSnapshotClassName is the name of the VolumeSnapshotClass requested by the VolumeSnapshot. VolumeSnapshotClassName may be left nil to indicate that the default SnapshotClass should be used. A given cluster may have multiple default Volume SnapshotClasses: one default per CSI Driver. If a VolumeSnapshot does not specify a SnapshotClass, VolumeSnapshotSource will be checked to figure out what the associated CSI Driver is, and the default VolumeSnapshotClass associated with that CSI Driver will be used. If more than one VolumeSnapshotClass exist for a given CSI Driver and more than one have been marked as default, CreateSnapshot will fail and generate an event. Empty string is not allowed for this field. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "volumeSnapshotClassName")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "volumeSnapshotClassName" + )] pub volume_snapshot_class_name: Option, } @@ -23,10 +32,18 @@ pub struct VolumeSnapshotSpec { #[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] pub struct VolumeSnapshotSource { /// persistentVolumeClaimName specifies the name of the PersistentVolumeClaim object representing the volume from which a snapshot should be created. This PVC is assumed to be in the same namespace as the VolumeSnapshot object. This field should be set if the snapshot does not exists, and needs to be created. This field is immutable. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "persistentVolumeClaimName")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "persistentVolumeClaimName" + )] pub persistent_volume_claim_name: Option, /// volumeSnapshotContentName specifies the name of a pre-existing VolumeSnapshotContent object representing an existing volume snapshot. This field should be set if the snapshot already exists and only needs a representation in Kubernetes. This field is immutable. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "volumeSnapshotContentName")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "volumeSnapshotContentName" + )] pub volume_snapshot_content_name: Option, } @@ -34,22 +51,42 @@ pub struct VolumeSnapshotSource { #[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] pub struct VolumeSnapshotStatus { /// boundVolumeSnapshotContentName is the name of the VolumeSnapshotContent object to which this VolumeSnapshot object intends to bind to. If not specified, it indicates that the VolumeSnapshot object has not been successfully bound to a VolumeSnapshotContent object yet. NOTE: To avoid possible security issues, consumers must verify binding between VolumeSnapshot and VolumeSnapshotContent objects is successful (by validating that both VolumeSnapshot and VolumeSnapshotContent point at each other) before using this object. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "boundVolumeSnapshotContentName")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "boundVolumeSnapshotContentName" + )] pub bound_volume_snapshot_content_name: Option, /// creationTime is the timestamp when the point-in-time snapshot is taken by the underlying storage system. In dynamic snapshot creation case, this field will be filled in by the snapshot controller with the "creation_time" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "creation_time" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it. If not specified, it may indicate that the creation time of the snapshot is unknown. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "creationTime")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "creationTime" + )] pub creation_time: Option, /// error is the last observed error during snapshot creation, if any. This field could be helpful to upper level controllers(i.e., application controller) to decide whether they should continue on waiting for the snapshot to be created based on the type of error reported. The snapshot controller will keep retrying when an error occurs during the snapshot creation. Upon success, this error field will be cleared. #[serde(default, skip_serializing_if = "Option::is_none")] pub error: Option, /// readyToUse indicates if the snapshot is ready to be used to restore a volume. In dynamic snapshot creation case, this field will be filled in by the snapshot controller with the "ready_to_use" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "ready_to_use" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it, otherwise, this field will be set to "True". If not specified, it means the readiness of a snapshot is unknown. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "readyToUse")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "readyToUse" + )] pub ready_to_use: Option, /// restoreSize represents the minimum size of volume required to create a volume from this snapshot. In dynamic snapshot creation case, this field will be filled in by the snapshot controller with the "size_bytes" value returned from CSI "CreateSnapshot" gRPC call. For a pre-existing snapshot, this field will be filled with the "size_bytes" value returned from the CSI "ListSnapshots" gRPC call if the driver supports it. When restoring a volume from this snapshot, the size of the volume MUST NOT be smaller than the restoreSize if it is specified, otherwise the restoration will fail. If not specified, it indicates that the size is unknown. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "restoreSize")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "restoreSize" + )] pub restore_size: Option, /// VolumeGroupSnapshotName is the name of the VolumeGroupSnapshot of which this VolumeSnapshot is a part of. - #[serde(default, skip_serializing_if = "Option::is_none", rename = "volumeGroupSnapshotName")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + rename = "volumeGroupSnapshotName" + )] pub volume_group_snapshot_name: Option, } @@ -63,4 +100,3 @@ pub struct VolumeSnapshotStatusError { #[serde(default, skip_serializing_if = "Option::is_none")] pub time: Option, } - From 2fecb6ba50001f226e06eb0ec1c7a98bc6ff20a0 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Wed, 21 Feb 2024 15:49:07 -0600 Subject: [PATCH 08/20] fix namespace lookup for volumesnapshot, update rbac --- charts/tembo-operator/templates/rbac-operator.yaml | 3 +++ tembo-operator/src/snapshots/volumesnapshots.rs | 13 +++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/charts/tembo-operator/templates/rbac-operator.yaml b/charts/tembo-operator/templates/rbac-operator.yaml index 6246955e9..d211c54f3 100644 --- a/charts/tembo-operator/templates/rbac-operator.yaml +++ b/charts/tembo-operator/templates/rbac-operator.yaml @@ -57,5 +57,8 @@ rules: - apiGroups: [""] resources: ["namespaces"] verbs: ["get", "list", "watch"] + - apiGroups: ["snapshot.storage.k8s.io"] + resources: ["volumesnapshots", "volumesnapshotcontents"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] {{- end }} {{- end }} diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index cff809e0b..4afe31bc0 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -195,11 +195,14 @@ async fn lookup_volume_snapshot(cdb: &CoreDB, client: &Client) -> Result = Api::namespaced(client.clone(), &namespace); + // todo: This is a temporary fix to get the VolumeSnapshot from the same namespace as the + // instance you are attempting to restore from. We need to figure out a better way of + // doing this in case someone wants to name a namespace differently than the instance name. + let volume_snapshot_api: Api = Api::namespaced(client.clone(), &name); let label_selector = format!("cnpg.io/cluster={}", name); let lp = ListParams::default().labels(&label_selector); @@ -220,6 +223,8 @@ async fn lookup_volume_snapshot(cdb: &CoreDB, client: &Client) -> Result Date: Wed, 21 Feb 2024 16:01:35 -0600 Subject: [PATCH 09/20] patch/create a volumesnapshotcontent --- tembo-operator/src/snapshots/volumesnapshots.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index 4afe31bc0..deaaadc92 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -83,7 +83,7 @@ async fn apply_volume_snapshot_content( let name = cdb.name_any(); // Apply VolumeSnapshotContent (All Namespaces) - let vs_api: Api = Api::all(client.clone()); + let vs_api: Api = Api::all(client.clone()); debug!("Patching VolumeSnapshotContent for instance: {}", name); let ps = PatchParams::apply("cntrlr").force(); From bb3e6dd46bd41cbb9d74840bac43bc094b6f3466 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Wed, 21 Feb 2024 17:21:18 -0600 Subject: [PATCH 10/20] make sure when patching to supply the correct name --- tembo-operator/src/snapshots/volumesnapshots.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index deaaadc92..ca1847695 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -48,6 +48,11 @@ async fn apply_volume_snapshot( volume_snapshot: &VolumeSnapshot, ) -> Result<(), Action> { let name = cdb.name_any(); + let vs_name = volume_snapshot + .metadata + .name + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; // Namespace for the VolumeSnapshot let namespace = volume_snapshot @@ -62,7 +67,7 @@ async fn apply_volume_snapshot( let ps = PatchParams::apply("cntrlr").force(); match vs_api - .patch(&name, &ps, &Patch::Apply(volume_snapshot)) + .patch(vs_name, &ps, &Patch::Apply(volume_snapshot)) .await { Ok(_) => debug!("VolumeSnapshot created successfully for {}.", name), @@ -81,6 +86,11 @@ async fn apply_volume_snapshot_content( volume_snapshot_content: &VolumeSnapshotContent, ) -> Result<(), Action> { let name = cdb.name_any(); + let vsc_name = volume_snapshot_content + .metadata + .name + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; // Apply VolumeSnapshotContent (All Namespaces) let vs_api: Api = Api::all(client.clone()); @@ -88,7 +98,7 @@ async fn apply_volume_snapshot_content( let ps = PatchParams::apply("cntrlr").force(); match vs_api - .patch(&name, &ps, &Patch::Apply(volume_snapshot_content)) + .patch(vsc_name, &ps, &Patch::Apply(volume_snapshot_content)) .await { Ok(_) => debug!("VolumeSnapshotContent created successfully for {}.", name), From 745a25f672a29b26899f7830e3ffa4ee79b970e7 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Wed, 21 Feb 2024 18:11:50 -0600 Subject: [PATCH 11/20] make sure to use snapshotHandle instead of volumeHandle --- tembo-operator/src/snapshots/volumesnapshots.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index ca1847695..d9bfdbf65 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -122,7 +122,12 @@ fn generate_volume_snapshot_content( .namespace() .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; - let volume_handle = &snapshot_content.spec.source.volume_handle; + let snapshot_handle = snapshot_content + .status + .as_ref() + .and_then(|status| status.snapshot_handle.as_ref()) + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))? + .to_string(); let driver = &snapshot_content.spec.driver; let volume_snapshot_class_name = snapshot_content .spec @@ -141,7 +146,7 @@ fn generate_volume_snapshot_content( deletion_policy: VolumeSnapshotContentDeletionPolicy::Retain, driver: driver.to_string(), source: VolumeSnapshotContentSource { - snapshot_handle: volume_handle.clone(), + snapshot_handle: Some(snapshot_handle), ..VolumeSnapshotContentSource::default() }, volume_snapshot_class_name: Some(volume_snapshot_class_name.to_string()), From 8aef2c6bfc455eae00a00da02916dd33be3d09ff Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Wed, 21 Feb 2024 18:36:02 -0600 Subject: [PATCH 12/20] fix tests --- tembo-operator/src/snapshots/volumesnapshots.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index d9bfdbf65..b234e6222 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -287,6 +287,7 @@ mod tests { apis::coredb_types::CoreDB, snapshots::volumesnapshotcontents_crd::{ VolumeSnapshotContent, VolumeSnapshotContentSource, VolumeSnapshotContentSpec, + VolumeSnapshotContentStatus, }, }; use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; @@ -343,7 +344,13 @@ mod tests { volume_snapshot_class_name: Some("test-class".to_string()), ..VolumeSnapshotContentSpec::default() }, - status: None, + status: Some(VolumeSnapshotContentStatus { + creation_time: Some(1708542600948000000), + ready_to_use: Some(true), + restore_size: Some(10737418240), + snapshot_handle: Some("snap-01234567abcdef890".to_string()), + ..VolumeSnapshotContentStatus::default() + }), }; let result = generate_volume_snapshot_content(&cdb, &snapshot_content).unwrap(); @@ -351,7 +358,7 @@ mod tests { assert_eq!(result.spec.driver, "test-driver"); assert_eq!( result.spec.source.snapshot_handle, - Some("test-volume-handle".to_string()) + Some("snap-01234567abcdef890".to_string()) ); assert_eq!( result.spec.volume_snapshot_class_name, From 4d4f925d7e2463ff2b1b2b10cb931795946e3b80 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Wed, 21 Feb 2024 19:33:22 -0600 Subject: [PATCH 13/20] clean up snapshot stuff --- tembo-operator/src/snapshots/volumesnapshots.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index b234e6222..c3f3d26f7 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -134,11 +134,11 @@ fn generate_volume_snapshot_content( .volume_snapshot_class_name .as_ref() .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; - let snapshot = format!("{}-restore", name); + let snapshot = format!("{}-restore-vs", name); let vsc = VolumeSnapshotContent { metadata: ObjectMeta { - name: Some(format!("{}-restore", name)), + name: Some(format!("{}-restore-vsc", name)), namespace: Some(namespace.clone()), ..ObjectMeta::default() }, @@ -171,6 +171,10 @@ fn generate_volume_snapshot( cdb: &CoreDB, snapshot_content: &VolumeSnapshotContent, ) -> Result { + let name = cdb.name_any(); + let namespace = cdb + .namespace() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; let volume_snapshot_content_name = snapshot_content .metadata .name @@ -184,8 +188,8 @@ fn generate_volume_snapshot( let vs = VolumeSnapshot { metadata: ObjectMeta { - name: Some(format!("{}-restore", cdb.name_any())), - namespace: cdb.namespace(), + name: Some(format!("{}-restore-vs", name)), + namespace: Some(namespace), ..ObjectMeta::default() }, spec: VolumeSnapshotSpec { From 86a544de43b4cc67d192597116391dc76fe5e132 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 22 Feb 2024 12:50:05 -0600 Subject: [PATCH 14/20] fix snapshot test --- tembo-operator/src/snapshots/volumesnapshots.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index c3f3d26f7..9395484d9 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -431,7 +431,7 @@ mod tests { // Assertions assert_eq!( result.metadata.name.unwrap(), - format!("{}-restore", cdb.name_any()) + format!("{}-restore-vs", cdb.name_any()) ); assert_eq!( result.spec.source.volume_snapshot_content_name, From 9d8b5efafd0ea401a6286e04a56e23afd7e453ea Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 22 Feb 2024 14:28:23 -0600 Subject: [PATCH 15/20] adding check to make sure volumesnapshot is ready prior to creating the cluster --- .../src/snapshots/volumesnapshots.rs | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index 9395484d9..5d62aa896 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -39,9 +39,52 @@ pub async fn reconcile_volume_snapshot_restore( // Apply the VolumeSnapshot apply_volume_snapshot(cdb, &client, &vs).await?; + // We need to wait for the snapshot to become ready before we can proceed + is_snapshot_ready(&client, &vs).await?; + Ok(vs) } +async fn is_snapshot_ready(client: &Client, vs: &VolumeSnapshot) -> Result<(), Action> { + let name = vs + .metadata + .name + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + let namespace = vs + .metadata + .namespace + .as_ref() + .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + + let vs_api: Api = Api::namespaced(client.clone(), namespace); + let lp = ListParams::default().fields(&format!("metadata.name={}", name)); + let mut ready = false; + let mut attempts = 0; + + while !ready && attempts < 10 { + let vs = vs_api.list(&lp).await.map_err(|e| { + error!("Error listing VolumeSnapshots: {}", e); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; + + if let Some(status) = vs.items.first().and_then(|vs| vs.status.as_ref()) { + ready = status.ready_to_use.unwrap_or(false); + } + + if !ready { + tokio::time::sleep(tokio::time::Duration::from_secs(30)).await; + attempts += 1; + } + } + + if !ready { + return Err(Action::requeue(tokio::time::Duration::from_secs(300))); + } + + Ok(()) +} + async fn apply_volume_snapshot( cdb: &CoreDB, client: &Client, From bf353351078fdb10c5a0c25f5b2988a64abb436d Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 22 Feb 2024 14:40:19 -0600 Subject: [PATCH 16/20] fix issue with pointing to the incorrect snapshot --- tembo-operator/src/cloudnativepg/cnpg.rs | 2 +- tembo-operator/src/snapshots/volumesnapshots.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index 126670ca0..5b8557175 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -514,7 +514,7 @@ fn cnpg_cluster_bootstrap_recovery_volume_snapshots( storage: ClusterBootstrapRecoveryVolumeSnapshotsStorage { // todo: Work on getting this from the VolumeSnapshot we created // during the restore process - name: format!("{}-restore", cdb.name_any()), + name: format!("{}-restore-vs", cdb.name_any()), kind: "VolumeSnapshot".to_string(), api_group: Some("snapshot.storage.k8s.io".to_string()), }, diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index 5d62aa896..9cc1c9c08 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -82,6 +82,8 @@ async fn is_snapshot_ready(client: &Client, vs: &VolumeSnapshot) -> Result<(), A return Err(Action::requeue(tokio::time::Duration::from_secs(300))); } + info!("VolumeSnapshot {} is ready.", name); + Ok(()) } From 77d947953383a9ce0ad386026057e696bb267766 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 22 Feb 2024 14:44:31 -0600 Subject: [PATCH 17/20] add missing crate --- tembo-operator/src/snapshots/volumesnapshots.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index 9cc1c9c08..07f218a25 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -18,7 +18,7 @@ use kube::{ Api, ResourceExt, }; use std::sync::Arc; -use tracing::{debug, error}; +use tracing::{debug, error, info}; // Main function to reconcile the VolumeSnapshotContent and VolumeSnapshot pub async fn reconcile_volume_snapshot_restore( From 053f3cfe42941bd4df7bbdb8fcb606a694d76a41 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 22 Feb 2024 15:16:51 -0600 Subject: [PATCH 18/20] make sure scheduledbackup job name is <=63 chars --- tembo-operator/src/cloudnativepg/cnpg.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index 5b8557175..afc46979f 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -1450,7 +1450,7 @@ fn cnpg_scheduled_backup( .filter(|vs| vs.enabled) .map(|_| ScheduledBackup { metadata: ObjectMeta { - name: Some(name.to_string() + "-snapshot"), + name: Some(name.to_string() + "-snap"), namespace: Some(namespace), ..ObjectMeta::default() }, From 9372806e22cbade221f3af266fe5865b8ba30801 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Fri, 23 Feb 2024 11:04:18 -0600 Subject: [PATCH 19/20] add better error logging --- tembo-operator/src/cloudnativepg/cnpg.rs | 6 +- .../src/snapshots/volumesnapshots.rs | 201 ++++++++++++------ 2 files changed, 136 insertions(+), 71 deletions(-) diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index afc46979f..3ce6072b4 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -1492,7 +1492,11 @@ pub async fn reconcile_cnpg_scheduled_backup( let scheduled_backups = match scheduled_backups_result { Ok(backups) => backups, Err(e) => { - error!("Failed to generate scheduled backups: {}", e); + error!( + "Failed to generate scheduled backups for {}: {}", + cdb.name_any(), + e + ); return Err(Action::requeue(Duration::from_secs(300))); } }; diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index 07f218a25..a1d70d879 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -28,7 +28,7 @@ pub async fn reconcile_volume_snapshot_restore( let client = ctx.client.clone(); // Lookup the VolumeSnapshot of the original instance let ogvs = lookup_volume_snapshot(cdb, &client).await?; - let ogvsc = lookup_volume_snapshot_content(&client, ogvs).await?; + let ogvsc = lookup_volume_snapshot_content(cdb, &client, ogvs).await?; let vsc = generate_volume_snapshot_content(cdb, &ogvsc)?; let vs = generate_volume_snapshot(cdb, &vsc)?; @@ -40,51 +40,70 @@ pub async fn reconcile_volume_snapshot_restore( apply_volume_snapshot(cdb, &client, &vs).await?; // We need to wait for the snapshot to become ready before we can proceed - is_snapshot_ready(&client, &vs).await?; + is_snapshot_ready(cdb, &client, &vs).await?; Ok(vs) } -async fn is_snapshot_ready(client: &Client, vs: &VolumeSnapshot) -> Result<(), Action> { - let name = vs - .metadata - .name - .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; - let namespace = vs - .metadata - .namespace - .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; +async fn is_snapshot_ready( + cdb: &CoreDB, + client: &Client, + vs: &VolumeSnapshot, +) -> Result<(), Action> { + let name = vs.metadata.name.as_ref().ok_or_else(|| { + error!( + "VolumeSnapshot name is empty for instance: {}.", + cdb.name_any() + ); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; + let namespace = vs.metadata.namespace.as_ref().ok_or_else(|| { + error!( + "VolumeSnapshot namespace is empty for instance: {}.", + cdb.name_any() + ); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; let vs_api: Api = Api::namespaced(client.clone(), namespace); let lp = ListParams::default().fields(&format!("metadata.name={}", name)); - let mut ready = false; - let mut attempts = 0; - while !ready && attempts < 10 { - let vs = vs_api.list(&lp).await.map_err(|e| { - error!("Error listing VolumeSnapshots: {}", e); - Action::requeue(tokio::time::Duration::from_secs(300)) - })?; - - if let Some(status) = vs.items.first().and_then(|vs| vs.status.as_ref()) { - ready = status.ready_to_use.unwrap_or(false); - } + // Fetch the VolumeSnapshot to check its status + let res = vs_api.list(&lp).await.map_err(|e| { + error!( + "Error listing VolumeSnapshots {} for instance {}: {}", + name, + cdb.name_any(), + e, + ); + Action::requeue(tokio::time::Duration::from_secs(30)) + })?; - if !ready { - tokio::time::sleep(tokio::time::Duration::from_secs(30)).await; - attempts += 1; + // Check if the first VolumeSnapshot is ready + if let Some(status) = res.items.first().and_then(|vs| vs.status.as_ref()) { + if status.ready_to_use.unwrap_or(false) { + info!( + "VolumeSnapshot {} is ready for instance {}.", + name, + cdb.name_any() + ); + Ok(()) + } else { + error!( + "VolumeSnapshot {} is not ready yet for instance {}.", + name, + cdb.name_any() + ); + Err(Action::requeue(tokio::time::Duration::from_secs(10))) } + } else { + error!( + "VolumeSnapshot {} not found for instance {}", + name, + cdb.name_any() + ); + Err(Action::requeue(tokio::time::Duration::from_secs(30))) } - - if !ready { - return Err(Action::requeue(tokio::time::Duration::from_secs(300))); - } - - info!("VolumeSnapshot {} is ready.", name); - - Ok(()) } async fn apply_volume_snapshot( @@ -93,18 +112,26 @@ async fn apply_volume_snapshot( volume_snapshot: &VolumeSnapshot, ) -> Result<(), Action> { let name = cdb.name_any(); - let vs_name = volume_snapshot - .metadata - .name - .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + let vs_name = volume_snapshot.metadata.name.as_ref().ok_or_else(|| { + error!( + "VolumeSnapshot name is empty for instance: {}.", + cdb.name_any() + ); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; // Namespace for the VolumeSnapshot let namespace = volume_snapshot .metadata .namespace .as_deref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + .ok_or_else(|| { + error!( + "VolumeSnapshot namespace is empty for instance: {}.", + cdb.name_any() + ); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; // Apply VolumeSnapshot (Namespaced) let vs_api: Api = Api::namespaced(client.clone(), namespace); @@ -117,7 +144,10 @@ async fn apply_volume_snapshot( { Ok(_) => debug!("VolumeSnapshot created successfully for {}.", name), Err(e) => { - error!("Failed to create VolumeSnapshot: {}", e); + error!( + "Failed to create VolumeSnapshot for instance {}: {}", + name, e + ); return Err(Action::requeue(tokio::time::Duration::from_secs(300))); } } @@ -135,7 +165,13 @@ async fn apply_volume_snapshot_content( .metadata .name .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + .ok_or_else(|| { + error!( + "VolumeSnapshot name is empty for instance: {}.", + cdb.name_any() + ); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; // Apply VolumeSnapshotContent (All Namespaces) let vs_api: Api = Api::all(client.clone()); @@ -148,7 +184,10 @@ async fn apply_volume_snapshot_content( { Ok(_) => debug!("VolumeSnapshotContent created successfully for {}.", name), Err(e) => { - error!("Failed to create VolumeSnapshotContent: {}", e); + error!( + "Failed to create VolumeSnapshotContent for instance {}: {}", + name, e + ); return Err(Action::requeue(tokio::time::Duration::from_secs(300))); } } @@ -163,22 +202,28 @@ fn generate_volume_snapshot_content( snapshot_content: &VolumeSnapshotContent, ) -> Result { let name = cdb.name_any(); - let namespace = cdb - .namespace() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; - + let namespace = cdb.namespace().ok_or_else(|| { + error!("CoreDB namespace is empty for instance: {}.", name); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; let snapshot_handle = snapshot_content .status .as_ref() .and_then(|status| status.snapshot_handle.as_ref()) - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))? + .ok_or_else(|| { + error!("Snapshot Handle is empty for instance {}", name); + Action::requeue(tokio::time::Duration::from_secs(300)) + })? .to_string(); let driver = &snapshot_content.spec.driver; let volume_snapshot_class_name = snapshot_content .spec .volume_snapshot_class_name .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + .ok_or_else(|| { + error!("VolumeSnapshotClass name is empty for instance {}", name); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; let snapshot = format!("{}-restore-vs", name); let vsc = VolumeSnapshotContent { @@ -217,19 +262,23 @@ fn generate_volume_snapshot( snapshot_content: &VolumeSnapshotContent, ) -> Result { let name = cdb.name_any(); - let namespace = cdb - .namespace() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; - let volume_snapshot_content_name = snapshot_content - .metadata - .name - .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + let namespace = cdb.namespace().ok_or_else(|| { + error!("CoreDB namespace is empty for instance {}", name); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; + let volume_snapshot_content_name = + snapshot_content.metadata.name.as_ref().ok_or_else(|| { + error!("VolumeSnapshotContent name is empty for instance {}", name); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; let volume_snapshot_class_name = snapshot_content .spec .volume_snapshot_class_name .as_ref() - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + .ok_or_else(|| { + error!("VolumeSnapshotClass name is empty for instance {}", name); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; let vs = VolumeSnapshot { metadata: ObjectMeta { @@ -258,10 +307,13 @@ async fn lookup_volume_snapshot(cdb: &CoreDB, client: &Client) -> Result Result Result Result Result { @@ -315,7 +369,10 @@ async fn lookup_volume_snapshot_content( .status .as_ref() .and_then(|s| s.bound_volume_snapshot_content_name.clone()) - .ok_or_else(|| Action::requeue(tokio::time::Duration::from_secs(300)))?; + .ok_or_else(|| { + error!("Snapshot status is empty for instance {}", cdb.name_any()); + Action::requeue(tokio::time::Duration::from_secs(300)) + })?; // Lookup the VolumeSnapshotContent object, since it's not namespaced we will // need to filter on all objects in the cluster @@ -323,7 +380,11 @@ async fn lookup_volume_snapshot_content( match volume_snapshot_content_api.get(&name).await { Ok(vsc) => Ok(vsc), Err(e) => { - error!("Failed to get VolumeSnapshotContent: {}", e); + error!( + "Failed to get VolumeSnapshotContent for instance {}: {}", + cdb.name_any(), + e + ); Err(Action::requeue(tokio::time::Duration::from_secs(300))) } } From 9d28f4559ee0b134b74125304a3534b38e445287 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Fri, 23 Feb 2024 11:09:36 -0600 Subject: [PATCH 20/20] better logging --- tembo-operator/src/snapshots/volumesnapshots.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index a1d70d879..b8384dc7c 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -18,7 +18,7 @@ use kube::{ Api, ResourceExt, }; use std::sync::Arc; -use tracing::{debug, error, info}; +use tracing::{debug, error, info, warn}; // Main function to reconcile the VolumeSnapshotContent and VolumeSnapshot pub async fn reconcile_volume_snapshot_restore( @@ -70,7 +70,7 @@ async fn is_snapshot_ready( // Fetch the VolumeSnapshot to check its status let res = vs_api.list(&lp).await.map_err(|e| { - error!( + warn!( "Error listing VolumeSnapshots {} for instance {}: {}", name, cdb.name_any(), @@ -89,7 +89,7 @@ async fn is_snapshot_ready( ); Ok(()) } else { - error!( + warn!( "VolumeSnapshot {} is not ready yet for instance {}.", name, cdb.name_any()