From b24dbeadeef3a71a4416132b08511fa45f3e3859 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 7 Mar 2024 15:35:06 -0600 Subject: [PATCH] add better handling of snapshot restores, better debug logging (#629) --- conductor/Cargo.lock | 2 +- conductor/src/main.rs | 13 ++-- tembo-operator/Cargo.lock | 2 +- tembo-operator/src/cloudnativepg/cnpg.rs | 2 +- .../src/snapshots/volumesnapshots.rs | 61 ++++++++++++++----- 5 files changed, 59 insertions(+), 21 deletions(-) diff --git a/conductor/Cargo.lock b/conductor/Cargo.lock index c485f2082..f5cb4045d 100644 --- a/conductor/Cargo.lock +++ b/conductor/Cargo.lock @@ -826,7 +826,7 @@ dependencies = [ [[package]] name = "controller" -version = "0.37.13" +version = "0.38.1" dependencies = [ "actix-web", "anyhow", diff --git a/conductor/src/main.rs b/conductor/src/main.rs index 144254320..d81a1c931 100644 --- a/conductor/src/main.rs +++ b/conductor/src/main.rs @@ -687,6 +687,11 @@ async fn init_cloud_perms( // use volume_snapshot_enabled feature flag to only enable for specific org_id's let volume_snapshot_enabled = is_volume_snapshot_enabled(read_msg, &restore_spec); + info!( + "Volume snapshot restore is {} for instance_id {} in org_id: {}", + volume_snapshot_enabled, read_msg.message.inst_id, read_msg.message.org_id + ); + // Ensure a restore spec exists, otherwise return an error let restore = coredb_spec @@ -732,8 +737,8 @@ fn is_volume_snapshot_enabled(msg: &Message, cdb_spec: &CoreDBSpec) - if orgs.contains(&msg.message.org_id.as_str()) { info!( - "Volume snapshot restore enabled for org_id: {}", - msg.message.org_id + "Volume snapshot restore enabled for instance_id {} in org_id: {}", + msg.message.inst_id, msg.message.org_id ); cdb_spec .backup @@ -742,8 +747,8 @@ fn is_volume_snapshot_enabled(msg: &Message, cdb_spec: &CoreDBSpec) - .map_or(false, |vs| vs.enabled) } else { info!( - "Volume snapshot restore disabled for org_id: {}", - msg.message.org_id + "Volume snapshot restore disabled for instance_id {} in org_id: {}", + msg.message.inst_id, msg.message.org_id ); false } diff --git a/tembo-operator/Cargo.lock b/tembo-operator/Cargo.lock index 42ba986e1..971cb2e02 100644 --- a/tembo-operator/Cargo.lock +++ b/tembo-operator/Cargo.lock @@ -494,7 +494,7 @@ dependencies = [ [[package]] name = "controller" -version = "0.38.0" +version = "0.38.1" dependencies = [ "actix-web", "anyhow", diff --git a/tembo-operator/src/cloudnativepg/cnpg.rs b/tembo-operator/src/cloudnativepg/cnpg.rs index e67b4ee55..415d096c6 100644 --- a/tembo-operator/src/cloudnativepg/cnpg.rs +++ b/tembo-operator/src/cloudnativepg/cnpg.rs @@ -1022,7 +1022,7 @@ pub async fn reconcile_cnpg(cdb: &CoreDB, ctx: Arc) -> Result<(), Actio // the VolumeSnapshotContent and VolumeSnapshot so that the Cluster will have // something to restore from. if let Some(restore) = &cdb.spec.restore { - if restore.volume_snapshot.is_some() { + if restore.volume_snapshot == Some(true) { debug!("Reconciling VolumeSnapshotContent and VolumeSnapshot for restore"); reconcile_volume_snapshot_restore(cdb, ctx.clone()).await?; } diff --git a/tembo-operator/src/snapshots/volumesnapshots.rs b/tembo-operator/src/snapshots/volumesnapshots.rs index 76d89578f..966bf02a4 100644 --- a/tembo-operator/src/snapshots/volumesnapshots.rs +++ b/tembo-operator/src/snapshots/volumesnapshots.rs @@ -412,6 +412,11 @@ async fn lookup_volume_snapshot(cdb: &CoreDB, client: &Client) -> Result Result Result let recovery_target_time: Option> = cdb .spec @@ -443,23 +449,50 @@ async fn lookup_volume_snapshot(cdb: &CoreDB, client: &Client) -> Result = result .into_iter() .filter(|vs| { vs.status .as_ref() - .map(|s| s.ready_to_use.unwrap_or(false)) - .unwrap_or(false) + .map_or(false, |s| s.ready_to_use.unwrap_or(false)) + }) + .filter(|vs| { + vs.metadata + .annotations + .as_ref() + .and_then(|ann| ann.get("cnpg.io/instanceRole")) + .map_or(false, |role| role == "primary") }) .collect(); - let closest_snapshot_to_recovery_time = find_closest_snapshot(snapshots, recovery_target_time); - - closest_snapshot_to_recovery_time.ok_or_else(|| { - error!("No VolumeSnapshot found for instance {}", og_instance_name); - Action::requeue(tokio::time::Duration::from_secs(300)) - }) + debug!( + "Filtered {} VolumeSnapshots for primary role and readiness", + snapshots.len() + ); + + match find_closest_snapshot(snapshots, recovery_target_time) { + Some(snapshot) => { + debug!( + "Selected VolumeSnapshot: {}", + snapshot + .metadata + .name + .as_deref() + .map_or("unknown", |name| name) + ); + Ok(snapshot) + } + None => { + error!( + "No suitable VolumeSnapshot found for instance {}", + og_instance_name + ); + Err(Action::requeue(tokio::time::Duration::from_secs(300))) + } + } } fn find_closest_snapshot(