resolve changes from merge conflict in upgrade changes

The upgrade changes for rook#2901 added the check for the correct version of the ceph image before continuing with an upgrade. This commit is to refactor that change to work with the new code path to validate the ceph version. Signed-off-by: travisn <tnielsen@redhat.com>
travisn · Jul 26, 2019 · 012ff7d · 012ff7d
1 parent 2d9eec3
commit 012ff7d
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 69 deletions.
diff --git a/pkg/operator/ceph/cluster/cluster.go b/pkg/operator/ceph/cluster/cluster.go
@@ -133,6 +133,52 @@ func (c *cluster) validateCephVersion(version *cephver.CephVersion) error {
 		return fmt.Errorf("allowUnsupported must be set to true to run with this version: %v", version)
 	}
 
+	// The following tries to determine if the operator can proceed with an upgrade
+	// If the cluster was unhealthy and someone injected a new image version, an upgrade was triggered but failed because the cluster is not healthy
+	// Then after this, if the operator gets restarted we are not able to fail if the cluster is not healthy, the following tries to determine the
+	// state we are in and if we should upgrade or not
+
+	// Try to load clusterInfo so we can compare the running version with the one from the spec image
+	clusterInfo, _, _, err := mon.LoadClusterInfo(c.context, c.Namespace)
+	if err == nil {
+		// Write connection info (ceph config file and keyring) for ceph commands
+		err = mon.WriteConnectionConfig(c.context, clusterInfo)
+		if err != nil {
+			logger.Errorf("failed to write config. Attempting to continue. %+v", err)
+		}
+	}
+
+	if !clusterInfo.IsInitialized() {
+		// If not initialized, this is likely a new cluster so there is nothing to do
+		return nil
+	}
+
+	// Get cluster running versions
+	versions, err := client.GetAllCephDaemonVersions(c.context, c.Namespace)
+	if err != nil {
+		logger.Errorf("failed to get ceph daemons versions. %+v", err)
+		return nil
+	}
+
+	runningVersions := *versions
+	differentImages, err := diffImageSpecAndClusterRunningVersion(*version, runningVersions)
+	if err != nil {
+		logger.Errorf("failed to determine if we should upgrade or not. %+v", err)
+		// we shouldn't block the orchestration if we can't determine the version of the image spec, we proceed anyway in best effort
+		// we won't be able to check if there is an update or not and what to do, so we don't check the cluster status either
+		// will happen if someone uses ceph/daemon:latest-master for instance
+		return nil
+	}
+
+	if differentImages {
+		// If the image version changed let's make sure we can safely upgrade
+		// check ceph's status, if not healthy we fail
+		cephHealthy := client.IsCephHealthy(c.context, c.Namespace)
+		if !cephHealthy {
+			return fmt.Errorf("ceph status in namespace %s is not healthy, refusing to upgrade. fix the cluster and re-edit the cluster CR to trigger a new orchestation update", c.Namespace)
+		}
+	}
+
 	return nil
 }
 

diff --git a/pkg/operator/ceph/cluster/controller.go b/pkg/operator/ceph/cluster/controller.go
@@ -250,86 +250,36 @@ func (c *ClusterController) initializeCluster(cluster *cluster, clusterObj *ceph
 		logger.Warningf("mon count is even (given: %d), should be uneven, continuing", cluster.Spec.Mon.Count)
 	}
 
-	// Try to load clusterInfo early so we can compare the running version with the one from the spec image
-	cluster.Info, _, _, err = mon.LoadClusterInfo(c.context, cluster.Namespace)
-	if err == nil {
-		// Let's write connection info (ceph config file and keyring) to the operator for health checks
-		err = mon.WriteConnectionConfig(cluster.context, cluster.Info)
-		if err != nil {
-			return
-		}
-	}
-
 	// Start the Rook cluster components. Retry several times in case of failure.
 	failedMessage := ""
 	state := cephv1.ClusterStateError
 
-	err := wait.Poll(clusterCreateInterval, clusterCreateTimeout, func() (bool, error) {
-		cephVersion, canRetry, err := c.detectAndValidateCephVersion(cluster, cluster.Spec.CephVersion.Image)
-		if err != nil {
-			failedMessage = fmt.Sprintf("failed the ceph version check. %+v", err)
-			logger.Errorf(failedMessage)
-			if !canRetry {
-				// it may seem strange to exit true but we don't want to retry if the version is not supported
-				return true, nil
-			}
-			return false, nil
-		}
-
-		// This tries to determine if the operator was restarted and we loss the state
-		// If the cluster was unhealthy and someone injected a new image version, an upgrade was triggered but failed because the cluster is not healthy
-		// Then after this, if the operator gets restarted we are not able to fail if the cluster is not healthy, the following tries to determine the
-		// state we are in and if we should upgrade or not
-		//
-		// If not initialized, this is likely a new cluster so there is nothing to do
-		if cluster.Info.IsInitialized() {
-			imageVersion := *cephVersion
-
-			// Get cluster running versions
-			versions, err := client.GetAllCephDaemonVersions(c.context, cluster.Namespace)
+	err := wait.Poll(clusterCreateInterval, clusterCreateTimeout,
+		func() (bool, error) {
+			cephVersion, canRetry, err := c.detectAndValidateCephVersion(cluster, cluster.Spec.CephVersion.Image)
 			if err != nil {
-				logger.Errorf("failed to get ceph daemons versions. %+v", err)
-				return false, err
-			}
-
-			runningVersions := *versions
-			updateOrNot, err := diffImageSpecAndClusterRunningVersion(imageVersion, runningVersions)
-			if err != nil {
-				logger.Errorf("failed to determine if we should upgrade or not. %+v", err)
-				// we shouldn't block the orchestration if we can't determine the version of the image spec, we proceed anyway in best effort
-				// we won't be able to check if there is an update or not and what to do, so we don't check the cluster status either
-				// will happen if someone uses ceph/daemon:latest-master for instance
-				validOrchestration = false
-				return true, nil
-			}
-
-			if updateOrNot {
-				// If the image version changed let's make sure we can safely upgrade
-				// check ceph's status, if not healthy we fail
-				cephStatus := client.IsCephHealthy(c.context, cluster.Namespace)
-				if !cephStatus {
-					logger.Errorf("ceph status in namespace %s is not healthy, refusing to upgrade. fix the cluster and re-edit the cluster CR to trigger a new orchestation update", cluster.Namespace)
-					validOrchestration = false
+				failedMessage = fmt.Sprintf("failed the ceph version check. %+v", err)
+				logger.Errorf(failedMessage)
+				if !canRetry {
+					// it may seem strange to exit true but we don't want to retry if the version is not supported
 					return true, nil
 				}
+				return false, nil
 			}
-		}
-
-		c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreating, "")
 
-		err = cluster.createInstance(c.rookImage, *cephVersion)
-		if err != nil {
-			logger.Errorf("failed to create cluster in namespace %s. %+v", cluster.Namespace, err)
-			return false, nil
-		}
+			c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreating, "")
 
-		// cluster is created, update the cluster CRD status now
-		c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreated, "")
+			err = cluster.createInstance(c.rookImage, *cephVersion)
+			if err != nil {
+				failedMessage = fmt.Sprintf("failed to create cluster in namespace %s. %+v", cluster.Namespace, err)
+				logger.Errorf(failedMessage)
+				return false, nil
+			}
 
-		state = cephv1.ClusterStateCreated
-		failedMessage = ""
-		return true, nil
-	})
+			state = cephv1.ClusterStateCreated
+			failedMessage = ""
+			return true, nil
+		})
 
 	c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, state, failedMessage)