Skip to content

Commit

Permalink
resolve changes from merge conflict in upgrade changes
Browse files Browse the repository at this point in the history
The upgrade changes for rook#2901 added the check for the correct version
of the ceph image before continuing with an upgrade. This commit is
to refactor that change to work with the new code path to validate
the ceph version.

Signed-off-by: travisn <tnielsen@redhat.com>
  • Loading branch information
travisn committed Jul 26, 2019
1 parent 2d9eec3 commit 012ff7d
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 69 deletions.
46 changes: 46 additions & 0 deletions pkg/operator/ceph/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,52 @@ func (c *cluster) validateCephVersion(version *cephver.CephVersion) error {
return fmt.Errorf("allowUnsupported must be set to true to run with this version: %v", version)
}

// The following tries to determine if the operator can proceed with an upgrade
// If the cluster was unhealthy and someone injected a new image version, an upgrade was triggered but failed because the cluster is not healthy
// Then after this, if the operator gets restarted we are not able to fail if the cluster is not healthy, the following tries to determine the
// state we are in and if we should upgrade or not

// Try to load clusterInfo so we can compare the running version with the one from the spec image
clusterInfo, _, _, err := mon.LoadClusterInfo(c.context, c.Namespace)
if err == nil {
// Write connection info (ceph config file and keyring) for ceph commands
err = mon.WriteConnectionConfig(c.context, clusterInfo)
if err != nil {
logger.Errorf("failed to write config. Attempting to continue. %+v", err)
}
}

if !clusterInfo.IsInitialized() {
// If not initialized, this is likely a new cluster so there is nothing to do
return nil
}

// Get cluster running versions
versions, err := client.GetAllCephDaemonVersions(c.context, c.Namespace)
if err != nil {
logger.Errorf("failed to get ceph daemons versions. %+v", err)
return nil
}

runningVersions := *versions
differentImages, err := diffImageSpecAndClusterRunningVersion(*version, runningVersions)
if err != nil {
logger.Errorf("failed to determine if we should upgrade or not. %+v", err)
// we shouldn't block the orchestration if we can't determine the version of the image spec, we proceed anyway in best effort
// we won't be able to check if there is an update or not and what to do, so we don't check the cluster status either
// will happen if someone uses ceph/daemon:latest-master for instance
return nil
}

if differentImages {
// If the image version changed let's make sure we can safely upgrade
// check ceph's status, if not healthy we fail
cephHealthy := client.IsCephHealthy(c.context, c.Namespace)
if !cephHealthy {
return fmt.Errorf("ceph status in namespace %s is not healthy, refusing to upgrade. fix the cluster and re-edit the cluster CR to trigger a new orchestation update", c.Namespace)
}
}

return nil
}

Expand Down
88 changes: 19 additions & 69 deletions pkg/operator/ceph/cluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,86 +250,36 @@ func (c *ClusterController) initializeCluster(cluster *cluster, clusterObj *ceph
logger.Warningf("mon count is even (given: %d), should be uneven, continuing", cluster.Spec.Mon.Count)
}

// Try to load clusterInfo early so we can compare the running version with the one from the spec image
cluster.Info, _, _, err = mon.LoadClusterInfo(c.context, cluster.Namespace)
if err == nil {
// Let's write connection info (ceph config file and keyring) to the operator for health checks
err = mon.WriteConnectionConfig(cluster.context, cluster.Info)
if err != nil {
return
}
}

// Start the Rook cluster components. Retry several times in case of failure.
failedMessage := ""
state := cephv1.ClusterStateError

err := wait.Poll(clusterCreateInterval, clusterCreateTimeout, func() (bool, error) {
cephVersion, canRetry, err := c.detectAndValidateCephVersion(cluster, cluster.Spec.CephVersion.Image)
if err != nil {
failedMessage = fmt.Sprintf("failed the ceph version check. %+v", err)
logger.Errorf(failedMessage)
if !canRetry {
// it may seem strange to exit true but we don't want to retry if the version is not supported
return true, nil
}
return false, nil
}

// This tries to determine if the operator was restarted and we loss the state
// If the cluster was unhealthy and someone injected a new image version, an upgrade was triggered but failed because the cluster is not healthy
// Then after this, if the operator gets restarted we are not able to fail if the cluster is not healthy, the following tries to determine the
// state we are in and if we should upgrade or not
//
// If not initialized, this is likely a new cluster so there is nothing to do
if cluster.Info.IsInitialized() {
imageVersion := *cephVersion

// Get cluster running versions
versions, err := client.GetAllCephDaemonVersions(c.context, cluster.Namespace)
err := wait.Poll(clusterCreateInterval, clusterCreateTimeout,
func() (bool, error) {
cephVersion, canRetry, err := c.detectAndValidateCephVersion(cluster, cluster.Spec.CephVersion.Image)
if err != nil {
logger.Errorf("failed to get ceph daemons versions. %+v", err)
return false, err
}

runningVersions := *versions
updateOrNot, err := diffImageSpecAndClusterRunningVersion(imageVersion, runningVersions)
if err != nil {
logger.Errorf("failed to determine if we should upgrade or not. %+v", err)
// we shouldn't block the orchestration if we can't determine the version of the image spec, we proceed anyway in best effort
// we won't be able to check if there is an update or not and what to do, so we don't check the cluster status either
// will happen if someone uses ceph/daemon:latest-master for instance
validOrchestration = false
return true, nil
}

if updateOrNot {
// If the image version changed let's make sure we can safely upgrade
// check ceph's status, if not healthy we fail
cephStatus := client.IsCephHealthy(c.context, cluster.Namespace)
if !cephStatus {
logger.Errorf("ceph status in namespace %s is not healthy, refusing to upgrade. fix the cluster and re-edit the cluster CR to trigger a new orchestation update", cluster.Namespace)
validOrchestration = false
failedMessage = fmt.Sprintf("failed the ceph version check. %+v", err)
logger.Errorf(failedMessage)
if !canRetry {
// it may seem strange to exit true but we don't want to retry if the version is not supported
return true, nil
}
return false, nil
}
}

c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreating, "")

err = cluster.createInstance(c.rookImage, *cephVersion)
if err != nil {
logger.Errorf("failed to create cluster in namespace %s. %+v", cluster.Namespace, err)
return false, nil
}
c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreating, "")

// cluster is created, update the cluster CRD status now
c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreated, "")
err = cluster.createInstance(c.rookImage, *cephVersion)
if err != nil {
failedMessage = fmt.Sprintf("failed to create cluster in namespace %s. %+v", cluster.Namespace, err)
logger.Errorf(failedMessage)
return false, nil
}

state = cephv1.ClusterStateCreated
failedMessage = ""
return true, nil
})
state = cephv1.ClusterStateCreated
failedMessage = ""
return true, nil
})

c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, state, failedMessage)

Expand Down

0 comments on commit 012ff7d

Please sign in to comment.