Skip to content

Commit

Permalink
Rollback on ErrImageNeverPull, ErrImagePullBackOff and CrashLoopBackOff
Browse files Browse the repository at this point in the history
  • Loading branch information
Luiz Felipe Takakura committed Aug 15, 2019
1 parent 6780438 commit 30f0c0b
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 20 deletions.
5 changes: 4 additions & 1 deletion controller/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import "fmt"
// SchedulerVersions rolling update statuses
const (
timedoutStatus = "timed out"
erroredStatus = "error"
canceledStatus = "canceled"
inProgressStatus = "in progress"
deployedStatus = "deployed"
Expand All @@ -15,4 +14,8 @@ var (
rollbackStatus = func(version string) string {
return fmt.Sprintf("rolled back to %s", version)
}

erroredStatus = func(err string) string {
return fmt.Sprintf("error: %s", err)
}
)
2 changes: 1 addition & 1 deletion controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ func UpdateSchedulerConfig(

if errored != nil {
err = errored
scheduler.RollingUpdateStatus = erroredStatus
scheduler.RollingUpdateStatus = erroredStatus(err.Error())
}

updateErr := scheduler.UpdateVersionStatus(db)
Expand Down
37 changes: 22 additions & 15 deletions controller/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,11 @@ func createNewRemoveOldPod(

// wait for new pod to be created
timeout := willTimeoutAt.Sub(clock.Now())
timedout, canceled = waitCreatingPods(
timedout, canceled, err = waitCreatingPods(
logger, clientset, timeout, configYAML.Name,
[]v1.Pod{*newPod}, operationManager, mr)
if timedout || canceled {
return timedout, canceled, nil
if timedout || canceled || err != nil {
return timedout, canceled, err
}

// delete old pod
Expand Down Expand Up @@ -366,7 +366,7 @@ func waitCreatingPods(
createdPods []v1.Pod,
operationManager *models.OperationManager,
mr *models.MixedMetricsReporter,
) (timedout, wasCanceled bool) {
) (timedout, wasCanceled bool, err error) {
logger := l.WithFields(logrus.Fields{
"operation": "controller.waitCreatingPods",
"scheduler": namespace,
Expand All @@ -384,7 +384,7 @@ func waitCreatingPods(
// operationManger is nil when rolling back (rollback can't be canceled)
if operationManager != nil && operationManager.WasCanceled() {
logger.Warn("operation was canceled")
return false, true
return false, true, nil
}

for _, pod := range createdPods {
Expand All @@ -396,6 +396,7 @@ func waitCreatingPods(
)
return err
})

if err != nil && strings.Contains(err.Error(), "not found") {
exit = false
logger.
Expand Down Expand Up @@ -434,22 +435,20 @@ func waitCreatingPods(

if !models.IsPodReady(createdPod) {
logger.WithField("pod", createdPod.GetName()).Debug("pod not ready yet, waiting...")
exit = false
break
}
err = models.ValidatePodWaitingState(createdPod)

if err != nil {
logger.WithField("pod", pod.GetName()).WithError(err).Error("invalid pod waiting state")
return false, false, err
}

if err != nil && !strings.Contains(err.Error(), "not found") {
logger.
WithError(err).
WithField("pod", pod.GetName()).
Info("error getting pod")
exit = false
break
}
}
case <-timeoutTimer.C:
logger.Error("timeout waiting for rooms to be created")
return true, false
return true, false, nil
}

if exit {
Expand All @@ -458,7 +457,7 @@ func waitCreatingPods(
}
}

return false, false
return false, false, nil
}

// DeletePodAndRoom deletes the pod and removes the room from redis
Expand Down Expand Up @@ -996,6 +995,14 @@ func acquireLock(
lockTimeoutMS := config.GetInt("watcher.lockTimeoutMs")
timeoutDur := time.Duration(timeoutSec) * time.Second
ticker := time.NewTicker(2 * time.Second)

// guarantee that downScaling and config locks doesn't timeout before update times out.
// otherwise it can result in all pods dying during a rolling update that is destined to timeout
if lockKey == models.GetSchedulerDownScalingLockKey(config.GetString("watcher.lockKey"), schedulerName) ||
lockKey == models.GetSchedulerConfigLockKey(config.GetString("watcher.lockKey"), schedulerName) {
lockTimeoutMS = (timeoutSec + 1) * 1000
}

defer ticker.Stop()
timeout := time.NewTimer(timeoutDur)
defer timeout.Stop()
Expand Down
2 changes: 1 addition & 1 deletion metadata/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
package metadata

//Version of Maestro
var Version = "8.1.1"
var Version = "8.2.1"

//KubeVersion is the desired Kubernetes version
var KubeVersion = "v1.10.0"
7 changes: 7 additions & 0 deletions models/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,10 @@ const Global = "global"

// PodNotFitsHostPorts is a message when the pod's host port is no available in any node of the pool
const PodNotFitsHostPorts = "PodFitsHostPorts"

// InvalidPodWaitingStates are all the states that are not accepted in a waiting pod.
var InvalidPodWaitingStates = []string{
"ErrImageNeverPull",
"ErrImagePullBackOff",
"CrashLoopBackOff",
}
36 changes: 36 additions & 0 deletions models/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,42 @@ func IsPodReady(pod *v1.Pod) bool {
return false
}

// ValidatePodWaitingState returns nil if pod waiting reson is valid and error otherwise
// Errors checked:
// - ErrImageNeverPull
// - CrashLoopBackOff
// - ErrImagePullBackOff
func ValidatePodWaitingState(pod *v1.Pod) error {

for _, invalidState := range InvalidPodWaitingStates {
status := &pod.Status
if checkWaitingReason(status, invalidState) {
return fmt.Errorf("one or more containers in pod are in %s", invalidState)
}
}

return nil
}

func checkWaitingReason(status *v1.PodStatus, reason string) bool {
if status == nil {
return false
}

if status.Phase != v1.PodPending {
return false
}

for _, containerStatus := range status.ContainerStatuses {
state := containerStatus.State
if state.Waiting != nil && state.Waiting.Reason == reason {
return true
}
}

return false
}

// PodPending returns true if pod is with status Pending.
// In this case, also returns reason for being pending and message.
func PodPending(pod *v1.Pod) (isPending bool, reason, message string) {
Expand Down
2 changes: 1 addition & 1 deletion testing/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func MockRedisLock(
calls = NewCalls()

calls.Add(mockRedisClient.EXPECT().
SetNX(lockKey, gomock.Any(), time.Duration(lockTimeoutMs)*time.Millisecond).
SetNX(lockKey, gomock.Any(), gomock.Any()).
Return(goredis.NewBoolResult(lockResult, errLock)))

return calls
Expand Down
2 changes: 1 addition & 1 deletion watcher/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ func (w *Watcher) EnsureCorrectRooms() error {
)

if downScalingBlocked {
logger.Infof("not able to ensure correct rooms. Rolling update in place for scheduler %s", scheduler.Name)
logger.Debugf("not able to ensure correct rooms. Rolling update in place for scheduler %s", scheduler.Name)
return nil
}

Expand Down

0 comments on commit 30f0c0b

Please sign in to comment.