Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rollback on ErrImageNeverPull, ErrImagePullBackOff and CrashLoopBackOff #70

Merged
merged 2 commits into from
Aug 15, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion controller/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import "fmt"
// SchedulerVersions rolling update statuses
const (
timedoutStatus = "timed out"
erroredStatus = "error"
canceledStatus = "canceled"
inProgressStatus = "in progress"
deployedStatus = "deployed"
Expand All @@ -15,4 +14,8 @@ var (
rollbackStatus = func(version string) string {
return fmt.Sprintf("rolled back to %s", version)
}

erroredStatus = func(err string) string {
return fmt.Sprintf("error: %s", err)
}
)
2 changes: 1 addition & 1 deletion controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ func UpdateSchedulerConfig(

if errored != nil {
err = errored
scheduler.RollingUpdateStatus = erroredStatus
scheduler.RollingUpdateStatus = erroredStatus(err.Error())
}

updateErr := scheduler.UpdateVersionStatus(db)
Expand Down
38 changes: 23 additions & 15 deletions controller/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,11 @@ func createNewRemoveOldPod(

// wait for new pod to be created
timeout := willTimeoutAt.Sub(clock.Now())
timedout, canceled = waitCreatingPods(
timedout, canceled, err = waitCreatingPods(
logger, clientset, timeout, configYAML.Name,
[]v1.Pod{*newPod}, operationManager, mr)
if timedout || canceled {
return timedout, canceled, nil
if timedout || canceled || err != nil {
return timedout, canceled, err
}

// delete old pod
Expand Down Expand Up @@ -366,7 +366,7 @@ func waitCreatingPods(
createdPods []v1.Pod,
operationManager *models.OperationManager,
mr *models.MixedMetricsReporter,
) (timedout, wasCanceled bool) {
) (timedout, wasCanceled bool, err error) {
logger := l.WithFields(logrus.Fields{
"operation": "controller.waitCreatingPods",
"scheduler": namespace,
Expand All @@ -384,7 +384,7 @@ func waitCreatingPods(
// operationManger is nil when rolling back (rollback can't be canceled)
if operationManager != nil && operationManager.WasCanceled() {
logger.Warn("operation was canceled")
return false, true
return false, true, nil
}

for _, pod := range createdPods {
Expand All @@ -396,6 +396,7 @@ func waitCreatingPods(
)
return err
})

if err != nil && strings.Contains(err.Error(), "not found") {
exit = false
logger.
Expand Down Expand Up @@ -434,22 +435,20 @@ func waitCreatingPods(

if !models.IsPodReady(createdPod) {
logger.WithField("pod", createdPod.GetName()).Debug("pod not ready yet, waiting...")
exit = false
break
}
err = models.ValidatePodWaitingState(createdPod)

if err != nil {
logger.WithField("pod", pod.GetName()).WithError(err).Error("invalid pod waiting state")
return false, false, err
}

if err != nil && !strings.Contains(err.Error(), "not found") {
logger.
WithError(err).
WithField("pod", pod.GetName()).
Info("error getting pod")
exit = false
break
}
}
case <-timeoutTimer.C:
logger.Error("timeout waiting for rooms to be created")
return true, false
return true, false, nil
}

if exit {
Expand All @@ -458,7 +457,7 @@ func waitCreatingPods(
}
}

return false, false
return false, false, nil
}

// DeletePodAndRoom deletes the pod and removes the room from redis
Expand Down Expand Up @@ -996,6 +995,14 @@ func acquireLock(
lockTimeoutMS := config.GetInt("watcher.lockTimeoutMs")
timeoutDur := time.Duration(timeoutSec) * time.Second
ticker := time.NewTicker(2 * time.Second)

// guarantee that downScaling and config locks doesn't timeout before update times out.
// otherwise it can result in all pods dying during a rolling update that is destined to timeout
if lockKey == models.GetSchedulerDownScalingLockKey(config.GetString("watcher.lockKey"), schedulerName) ||
lockKey == models.GetSchedulerConfigLockKey(config.GetString("watcher.lockKey"), schedulerName) {
lockTimeoutMS = (timeoutSec + 1) * 1000
}

defer ticker.Stop()
timeout := time.NewTimer(timeoutDur)
defer timeout.Stop()
Expand Down Expand Up @@ -1029,6 +1036,7 @@ func acquireLock(

if lock == nil {
l.Warnf("unable to get watcher %s lock, maybe some other process has it...", schedulerName)
break
}

if lock.IsLocked() {
Expand Down
2 changes: 1 addition & 1 deletion metadata/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
package metadata

//Version of Maestro
var Version = "8.1.1"
var Version = "8.2.1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not 8.2.0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

8.2.0 was released already. Forgot to change the metadata =P


//KubeVersion is the desired Kubernetes version
var KubeVersion = "v1.10.0"
7 changes: 7 additions & 0 deletions models/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,10 @@ const Global = "global"

// PodNotFitsHostPorts is a message when the pod's host port is no available in any node of the pool
const PodNotFitsHostPorts = "PodFitsHostPorts"

// InvalidPodWaitingStates are all the states that are not accepted in a waiting pod.
var InvalidPodWaitingStates = []string{
"ErrImageNeverPull",
"ErrImagePullBackOff",
"CrashLoopBackOff",
lftakakura marked this conversation as resolved.
Show resolved Hide resolved
}
32 changes: 32 additions & 0 deletions models/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,38 @@ func IsPodReady(pod *v1.Pod) bool {
return false
}

// ValidatePodWaitingState returns nil if pod waiting reson is valid and error otherwise
// Errors checked:
// - ErrImageNeverPull
// - CrashLoopBackOff
// - ErrImagePullBackOff
func ValidatePodWaitingState(pod *v1.Pod) error {

for _, invalidState := range InvalidPodWaitingStates {
status := &pod.Status
if checkWaitingReason(status, invalidState) {
return fmt.Errorf("one or more containers in pod are in %s", invalidState)
}
}

return nil
}

func checkWaitingReason(status *v1.PodStatus, reason string) bool {
if status == nil {
return false
}

for _, containerStatus := range status.ContainerStatuses {
state := containerStatus.State
if state.Waiting != nil && state.Waiting.Reason == reason {
return true
}
}

return false
}

// PodPending returns true if pod is with status Pending.
// In this case, also returns reason for being pending and message.
func PodPending(pod *v1.Pod) (isPending bool, reason, message string) {
Expand Down
2 changes: 1 addition & 1 deletion testing/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func MockRedisLock(
calls = NewCalls()

calls.Add(mockRedisClient.EXPECT().
SetNX(lockKey, gomock.Any(), time.Duration(lockTimeoutMs)*time.Millisecond).
SetNX(lockKey, gomock.Any(), gomock.Any()).
Return(goredis.NewBoolResult(lockResult, errLock)))

return calls
Expand Down
2 changes: 1 addition & 1 deletion watcher/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ func (w *Watcher) EnsureCorrectRooms() error {
)

if downScalingBlocked {
logger.Infof("not able to ensure correct rooms. Rolling update in place for scheduler %s", scheduler.Name)
logger.Debugf("not able to ensure correct rooms. Rolling update in place for scheduler %s", scheduler.Name)
return nil
}

Expand Down