Skip to content

Commit

Permalink
Add wait period to allow image pull error recovery (#997)
Browse files Browse the repository at this point in the history
Previously, we would treat any image pull error as a final error and it
would result in a plugin being marked as failing. In this change, we now
allow a window of 5 minutes from when the pod started to allow recovery
from these errors as they may be transient.

Signed-off-by: Bridget McErlean <bmcerlean@vmware.com>
  • Loading branch information
zubron authored and johnSchnake committed Nov 15, 2019
1 parent 6f983e2 commit ad67ceb
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 3 deletions.
11 changes: 8 additions & 3 deletions pkg/plugin/driver/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const (
// that we consider it a failure mode. This handles the situation where the plugin container
// exits without returning results.
terminatedContainerWindow = 5 * time.Minute

// maxWaitForImageTime is the amount of time that we allow for pods to recover from failed image pulls.
// This allows for transient image pull errors to be recovered from without marking the plugin as failed.
maxWaitForImageTime = 5 * time.Minute
)

// GetSessionID generates a new session id.
Expand Down Expand Up @@ -62,10 +66,11 @@ func IsPodFailing(pod *v1.Pod) (bool, string) {
return true, errstr
}

// Check if it can't fetch its image
// Check if it can't fetch its image within the maximum wait time
if waiting := cstatus.State.Waiting; waiting != nil {
if waiting.Reason == "ImagePullBackOff" || waiting.Reason == "ErrImagePull" {
errstr := fmt.Sprintf("Container %v is in state %v", cstatus.Name, waiting.Reason)
elapsedPodTime := time.Now().Sub(pod.Status.StartTime.Time)
if elapsedPodTime > maxWaitForImageTime && (waiting.Reason == "ImagePullBackOff" || waiting.Reason == "ErrImagePull") {
errstr := fmt.Sprintf("Failed to pull image for container %v within %v. Container is in state %v", cstatus.Name, maxWaitForImageTime, waiting.Reason)
return true, errstr
}
}
Expand Down
65 changes: 65 additions & 0 deletions pkg/plugin/driver/utils/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,71 @@ func TestPodFailing(t *testing.T) {
}
return p
}),
}, {
desc: "ImagePullBackOff is not considered a failure if elapsed time within wait window",
expectFailing: false,
expectMsg: "",
pod: fromGoodPod(func(p *corev1.Pod) *corev1.Pod {
p.Status.StartTime = &metav1.Time{Time: time.Now().Add(maxWaitForImageTime / -2)}
p.Status.ContainerStatuses = []corev1.ContainerStatus{
{State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ImagePullBackOff",
},
}},
}
return p
}),
}, {
desc: "ImagePullBackOff is considered a failure if elapsed time greater than wait window",
expectFailing: true,
expectMsg: "Failed to pull image for container error-container within 5m0s. Container is in state ImagePullBackOff",
pod: fromGoodPod(func(p *corev1.Pod) *corev1.Pod {
p.Status.StartTime = &metav1.Time{Time: time.Now().Add(-maxWaitForImageTime)}
p.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: "error-container",
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ImagePullBackOff",
},
}},
}
return p
}),
}, {
desc: "ErrImagePull is considered a failure if elapsed time greater than wait window",
expectFailing: true,
expectMsg: "Failed to pull image for container error-container within 5m0s. Container is in state ErrImagePull",
pod: fromGoodPod(func(p *corev1.Pod) *corev1.Pod {
p.Status.StartTime = &metav1.Time{Time: time.Now().Add(-maxWaitForImageTime)}
p.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: "error-container",
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ErrImagePull",
},
}},
}
return p
}),
}, {
desc: "Other wait reason not considered a failure if elapsed time greater than wait window",
expectFailing: false,
expectMsg: "",
pod: fromGoodPod(func(p *corev1.Pod) *corev1.Pod {
p.Status.StartTime = &metav1.Time{Time: time.Now().Add(-maxWaitForImageTime)}
p.Status.ContainerStatuses = []corev1.ContainerStatus{
{
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ContainerCreating",
},
}},
}
return p
}),
},
}

Expand Down

0 comments on commit ad67ceb

Please sign in to comment.