Skip to content

Commit

Permalink
health check: add on-failure actions
Browse files Browse the repository at this point in the history
For systems that have extreme robustness requirements (edge devices,
particularly those in difficult to access environments), it is important
that applications continue running in all circumstances. When the
application fails, Podman must restart it automatically to provide this
robustness. Otherwise, these devices may require customer IT to
physically gain access to restart, which can be prohibitively difficult.

Add a new `--on-failure` flag that supports four actions:

- **none**: Take no action.

- **kill**: Kill the container.

- **restart**: Restart the container.  Do not combine the `restart`
               action with the `--restart` flag.  When running inside of
               a systemd unit, consider using the `kill` or `stop`
               action instead to make use of systemd's restart policy.

- **stop**: Stop the container.

To remain backwards compatible, **none** is the default action.

Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
  • Loading branch information
vrothberg committed Sep 9, 2022
1 parent 6d8bafe commit aad29e7
Show file tree
Hide file tree
Showing 19 changed files with 340 additions and 39 deletions.
5 changes: 5 additions & 0 deletions cmd/podman/common/completion.go
Expand Up @@ -1641,3 +1641,8 @@ func AutocompleteSSH(cmd *cobra.Command, args []string, toComplete string) ([]st
}
return []string{string(ssh.GolangMode), string(ssh.NativeMode)}, cobra.ShellCompDirectiveNoFileComp
}

// AutocompleteHealthOnFailure - action to take once the container turns unhealthy.
func AutocompleteHealthOnFailure(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
return define.SupportedHealthCheckOnFailureActions, cobra.ShellCompDirectiveNoFileComp
}
8 changes: 8 additions & 0 deletions cmd/podman/common/create.go
Expand Up @@ -208,6 +208,14 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

healthOnFailureFlagName := "health-on-failure"
createFlags.StringVar(
&cf.HealthOnFailure,
healthOnFailureFlagName, "none",
"action to take once the container turns unhealthy",
)
_ = cmd.RegisterFlagCompletionFunc(healthOnFailureFlagName, AutocompleteHealthOnFailure)

createFlags.BoolVar(
&cf.HTTPProxy,
"http-proxy", containerConfig.Containers.HTTPProxy,
Expand Down
8 changes: 8 additions & 0 deletions docs/source/markdown/options/health-on-failure.md
@@ -0,0 +1,8 @@
#### **--health-on-failure**=*action*

Action to take once the container transitions to an unhealthy state. The default is **none**.

- **none**: Take no action.
- **kill**: Kill the container.
- **restart**: Restart the container. Do not combine the `restart` action with the `--restart` flag. When running inside of a systemd unit, consider using the `kill` or `stop` action instead to make use of systemd's restart policy.
- **stop**: Stop the container.
2 changes: 2 additions & 0 deletions docs/source/markdown/podman-create.1.md.in
Expand Up @@ -185,6 +185,8 @@ Read in a line delimited file of environment variables. See **Environment** note

@@option health-interval

@@option health-on-failure

@@option health-retries

@@option health-start-period
Expand Down
2 changes: 2 additions & 0 deletions docs/source/markdown/podman-run.1.md.in
Expand Up @@ -221,6 +221,8 @@ Read in a line delimited file of environment variables. See **Environment** note

@@option health-interval

@@option health-on-failure

@@option health-retries

@@option health-start-period
Expand Down
3 changes: 3 additions & 0 deletions libpod/container_config.go
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/containers/common/libnetwork/types"
"github.com/containers/common/pkg/secrets"
"github.com/containers/image/v5/manifest"
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/pkg/namespaces"
"github.com/containers/podman/v4/pkg/specgen"
"github.com/containers/storage"
Expand Down Expand Up @@ -392,6 +393,8 @@ type ContainerMiscConfig struct {
Systemd *bool `json:"systemd,omitempty"`
// HealthCheckConfig has the health check command and related timings
HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"`
// HealthCheckOnFailureAction defines an action to take once the container turns unhealthy.
HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"`
// PreserveFDs is a number of additional file descriptors (in addition
// to 0, 1, 2) that will be passed to the executed process. The total FDs
// passed will be 3 + PreserveFDs.
Expand Down
2 changes: 2 additions & 0 deletions libpod/container_inspect.go
Expand Up @@ -390,6 +390,8 @@ func (c *Container) generateInspectContainerConfig(spec *spec.Spec) *define.Insp
// leak.
ctrConfig.Healthcheck = c.config.HealthCheckConfig

ctrConfig.HealthcheckOnFailureAction = c.config.HealthCheckOnFailureAction.String()

ctrConfig.CreateCommand = c.config.CreateCommand

ctrConfig.Timezone = c.config.Timezone
Expand Down
4 changes: 4 additions & 0 deletions libpod/container_validate.go
Expand Up @@ -137,5 +137,9 @@ func (c *Container) validate() error {
if c.config.SdNotifyMode == define.SdNotifyModeIgnore && len(c.config.SdNotifySocket) > 0 {
return fmt.Errorf("cannot set sd-notify socket %q with sd-notify mode %q", c.config.SdNotifySocket, c.config.SdNotifyMode)
}

if c.config.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone && c.config.HealthCheckConfig == nil {
return fmt.Errorf("cannot set on-failure action to %s without a health check", c.config.HealthCheckOnFailureAction.String())
}
return nil
}
2 changes: 2 additions & 0 deletions libpod/define/container_inspect.go
Expand Up @@ -55,6 +55,8 @@ type InspectContainerConfig struct {
StopSignal uint `json:"StopSignal"`
// Configured healthcheck for the container
Healthcheck *manifest.Schema2HealthConfig `json:"Healthcheck,omitempty"`
// HealthcheckOnFailureAction defines an action to take once the container turns unhealthy.
HealthcheckOnFailureAction string `json:"HealthcheckOnFailureAction,omitempty"`
// CreateCommand is the full command plus arguments of the process the
// container has been created with.
CreateCommand []string `json:"CreateCommand,omitempty"`
Expand Down
74 changes: 74 additions & 0 deletions libpod/define/healthchecks.go
@@ -1,5 +1,10 @@
package define

import (
"fmt"
"strings"
)

const (
// HealthCheckHealthy describes a healthy container
HealthCheckHealthy string = "healthy"
Expand Down Expand Up @@ -57,3 +62,72 @@ const (
// HealthConfigTestCmdShell runs commands with the system's default shell
HealthConfigTestCmdShell = "CMD-SHELL"
)

// HealthCheckOnFailureAction defines how Podman reacts when a container's health
// status turns unhealthy.
type HealthCheckOnFailureAction int

// Healthcheck on-failure actions.
const (
// HealthCheckOnFailureActionNonce instructs Podman to not react on an unhealthy status.
HealthCheckOnFailureActionNone = iota // Must be first iota for backwards compatibility
// HealthCheckOnFailureActionInvalid denotes an invalid on-failure policy.
HealthCheckOnFailureActionInvalid = iota
// HealthCheckOnFailureActionNonce instructs Podman to kill the container on an unhealthy status.
HealthCheckOnFailureActionKill = iota
// HealthCheckOnFailureActionNonce instructs Podman to restart the container on an unhealthy status.
HealthCheckOnFailureActionRestart = iota
// HealthCheckOnFailureActionNonce instructs Podman to stop the container on an unhealthy status.
HealthCheckOnFailureActionStop = iota
)

// String representations for on-failure actions.
const (
strHealthCheckOnFailureActionNone = "none"
strHealthCheckOnFailureActionInvalid = "invalid"
strHealthCheckOnFailureActionKill = "kill"
strHealthCheckOnFailureActionRestart = "restart"
strHealthCheckOnFailureActionStop = "stop"
)

// SupportedHealthCheckOnFailureActions lists all supported healthcheck restart policies.
var SupportedHealthCheckOnFailureActions = []string{
strHealthCheckOnFailureActionNone,
strHealthCheckOnFailureActionKill,
strHealthCheckOnFailureActionRestart,
strHealthCheckOnFailureActionStop,
}

// String returns the string representation of the HealthCheckOnFailureAction.
func (h HealthCheckOnFailureAction) String() string {
switch h {
case HealthCheckOnFailureActionNone:
return strHealthCheckOnFailureActionNone
case HealthCheckOnFailureActionKill:
return strHealthCheckOnFailureActionKill
case HealthCheckOnFailureActionRestart:
return strHealthCheckOnFailureActionRestart
case HealthCheckOnFailureActionStop:
return strHealthCheckOnFailureActionStop
default:
return strHealthCheckOnFailureActionInvalid
}
}

// ParseHealthCheckOnFailureAction parses the specified string into a HealthCheckOnFailureAction.
// An error is returned for an invalid input.
func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, error) {
switch s {
case "", strHealthCheckOnFailureActionNone:
return HealthCheckOnFailureActionNone, nil
case strHealthCheckOnFailureActionKill:
return HealthCheckOnFailureActionKill, nil
case strHealthCheckOnFailureActionRestart:
return HealthCheckOnFailureActionRestart, nil
case strHealthCheckOnFailureActionStop:
return HealthCheckOnFailureActionStop, nil
default:
err := fmt.Errorf("invalid on-failure action %q for health check: supported actions are %s", s, strings.Join(SupportedHealthCheckOnFailureActions, ","))
return HealthCheckOnFailureActionInvalid, err
}
}
41 changes: 40 additions & 1 deletion libpod/healthcheck.go
Expand Up @@ -2,6 +2,7 @@ package libpod

import (
"bufio"
"context"
"errors"
"fmt"
"io/ioutil"
Expand All @@ -12,6 +13,7 @@ import (

"github.com/containers/podman/v4/libpod/define"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

const (
Expand All @@ -29,9 +31,14 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) {
if err != nil {
return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err)
}

hcStatus, err := checkHealthCheckCanBeRun(container)
if err == nil {
return container.runHealthCheck()
hcStatus, err := container.runHealthCheck()
if err := container.processHealthCheckStatus(hcStatus); err != nil {
return hcStatus, err
}
return hcStatus, err
}
return hcStatus, err
}
Expand Down Expand Up @@ -127,13 +134,45 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
hcResult = define.HealthCheckFailure
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}

hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil {
return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
}

return hcResult, hcErr
}

func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error {
if status == define.HealthCheckSuccess {
return nil
}

switch c.config.HealthCheckOnFailureAction {
case define.HealthCheckOnFailureActionNone: // Nothing to do

case define.HealthCheckOnFailureActionKill:
if err := c.Kill(uint(unix.SIGKILL)); err != nil {
return fmt.Errorf("killing container health-check turned unhealthy: %w", err)
}

case define.HealthCheckOnFailureActionRestart:
if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil {
return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err)
}

case define.HealthCheckOnFailureActionStop:
if err := c.Stop(); err != nil {
return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err)
}

default: // Should not happen but better be safe than sorry
return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction)
}

return nil
}

func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
cstate, err := c.State()
if err != nil {
Expand Down
11 changes: 11 additions & 0 deletions libpod/options.go
Expand Up @@ -1473,6 +1473,17 @@ func WithHealthCheck(healthCheck *manifest.Schema2HealthConfig) CtrCreateOption
}
}

// WithHealthCheckOnFailureAction adds an on-failure action to health-check config
func WithHealthCheckOnFailureAction(action define.HealthCheckOnFailureAction) CtrCreateOption {
return func(ctr *Container) error {
if ctr.valid {
return define.ErrCtrFinalized
}
ctr.config.HealthCheckOnFailureAction = action
return nil
}
}

// WithPreserveFDs forwards from the process running Libpod into the container
// the given number of extra FDs (starting after the standard streams) to the created container
func WithPreserveFDs(fd uint) CtrCreateOption {
Expand Down
1 change: 1 addition & 0 deletions pkg/domain/entities/pods.go
Expand Up @@ -212,6 +212,7 @@ type ContainerCreateOptions struct {
HealthRetries uint
HealthStartPeriod string
HealthTimeout string
HealthOnFailure string
Hostname string `json:"hostname,omitempty"`
HTTPProxy bool
HostUsers []string
Expand Down
4 changes: 4 additions & 0 deletions pkg/specgen/generate/container_create.go
Expand Up @@ -515,6 +515,10 @@ func createContainerOptions(rt *libpod.Runtime, s *specgen.SpecGenerator, pod *l
logrus.Debugf("New container has a health check")
}

if s.ContainerHealthCheckConfig.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone {
options = append(options, libpod.WithHealthCheckOnFailureAction(s.ContainerHealthCheckConfig.HealthCheckOnFailureAction))
}

if len(s.Secrets) != 0 {
manager, err := rt.SecretsManager()
if err != nil {
Expand Down
4 changes: 3 additions & 1 deletion pkg/specgen/specgen.go
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/containers/common/libimage"
nettypes "github.com/containers/common/libnetwork/types"
"github.com/containers/image/v5/manifest"
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/storage/types"
spec "github.com/opencontainers/runtime-spec/specs-go"
)
Expand Down Expand Up @@ -533,7 +534,8 @@ type ContainerResourceConfig struct {
// ContainerHealthCheckConfig describes a container healthcheck with attributes
// like command, retries, interval, start period, and timeout.
type ContainerHealthCheckConfig struct {
HealthConfig *manifest.Schema2HealthConfig `json:"healthconfig,omitempty"`
HealthConfig *manifest.Schema2HealthConfig `json:"healthconfig,omitempty"`
HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"health_check_on_failure_action,omitempty"`
}

// SpecGenerator creates an OCI spec and Libpod configuration options to create
Expand Down
7 changes: 7 additions & 0 deletions pkg/specgenutil/specgen.go
Expand Up @@ -265,6 +265,13 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions
Test: []string{"NONE"},
}
}

onFailureAction, err := define.ParseHealthCheckOnFailureAction(c.HealthOnFailure)
if err != nil {
return err
}
s.HealthCheckOnFailureAction = onFailureAction

if err := setNamespaces(s, c); err != nil {
return err
}
Expand Down

0 comments on commit aad29e7

Please sign in to comment.