Skip to content

Commit

Permalink
fix: use controller-runtime standard healthz endpoints
Browse files Browse the repository at this point in the history
Fixes #717

`caps-controller-manager` has proper webhook support, so use that for
readiness/liveness checks (standard CAPI way).

`sidero-controller-manager` doesn't have webhooks (we should fix it
eventually!), so using iPXE check

Also:

* bump Talos to 0.14.1
* use Talos provided default arguments for the agent environment (as
agent is running Talos kernel, it makes sense).

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Feb 1, 2022
1 parent c73d8e5 commit e44f350
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 66 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ MODULE := $(shell head -1 go.mod | cut -d' ' -f2)

ARTIFACTS := _out
TEST_PKGS ?= ./...
TALOS_RELEASE ?= v0.14.0-alpha.2
TALOS_RELEASE ?= v0.14.1
PREVIOUS_TALOS_RELEASE ?= v0.13.4
DEFAULT_K8S_VERSION ?= v1.22.3

Expand Down
12 changes: 12 additions & 0 deletions app/caps-controller-manager/config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,16 @@ spec:
requests:
cpu: 100m
memory: 128Mi
ports:
- containerPort: 9440
name: healthz
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
port: healthz
livenessProbe:
httpGet:
path: /healthz
port: healthz
terminationGracePeriodSeconds: 10
33 changes: 25 additions & 8 deletions app/caps-controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ func init() {
func main() {
var (
metricsAddr string
healthAddr string
enableLeaderElection bool
webhookPort int
)

flag.StringVar(&metricsAddr, "metrics-bind-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", true,
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
flag.IntVar(&webhookPort, "webhook-port", 9443, "Webhook Server port, disabled by default. When enabled, the manager will only work as webhook server, no reconcilers are installed.")
Expand All @@ -82,12 +84,13 @@ func main() {
})

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "controller-leader-election-capm",
Port: webhookPort,
EventBroadcaster: broadcaster,
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "controller-leader-election-capm",
Port: webhookPort,
EventBroadcaster: broadcaster,
HealthProbeBindAddress: healthAddr,
})
if err != nil {
setupLog.Error(err, "unable to start manager")
Expand All @@ -110,7 +113,7 @@ func main() {
mgr.GetScheme(),
corev1.EventSource{Component: "caps-controller-manager"})

ctx := context.Background()
ctx := ctrl.SetupSignalHandler()

if err = (&controllers.MetalClusterReconciler{
Client: mgr.GetClient(),
Expand Down Expand Up @@ -162,10 +165,24 @@ func main() {
}
// +kubebuilder:scaffold:builder

setupChecks(mgr)

setupLog.Info("starting manager")

if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
if err := mgr.Start(ctx); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}
}

func setupChecks(mgr ctrl.Manager) {
if err := mgr.AddReadyzCheck("webhook", mgr.GetWebhookServer().StartedChecker()); err != nil {
setupLog.Error(err, "unable to create ready check")
os.Exit(1)
}

if err := mgr.AddHealthzCheck("webhook", mgr.GetWebhookServer().StartedChecker()); err != nil {
setupLog.Error(err, "unable to create health check")
os.Exit(1)
}
}
11 changes: 6 additions & 5 deletions app/sidero-controller-manager/config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ spec:
- name: http
containerPort: ${SIDERO_CONTROLLER_MANAGER_CONTAINER_API_PORT:=8081}
protocol: TCP
- containerPort: 9440
name: healthz
protocol: TCP
env:
- name: API_ENDPOINT
valueFrom:
Expand All @@ -96,14 +99,12 @@ spec:
memory: 128Mi
readinessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 15
path: /readyz
port: healthz
livenessProbe:
httpGet:
path: /healthz
port: http
initialDelaySeconds: 15
port: healthz
- command:
- /siderolink-manager
args:
Expand Down
19 changes: 0 additions & 19 deletions app/sidero-controller-manager/internal/healthz/healthz.go

This file was deleted.

48 changes: 37 additions & 11 deletions app/sidero-controller-manager/internal/ipxe/ipxe_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ import (
"context"
"errors"
"fmt"
"io"
"log"
"net"
"net/http"
"strconv"
"strings"
"text/template"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -23,9 +25,11 @@ import (
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"

"github.com/talos-systems/go-procfs/procfs"
talosconstants "github.com/talos-systems/talos/pkg/machinery/constants"
"github.com/talos-systems/talos/pkg/machinery/kernel"

infrav1 "github.com/talos-systems/sidero/app/caps-controller-manager/api/v1alpha3"
metalv1alpha1 "github.com/talos-systems/sidero/app/sidero-controller-manager/api/v1alpha1"
Expand Down Expand Up @@ -369,23 +373,15 @@ func newEnvironment(server *metalv1alpha1.Server, serverBinding *infrav1.ServerB
}

func newAgentEnvironment(arch string) *metalv1alpha1.Environment {
args := []string{
args := append([]string(nil), kernel.DefaultArgs...)
args = append(args,
"console=tty0",
"console=ttyS0",
"ima_appraise=fix",
"ima_hash=sha512",
"ima_template=ima-ng",
"initrd=initramfs.xz",
"ip=dhcp",
"page_poison=1",
"panic=30",
"printk.devkmsg=on",
"pti=on",
"random.trust_cpu=on",
"slab_nomerge=",
"slub_debug=P",
fmt.Sprintf("%s=%s:%d", constants.AgentEndpointArg, apiEndpoint, apiPort),
}
)

cmdline := procfs.NewCmdline(strings.Join(args, " "))
extra := procfs.NewCmdline(extraAgentKernelArgs)
Expand Down Expand Up @@ -513,3 +509,33 @@ func markAsPXEBooted(server *metalv1alpha1.Server) error {
Conditions: []clusterv1.ConditionType{metalv1alpha1.ConditionPXEBooted},
})
}

func Check(addr string) healthz.Checker {
return func(_ *http.Request) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://%s/boot.ipxe", addr), nil)
if err != nil {
return err
}

resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}

defer func() {
if resp.Body != nil {
io.Copy(io.Discard, resp.Body) //nolint:errcheck
resp.Body.Close() //nolint:errcheck
}
}()

if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected code %d", resp.StatusCode)
}

return nil
}
}
42 changes: 27 additions & 15 deletions app/sidero-controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (
infrav1 "github.com/talos-systems/sidero/app/caps-controller-manager/api/v1alpha3"
metalv1alpha1 "github.com/talos-systems/sidero/app/sidero-controller-manager/api/v1alpha1"
"github.com/talos-systems/sidero/app/sidero-controller-manager/controllers"
"github.com/talos-systems/sidero/app/sidero-controller-manager/internal/healthz"
"github.com/talos-systems/sidero/app/sidero-controller-manager/internal/ipxe"
"github.com/talos-systems/sidero/app/sidero-controller-manager/internal/metadata"
"github.com/talos-systems/sidero/app/sidero-controller-manager/internal/power/api"
Expand Down Expand Up @@ -71,6 +70,7 @@ func init() {
func main() {
var (
metricsAddr string
healthAddr string
apiEndpoint string
apiPort int
httpPort int
Expand All @@ -91,6 +91,7 @@ func main() {
flag.IntVar(&apiPort, "api-port", 8081, "The TCP port Sidero components can be reached at from the servers.")
flag.IntVar(&httpPort, "http-port", 8081, "The TCP port Sidero controller manager HTTP server is running.")
flag.StringVar(&metricsAddr, "metrics-bind-addr", ":8081", "The address the metric endpoint binds to.")
flag.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.")
flag.StringVar(&extraAgentKernelArgs, "extra-agent-kernel-args", "", "A list of Linux kernel command line arguments to add to the agent environment kernel parameters (e.g. 'console=tty1 console=ttyS1').")
flag.StringVar(&bootFromDiskMethod, "boot-from-disk-method", string(ipxe.BootIPXEExit), "Default method to use to boot server from disk if it hits iPXE endpoint after install.")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", true, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
Expand Down Expand Up @@ -150,11 +151,12 @@ func main() {
api.DefaultDice = api.NewFailureDice(testPowerSimulatedExplicitFailureProb, testPowerSimulatedSilentFailureProb)

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "controller-leader-election-sidero-controller-manager",
Port: 9443,
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "controller-leader-election-sidero-controller-manager",
Port: 9443,
HealthProbeBindAddress: healthAddr,
})
if err != nil {
setupLog.Error(err, "unable to start manager")
Expand All @@ -177,7 +179,7 @@ func main() {
mgr.GetScheme(),
corev1.EventSource{Component: "sidero-controller-manager"})

ctx := context.Background()
ctx := ctrl.SetupSignalHandler()

if err = (&controllers.EnvironmentReconciler{
Client: mgr.GetClient(),
Expand Down Expand Up @@ -212,6 +214,9 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "ServerClass")
os.Exit(1)
}

setupChecks(mgr, httpPort)

// +kubebuilder:scaffold:builder

errCh := make(chan error)
Expand Down Expand Up @@ -242,13 +247,6 @@ func main() {
os.Exit(1)
}

setupLog.Info("starting healthz server")

if err := healthz.RegisterServer(httpMux); err != nil {
setupLog.Error(err, "unable to start healthz server", "controller", "Environment")
os.Exit(1)
}

setupLog.Info("starting internal API server")

apiRecorder := eventBroadcaster.NewRecorder(
Expand Down Expand Up @@ -283,7 +281,7 @@ func main() {
setupLog.Info("starting manager and HTTP server")

go func() {
err := mgr.Start(ctrl.SetupSignalHandler())
err := mgr.Start(ctx)
if err != nil {
setupLog.Error(err, "problem running manager")
}
Expand Down Expand Up @@ -326,3 +324,17 @@ func main() {
}
}
}

func setupChecks(mgr ctrl.Manager, httpPort int) {
addr := fmt.Sprintf("127.0.0.1:%d", httpPort)

if err := mgr.AddReadyzCheck("ipxe", ipxe.Check(addr)); err != nil {
setupLog.Error(err, "unable to create ready check")
os.Exit(1)
}

if err := mgr.AddHealthzCheck("webhook", ipxe.Check(addr)); err != nil {
setupLog.Error(err, "unable to create health check")
os.Exit(1)
}
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ require (
github.com/talos-systems/grpc-proxy v0.2.0
github.com/talos-systems/net v0.3.1
github.com/talos-systems/siderolink v0.1.1-0.20211130121818-9902ad2774f0
github.com/talos-systems/talos/pkg/machinery v0.14.0
github.com/talos-systems/talos/pkg/machinery v0.14.1
go.uber.org/zap v1.20.0
golang.org/x/net v0.0.0-20220114011407-0dd24b26b47d
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
Expand Down
3 changes: 2 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -663,8 +663,9 @@ github.com/talos-systems/siderolink v0.1.1-0.20211130121818-9902ad2774f0/go.mod
github.com/talos-systems/talos/pkg/machinery v0.12.3/go.mod h1:qX77JMZawrDTQaJucqecdlFsHy+dbnZ9YL8Kw4qL7d4=
github.com/talos-systems/talos/pkg/machinery v0.13.0/go.mod h1:fQx1FlvFLSexSOYL1DSl0EjtazujlzNmVDCt2yRoLJ4=
github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211118180932-1ffa8e048008/go.mod h1:D8NT4Aj+X2OpA6yK6RAtpw1wcgkDS7oD23vqOQWRiP8=
github.com/talos-systems/talos/pkg/machinery v0.14.0 h1:UKk33z236rMWHsSMhu6ExlG1uB5dF7jws3qRDP+yycA=
github.com/talos-systems/talos/pkg/machinery v0.14.0/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc=
github.com/talos-systems/talos/pkg/machinery v0.14.1 h1:ecvzW8OMlWxfdGsiL6cVwtEOd4IwIYTIgRaEEFxyuTc=
github.com/talos-systems/talos/pkg/machinery v0.14.1/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
Expand Down
4 changes: 2 additions & 2 deletions sfyra/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ require (
github.com/talos-systems/go-retry v0.3.1
github.com/talos-systems/net v0.3.1
github.com/talos-systems/sidero v0.0.0-00010101000000-000000000000
github.com/talos-systems/talos v0.14.0
github.com/talos-systems/talos/pkg/machinery v0.14.0
github.com/talos-systems/talos v0.14.1
github.com/talos-systems/talos/pkg/machinery v0.14.1
google.golang.org/grpc v1.43.0
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
k8s.io/api v0.23.1
Expand Down
7 changes: 4 additions & 3 deletions sfyra/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1125,13 +1125,14 @@ github.com/talos-systems/net v0.3.1 h1:F9mlDgKE4XVfgpoRmacVUTEAMAeQ5xuOaeCl+A9a0
github.com/talos-systems/net v0.3.1/go.mod h1:zhcGixNJz9dgwFiUwc7gkkAqdVqXagU1SNNoIVXYKGo=
github.com/talos-systems/siderolink v0.1.0/go.mod h1:bEGwDYl9QgC3oZ4kdnJTuR2HX/XlUhxZjx/QAakKuBc=
github.com/talos-systems/siderolink v0.1.1-0.20211130121818-9902ad2774f0/go.mod h1:bEGwDYl9QgC3oZ4kdnJTuR2HX/XlUhxZjx/QAakKuBc=
github.com/talos-systems/talos v0.14.0 h1:3RyC7FgpQ5ZWjiyfz57u4qG/l0+pR52hDSJnGPxBgZg=
github.com/talos-systems/talos v0.14.0/go.mod h1:UWuLwoQGsUHEddTqwfvxNOyYTOzd2n9KiV+pyfIJT5M=
github.com/talos-systems/talos v0.14.1 h1:z6pr4mvs32DiG8M46tb88v2d9jBNLEBq6zxWgSlTC2U=
github.com/talos-systems/talos v0.14.1/go.mod h1:GavvnvQzEp4A00+R8ecpd26TpXWVroRhKLavIK3P2fI=
github.com/talos-systems/talos/pkg/machinery v0.12.3/go.mod h1:qX77JMZawrDTQaJucqecdlFsHy+dbnZ9YL8Kw4qL7d4=
github.com/talos-systems/talos/pkg/machinery v0.13.0/go.mod h1:fQx1FlvFLSexSOYL1DSl0EjtazujlzNmVDCt2yRoLJ4=
github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211118180932-1ffa8e048008/go.mod h1:D8NT4Aj+X2OpA6yK6RAtpw1wcgkDS7oD23vqOQWRiP8=
github.com/talos-systems/talos/pkg/machinery v0.14.0 h1:UKk33z236rMWHsSMhu6ExlG1uB5dF7jws3qRDP+yycA=
github.com/talos-systems/talos/pkg/machinery v0.14.0/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc=
github.com/talos-systems/talos/pkg/machinery v0.14.1 h1:ecvzW8OMlWxfdGsiL6cVwtEOd4IwIYTIgRaEEFxyuTc=
github.com/talos-systems/talos/pkg/machinery v0.14.1/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc=
github.com/tchap/go-patricia v2.2.6+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I=
github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
Expand Down

0 comments on commit e44f350

Please sign in to comment.