Skip to content

Commit

Permalink
✨ Handle stuck servers that do not switch on
Browse files Browse the repository at this point in the history
In HCloud, servers sometimes get stuck in status off. Even repeatedly
powering them on does not help. This commit introduces a logic that
makes sure, these servers get marked as failed after a certain timeout.
  • Loading branch information
janiskemper committed Apr 7, 2022
1 parent a48bce7 commit 483aafb
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 9 deletions.
4 changes: 2 additions & 2 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ const (
InstanceTerminatedReason = "InstanceTerminated"
// InstanceHasNonExistingPlacementGroupReason instance has a placement group name that does not exist.
InstanceHasNonExistingPlacementGroupReason = "InstanceHasNonExistingPlacementGroup"
// InstanceHasNoValidSSHKeyReason instance has no valid ssh key.
InstanceHasNoValidSSHKeyReason = "InstanceHasNoValidSSHKey"
// ServerOffReason instance is off.
ServerOffReason = "ServerOff"
// InstanceAsControlPlaneUnreachableReason control plane is (not yet) reachable.
InstanceAsControlPlaneUnreachableReason = "InstanceAsControlPlaneUnreachable"
)
Expand Down
9 changes: 9 additions & 0 deletions pkg/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/patch"
)
Expand Down Expand Up @@ -102,6 +103,14 @@ func (m *MachineScope) PatchObject(ctx context.Context) error {
return m.patchHelper.Patch(ctx, m.HCloudMachine)
}

// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs
// the message. It assumes the reason is invalid configuration, since that is
// currently the only relevant MachineStatusError choice.
func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) {
m.HCloudMachine.Status.FailureMessage = &message
m.HCloudMachine.Status.FailureReason = &reason
}

// IsBootstrapDataReady checks the readiness of a capi machine's bootstrap data.
func (m *MachineScope) IsBootstrapDataReady(ctx context.Context) bool {
return m.Machine.Spec.Bootstrap.DataSecretName != nil
Expand Down
52 changes: 45 additions & 7 deletions pkg/services/hcloud/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ import (
"github.com/syself/cluster-api-provider-hetzner/pkg/utils"
corev1 "k8s.io/api/core/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

const maxShutDownTime = 2 * time.Minute
const serverOffTimeout = 10 * time.Minute

// Service defines struct with machine scope to reconcile HCloud machines.
type Service struct {
Expand Down Expand Up @@ -88,13 +90,13 @@ func (s *Service) Reconcile(ctx context.Context) (_ *ctrl.Result, err error) {

switch server.Status {
case hcloud.ServerStatusOff:
// Check if server is in ServerStatusOff and turn it on. This is to avoid a bug of Hetzner where
// sometimes machines are created and not turned on
// Don't do this when server is just created
if _, err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil {
return nil, errors.Wrap(err, "failed to power on server")
}

return s.handleServerStatusOff(ctx, server)
case hcloud.ServerStatusStarting:
// Requeue here so that server does not switch back and forth between off and starting.
// If we don't return here, the condition InstanceReady would get marked as true in this
// case. However, if the server is stuck and does not power on, we should not mark the
// condition InstanceReady as true to be able to remediate the server after a timeout.
return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
case hcloud.ServerStatusRunning: // Do nothing
default:
s.scope.HCloudMachine.Status.Ready = false
Expand Down Expand Up @@ -128,6 +130,42 @@ func (s *Service) Reconcile(ctx context.Context) (_ *ctrl.Result, err error) {
return nil, nil
}

func (s *Service) handleServerStatusOff(ctx context.Context, server *hcloud.Server) (*reconcile.Result, error) {
// Check if server is in ServerStatusOff and turn it on. This is to avoid a bug of Hetzner where
// sometimes machines are created and not turned on

condition := conditions.Get(s.scope.HCloudMachine, infrav1.InstanceReadyCondition)
if condition != nil &&
condition.Status == corev1.ConditionFalse &&
condition.Reason == infrav1.ServerOffReason {
if time.Now().Before(condition.LastTransitionTime.Time.Add(serverOffTimeout)) {
// Not yet timed out, try again to power on
if _, err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil {
return nil, errors.Wrap(err, "failed to power on server")
}
} else {
// Timed out. Set failure reason
s.scope.SetError("reached timeout of waiting for machines that are switched off", capierrors.CreateMachineError)
return nil, nil
}
} else {
// No condition set yet. Try to power server on.
if _, err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil {
return nil, errors.Wrap(err, "failed to power on server")
}
conditions.MarkFalse(
s.scope.HCloudMachine,
infrav1.InstanceReadyCondition,
infrav1.ServerOffReason,
clusterv1.ConditionSeverityInfo,
"server is switched off",
)
}

// Try again in 30 sec.
return &reconcile.Result{RequeueAfter: 30 * time.Second}, nil
}

func (s *Service) reconcileNetworkAttachment(ctx context.Context, server *hcloud.Server) error {
// If no network exists, then do nothing
if s.scope.HetznerCluster.Status.Network == nil {
Expand Down
14 changes: 14 additions & 0 deletions pkg/services/hcloud/server/server_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import (
"github.com/hetznercloud/hcloud-go/hcloud/schema"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
hcloudclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/client"
corev1 "k8s.io/api/core/v1"
)

Expand Down Expand Up @@ -205,3 +208,14 @@ var _ = BeforeSuite(func() {

server = hcloud.ServerFromSchema(serverSchema)
})

func newTestService(hcloudMachine *infrav1.HCloudMachine, hcloudClient hcloudclient.Client) *Service {
return &Service{
&scope.MachineScope{
HCloudMachine: hcloudMachine,
ClusterScope: scope.ClusterScope{
HCloudClient: hcloudClient,
},
},
}
}
80 changes: 80 additions & 0 deletions pkg/services/hcloud/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,20 @@ limitations under the License.
package server

import (
"context"
"time"

"github.com/hetznercloud/hcloud-go/hcloud"
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
fakeclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/client/fake"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

var _ = Describe("setStatusFromAPI", func() {
Expand Down Expand Up @@ -156,3 +165,74 @@ var _ = Describe("getSSHKeys", func() {
Expect(err).To(HaveOccurred())
})
})

var _ = Describe("handleServerStatusOff", func() {
var hcloudMachine *infrav1.HCloudMachine
client := fakeclient.NewHCloudClientFactory().NewClient("")

res, err := client.CreateServer(context.Background(), hcloud.ServerCreateOpts{Name: "serverName"})
Expect(err).To(Succeed())
server := res.Server

BeforeEach(func() {
hcloudMachine = &infrav1.HCloudMachine{
ObjectMeta: metav1.ObjectMeta{
Name: "hcloudMachineName",
Namespace: "default",
},
Spec: infrav1.HCloudMachineSpec{
ImageName: "fedora-control-plane",
Type: "cpx31",
},
}

server.Status = hcloud.ServerStatusOff
})

It("sets a condition if none is previously set", func() {
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
Expect(res).Should(Equal(&reconcile.Result{RequeueAfter: 30 * time.Second}))
Expect(conditions.GetReason(hcloudMachine, infrav1.InstanceReadyCondition)).To(Equal(infrav1.ServerOffReason))
Expect(server.Status).To(Equal(hcloud.ServerStatusRunning))
})

It("tries to power on server again if it is not timed out", func() {
conditions.MarkFalse(hcloudMachine, infrav1.InstanceReadyCondition, infrav1.ServerOffReason, clusterv1.ConditionSeverityInfo, "")
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
Expect(res).Should(Equal(&reconcile.Result{RequeueAfter: 30 * time.Second}))
Expect(server.Status).To(Equal(hcloud.ServerStatusRunning))
Expect(conditions.GetReason(hcloudMachine, infrav1.InstanceReadyCondition)).To(Equal(infrav1.ServerOffReason))
})

It("sets a failure message if it timed out", func() {
conditions.MarkFalse(hcloudMachine, infrav1.InstanceReadyCondition, infrav1.ServerOffReason, clusterv1.ConditionSeverityInfo, "")
// manipulate lastTransitionTime
conditionsList := hcloudMachine.GetConditions()
for i, c := range conditionsList {
if c.Type == infrav1.InstanceReadyCondition {
conditionsList[i].LastTransitionTime = metav1.NewTime(time.Now().Add(-time.Hour))
}
}
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
var nilResult *reconcile.Result
Expect(res).Should(Equal(nilResult))
Expect(server.Status).To(Equal(hcloud.ServerStatusOff))
Expect(hcloudMachine.Status.FailureMessage).Should(Equal(pointer.String("reached timeout of waiting for machines that are switched off")))
})

It("tries to power on server and sets new condition if different one is set", func() {
conditions.MarkTrue(hcloudMachine, infrav1.InstanceReadyCondition)
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
Expect(res).Should(Equal(&reconcile.Result{RequeueAfter: 30 * time.Second}))
Expect(conditions.GetReason(hcloudMachine, infrav1.InstanceReadyCondition)).To(Equal(infrav1.ServerOffReason))
Expect(server.Status).To(Equal(hcloud.ServerStatusRunning))
})
})

0 comments on commit 483aafb

Please sign in to comment.