Skip to content

Commit

Permalink
Merge pull request #143 from syself/bugfix/hcloud-server-off
Browse files Browse the repository at this point in the history
✨ Handle stuck servers that do not switch on
  • Loading branch information
janiskemper committed Apr 8, 2022
2 parents a48bce7 + c888ea9 commit 1a8228c
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.44.0 # https://github.com/golangci/golangci-lint/releases
version: v1.45.2 # https://github.com/golangci/golangci-lint/releases
working-directory: ${{matrix.working-directory}}
4 changes: 2 additions & 2 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ const (
InstanceTerminatedReason = "InstanceTerminated"
// InstanceHasNonExistingPlacementGroupReason instance has a placement group name that does not exist.
InstanceHasNonExistingPlacementGroupReason = "InstanceHasNonExistingPlacementGroup"
// InstanceHasNoValidSSHKeyReason instance has no valid ssh key.
InstanceHasNoValidSSHKeyReason = "InstanceHasNoValidSSHKey"
// ServerOffReason instance is off.
ServerOffReason = "ServerOff"
// InstanceAsControlPlaneUnreachableReason control plane is (not yet) reachable.
InstanceAsControlPlaneUnreachableReason = "InstanceAsControlPlaneUnreachable"
)
Expand Down
9 changes: 9 additions & 0 deletions pkg/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/patch"
)
Expand Down Expand Up @@ -102,6 +103,14 @@ func (m *MachineScope) PatchObject(ctx context.Context) error {
return m.patchHelper.Patch(ctx, m.HCloudMachine)
}

// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs
// the message. It assumes the reason is invalid configuration, since that is
// currently the only relevant MachineStatusError choice.
func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) {
m.HCloudMachine.Status.FailureMessage = &message
m.HCloudMachine.Status.FailureReason = &reason
}

// IsBootstrapDataReady checks the readiness of a capi machine's bootstrap data.
func (m *MachineScope) IsBootstrapDataReady(ctx context.Context) bool {
return m.Machine.Spec.Bootstrap.DataSecretName != nil
Expand Down
52 changes: 45 additions & 7 deletions pkg/services/hcloud/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ import (
"github.com/syself/cluster-api-provider-hetzner/pkg/utils"
corev1 "k8s.io/api/core/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

const maxShutDownTime = 2 * time.Minute
const serverOffTimeout = 10 * time.Minute

// Service defines struct with machine scope to reconcile HCloud machines.
type Service struct {
Expand Down Expand Up @@ -88,13 +90,13 @@ func (s *Service) Reconcile(ctx context.Context) (_ *ctrl.Result, err error) {

switch server.Status {
case hcloud.ServerStatusOff:
// Check if server is in ServerStatusOff and turn it on. This is to avoid a bug of Hetzner where
// sometimes machines are created and not turned on
// Don't do this when server is just created
if _, err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil {
return nil, errors.Wrap(err, "failed to power on server")
}

return s.handleServerStatusOff(ctx, server)
case hcloud.ServerStatusStarting:
// Requeue here so that server does not switch back and forth between off and starting.
// If we don't return here, the condition InstanceReady would get marked as true in this
// case. However, if the server is stuck and does not power on, we should not mark the
// condition InstanceReady as true to be able to remediate the server after a timeout.
return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
case hcloud.ServerStatusRunning: // Do nothing
default:
s.scope.HCloudMachine.Status.Ready = false
Expand Down Expand Up @@ -128,6 +130,42 @@ func (s *Service) Reconcile(ctx context.Context) (_ *ctrl.Result, err error) {
return nil, nil
}

func (s *Service) handleServerStatusOff(ctx context.Context, server *hcloud.Server) (*reconcile.Result, error) {
// Check if server is in ServerStatusOff and turn it on. This is to avoid a bug of Hetzner where
// sometimes machines are created and not turned on

condition := conditions.Get(s.scope.HCloudMachine, infrav1.InstanceReadyCondition)
if condition != nil &&
condition.Status == corev1.ConditionFalse &&
condition.Reason == infrav1.ServerOffReason {
if time.Now().Before(condition.LastTransitionTime.Time.Add(serverOffTimeout)) {
// Not yet timed out, try again to power on
if _, err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil {
return nil, errors.Wrap(err, "failed to power on server")
}
} else {
// Timed out. Set failure reason
s.scope.SetError("reached timeout of waiting for machines that are switched off", capierrors.CreateMachineError)
return nil, nil
}
} else {
// No condition set yet. Try to power server on.
if _, err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil {
return nil, errors.Wrap(err, "failed to power on server")
}
conditions.MarkFalse(
s.scope.HCloudMachine,
infrav1.InstanceReadyCondition,
infrav1.ServerOffReason,
clusterv1.ConditionSeverityInfo,
"server is switched off",
)
}

// Try again in 30 sec.
return &reconcile.Result{RequeueAfter: 30 * time.Second}, nil
}

func (s *Service) reconcileNetworkAttachment(ctx context.Context, server *hcloud.Server) error {
// If no network exists, then do nothing
if s.scope.HetznerCluster.Status.Network == nil {
Expand Down
14 changes: 14 additions & 0 deletions pkg/services/hcloud/server/server_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import (
"github.com/hetznercloud/hcloud-go/hcloud/schema"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
hcloudclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/client"
corev1 "k8s.io/api/core/v1"
)

Expand Down Expand Up @@ -205,3 +208,14 @@ var _ = BeforeSuite(func() {

server = hcloud.ServerFromSchema(serverSchema)
})

func newTestService(hcloudMachine *infrav1.HCloudMachine, hcloudClient hcloudclient.Client) *Service {
return &Service{
&scope.MachineScope{
HCloudMachine: hcloudMachine,
ClusterScope: scope.ClusterScope{
HCloudClient: hcloudClient,
},
},
}
}
80 changes: 80 additions & 0 deletions pkg/services/hcloud/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,20 @@ limitations under the License.
package server

import (
"context"
"time"

"github.com/hetznercloud/hcloud-go/hcloud"
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
fakeclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/client/fake"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

var _ = Describe("setStatusFromAPI", func() {
Expand Down Expand Up @@ -156,3 +165,74 @@ var _ = Describe("getSSHKeys", func() {
Expect(err).To(HaveOccurred())
})
})

var _ = Describe("handleServerStatusOff", func() {
var hcloudMachine *infrav1.HCloudMachine
client := fakeclient.NewHCloudClientFactory().NewClient("")

res, err := client.CreateServer(context.Background(), hcloud.ServerCreateOpts{Name: "serverName"})
Expect(err).To(Succeed())
server := res.Server

BeforeEach(func() {
hcloudMachine = &infrav1.HCloudMachine{
ObjectMeta: metav1.ObjectMeta{
Name: "hcloudMachineName",
Namespace: "default",
},
Spec: infrav1.HCloudMachineSpec{
ImageName: "fedora-control-plane",
Type: "cpx31",
},
}

server.Status = hcloud.ServerStatusOff
})

It("sets a condition if none is previously set", func() {
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
Expect(res).Should(Equal(&reconcile.Result{RequeueAfter: 30 * time.Second}))
Expect(conditions.GetReason(hcloudMachine, infrav1.InstanceReadyCondition)).To(Equal(infrav1.ServerOffReason))
Expect(server.Status).To(Equal(hcloud.ServerStatusRunning))
})

It("tries to power on server again if it is not timed out", func() {
conditions.MarkFalse(hcloudMachine, infrav1.InstanceReadyCondition, infrav1.ServerOffReason, clusterv1.ConditionSeverityInfo, "")
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
Expect(res).Should(Equal(&reconcile.Result{RequeueAfter: 30 * time.Second}))
Expect(server.Status).To(Equal(hcloud.ServerStatusRunning))
Expect(conditions.GetReason(hcloudMachine, infrav1.InstanceReadyCondition)).To(Equal(infrav1.ServerOffReason))
})

It("sets a failure message if it timed out", func() {
conditions.MarkFalse(hcloudMachine, infrav1.InstanceReadyCondition, infrav1.ServerOffReason, clusterv1.ConditionSeverityInfo, "")
// manipulate lastTransitionTime
conditionsList := hcloudMachine.GetConditions()
for i, c := range conditionsList {
if c.Type == infrav1.InstanceReadyCondition {
conditionsList[i].LastTransitionTime = metav1.NewTime(time.Now().Add(-time.Hour))
}
}
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
var nilResult *reconcile.Result
Expect(res).Should(Equal(nilResult))
Expect(server.Status).To(Equal(hcloud.ServerStatusOff))
Expect(hcloudMachine.Status.FailureMessage).Should(Equal(pointer.String("reached timeout of waiting for machines that are switched off")))
})

It("tries to power on server and sets new condition if different one is set", func() {
conditions.MarkTrue(hcloudMachine, infrav1.InstanceReadyCondition)
service := newTestService(hcloudMachine, client)
res, err := service.handleServerStatusOff(context.Background(), server)
Expect(err).To(Succeed())
Expect(res).Should(Equal(&reconcile.Result{RequeueAfter: 30 * time.Second}))
Expect(conditions.GetReason(hcloudMachine, infrav1.InstanceReadyCondition)).To(Equal(infrav1.ServerOffReason))
Expect(server.Status).To(Equal(hcloud.ServerStatusRunning))
})
})

0 comments on commit 1a8228c

Please sign in to comment.