Skip to content

Commit

Permalink
worker: check cloudformation stack status
Browse files Browse the repository at this point in the history
Stack update may fail due to external changes, e.g. due to removal of
a referenced security group.

This change reports certain stack statuses as problems which prevents
`last_sync_timestamp_seconds` metric update that could be
used to monitor stack update failures.

Signed-off-by: Alexander Yastrebov <alexander.yastrebov@zalando.de>
  • Loading branch information
AlexanderYastrebov committed Apr 27, 2022
1 parent 566005b commit d151cae
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
13 changes: 13 additions & 0 deletions aws/cf.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,19 @@ func (s *Stack) ShouldDelete() bool {
return true
}

// Err returns nil or an error describing the stack state.
func (s *Stack) Err() error {
if s == nil {
return nil
}

switch s.status {
case cloudformation.StackStatusUpdateComplete:
return nil
}
return fmt.Errorf("bad status %s", s.status)
}

type stackOutput map[string]string

func newStackOutput(outputs []*cloudformation.Output) stackOutput {
Expand Down
6 changes: 6 additions & 0 deletions worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,12 @@ func doWork(
return problems.Add("failed to list managed stacks: %w", err)
}

for _, stack := range stacks {
if err := stack.Err(); err != nil {
problems.Add("stack %s error: %w", stack.Name, err)
}
}

err = awsAdapter.UpdateAutoScalingGroupsAndInstances()
if err != nil {
return problems.Add("failed to get instances from EC2: %w", err)
Expand Down

0 comments on commit d151cae

Please sign in to comment.