-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
POC: Add an HTTP endpoint for liveness checks #2382
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -307,18 +307,12 @@ func (s *server) run() error { | |
return err | ||
} | ||
|
||
if err := s.veleroResourcesExist(); err != nil { | ||
return err | ||
} | ||
|
||
if err := s.validateBackupStorageLocations(); err != nil { | ||
return err | ||
} | ||
|
||
if _, err := s.veleroClient.VeleroV1().BackupStorageLocations(s.namespace).Get(s.config.defaultBackupLocation, metav1.GetOptions{}); err != nil { | ||
s.logger.WithError(errors.WithStack(err)). | ||
Warnf("A backup storage location named %s has been specified for the server to use by default, but no corresponding backup storage location exists. Backups with a location not matching the default will need to explicitly specify an existing location", s.config.defaultBackupLocation) | ||
} | ||
// Anything that could be created after Velero has started will be | ||
// queried as part of the liveness check | ||
go func() { | ||
s.logger.Info("Starting liveness server at port 2000") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to have the port number "2000" either as a constant or configurable via variable. Same for line 861. |
||
s.runLiveness() | ||
}() | ||
|
||
if err := s.initRestic(); err != nil { | ||
return err | ||
|
@@ -380,7 +374,7 @@ func (s *server) veleroResourcesExist() error { | |
} | ||
|
||
if veleroGroupVersion == nil { | ||
return errors.Errorf("Velero API group %s not found. Apply examples/common/00-prereqs.yaml to create it.", api.SchemeGroupVersion) | ||
return errors.Errorf("Velero API group %s not found. Create the CRDs via `velero install`, Helm, or applying YAML to proceed.", api.SchemeGroupVersion) | ||
} | ||
|
||
foundResources := sets.NewString() | ||
|
@@ -399,7 +393,7 @@ func (s *server) veleroResourcesExist() error { | |
} | ||
|
||
if len(errs) > 0 { | ||
errs = append(errs, errors.New("Velero custom resources not found - apply examples/common/00-prereqs.yaml to update the custom resource definitions")) | ||
errs = append(errs, errors.New("Velero custom resources not found. Create the CRDs via `velero install`, Helm, or applying YAML to proceed.")) | ||
return kubeerrs.NewAggregate(errs) | ||
} | ||
|
||
|
@@ -832,3 +826,39 @@ func (s *server) runProfiler() { | |
s.logger.WithError(errors.WithStack(err)).Error("error running profiler http server") | ||
} | ||
} | ||
|
||
// runLiveness starts an HTTP server to handle liveness checks for the velero server. | ||
// Note that liveness checks should have generous timeouts, since Velero will query | ||
// object storage, which is possibly a network call across the internet. | ||
func (s *server) runLiveness() { | ||
mux := http.NewServeMux() | ||
mux.HandleFunc("/livez", func(w http.ResponseWriter, r *http.Request) { | ||
// Refresh the discovery helper, since it may be stale if the velero API group was | ||
// created after the pod. | ||
s.discoveryHelper.Refresh() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. somewhat related, we already have a goroutine that refreshes discovery every 5min -- so we'll end up refrehsing a lot. Not sure yet where it makes sense to have it, just food for thought right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah; I added it because it wasn't picking up the CRDs even after I posted them. 5 minutes seems like a long time for a liveness check, but maybe we remove this refresh and change the other one to match whatever the check period is on the pod? |
||
if err := s.veleroResourcesExist(); err != nil { | ||
s.logger.Warn(err) | ||
http.NotFound(w, r) | ||
return | ||
} | ||
|
||
if err := s.validateBackupStorageLocations(); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Current logic of validateBackupStorageLocations would return error if there is any invalid BackupStorageLocation. Should Velero continue function if there is at least 1 valid BackupStorageLocation? If so, validateBackupStorageLocations should be enhanced to return more than just an error. For example return number of valid BackupStorageLocation and at this point we decide Velero should continue work if some valid BackupStorageLocation present. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This validation will change after #2617. We will change the validation logic to only error if there are no valid BackupStorageLocations. Else it will be only warnings if there's any invalid BSL. |
||
s.logger.Warn(err) | ||
http.NotFound(w, r) | ||
return | ||
} | ||
if _, err := s.veleroClient.VeleroV1().BackupStorageLocations(s.namespace).Get(s.config.defaultBackupLocation, metav1.GetOptions{}); err != nil { | ||
s.logger.WithError(errors.WithStack(err)). | ||
Warnf("A backup storage location named %s has been specified for the server to use by default, but no corresponding backup storage location exists. Backups with a location not matching the default will need to explicitly specify an existing location", s.config.defaultBackupLocation) | ||
http.NotFound(w, r) | ||
return | ||
} | ||
|
||
// Happy path, all prerequisites are present. | ||
fmt.Fprintf(w, "Velero is live") | ||
}) | ||
|
||
if err := http.ListenAndServe(":2000", mux); err != nil { | ||
s.logger.WithError(errors.WithStack(err)).Error("error running liveness http server") | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it probably makes sense to repeat the checks I deleted here before the liveness endpoint starts, but maybe just log their errors rather than returning them and halting.