diff --git a/pkg/environment/config.go b/pkg/environment/config.go index 89b5f1bd..0993c476 100644 --- a/pkg/environment/config.go +++ b/pkg/environment/config.go @@ -31,6 +31,7 @@ type Config struct { RolloutUpgrade struct { TestFarms []uint32 `json:"test_farms"` } `json:"rollout_upgrade"` + RelaysURLs []string `json:"relays_urls"` } // Merge, updates current config with cfg merging and override config diff --git a/pkg/environment/environment.go b/pkg/environment/environment.go index 3b5b5895..37498c9d 100644 --- a/pkg/environment/environment.go +++ b/pkg/environment/environment.go @@ -7,6 +7,7 @@ import ( "sync" "github.com/pkg/errors" + "github.com/rs/zerolog/log" substrate "github.com/threefoldtech/tfchain/clients/tfchain-client-go" "github.com/threefoldtech/zosbase/pkg" @@ -47,7 +48,7 @@ type Environment struct { // IMPORTANT NOTICE: // SINCE RELAYS FOR A NODE IS STORED ON THE CHAIN IN A LIMITED SPACE // PLEASE MAKE SURE THAT ANY ENV HAS NO MORE THAN FOUR RELAYS CONFIGURED - RelayURL []string + relaysURLs []string ActivationURL []string GraphQL []string KycURL string @@ -112,7 +113,7 @@ var ( "wss://tfchain.dev.grid.tf/", "wss://tfchain.02.dev.grid.tf", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.dev.grid.tf", "wss://relay.02.dev.grid.tf", }, @@ -136,7 +137,7 @@ var ( "wss://tfchain.test.grid.tf/", "wss://tfchain.02.test.grid.tf", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.test.grid.tf", "wss://relay.02.test.grid.tf", }, @@ -160,7 +161,7 @@ var ( "wss://tfchain.qa.grid.tf/", "wss://tfchain.02.qa.grid.tf/", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.qa.grid.tf", "wss://relay.02.qa.grid.tf", }, @@ -187,9 +188,9 @@ var ( "wss://03.tfchain.grid.tf/", "wss://04.tfchain.grid.tf/", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.grid.tf", - "wss://relay.02.grid.tf", + // "wss://relay.02.grid.tf", }, ActivationURL: []string{ "https://activation.grid.tf/activation/activate", @@ -224,13 +225,22 @@ func Get() (Environment, error) { if err != nil { return Environment{}, err } - if params.IsV4() { - env.FlistURL = "redis://v4.hub.grid.tf:9940" - } return env, nil } +func GetRelaysURLs() []string { + config, err := GetConfig() + if err == nil && len(config.RelaysURLs) > 0 { + log.Debug().Msg("using relays urls from zos-config") + return config.RelaysURLs + } + + log.Debug().Msg("using relays urls from environment") + env := MustGet() + return env.relaysURLs +} + // GetSubstrate gets a client to subsrate blockchain func GetSubstrate() (substrate.Manager, error) { env, err := Get() @@ -281,7 +291,7 @@ func getEnvironmentFromParams(params kernel.Params) (Environment, error) { if relay, ok := params.Get("relay"); ok { if len(relay) > 0 { - env.RelayURL = relay + env.relaysURLs = relay } } @@ -368,5 +378,10 @@ func getEnvironmentFromParams(params kernel.Params) (Environment, error) { env.BinRepo = e } + // if the node running v4 chage flisturl to use v4.hub.grid.tf + if params.IsV4() { + env.FlistURL = "redis://v4.hub.grid.tf:9940" + } + return env, nil } diff --git a/pkg/perf/healthcheck/healthcheck.go b/pkg/perf/healthcheck/healthcheck.go index 9cbcbf0f..725005e0 100644 --- a/pkg/perf/healthcheck/healthcheck.go +++ b/pkg/perf/healthcheck/healthcheck.go @@ -86,7 +86,7 @@ func (h *healthcheckTask) Run(ctx context.Context) (interface{}, error) { } if len(errors) != 0 { - return fmt.Errorf("failed health check") + return fmt.Errorf("failed health check %s", errorsToStrings(errors)) } return nil diff --git a/pkg/perf/healthcheck/network.go b/pkg/perf/healthcheck/network.go index a509fcb5..a7f9c1cf 100644 --- a/pkg/perf/healthcheck/network.go +++ b/pkg/perf/healthcheck/network.go @@ -5,6 +5,7 @@ import ( "fmt" "net" "net/url" + "strings" "sync" "time" @@ -13,64 +14,100 @@ import ( "github.com/threefoldtech/zosbase/pkg/environment" ) -const defaultRequestTimeout = 5 * time.Second +const defaultRequestTimeout = 10 * time.Second +// function: at least one instance of each service should be reachable +// returns errors as a report for perf healthcheck +// a side effect: set/delete the not-reachable flag func networkCheck(ctx context.Context) []error { - env := environment.MustGet() - servicesUrl := []string{env.FlistURL} - - servicesUrl = append(append(servicesUrl, env.SubstrateURL...), env.RelayURL...) - servicesUrl = append(append(servicesUrl, env.ActivationURL...), env.GraphQL...) + var ( + wg sync.WaitGroup + errMu sync.Mutex + errors []error + ) - var errors []error + env := environment.MustGet() + services := map[string][]string{ + "substrate": env.SubstrateURL, + "activation": env.ActivationURL, + "relay": environment.GetRelaysURLs(), + "graphql": env.GraphQL, + "hub": {env.FlistURL}, + "kyc": {env.KycURL}, + } - var wg sync.WaitGroup - var mut sync.Mutex - for _, serviceUrl := range servicesUrl { + for service, instances := range services { wg.Add(1) - go func(serviceUrl string) { + go func(service string, instances []string) { defer wg.Done() - err := checkService(ctx, serviceUrl) - if err != nil { - mut.Lock() - defer mut.Unlock() - + if err := verifyAtLeastOneIsReachable(ctx, service, instances); err != nil { + errMu.Lock() errors = append(errors, err) + errMu.Unlock() } - }(serviceUrl) + + }(service, instances) } + wg.Wait() if len(errors) == 0 { + log.Debug().Msg("all network checks passed") if err := app.DeleteFlag(app.NotReachable); err != nil { - log.Error().Err(err).Msg("failed to delete readonly flag") + log.Error().Err(err).Msg("failed to delete not-reachable flag") + } + } else { + log.Warn().Int("failed_checks", len(errors)).Msg("some network checks failed") + if err := app.SetFlag(app.NotReachable); err != nil { + log.Error().Err(err).Msg("failed to set not-reachable flag") } } return errors } +func verifyAtLeastOneIsReachable(ctx context.Context, service string, instances []string) error { + if len(instances) == 0 { + return fmt.Errorf("no instances provided for service %s", service) + } + + var unreachableErrors []string + for _, instance := range instances { + if err := checkService(ctx, instance); err == nil { + return nil + } else { + unreachableErrors = append(unreachableErrors, err.Error()) + } + } + + return fmt.Errorf("all %s instances are unreachable: %s", service, strings.Join(unreachableErrors, "; ")) +} + func checkService(ctx context.Context, serviceUrl string) error { - ctx, cancel := context.WithTimeout(ctx, defaultRequestTimeout) + timeoutCtx, cancel := context.WithTimeout(ctx, defaultRequestTimeout) defer cancel() - address := parseUrl(serviceUrl) - err := isReachable(ctx, address) + address, err := parseUrl(serviceUrl) if err != nil { - if err := app.SetFlag(app.NotReachable); err != nil { - log.Error().Err(err).Msg("failed to set not reachable flag") - } + return fmt.Errorf("invalid URL %s: %w", serviceUrl, err) + } + + if err := isReachable(timeoutCtx, address); err != nil { return fmt.Errorf("%s is not reachable: %w", serviceUrl, err) } return nil } -func parseUrl(serviceUrl string) string { +func parseUrl(serviceUrl string) (string, error) { u, err := url.Parse(serviceUrl) if err != nil { - return "" + return "", fmt.Errorf("failed to parse URL: %w", err) + } + + if u.Host == "" { + return "", fmt.Errorf("missing hostname in URL") } port := ":80" @@ -82,11 +119,11 @@ func parseUrl(serviceUrl string) string { u.Host += port } - return u.Host + return u.Host, nil } func isReachable(ctx context.Context, address string) error { - d := net.Dialer{Timeout: defaultRequestTimeout} + var d net.Dialer conn, err := d.DialContext(ctx, "tcp", address) if err != nil { return fmt.Errorf("failed to connect: %w", err)