From 00bdde3424f0613aa601136d75d4f7c07b0e2797 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Sun, 27 Apr 2025 16:31:03 +0300 Subject: [PATCH 1/3] refactor network health check - require at least one instance of each service to be alive --- pkg/perf/healthcheck/network.go | 89 +++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/pkg/perf/healthcheck/network.go b/pkg/perf/healthcheck/network.go index a509fcb5..f8533b80 100644 --- a/pkg/perf/healthcheck/network.go +++ b/pkg/perf/healthcheck/network.go @@ -5,6 +5,7 @@ import ( "fmt" "net" "net/url" + "strings" "sync" "time" @@ -15,62 +16,98 @@ import ( const defaultRequestTimeout = 5 * time.Second +// function: at least one instance of each service should be reachable +// returns errors as a report for perf healthcheck +// a side effect: set/delete the not-reachable flag func networkCheck(ctx context.Context) []error { env := environment.MustGet() - servicesUrl := []string{env.FlistURL} - - servicesUrl = append(append(servicesUrl, env.SubstrateURL...), env.RelayURL...) - servicesUrl = append(append(servicesUrl, env.ActivationURL...), env.GraphQL...) + services := map[string][]string{ + "substrate": env.SubstrateURL, + "relay": env.RelayURL, + "activation": env.ActivationURL, + "graphql": env.GraphQL, + "hub": {env.FlistURL}, + "kyc": {env.KycURL}, + } - var errors []error + var ( + wg sync.WaitGroup + errMu sync.Mutex + errors []error + ) - var wg sync.WaitGroup - var mut sync.Mutex - for _, serviceUrl := range servicesUrl { + for service, instances := range services { wg.Add(1) - go func(serviceUrl string) { + go func(service string, instances []string) { defer wg.Done() - err := checkService(ctx, serviceUrl) - if err != nil { - mut.Lock() - defer mut.Unlock() - + if err := verifyAtLeastOneIsReachable(ctx, service, instances); err != nil { + errMu.Lock() errors = append(errors, err) + errMu.Unlock() } - }(serviceUrl) + + }(service, instances) } + wg.Wait() if len(errors) == 0 { + log.Debug().Msg("all network checks passed") if err := app.DeleteFlag(app.NotReachable); err != nil { - log.Error().Err(err).Msg("failed to delete readonly flag") + log.Error().Err(err).Msg("failed to delete not-reachable flag") + } + } else { + log.Warn().Int("failed_checks", len(errors)).Msg("some network checks failed") + if err := app.SetFlag(app.NotReachable); err != nil { + log.Error().Err(err).Msg("failed to set not-reachable flag") } } return errors } +func verifyAtLeastOneIsReachable(ctx context.Context, service string, instances []string) error { + if len(instances) == 0 { + return fmt.Errorf("no instances provided for service %s", service) + } + + var unreachableErrors []string + for _, instance := range instances { + if err := checkService(ctx, instance); err == nil { + return nil + } else { + unreachableErrors = append(unreachableErrors, err.Error()) + } + } + + return fmt.Errorf("all %s instances are unreachable: %s", service, strings.Join(unreachableErrors, "; ")) +} + func checkService(ctx context.Context, serviceUrl string) error { - ctx, cancel := context.WithTimeout(ctx, defaultRequestTimeout) + timeoutCtx, cancel := context.WithTimeout(ctx, defaultRequestTimeout) defer cancel() - address := parseUrl(serviceUrl) - err := isReachable(ctx, address) + address, err := parseUrl(serviceUrl) if err != nil { - if err := app.SetFlag(app.NotReachable); err != nil { - log.Error().Err(err).Msg("failed to set not reachable flag") - } + return fmt.Errorf("invalid URL %s: %w", serviceUrl, err) + } + + if err := isReachable(timeoutCtx, address); err != nil { return fmt.Errorf("%s is not reachable: %w", serviceUrl, err) } return nil } -func parseUrl(serviceUrl string) string { +func parseUrl(serviceUrl string) (string, error) { u, err := url.Parse(serviceUrl) if err != nil { - return "" + return "", fmt.Errorf("failed to parse URL: %w", err) + } + + if u.Host == "" { + return "", fmt.Errorf("missing hostname in URL") } port := ":80" @@ -82,11 +119,11 @@ func parseUrl(serviceUrl string) string { u.Host += port } - return u.Host + return u.Host, nil } func isReachable(ctx context.Context, address string) error { - d := net.Dialer{Timeout: defaultRequestTimeout} + var d net.Dialer conn, err := d.DialContext(ctx, "tcp", address) if err != nil { return fmt.Errorf("failed to connect: %w", err) From 46e6df26aad03e1423668ee289094b266bd066ea Mon Sep 17 00:00:00 2001 From: Ashraf Fouda Date: Wed, 16 Apr 2025 16:03:20 +0200 Subject: [PATCH 2/3] adds set relays urls from zos config Signed-off-by: Ashraf Fouda --- pkg/environment/config.go | 1 + pkg/environment/environment.go | 37 ++++++++++++++++++++++------- pkg/perf/healthcheck/healthcheck.go | 2 +- pkg/perf/healthcheck/network.go | 20 ++++++++++------ 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/pkg/environment/config.go b/pkg/environment/config.go index 89b5f1bd..0993c476 100644 --- a/pkg/environment/config.go +++ b/pkg/environment/config.go @@ -31,6 +31,7 @@ type Config struct { RolloutUpgrade struct { TestFarms []uint32 `json:"test_farms"` } `json:"rollout_upgrade"` + RelaysURLs []string `json:"relays_urls"` } // Merge, updates current config with cfg merging and override config diff --git a/pkg/environment/environment.go b/pkg/environment/environment.go index 3b5b5895..8eb9ceec 100644 --- a/pkg/environment/environment.go +++ b/pkg/environment/environment.go @@ -7,6 +7,7 @@ import ( "sync" "github.com/pkg/errors" + "github.com/rs/zerolog/log" substrate "github.com/threefoldtech/tfchain/clients/tfchain-client-go" "github.com/threefoldtech/zosbase/pkg" @@ -47,7 +48,7 @@ type Environment struct { // IMPORTANT NOTICE: // SINCE RELAYS FOR A NODE IS STORED ON THE CHAIN IN A LIMITED SPACE // PLEASE MAKE SURE THAT ANY ENV HAS NO MORE THAN FOUR RELAYS CONFIGURED - RelayURL []string + relaysURLs []string ActivationURL []string GraphQL []string KycURL string @@ -112,7 +113,7 @@ var ( "wss://tfchain.dev.grid.tf/", "wss://tfchain.02.dev.grid.tf", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.dev.grid.tf", "wss://relay.02.dev.grid.tf", }, @@ -136,7 +137,7 @@ var ( "wss://tfchain.test.grid.tf/", "wss://tfchain.02.test.grid.tf", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.test.grid.tf", "wss://relay.02.test.grid.tf", }, @@ -160,7 +161,7 @@ var ( "wss://tfchain.qa.grid.tf/", "wss://tfchain.02.qa.grid.tf/", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.qa.grid.tf", "wss://relay.02.qa.grid.tf", }, @@ -187,7 +188,7 @@ var ( "wss://03.tfchain.grid.tf/", "wss://04.tfchain.grid.tf/", }, - RelayURL: []string{ + relaysURLs: []string{ "wss://relay.grid.tf", "wss://relay.02.grid.tf", }, @@ -224,13 +225,26 @@ func Get() (Environment, error) { if err != nil { return Environment{}, err } - if params.IsV4() { - env.FlistURL = "redis://v4.hub.grid.tf:9940" - } return env, nil } +func GetRelaysURLs() (relaysUrls []string, err error) { + relaysUrls = MustGet().relaysURLs + + config, err := GetConfig() + // if error happened when getting config from github just ignore and use the one in the env + if err != nil { + log.Error().Err(err).Msg("failed to get relays urls from zos-config") + return + } + if len(config.RelaysURLs) > 0 { + log.Debug().Msg("found relays urls in zos-config") + return config.RelaysURLs, nil + } + return +} + // GetSubstrate gets a client to subsrate blockchain func GetSubstrate() (substrate.Manager, error) { env, err := Get() @@ -281,7 +295,7 @@ func getEnvironmentFromParams(params kernel.Params) (Environment, error) { if relay, ok := params.Get("relay"); ok { if len(relay) > 0 { - env.RelayURL = relay + env.relaysURLs = relay } } @@ -368,5 +382,10 @@ func getEnvironmentFromParams(params kernel.Params) (Environment, error) { env.BinRepo = e } + // if the node running v4 chage flisturl to use v4.hub.grid.tf + if params.IsV4() { + env.FlistURL = "redis://v4.hub.grid.tf:9940" + } + return env, nil } diff --git a/pkg/perf/healthcheck/healthcheck.go b/pkg/perf/healthcheck/healthcheck.go index 9cbcbf0f..725005e0 100644 --- a/pkg/perf/healthcheck/healthcheck.go +++ b/pkg/perf/healthcheck/healthcheck.go @@ -86,7 +86,7 @@ func (h *healthcheckTask) Run(ctx context.Context) (interface{}, error) { } if len(errors) != 0 { - return fmt.Errorf("failed health check") + return fmt.Errorf("failed health check %s", errorsToStrings(errors)) } return nil diff --git a/pkg/perf/healthcheck/network.go b/pkg/perf/healthcheck/network.go index f8533b80..79cd2afd 100644 --- a/pkg/perf/healthcheck/network.go +++ b/pkg/perf/healthcheck/network.go @@ -14,27 +14,33 @@ import ( "github.com/threefoldtech/zosbase/pkg/environment" ) -const defaultRequestTimeout = 5 * time.Second +const defaultRequestTimeout = 10 * time.Second // function: at least one instance of each service should be reachable // returns errors as a report for perf healthcheck // a side effect: set/delete the not-reachable flag func networkCheck(ctx context.Context) []error { + var ( + wg sync.WaitGroup + errMu sync.Mutex + errors []error + ) + env := environment.MustGet() services := map[string][]string{ "substrate": env.SubstrateURL, - "relay": env.RelayURL, "activation": env.ActivationURL, "graphql": env.GraphQL, "hub": {env.FlistURL}, "kyc": {env.KycURL}, } - var ( - wg sync.WaitGroup - errMu sync.Mutex - errors []error - ) + relays, err := environment.GetRelaysURLs() + if err != nil { + errors = append(errors, fmt.Errorf("failed to get relays urls %w", err)) + } else { + services["relays"] = relays + } for service, instances := range services { wg.Add(1) From 865f3a934f00bcd675795f8485f387d24abd3343 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Thu, 1 May 2025 16:22:43 +0300 Subject: [PATCH 3/3] modify GetRelaysUrls to not propagate error --- pkg/environment/environment.go | 22 +++++++++------------- pkg/perf/healthcheck/network.go | 8 +------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/pkg/environment/environment.go b/pkg/environment/environment.go index 8eb9ceec..37498c9d 100644 --- a/pkg/environment/environment.go +++ b/pkg/environment/environment.go @@ -190,7 +190,7 @@ var ( }, relaysURLs: []string{ "wss://relay.grid.tf", - "wss://relay.02.grid.tf", + // "wss://relay.02.grid.tf", }, ActivationURL: []string{ "https://activation.grid.tf/activation/activate", @@ -229,20 +229,16 @@ func Get() (Environment, error) { return env, nil } -func GetRelaysURLs() (relaysUrls []string, err error) { - relaysUrls = MustGet().relaysURLs - +func GetRelaysURLs() []string { config, err := GetConfig() - // if error happened when getting config from github just ignore and use the one in the env - if err != nil { - log.Error().Err(err).Msg("failed to get relays urls from zos-config") - return + if err == nil && len(config.RelaysURLs) > 0 { + log.Debug().Msg("using relays urls from zos-config") + return config.RelaysURLs } - if len(config.RelaysURLs) > 0 { - log.Debug().Msg("found relays urls in zos-config") - return config.RelaysURLs, nil - } - return + + log.Debug().Msg("using relays urls from environment") + env := MustGet() + return env.relaysURLs } // GetSubstrate gets a client to subsrate blockchain diff --git a/pkg/perf/healthcheck/network.go b/pkg/perf/healthcheck/network.go index 79cd2afd..a7f9c1cf 100644 --- a/pkg/perf/healthcheck/network.go +++ b/pkg/perf/healthcheck/network.go @@ -30,18 +30,12 @@ func networkCheck(ctx context.Context) []error { services := map[string][]string{ "substrate": env.SubstrateURL, "activation": env.ActivationURL, + "relay": environment.GetRelaysURLs(), "graphql": env.GraphQL, "hub": {env.FlistURL}, "kyc": {env.KycURL}, } - relays, err := environment.GetRelaysURLs() - if err != nil { - errors = append(errors, fmt.Errorf("failed to get relays urls %w", err)) - } else { - services["relays"] = relays - } - for service, instances := range services { wg.Add(1) go func(service string, instances []string) {