Skip to content

Commit

Permalink
Added max-unhealthy-endpoints-ratio cmdline parameter for PHC (#3081)
Browse files Browse the repository at this point in the history
Signed-off-by: Roman Zavodskikh <roman.zavodskikh@zalando.de>
Co-authored-by: Roman Zavodskikh <roman.zavodskikh@zalando.de>
  • Loading branch information
RomanZavodskikh and Roman Zavodskikh committed May 23, 2024
1 parent 0964a48 commit 07de308
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 76 deletions.
12 changes: 9 additions & 3 deletions docs/operation/operation.md
Original file line number Diff line number Diff line change
Expand Up @@ -904,15 +904,20 @@ the Skipper takes a look at previous period and if the amount of requests in the
and failed requests ratio is more than `min-drop-probability` for the given endpoints
then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio
in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC.
If the ratio of unhealthy endpoints is more than `max-unhealthy-endpoints-ratio` then PHC becomes fail-open. This effectively means
if there are too many unhealthy endpoints PHC does not try to mitigate them any more and requests are sent like there is no PHC at all.

Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead.

To enable this feature, you need to provide `-passive-health-check` option having forementioned parameters
(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined.
(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`, `max-unhealthy-endpoints-ratio`) defined.
`period`, `min-requests`, `max-drop-probability` are required parameters, it is not possible for PHC to be enabled without
them explicitly defined by user. `min-drop-probability` is implicitly defined as `0.0` if not explicitly set by user.
`max-unhealthy-endpoints-ratio` is defined as `1.0` if not explicitly set by user.
Valid examples of `-passive-health-check` are:

+ `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3`
+ `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3`
+ `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9`
+ `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9`

Expand All @@ -923,9 +928,10 @@ The parameters of `-passive-health-check` option are:

+ `period=<duration>` - the duration of stats reset period
+ `min-requests=<int>` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint
+ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint
+ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered
+ `min-drop-probabilty=[0.0 <= p < max-drop-probability)` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint
+ `max-drop-probabilty=(min-drop-probability < p <= 1.0]` - the maximum possible probability of unhealthy endpoint being not considered
while choosing the endpoint for the given request
+ `max-unhealthy-endpoints-ratio=[0.0 <= r <= 1.0]` - the maximum ratio of unhealthy endpoints for PHC to try to mitigate ongoing requests

### Metrics

Expand Down
12 changes: 10 additions & 2 deletions proxy/healthy_endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ import (
)

type healthyEndpoints struct {
rnd *rand.Rand
endpointRegistry *routing.EndpointRegistry
rnd *rand.Rand
endpointRegistry *routing.EndpointRegistry
maxUnhealthyEndpointsRatio float64
}

func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []routing.LBEndpoint, metrics metrics.Metrics) []routing.LBEndpoint {
Expand All @@ -19,16 +20,23 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout

p := h.rnd.Float64()

unhealthyEndpointsCount := 0
maxUnhealthyEndpointsCount := float64(len(endpoints)) * h.maxUnhealthyEndpointsRatio
filtered := make([]routing.LBEndpoint, 0, len(endpoints))
for _, e := range endpoints {
dropProbability := e.Metrics.HealthCheckDropProbability()
if p < dropProbability {
ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f",
e.Host, p, dropProbability)
metrics.IncCounter("passive-health-check.endpoints.dropped")
unhealthyEndpointsCount++
} else {
filtered = append(filtered, e)
}

if float64(unhealthyEndpointsCount) > maxUnhealthyEndpointsCount {
return endpoints
}
}

if len(filtered) == 0 {
Expand Down
102 changes: 97 additions & 5 deletions proxy/healthy_endpoints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,44 @@ func fireVegeta(t *testing.T, ps *httptest.Server, freq int, per time.Duration,
}

func setupProxy(t *testing.T, doc string) (*metricstest.MockMetrics, *httptest.Server) {
return setupProxyWithCustomEndpointRegisty(t, doc, defaultEndpointRegistry())
m := &metricstest.MockMetrics{}
endpointRegistry := defaultEndpointRegistry()
proxyParams := Params{
EnablePassiveHealthCheck: true,
EndpointRegistry: endpointRegistry,
Metrics: m,
PassiveHealthCheck: &PassiveHealthCheck{
MaxUnhealthyEndpointsRatio: 1.0,
},
}

return m, setupProxyWithCustomProxyParams(t, doc, proxyParams)
}

func setupProxyWithCustomEndpointRegisty(t *testing.T, doc string, endpointRegistry *routing.EndpointRegistry) (*metricstest.MockMetrics, *httptest.Server) {
m := &metricstest.MockMetrics{}

tp, err := newTestProxyWithParams(doc, Params{
proxyParams := Params{
EnablePassiveHealthCheck: true,
EndpointRegistry: endpointRegistry,
Metrics: m,
})
PassiveHealthCheck: &PassiveHealthCheck{
MaxUnhealthyEndpointsRatio: 1.0,
},
}

return m, setupProxyWithCustomProxyParams(t, doc, proxyParams)
}

func setupProxyWithCustomProxyParams(t *testing.T, doc string, proxyParams Params) *httptest.Server {
tp, err := newTestProxyWithParams(doc, proxyParams)
require.NoError(t, err)

ps := httptest.NewServer(tp.proxy)

t.Cleanup(tp.close)
t.Cleanup(ps.Close)

return m, ps
return ps
}

func setupServices(t *testing.T, healthy, unhealthy int) string {
Expand Down Expand Up @@ -138,6 +157,9 @@ func TestPHCForSingleHealthyEndpoint(t *testing.T) {
tp, err := newTestProxyWithParams(doc, Params{
EnablePassiveHealthCheck: true,
EndpointRegistry: endpointRegistry,
PassiveHealthCheck: &PassiveHealthCheck{
MaxUnhealthyEndpointsRatio: 1.0,
},
})
if err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -241,3 +263,73 @@ func TestPHC(t *testing.T) {
})
}
}

func TestPHCNoHealthyEndpoints(t *testing.T) {
const (
healthy = 0
unhealthy = 4
)

servicesString := setupServices(t, healthy, unhealthy)
mockMetrics, ps := setupProxy(t, fmt.Sprintf(`* -> backendTimeout("20ms") -> <random, %s>`,
servicesString))
va := fireVegeta(t, ps, 5000, 1*time.Second, 5*time.Second)

count200, ok := va.CountStatus(200)
assert.False(t, ok)

count504, ok := va.CountStatus(504)
assert.True(t, ok)

failedRequests := math.Abs(float64(va.TotalRequests())) - float64(count200)
t.Logf("total requests: %d, count200: %d, count504: %d, failedRequests: %f", va.TotalRequests(), count200, count504, failedRequests)

assert.InDelta(t, float64(count504), failedRequests, 5)
assert.InDelta(t, float64(va.TotalRequests()), float64(failedRequests), 0.1*float64(va.TotalRequests()))
mockMetrics.WithCounters(func(counters map[string]int64) {
assert.InDelta(t, float64(unhealthy)*float64(va.TotalRequests()), float64(counters["passive-health-check.endpoints.dropped"]), 0.3*float64(unhealthy)*float64(va.TotalRequests()))
assert.InDelta(t, 0.0, float64(counters["passive-health-check.requests.passed"]), 0.3*float64(nRequests)) // allow 30% error
})
}

func TestPHCMaxUnhealthyEndpointsRatioParam(t *testing.T) {
const (
healthy = 0
unhealthy = 4
maxUnhealthyEndpointsRatio = 0.49 // slightly less than 0.5 to avoid equality and looking up the third endpoint
)

servicesString := setupServices(t, healthy, unhealthy)
mockMetrics := &metricstest.MockMetrics{}
endpointRegistry := defaultEndpointRegistry()
proxyParams := Params{
EnablePassiveHealthCheck: true,
EndpointRegistry: endpointRegistry,
Metrics: mockMetrics,
PassiveHealthCheck: &PassiveHealthCheck{
MaxUnhealthyEndpointsRatio: maxUnhealthyEndpointsRatio,
},
}
ps := setupProxyWithCustomProxyParams(t, fmt.Sprintf(`* -> backendTimeout("20ms") -> <random, %s>`,
servicesString), proxyParams)
va := fireVegeta(t, ps, 5000, 1*time.Second, 5*time.Second)

count200, ok := va.CountStatus(200)
assert.False(t, ok)

count504, ok := va.CountStatus(504)
assert.True(t, ok)

failedRequests := math.Abs(float64(va.TotalRequests())) - float64(count200)
t.Logf("total requests: %d, count200: %d, count504: %d, failedRequests: %f", va.TotalRequests(), count200, count504, failedRequests)

assert.InDelta(t, float64(count504), failedRequests, 5)
assert.InDelta(t, float64(va.TotalRequests()), float64(failedRequests), 0.1*float64(va.TotalRequests()))
mockMetrics.WithCounters(func(counters map[string]int64) {
assert.InDelta(t, maxUnhealthyEndpointsRatio*float64(unhealthy)*float64(va.TotalRequests()),
float64(counters["passive-health-check.endpoints.dropped"]),
0.3*maxUnhealthyEndpointsRatio*float64(unhealthy)*float64(va.TotalRequests()),
)
assert.InDelta(t, 0.0, float64(counters["passive-health-check.requests.passed"]), 0.3*float64(nRequests)) // allow 30% error
})
}
23 changes: 20 additions & 3 deletions proxy/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,21 @@ type PassiveHealthCheck struct {

// The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request
MaxDropProbability float64

// MaxUnhealthyEndpointsRatio is the maximum ratio of unhealthy endpoints in the list of all endpoints PHC will check
// in case of all endpoints are unhealthy
MaxUnhealthyEndpointsRatio float64
}

func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, error) {
if len(o) == 0 {
return false, &PassiveHealthCheck{}, nil
}

result := &PassiveHealthCheck{}
result := &PassiveHealthCheck{
MinDropProbability: 0.0,
MaxUnhealthyEndpointsRatio: 1.0,
}
requiredParams := map[string]struct{}{
"period": {},
"max-drop-probability": {},
Expand Down Expand Up @@ -218,6 +225,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e
return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value)
}
result.MinDropProbability = minDropProbability
case "max-unhealthy-endpoints-ratio":
maxUnhealthyEndpointsRatio, err := strconv.ParseFloat(value, 64)
if err != nil {
return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value)
}
if maxUnhealthyEndpointsRatio < 0 || maxUnhealthyEndpointsRatio > 1 {
return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value)
}
result.MaxUnhealthyEndpointsRatio = maxUnhealthyEndpointsRatio
default:
return false, nil, fmt.Errorf("passive health check: invalid parameter: key=%s,value=%s", key, value)
}
Expand Down Expand Up @@ -824,8 +840,9 @@ func WithParams(p Params) *Proxy {
var healthyEndpointsChooser *healthyEndpoints
if p.EnablePassiveHealthCheck {
healthyEndpointsChooser = &healthyEndpoints{
rnd: rand.New(loadbalancer.NewLockedSource()),
endpointRegistry: p.EndpointRegistry,
rnd: rand.New(loadbalancer.NewLockedSource()),
endpointRegistry: p.EndpointRegistry,
maxUnhealthyEndpointsRatio: p.PassiveHealthCheck.MaxUnhealthyEndpointsRatio,
}
}
return &Proxy{
Expand Down
Loading

0 comments on commit 07de308

Please sign in to comment.