diff --git a/docs/operation/operation.md b/docs/operation/operation.md index 37c6211e1d..c7ee3e9c5a 100644 --- a/docs/operation/operation.md +++ b/docs/operation/operation.md @@ -904,15 +904,20 @@ the Skipper takes a look at previous period and if the amount of requests in the and failed requests ratio is more than `min-drop-probability` for the given endpoints then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC. +If the ratio of unhealthy endpoints is more than `max-unhealthy-endpoints-ratio` then PHC becomes fail-open. This effectively means +if there are too many unhealthy endpoints PHC does not try to mitigate them any more and requests are sent like there is no PHC at all. Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead. To enable this feature, you need to provide `-passive-health-check` option having forementioned parameters -(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined. +(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`, `max-unhealthy-endpoints-ratio`) defined. `period`, `min-requests`, `max-drop-probability` are required parameters, it is not possible for PHC to be enabled without them explicitly defined by user. `min-drop-probability` is implicitly defined as `0.0` if not explicitly set by user. +`max-unhealthy-endpoints-ratio` is defined as `1.0` if not explicitly set by user. Valid examples of `-passive-health-check` are: ++ `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3` ++ `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9,max-unhealthy-endpoints-ratio=0.3` + `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9` + `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9` @@ -923,9 +928,10 @@ The parameters of `-passive-health-check` option are: + `period=` - the duration of stats reset period + `min-requests=` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint -+ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint -+ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered ++ `min-drop-probabilty=[0.0 <= p < max-drop-probability)` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint ++ `max-drop-probabilty=(min-drop-probability < p <= 1.0]` - the maximum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request ++ `max-unhealthy-endpoints-ratio=[0.0 <= r <= 1.0]` - the maximum ratio of unhealthy endpoints for PHC to try to mitigate ongoing requests ### Metrics diff --git a/proxy/healthy_endpoints.go b/proxy/healthy_endpoints.go index 00e1ef5db8..d4818065d6 100644 --- a/proxy/healthy_endpoints.go +++ b/proxy/healthy_endpoints.go @@ -8,8 +8,9 @@ import ( ) type healthyEndpoints struct { - rnd *rand.Rand - endpointRegistry *routing.EndpointRegistry + rnd *rand.Rand + endpointRegistry *routing.EndpointRegistry + maxUnhealthyEndpointsRatio float64 } func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []routing.LBEndpoint, metrics metrics.Metrics) []routing.LBEndpoint { @@ -19,6 +20,8 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout p := h.rnd.Float64() + unhealthyEndpointsCount := 0 + maxUnhealthyEndpointsCount := float64(len(endpoints)) * h.maxUnhealthyEndpointsRatio filtered := make([]routing.LBEndpoint, 0, len(endpoints)) for _, e := range endpoints { dropProbability := e.Metrics.HealthCheckDropProbability() @@ -26,9 +29,14 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f", e.Host, p, dropProbability) metrics.IncCounter("passive-health-check.endpoints.dropped") + unhealthyEndpointsCount++ } else { filtered = append(filtered, e) } + + if float64(unhealthyEndpointsCount) > maxUnhealthyEndpointsCount { + return endpoints + } } if len(filtered) == 0 { diff --git a/proxy/healthy_endpoints_test.go b/proxy/healthy_endpoints_test.go index 01cea436b0..0e75e65cda 100644 --- a/proxy/healthy_endpoints_test.go +++ b/proxy/healthy_endpoints_test.go @@ -61,17 +61,36 @@ func fireVegeta(t *testing.T, ps *httptest.Server, freq int, per time.Duration, } func setupProxy(t *testing.T, doc string) (*metricstest.MockMetrics, *httptest.Server) { - return setupProxyWithCustomEndpointRegisty(t, doc, defaultEndpointRegistry()) + m := &metricstest.MockMetrics{} + endpointRegistry := defaultEndpointRegistry() + proxyParams := Params{ + EnablePassiveHealthCheck: true, + EndpointRegistry: endpointRegistry, + Metrics: m, + PassiveHealthCheck: &PassiveHealthCheck{ + MaxUnhealthyEndpointsRatio: 1.0, + }, + } + + return m, setupProxyWithCustomProxyParams(t, doc, proxyParams) } func setupProxyWithCustomEndpointRegisty(t *testing.T, doc string, endpointRegistry *routing.EndpointRegistry) (*metricstest.MockMetrics, *httptest.Server) { m := &metricstest.MockMetrics{} - - tp, err := newTestProxyWithParams(doc, Params{ + proxyParams := Params{ EnablePassiveHealthCheck: true, EndpointRegistry: endpointRegistry, Metrics: m, - }) + PassiveHealthCheck: &PassiveHealthCheck{ + MaxUnhealthyEndpointsRatio: 1.0, + }, + } + + return m, setupProxyWithCustomProxyParams(t, doc, proxyParams) +} + +func setupProxyWithCustomProxyParams(t *testing.T, doc string, proxyParams Params) *httptest.Server { + tp, err := newTestProxyWithParams(doc, proxyParams) require.NoError(t, err) ps := httptest.NewServer(tp.proxy) @@ -79,7 +98,7 @@ func setupProxyWithCustomEndpointRegisty(t *testing.T, doc string, endpointRegis t.Cleanup(tp.close) t.Cleanup(ps.Close) - return m, ps + return ps } func setupServices(t *testing.T, healthy, unhealthy int) string { @@ -138,6 +157,9 @@ func TestPHCForSingleHealthyEndpoint(t *testing.T) { tp, err := newTestProxyWithParams(doc, Params{ EnablePassiveHealthCheck: true, EndpointRegistry: endpointRegistry, + PassiveHealthCheck: &PassiveHealthCheck{ + MaxUnhealthyEndpointsRatio: 1.0, + }, }) if err != nil { t.Fatal(err) @@ -241,3 +263,73 @@ func TestPHC(t *testing.T) { }) } } + +func TestPHCNoHealthyEndpoints(t *testing.T) { + const ( + healthy = 0 + unhealthy = 4 + ) + + servicesString := setupServices(t, healthy, unhealthy) + mockMetrics, ps := setupProxy(t, fmt.Sprintf(`* -> backendTimeout("20ms") -> `, + servicesString)) + va := fireVegeta(t, ps, 5000, 1*time.Second, 5*time.Second) + + count200, ok := va.CountStatus(200) + assert.False(t, ok) + + count504, ok := va.CountStatus(504) + assert.True(t, ok) + + failedRequests := math.Abs(float64(va.TotalRequests())) - float64(count200) + t.Logf("total requests: %d, count200: %d, count504: %d, failedRequests: %f", va.TotalRequests(), count200, count504, failedRequests) + + assert.InDelta(t, float64(count504), failedRequests, 5) + assert.InDelta(t, float64(va.TotalRequests()), float64(failedRequests), 0.1*float64(va.TotalRequests())) + mockMetrics.WithCounters(func(counters map[string]int64) { + assert.InDelta(t, float64(unhealthy)*float64(va.TotalRequests()), float64(counters["passive-health-check.endpoints.dropped"]), 0.3*float64(unhealthy)*float64(va.TotalRequests())) + assert.InDelta(t, 0.0, float64(counters["passive-health-check.requests.passed"]), 0.3*float64(nRequests)) // allow 30% error + }) +} + +func TestPHCMaxUnhealthyEndpointsRatioParam(t *testing.T) { + const ( + healthy = 0 + unhealthy = 4 + maxUnhealthyEndpointsRatio = 0.49 // slightly less than 0.5 to avoid equality and looking up the third endpoint + ) + + servicesString := setupServices(t, healthy, unhealthy) + mockMetrics := &metricstest.MockMetrics{} + endpointRegistry := defaultEndpointRegistry() + proxyParams := Params{ + EnablePassiveHealthCheck: true, + EndpointRegistry: endpointRegistry, + Metrics: mockMetrics, + PassiveHealthCheck: &PassiveHealthCheck{ + MaxUnhealthyEndpointsRatio: maxUnhealthyEndpointsRatio, + }, + } + ps := setupProxyWithCustomProxyParams(t, fmt.Sprintf(`* -> backendTimeout("20ms") -> `, + servicesString), proxyParams) + va := fireVegeta(t, ps, 5000, 1*time.Second, 5*time.Second) + + count200, ok := va.CountStatus(200) + assert.False(t, ok) + + count504, ok := va.CountStatus(504) + assert.True(t, ok) + + failedRequests := math.Abs(float64(va.TotalRequests())) - float64(count200) + t.Logf("total requests: %d, count200: %d, count504: %d, failedRequests: %f", va.TotalRequests(), count200, count504, failedRequests) + + assert.InDelta(t, float64(count504), failedRequests, 5) + assert.InDelta(t, float64(va.TotalRequests()), float64(failedRequests), 0.1*float64(va.TotalRequests())) + mockMetrics.WithCounters(func(counters map[string]int64) { + assert.InDelta(t, maxUnhealthyEndpointsRatio*float64(unhealthy)*float64(va.TotalRequests()), + float64(counters["passive-health-check.endpoints.dropped"]), + 0.3*maxUnhealthyEndpointsRatio*float64(unhealthy)*float64(va.TotalRequests()), + ) + assert.InDelta(t, 0.0, float64(counters["passive-health-check.requests.passed"]), 0.3*float64(nRequests)) // allow 30% error + }) +} diff --git a/proxy/proxy.go b/proxy/proxy.go index cc5061954a..5553a2fdd5 100644 --- a/proxy/proxy.go +++ b/proxy/proxy.go @@ -162,6 +162,10 @@ type PassiveHealthCheck struct { // The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request MaxDropProbability float64 + + // MaxUnhealthyEndpointsRatio is the maximum ratio of unhealthy endpoints in the list of all endpoints PHC will check + // in case of all endpoints are unhealthy + MaxUnhealthyEndpointsRatio float64 } func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, error) { @@ -169,7 +173,10 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e return false, &PassiveHealthCheck{}, nil } - result := &PassiveHealthCheck{} + result := &PassiveHealthCheck{ + MinDropProbability: 0.0, + MaxUnhealthyEndpointsRatio: 1.0, + } requiredParams := map[string]struct{}{ "period": {}, "max-drop-probability": {}, @@ -218,6 +225,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value) } result.MinDropProbability = minDropProbability + case "max-unhealthy-endpoints-ratio": + maxUnhealthyEndpointsRatio, err := strconv.ParseFloat(value, 64) + if err != nil { + return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value) + } + if maxUnhealthyEndpointsRatio < 0 || maxUnhealthyEndpointsRatio > 1 { + return false, nil, fmt.Errorf("passive health check: invalid maxUnhealthyEndpointsRatio value: %q", value) + } + result.MaxUnhealthyEndpointsRatio = maxUnhealthyEndpointsRatio default: return false, nil, fmt.Errorf("passive health check: invalid parameter: key=%s,value=%s", key, value) } @@ -824,8 +840,9 @@ func WithParams(p Params) *Proxy { var healthyEndpointsChooser *healthyEndpoints if p.EnablePassiveHealthCheck { healthyEndpointsChooser = &healthyEndpoints{ - rnd: rand.New(loadbalancer.NewLockedSource()), - endpointRegistry: p.EndpointRegistry, + rnd: rand.New(loadbalancer.NewLockedSource()), + endpointRegistry: p.EndpointRegistry, + maxUnhealthyEndpointsRatio: p.PassiveHealthCheck.MaxUnhealthyEndpointsRatio, } } return &Proxy{ diff --git a/proxy/proxy_test.go b/proxy/proxy_test.go index 51780a4f5d..b9c080b188 100644 --- a/proxy/proxy_test.go +++ b/proxy/proxy_test.go @@ -2296,10 +2296,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "somethingInvalid", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "somethingInvalid", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2307,26 +2308,29 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: true, expectedParams: &PassiveHealthCheck{ - Period: 1 * time.Minute, - MinRequests: 10, - MaxDropProbability: 0.9, - MinDropProbability: 0.05, + Period: 1 * time.Minute, + MinRequests: 10, + MaxDropProbability: 0.9, + MinDropProbability: 0.05, + MaxUnhealthyEndpointsRatio: 0.3, }, expectedError: nil, }, { inputArg: map[string]string{ - "period": "-1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "-1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2334,10 +2338,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "somethingInvalid", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "somethingInvalid", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2345,10 +2350,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "-10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "-10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2356,10 +2362,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "somethingInvalid", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "somethingInvalid", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2367,10 +2374,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "-0.1", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "-0.1", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2378,10 +2386,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "3.1415", - "min-drop-probability": "0.05", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "3.1415", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2389,10 +2398,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "somethingInvalid", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "somethingInvalid", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2400,10 +2410,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "-0.1", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "-0.1", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2411,10 +2422,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "3.1415", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "3.1415", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2422,10 +2434,11 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.05", - "min-drop-probability": "0.9", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.05", + "min-drop-probability": "0.9", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2433,11 +2446,48 @@ func TestInitPassiveHealthChecker(t *testing.T) { }, { inputArg: map[string]string{ - "period": "1m", - "min-requests": "10", - "max-drop-probability": "0.9", - "min-drop-probability": "0.05", - "non-existing": "non-existing", + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "-0.1", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf(`passive health check: invalid maxUnhealthyEndpointsRatio value: "-0.1"`), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "3.1415", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf(`passive health check: invalid maxUnhealthyEndpointsRatio value: "3.1415"`), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "somethingInvalid", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf(`passive health check: invalid maxUnhealthyEndpointsRatio value: "somethingInvalid"`), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", + "non-existing": "non-existing", }, expectedEnabled: false, expectedParams: nil, @@ -2448,7 +2498,8 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", /* forgot max-drop-probability */ - "min-drop-probability": "0.05", + "min-drop-probability": "0.05", + "max-unhealthy-endpoints-ratio": "0.3", }, expectedEnabled: false, expectedParams: nil, @@ -2459,14 +2510,34 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "0.9", - /* using default max-drop-probability */ + /* using default min-drop-probability */ + "max-unhealthy-endpoints-ratio": "0.3", + }, + expectedEnabled: true, + expectedParams: &PassiveHealthCheck{ + Period: 1 * time.Minute, + MinRequests: 10, + MaxDropProbability: 0.9, + MinDropProbability: 0.0, + MaxUnhealthyEndpointsRatio: 0.3, + }, + expectedError: nil, + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", + /* using default max-unhealthy-endpoints-ratio */ }, expectedEnabled: true, expectedParams: &PassiveHealthCheck{ - Period: 1 * time.Minute, - MinRequests: 10, - MaxDropProbability: 0.9, - MinDropProbability: 0.0, + Period: 1 * time.Minute, + MinRequests: 10, + MaxDropProbability: 0.9, + MinDropProbability: 0.05, + MaxUnhealthyEndpointsRatio: 1.0, }, expectedError: nil, },