Skip to content

Commit

Permalink
Add -min-drop-probability cmdline option for the PHC (#3053)
Browse files Browse the repository at this point in the history
Signed-off-by: Roman Zavodskikh <roman.zavodskikh@zalando.de>
Co-authored-by: Roman Zavodskikh <roman.zavodskikh@zalando.de>
  • Loading branch information
RomanZavodskikh and Roman Zavodskikh committed Apr 30, 2024
1 parent 678fc5a commit 68cce2a
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 12 deletions.
13 changes: 7 additions & 6 deletions docs/operation/operation.md
Original file line number Diff line number Diff line change
Expand Up @@ -901,15 +901,15 @@ Passive Health Check(PHC).
PHC works the following way: the entire uptime is divided in chunks of `period`, per every period Skipper calculates
the total amount of requests and amount of requests failed per every endpoint. While next period is going on,
the Skipper takes a look at previous period and if the amount of requests in the previous period is more than `min-requests`
for the given endpoints then Skipper will send reduced (the more `max-drop-probability`
and failed requests ratio in previous period are, the stronger reduction is)
amount of requests compared to amount sent without PHC.
and failed requests ratio is more than `min-drop-probability` for the given endpoints
then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio
in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC.

Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead.

To enable this feature, you need to provide `-passive-health-check` option having all forementioned parameters
(`period`, `min-requests`, `max-drop-probability`) defined,
for instance: `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9`.
(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined,
for instance: `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9`.

You need to define all parameters on your side, there are no defaults, and skipper will not run if PHC params are passed only partially.

Expand All @@ -918,7 +918,8 @@ However, Skipper will run without this feature, if no `-passive-health-check` is
The parameters of `-passive-health-check` option are:
+ `period=<duration>` - the duration of stats reset period
+ `min-requests=<int>` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint
+ `max-drop-probabilty=<float more than/equal to 0 and less than/equal to 1>` - the maximum possible probability of unhealthy endpoint being not considered
+ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint
+ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered
while choosing the endpoint for the given request

### Metrics
Expand Down
1 change: 1 addition & 0 deletions metricsinit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func TestInitOrderAndDefault(t *testing.T) {
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
},
}

Expand Down
2 changes: 1 addition & 1 deletion proxy/healthy_endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout
filtered := make([]routing.LBEndpoint, 0, len(endpoints))
for _, e := range endpoints {
dropProbability := e.Metrics.HealthCheckDropProbability()
if dropProbability > 0.05 && p < dropProbability {
if p < dropProbability {
ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f",
e.Host, p, dropProbability)
metrics.IncCounter("passive-health-check.endpoints.dropped")
Expand Down
1 change: 1 addition & 0 deletions proxy/healthy_endpoints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ func defaultEndpointRegistry() *routing.EndpointRegistry {
StatsResetPeriod: period,
MinRequests: 10,
MaxHealthCheckDropProbability: 1.0,
MinHealthCheckDropProbability: 0.01,
})
}

Expand Down
19 changes: 18 additions & 1 deletion proxy/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ type PassiveHealthCheck struct {
// potentially opt out the endpoint from the list of healthy endpoints
MinRequests int64

// The minimal ratio of failed requests in a single period to potentially opt out the endpoint
// from the list of healthy endpoints. This ratio is equal to the minimal non-zero probability of
// dropping endpoint out from load balancing for every specific request
MinDropProbability float64

// The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request
MaxDropProbability float64
}
Expand Down Expand Up @@ -186,6 +191,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e
return false, nil, fmt.Errorf("passive health check: invalid minRequests value: %s", value)
}
result.MinRequests = int64(minRequests)
case "min-drop-probability":
minDropProbability, err := strconv.ParseFloat(value, 64)
if err != nil {
return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value)
}
if minDropProbability < 0 || minDropProbability > 1 {
return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value)
}
result.MinDropProbability = minDropProbability
case "max-drop-probability":
maxDropProbability, err := strconv.ParseFloat(value, 64)
if err != nil {
Expand All @@ -202,9 +216,12 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e
keysInitialized[key] = struct{}{}
}

if len(keysInitialized) != 3 {
if len(keysInitialized) != 4 {
return false, nil, fmt.Errorf("passive health check: missing required parameters")
}
if result.MinDropProbability >= result.MaxDropProbability {
return false, nil, fmt.Errorf("passive health check: minDropProbability should be less than maxDropProbability")
}
return true, result, nil
}

Expand Down
55 changes: 55 additions & 0 deletions proxy/proxy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2299,6 +2299,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "somethingInvalid",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2309,12 +2310,14 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
},
expectedEnabled: true,
expectedParams: &PassiveHealthCheck{
Period: 1 * time.Minute,
MinRequests: 10,
MaxDropProbability: 0.9,
MinDropProbability: 0.05,
},
expectedError: nil,
},
Expand All @@ -2323,6 +2326,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "-1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2333,6 +2337,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "somethingInvalid",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2343,6 +2348,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "-10",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2353,6 +2359,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "10",
"max-drop-probability": "somethingInvalid",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2363,6 +2370,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "10",
"max-drop-probability": "-0.1",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2373,6 +2381,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "10",
"max-drop-probability": "3.1415",
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand All @@ -2383,6 +2392,51 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "somethingInvalid",
},
expectedEnabled: false,
expectedParams: nil,
expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: somethingInvalid"),
},
{
inputArg: map[string]string{
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "-0.1",
},
expectedEnabled: false,
expectedParams: nil,
expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: -0.1"),
},
{
inputArg: map[string]string{
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "3.1415",
},
expectedEnabled: false,
expectedParams: nil,
expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: 3.1415"),
},
{
inputArg: map[string]string{
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.05",
"min-drop-probability": "0.9",
},
expectedEnabled: false,
expectedParams: nil,
expectedError: fmt.Errorf("passive health check: minDropProbability should be less than maxDropProbability"),
},
{
inputArg: map[string]string{
"period": "1m",
"min-requests": "10",
"max-drop-probability": "0.9",
"min-drop-probability": "0.05",
"non-existing": "non-existing",
},
expectedEnabled: false,
Expand All @@ -2394,6 +2448,7 @@ func TestInitPassiveHealthChecker(t *testing.T) {
"period": "1m",
"min-requests": "10",
/* forgot max-drop-probability */
"min-drop-probability": "0.05",
},
expectedEnabled: false,
expectedParams: nil,
Expand Down
11 changes: 7 additions & 4 deletions routing/endpointregistry.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ type EndpointRegistry struct {
lastSeenTimeout time.Duration
statsResetPeriod time.Duration
minRequests int64
minHealthCheckDropProbability float64
maxHealthCheckDropProbability float64

quit chan struct{}
Expand All @@ -113,6 +114,7 @@ type RegistryOptions struct {
PassiveHealthCheckEnabled bool
StatsResetPeriod time.Duration
MinRequests int64
MinHealthCheckDropProbability float64
MaxHealthCheckDropProbability float64
}

Expand Down Expand Up @@ -164,17 +166,17 @@ func (r *EndpointRegistry) updateStats() {
e.totalFailedRoundTrips[nextSlot].Store(0)
e.totalRequests[nextSlot].Store(0)

newDropProbability := 0.0
failed := e.totalFailedRoundTrips[curSlot].Load()
requests := e.totalRequests[curSlot].Load()
if requests > r.minRequests {
failedRoundTripsRatio := float64(failed) / float64(requests)
if failedRoundTripsRatio > 0.0 {
if failedRoundTripsRatio > r.minHealthCheckDropProbability {
log.Infof("Passive health check: marking %q as unhealthy due to failed round trips ratio: %0.2f", key, failedRoundTripsRatio)
newDropProbability = min(failedRoundTripsRatio, r.maxHealthCheckDropProbability)
}
e.healthCheckDropProbability.Store(min(failedRoundTripsRatio, r.maxHealthCheckDropProbability))
} else {
e.healthCheckDropProbability.Store(0.0)
}
e.healthCheckDropProbability.Store(newDropProbability)
e.curSlot.Store(nextSlot)

return true
Expand All @@ -201,6 +203,7 @@ func NewEndpointRegistry(o RegistryOptions) *EndpointRegistry {
lastSeenTimeout: o.LastSeenTimeout,
statsResetPeriod: o.StatsResetPeriod,
minRequests: o.MinRequests,
minHealthCheckDropProbability: o.MinHealthCheckDropProbability,
maxHealthCheckDropProbability: o.MaxHealthCheckDropProbability,

quit: make(chan struct{}),
Expand Down
1 change: 1 addition & 0 deletions skipper.go
Original file line number Diff line number Diff line change
Expand Up @@ -1945,6 +1945,7 @@ func run(o Options, sig chan os.Signal, idleConnsCH chan struct{}) error {
PassiveHealthCheckEnabled: passiveHealthCheckEnabled,
StatsResetPeriod: passiveHealthCheck.Period,
MinRequests: passiveHealthCheck.MinRequests,
MinHealthCheckDropProbability: passiveHealthCheck.MinDropProbability,
MaxHealthCheckDropProbability: passiveHealthCheck.MaxDropProbability,
})
ro := routing.Options{
Expand Down

0 comments on commit 68cce2a

Please sign in to comment.