-
Notifications
You must be signed in to change notification settings - Fork 66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bishnu/ep level retry timeout http clients #823
Conversation
6ad31ac
to
bc66b21
Compare
90fd389
to
0601f99
Compare
0601f99
to
910cafa
Compare
910cafa
to
86a6b90
Compare
386c480
to
04df52c
Compare
codegen/templates/http_client.tmpl
Outdated
qpsLevels := map[string]string{ | ||
{{range $methodName, $qpsLevel := $QPSLevels -}} | ||
"{{$methodName}}": "{{$qpsLevel}}", | ||
{{end}} | ||
} | ||
if !circuitBreakerDisabled { | ||
for methodName := range methodNames { | ||
for methodName, methodTimeoutVal := range clientMethodTimeoutMapping{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Prefer methodTimeout
for brevity.
for methodName, methodTimeoutVal := range clientMethodTimeoutMapping{ | |
for methodName, methodTimeout := range clientMethodTimeoutMapping{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make sense, addressed
codegen/templates/workflow.tmpl
Outdated
@@ -16,6 +16,9 @@ package workflow | |||
{{- $serviceMethod := printf "%s%s" (title .Method.ThriftService) (title .Method.Name) }} | |||
{{- $workflowInterface := printf "%sWorkflow" $serviceMethod }} | |||
{{- $workflowStruct := camel $workflowInterface }} | |||
{{- $endpointId := .Spec.EndpointID }} | |||
{{- $handleId := .Spec.HandleID }} | |||
{{- $handleIdDotEndpointId := printf "%s.%s" ($endpointId) ($handleId) }} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{{- $handleIdDotEndpointId := printf "%s.%s" ($endpointId) ($handleId) }} | |
{{- $handleIdDotEndpointIdFmt := printf "%s.%s" ($endpointId) ($handleId) }} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
codegen/templates/workflow.tmpl
Outdated
var backOffTimeAcrossRetries time.Duration | ||
var overAllTimeout time.Duration | ||
|
||
//when endpoint level timeout information provided, override it with client level config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
//when endpoint level timeout information provided, override it with client level config | |
//when endpoint level timeout information is available, override it with client level config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
codegen/templates/workflow.tmpl
Outdated
scaleFactor := w.defaultDeps.Config.MustGetFloat("endpoints.{{$handleIdDotEndpointId}}.scaleFactor") | ||
maxRetry = int(w.defaultDeps.Config.MustGetInt("endpoints.{{$handleIdDotEndpointId}}.retryCount")) | ||
|
||
backOffTimeAcrossRetriesConf := w.defaultDeps.Config.MustGetInt("endpoints.{{$handleIdDotEndpointId}}.backOffTimeAcrossRetries") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
backOffTimeAcrossRetriesConf := w.defaultDeps.Config.MustGetInt("endpoints.{{$handleIdDotEndpointId}}.backOffTimeAcrossRetries") | |
backOffTimeAcrossRetriesCfg := w.defaultDeps.Config.MustGetInt("endpoints.{{$handleIdDotEndpointId}}.backOffTimeAcrossRetries") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
codegen/templates/workflow.tmpl
Outdated
//add a scale factor to timeoutPerAttempt | ||
timeoutPerAttemptConf = int64(math.Ceil(float64(timeoutPerAttemptConf) * scaleFactor)) | ||
|
||
//convert timeoutPerAttemptConf & backOffTimeAcrossRetries to Duration |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Redundant comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved it to utils , redundant code for all workflow
codegen/templates/workflow.tmpl
Outdated
timeoutPerAttempt = time.Duration(timeoutPerAttemptConf) * time.Millisecond | ||
backOffTimeAcrossRetries = time.Duration(backOffTimeAcrossRetriesConf) * time.Millisecond | ||
|
||
overAllTimeout = time.Duration((timeoutPerAttemptConf + backOffTimeAcrossRetriesConf) * (int64(maxRetry-1)) + timeoutPerAttemptConf) * time.Millisecond |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to have this logic in tmpl
This should be sourced from the runtime. tmpl should only be the invoker or contain branching at max. This makes our template bloated and increases code gen size.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved it to utils , redundant code for all workflow
HelloA: ServiceABack::hello | ||
HelloB: ServiceBBack::hello | ||
HelloA: ServiceABack::helloA | ||
HelloB: ServiceBBack::helloB |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't it change the test? Two clients map to the same procedure vs two clients mapping to different procedure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it does a bit :). Have updated it to not alter the test case. Details -
In the new code I have updated the QPS level for both helloA and helloB as QPS level is mapped as "clientID-clientMehtod" but in this case we have multiple services for client multi, so the mapping breaks fundamentally.
To fix this completely, we have to name the circuit breaker name as "clientId-serviceName-serviceMethod" - Will have a backlog ticket for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it. Can you create a Github issue for this and reference this PR there? Otherwise we'll lose context.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done #841
|
||
import "time" | ||
|
||
// TimeoutAndRetryOptions specifies timeout and retries params for outbound calls |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice practice of adding an options here. However, we could explore a option/decorator pattern similar to what is being done in fx and glue:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initial review
6a0b9fa
to
ea4ffc5
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM assuming that the retry functionality has been extensively tested across sample/example gateways across both (ep level / client level) timeouts.
@@ -143,6 +143,17 @@ func {{$exportName}}(deps *module.Dependencies) Client { | |||
{{ end -}} | |||
} | |||
|
|||
//get mapping of client method and it's timeout | |||
//if mapping is not provided, use client's timeout for all the methods | |||
clientMethodTimeoutMapping := make(map[string]int64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why did we decide to make the values int64
here? If we used an int
instead, we could get rid of the type conversion inside configureCircuitBreaker
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int doesn't for deps.Default.Config.MustGetStruct("clients.{{$clientID}}.methodTimeoutMapping", &clientMethodTimeoutMapping)
statement, hence had to update it int64
runtime/client_http_request.go
Outdated
} | ||
req.ContextLogger.Debug(req.ctx, "ClientHTTPRequest definition", | ||
zap.String("clientId", req.ClientID), zap.String("methodName", req.MethodName), | ||
zap.String("req.RequestTimeoutPerAttemptInMs", string(rune(req.timeoutAndRetryOptions.RequestTimeoutPerAttemptInMs))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not use zap.Int
instead to avoid the type conversion?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sense, Addressed
if err != nil { | ||
req.ContextLogger.ErrorZ(req.ctx, "Could not make outbound request", zap.Error(err)) | ||
return nil, err | ||
req.ContextLogger.ErrorZ(req.ctx, fmt.Sprintf("Could not make http outbound %s.%s request", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is the formatting in the error log message necessary? Shouldn't we get this information through some of the logger tags from the context logger?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally, it should have come through the context logger but it wasn't the case. As a quick fix I added it here instead debugging it
runtime/client_http_request.go
Outdated
req.ContextLogger.GetLogger().Info("errors while making http outbound request", | ||
zap.Error(err), | ||
zap.String("clientId", req.ClientID), zap.String("methodName", req.MethodName), | ||
zap.String("attempt", string(rune(retryCount))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same question here, we can use zap.Int types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
runtime/client_http_request.go
Outdated
shouldRetry = req.client.CheckRetry(ctx, req.timeoutAndRetryOptions, res, err) | ||
} | ||
|
||
req.ContextLogger.GetLogger().Info("errors while making http outbound request", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should be an error log?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also why did we expose the underlying logger instead of using the context logger directly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have updated it as Warn as retries could be pending which says it's not the final error.
Why not contextLogger? Each attempt may fail with diff reason and may override the message in contextLogger, what we want is individual logs for each retry. Other approach is to fix contextLogger to have same field with multiple values, like msg1, msg2..
runtime/utils.go
Outdated
) | ||
|
||
//GetTimeoutAndRetryConfig encapsulates timeout and retry configs in TimeoutAndRetryOptions struct | ||
func GetTimeoutAndRetryConfig(timeoutPerAttemptConf int, backOffTimeAcrossRetriesConf int, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: BuildTimeoutAndRetryConfig
might be a better name?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
:), addressed
270e8c6
to
0a47bbd
Compare
0a47bbd
to
5dd6816
Compare
ERD:
https://docs.google.com/document/d/1e6Nmhoa5ygsh2lXSzUyJdHqz2pXF0lc5ACr8DzwKBio/edit#
Summary:
fyi: The branch reads it as the changes are only for HTTP clients but its for both HTTP and TChannel