-
Notifications
You must be signed in to change notification settings - Fork 77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Stop returning tonic errors from polls instead log warnings and retry. #202
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,16 +36,29 @@ impl<SG> RetryGateway<SG> { | |
|
||
impl<SG: ServerGatewayApis + Send + Sync + 'static> RetryGateway<SG> { | ||
async fn call_with_retry<R, F, Fut>(&self, factory: F) -> Result<R> | ||
where | ||
F: Fn() -> Fut + Unpin, | ||
Fut: Future<Output = Result<R>>, | ||
{ | ||
self.call_type_with_retry(factory, CallType::Normal).await | ||
} | ||
|
||
async fn long_poll_call_with_retry<R, F, Fut>(&self, factory: F) -> Result<R> | ||
where | ||
F: Fn() -> Fut + Unpin, | ||
Fut: Future<Output = Result<R>>, | ||
{ | ||
self.call_type_with_retry(factory, CallType::LongPoll).await | ||
} | ||
|
||
async fn call_type_with_retry<R, F, Fut>(&self, factory: F, ct: CallType) -> Result<R> | ||
where | ||
F: Fn() -> Fut + Unpin, | ||
Fut: Future<Output = Result<R>>, | ||
{ | ||
Ok(FutureRetry::new( | ||
factory, | ||
TonicErrorHandler::new( | ||
self.retry_config.clone().into(), | ||
self.retry_config.max_retries, | ||
), | ||
TonicErrorHandler::new(self.retry_config.clone(), ct), | ||
) | ||
.await | ||
.map_err(|(e, _attempt)| e)? | ||
|
@@ -54,27 +67,44 @@ impl<SG: ServerGatewayApis + Send + Sync + 'static> RetryGateway<SG> { | |
} | ||
|
||
#[derive(Debug)] | ||
pub struct TonicErrorHandler { | ||
struct TonicErrorHandler { | ||
backoff: ExponentialBackoff, | ||
max_attempts: usize, | ||
call_type: CallType, | ||
} | ||
|
||
impl TonicErrorHandler { | ||
pub fn new(backoff: ExponentialBackoff, max_attempts: usize) -> Self { | ||
TonicErrorHandler { | ||
backoff, | ||
max_attempts, | ||
fn new(mut cfg: RetryConfig, call_type: CallType) -> Self { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe instead of introducing this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason about needing to log a warning |
||
if call_type == CallType::LongPoll { | ||
// Long polls can retry forever | ||
cfg.max_elapsed_time = None; | ||
} | ||
Self { | ||
max_attempts: cfg.max_retries, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This bit of logic is unchanged. Attempt starts 0 indexed so it works out. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The name is misleading though, I'd change it (doesn't have to be in this PR). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The long poll handler shouldn't have max attempts set. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need some way to decide when to start logging warnings if we've been retrying forever. Normal max retries provides a decent signal. |
||
backoff: cfg.into(), | ||
call_type, | ||
} | ||
} | ||
} | ||
#[derive(Debug, Eq, PartialEq, Hash)] | ||
enum CallType { | ||
Normal, | ||
LongPoll, | ||
} | ||
|
||
impl ErrorHandler<tonic::Status> for TonicErrorHandler { | ||
type OutError = tonic::Status; | ||
|
||
fn handle(&mut self, current_attempt: usize, e: tonic::Status) -> RetryPolicy<tonic::Status> { | ||
// Long poll calls get unlimited retries | ||
if current_attempt >= self.max_attempts { | ||
return RetryPolicy::ForwardError(e); | ||
if self.call_type == CallType::Normal { | ||
return RetryPolicy::ForwardError(e); | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pedantic/subjective: no else needed |
||
// But once they exceed the normal max attempts, start logging warnings | ||
warn!(error=?e, "Polling encountered repeated error") | ||
} | ||
} | ||
|
||
if RETRYABLE_ERROR_CODES.contains(&e.code()) { | ||
match self.backoff.next_backoff() { | ||
None => RetryPolicy::ForwardError(e), // None is returned when we've ran out of time. | ||
|
@@ -114,15 +144,15 @@ impl<SG: ServerGatewayApis + Send + Sync + 'static> ServerGatewayApis for RetryG | |
task_queue: String, | ||
) -> Result<PollWorkflowTaskQueueResponse> { | ||
let factory = move || self.gateway.poll_workflow_task(task_queue.clone()); | ||
self.call_with_retry(factory).await | ||
self.long_poll_call_with_retry(factory).await | ||
} | ||
|
||
async fn poll_activity_task( | ||
&self, | ||
task_queue: String, | ||
) -> Result<PollActivityTaskQueueResponse> { | ||
let factory = move || self.gateway.poll_activity_task(task_queue.clone()); | ||
self.call_with_retry(factory).await | ||
self.long_poll_call_with_retry(factory).await | ||
} | ||
|
||
async fn reset_sticky_task_queue( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aren't there ever bad poll responses that should be sent to lang and considered fatal?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What will you do with that information? Nothing interesting to be done. I'll send you an eviction if it matters