Skip to content

Commit

Permalink
Fix resilient shard
Browse files Browse the repository at this point in the history
  • Loading branch information
yiminc committed Nov 14, 2022
1 parent 5e466c3 commit fdef158
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions service/history/shard/context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -1869,7 +1869,7 @@ func (s *ContextImpl) acquireShard() {
//
// We stop retrying on any of:
// 1. We succeed in acquiring the rangeid lock.
// 2. We get any error other than transient errors.
// 2. We get ShardOwnershipLostError or lifecycleCtx ended.
// 3. The state changes to Stopping or Stopped.
//
// If the shard controller sees that service resolver has assigned ownership to someone
Expand Down Expand Up @@ -1933,7 +1933,18 @@ func (s *ContextImpl) acquireShard() {
return nil
}

err := backoff.ThrottleRetry(op, policy, common.IsPersistenceTransientError)
// keep retrying except ShardOwnershipLostError or lifecycle context ended
acquireShardRetryable := func(err error) bool {
if s.lifecycleCtx.Err() != nil {
return false
}
switch err.(type) {
case *persistence.ShardOwnershipLostError:
return false
}
return true
}
err := backoff.ThrottleRetry(op, policy, acquireShardRetryable)
if err != nil {
// We got an unretryable error (perhaps context cancelled or ShardOwnershipLostError).
s.contextTaggedLogger.Error("Couldn't acquire shard", tag.Error(err))
Expand Down

0 comments on commit fdef158

Please sign in to comment.