From ec75820510d2a350093591913d5b79619399f633 Mon Sep 17 00:00:00 2001 From: Haifeng He Date: Mon, 24 Jul 2023 13:42:15 -0700 Subject: [PATCH] Minor fixes in force replication verification (#4675) **What changed?** - Minor fixes in force replication verification - Sleep first in verfication step - Remove RetryableTime error **Why?** **How did you test it?** **Potential risks** **Is hotfix candidate?** --- service/worker/migration/activities.go | 29 ++++++++++---------------- 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/service/worker/migration/activities.go b/service/worker/migration/activities.go index b640ea7f6c8..34f1e3c16af 100644 --- a/service/worker/migration/activities.go +++ b/service/worker/migration/activities.go @@ -644,7 +644,6 @@ func (a *activities) verifyReplicationTasks( } const ( - defaultNoProgressRetryableTimeout = 5 * time.Minute defaultNoProgressNotRetryableTimeout = 15 * time.Minute ) @@ -681,6 +680,10 @@ func (a *activities) VerifyReplicationTasks(ctx context.Context, request *verify // - more than NonRetryableTimeout, it means potentially we encountered #4. The activity returns // non-retryable error and force-replication workflow will restarted. for { + + // Since replication has a lag, sleep first. + time.Sleep(request.VerifyInterval) + verified, progress, err := a.verifyReplicationTasks(ctx, request, &details, remoteClient) if err != nil { return err @@ -697,23 +700,13 @@ func (a *activities) VerifyReplicationTasks(ctx context.Context, request *verify } diff := time.Now().Sub(details.CheckPoint) - if diff > defaultNoProgressRetryableTimeout { - if diff > defaultNoProgressNotRetryableTimeout { - // Potentially encountered a missing execution, return non-retryable error - return temporal.NewNonRetryableApplicationError( - fmt.Sprintf("verifyReplicationTasks was not able to make progress for more than %v minutes (not retryable). Not found WorkflowExecution: %v, Checkpoint: %v", - diff.Minutes(), - details.LastNotFoundWorkflowExecution, details.CheckPoint), - "", nil) - } - - // return error to trigger activity retry - return verifyReplicationTasksTimeoutErr{ - timeout: diff, - details: details, - } + if diff > defaultNoProgressNotRetryableTimeout { + // Potentially encountered a missing execution, return non-retryable error + return temporal.NewNonRetryableApplicationError( + fmt.Sprintf("verifyReplicationTasks was not able to make progress for more than %v minutes (not retryable). Not found WorkflowExecution: %v, Checkpoint: %v", + diff.Minutes(), + details.LastNotFoundWorkflowExecution, details.CheckPoint), + "", nil) } - - time.Sleep(request.VerifyInterval) } }