Skip to content

Commit

Permalink
Improve execution scavenger (#3674)
Browse files Browse the repository at this point in the history
* Improve execution scavenger
  • Loading branch information
yux0 committed Nov 30, 2022
1 parent f6a340d commit 0101924
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 19 deletions.
1 change: 1 addition & 0 deletions common/metrics/metric_defs.go
Expand Up @@ -1651,6 +1651,7 @@ var (
ArchiverWorkflowStoppingCount = NewCounterDef("archiver_workflow_stopping")
ScavengerValidationRequestsCount = NewCounterDef("scavenger_validation_requests")
ScavengerValidationFailuresCount = NewCounterDef("scavenger_validation_failures")
ScavengerValidationSkipsCount = NewCounterDef("scavenger_validation_skips")
AddSearchAttributesFailuresCount = NewCounterDef("add_search_attributes_failures")
DeleteNamespaceSuccessCount = NewCounterDef("delete_namespace_success")
RenameNamespaceSuccessCount = NewCounterDef("rename_namespace_success")
Expand Down
25 changes: 14 additions & 11 deletions service/worker/scanner/executions/mutable_state_validator.go
Expand Up @@ -91,6 +91,20 @@ func (v *mutableStateValidator) Validate(

var results []MutableStateValidationResult

// First, to check if the data is expired on retention time.
retentionResult, err := v.validateRetention(
mutableState.GetExecutionInfo(),
mutableState.GetExecutionState().GetState(),
)
if err != nil {
return results, err
}
if retentionResult != nil {
// Skip all validation if the data is expired.
results = append(results, *retentionResult)
return results, nil
}

results = append(results, v.validateActivity(
mutableState.ActivityInfos,
lastItem.GetEventId())...,
Expand All @@ -116,17 +130,6 @@ func (v *mutableStateValidator) Validate(
lastItem.GetEventId())...,
)

retentionResult, err := v.validateRetention(
mutableState.GetExecutionInfo(),
mutableState.GetExecutionState().GetState(),
)
if err != nil {
return results, err
}
if retentionResult != nil {
results = append(results, *retentionResult)
}

return results, nil
}

Expand Down
34 changes: 26 additions & 8 deletions service/worker/scanner/executions/task.go
Expand Up @@ -94,7 +94,7 @@ func newTask(
historyClient: historyClient,
adminClient: adminClient,

metricsHandler: metricsHandler,
metricsHandler: metricsHandler.WithTags(metrics.OperationTag(metrics.ExecutionsScavengerScope)),
logger: logger,
scavenger: scavenger,

Expand All @@ -112,12 +112,15 @@ func (t *task) Run() executor.TaskStatus {
))

iter := collection.NewPagingIteratorWithToken(t.getPaginationFn(), t.paginationToken)
var retryTask bool
for iter.HasNext() {
_ = t.rateLimiter.Wait(t.ctx)
record, err := iter.Next()
if err != nil {
t.metricsHandler.Counter(metrics.ScavengerValidationSkipsCount.GetMetricName()).Record(1)
// continue validation process and retry after all workflow records has been iterated.
t.logger.Error("unable to paginate concrete execution", tag.ShardID(t.shardID), tag.Error(err))
return executor.TaskStatusDefer
retryTask = true
}

mutableState := &MutableState{WorkflowMutableState: record}
Expand All @@ -130,10 +133,21 @@ func (t *task) Run() executor.TaskStatus {
)
err = t.handleFailures(mutableState, results)
if err != nil {
t.logger.Error("unable to process failure result", tag.ShardID(t.shardID), tag.Error(err))
return executor.TaskStatusDefer
// continue validation process and retry after all workflow records has been iterated.
executionInfo := mutableState.GetExecutionInfo()
t.metricsHandler.Counter(metrics.ScavengerValidationSkipsCount.GetMetricName()).Record(1)
t.logger.Error("unable to process failure result",
tag.ShardID(t.shardID),
tag.Error(err),
tag.WorkflowNamespaceID(executionInfo.GetNamespaceId()),
tag.WorkflowID(executionInfo.GetWorkflowId()),
tag.WorkflowRunID(mutableState.GetExecutionState().GetRunId()))
retryTask = true
}
}
if retryTask {
return executor.TaskStatusDefer
}
return executor.TaskStatusDone
}

Expand Down Expand Up @@ -167,6 +181,11 @@ func (t *task) validate(
results = append(results, validationResults...)
}

// Fail fast if the mutable is corrupted, no need to validate history.
if len(results) > 0 {
return results
}

if validationResults, err := NewHistoryEventIDValidator(
t.shardID,
t.executionManager,
Expand Down Expand Up @@ -254,15 +273,14 @@ func printValidationResult(
metricsHandler metrics.MetricsHandler,
logger log.Logger,
) {
handler := metricsHandler.WithTags(metrics.OperationTag(metrics.ExecutionsScavengerScope), metrics.FailureTag(""))
handler.Counter(metrics.ScavengerValidationRequestsCount.GetMetricName()).Record(1)
metricsHandler.Counter(metrics.ScavengerValidationRequestsCount.GetMetricName()).Record(1)
if len(results) == 0 {
return
}

handler.Counter(metrics.ScavengerValidationFailuresCount.GetMetricName()).Record(1)
metricsHandler.Counter(metrics.ScavengerValidationFailuresCount.GetMetricName()).Record(1)
for _, result := range results {
handler.Counter(metrics.ScavengerValidationFailuresCount.GetMetricName()).Record(1, metrics.FailureTag(result.failureType))
metricsHandler.Counter(metrics.ScavengerValidationFailuresCount.GetMetricName()).Record(1, metrics.FailureTag(result.failureType))
logger.Info(
"validation failed for execution.",
tag.WorkflowNamespaceID(mutableState.GetExecutionInfo().GetNamespaceId()),
Expand Down

0 comments on commit 0101924

Please sign in to comment.