Skip to content

Commit

Permalink
Improve build id scavenger (#4568)
Browse files Browse the repository at this point in the history
  • Loading branch information
dnr authored and mindaugasrukas committed Jul 1, 2023
1 parent ed7f94c commit 873df34
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 22 deletions.
2 changes: 2 additions & 0 deletions common/dynamicconfig/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ const (
// 2. There are delays in the visibility task processor (which is asynchronous).
// 3. There's propagation delay of the versioning data between matching nodes.
RemovableBuildIdDurationSinceDefault = "worker.removableBuildIdDurationSinceDefault"
// BuildIdScavengerVisibilityRPS is the rate limit for visibility calls from the build id scavenger
BuildIdScavenengerVisibilityRPS = "worker.buildIdScavengerVisibilityRPS"

// keys for frontend

Expand Down
14 changes: 7 additions & 7 deletions service/worker/scanner/build_ids/scavenger.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ package build_ids

import (
"context"
"math"
"time"

enumspb "go.temporal.io/api/enums/v1"
Expand Down Expand Up @@ -67,7 +66,6 @@ var (

type (
BuildIdScavangerInput struct {
VisibilityRPS float64
NamespaceListPageSize int
TaskQueueListPageSize int
}
Expand All @@ -86,6 +84,7 @@ type (
// 2. workflows with that identifier that have yet to be indexed in visibility
// The scavenger should allow enough time to pass before cleaning these build ids.
removableBuildIdDurationSinceDefault dynamicconfig.DurationPropertyFn
buildIdScavengerVisibilityRPS dynamicconfig.FloatPropertyFn
}

heartbeatDetails struct {
Expand All @@ -105,6 +104,7 @@ func NewActivities(
matchingClient matchingservice.MatchingServiceClient,
currentClusterName string,
removableBuildIdDurationSinceDefault dynamicconfig.DurationPropertyFn,
buildIdScavengerVisibilityRPS dynamicconfig.FloatPropertyFn,
) *Activities {
return &Activities{
logger: logger,
Expand All @@ -115,6 +115,7 @@ func NewActivities(
matchingClient: matchingClient,
currentClusterName: currentClusterName,
removableBuildIdDurationSinceDefault: removableBuildIdDurationSinceDefault,
buildIdScavengerVisibilityRPS: buildIdScavengerVisibilityRPS,
}
}

Expand All @@ -136,9 +137,6 @@ func (a *Activities) setDefaults(input *BuildIdScavangerInput) {
if input.TaskQueueListPageSize == 0 {
input.TaskQueueListPageSize = 100
}
if input.VisibilityRPS == 0 {
input.VisibilityRPS = 1
}
}

func (a *Activities) recordHeartbeat(ctx context.Context, heartbeat heartbeatDetails) {
Expand All @@ -155,7 +153,7 @@ func (a *Activities) ScavengeBuildIds(ctx context.Context, input BuildIdScavange
return temporal.NewNonRetryableApplicationError("failed to load previous heartbeat details", "TypeError", err)
}
}
rateLimiter := quotas.NewRateLimiter(input.VisibilityRPS, int(math.Ceil(input.VisibilityRPS)))
rateLimiter := quotas.NewDefaultOutgoingRateLimiter(quotas.RateFn(a.buildIdScavengerVisibilityRPS))
for {
nsResponse, err := a.metadataManager.ListNamespaces(ctx, &persistence.ListNamespacesRequest{
PageSize: input.NamespaceListPageSize,
Expand Down Expand Up @@ -208,14 +206,16 @@ func (a *Activities) processNamespaceEntry(
return err
}
for heartbeat.TaskQueueIdx < len(tqResponse.Entries) {
if ctx.Err() != nil {
return ctx.Err()
}
entry := tqResponse.Entries[heartbeat.TaskQueueIdx]
if err := a.processUserDataEntry(ctx, rateLimiter, *heartbeat, ns, entry); err != nil {
// Intentionally don't fail the activity on single entry.
a.logger.Error("Failed to update task queue user data",
tag.WorkflowNamespace(ns.Name().String()),
tag.WorkflowTaskQueueName(entry.TaskQueue),
tag.Error(err))
continue
}
heartbeat.TaskQueueIdx++
a.recordHeartbeat(ctx, *heartbeat)
Expand Down
29 changes: 14 additions & 15 deletions service/worker/scanner/build_ids/scavenger_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
"go.temporal.io/server/api/matchingservicemock/v1"
persistencespb "go.temporal.io/server/api/persistence/v1"
hlc "go.temporal.io/server/common/clock/hybrid_logical_clock"
"go.temporal.io/server/common/dynamicconfig"
"go.temporal.io/server/common/log"
"go.temporal.io/server/common/namespace"
"go.temporal.io/server/common/persistence"
Expand Down Expand Up @@ -84,11 +85,10 @@ func Test_findBuildIdsToRemove_FindsAllBuildIdsToRemove(t *testing.T) {
rateLimiter := quotas.NewMockRateLimiter(ctrl)

a := &Activities{
logger: log.NewCLILogger(),
visibilityManager: visiblityManager,
removableBuildIdDurationSinceDefault: func() time.Duration {
return time.Hour
},
logger: log.NewCLILogger(),
visibilityManager: visiblityManager,
removableBuildIdDurationSinceDefault: dynamicconfig.GetDurationPropertyFn(time.Hour),
buildIdScavengerVisibilityRPS: dynamicconfig.GetFloatPropertyFn(1.0),
}

visiblityManager.EXPECT().CountWorkflowExecutions(gomock.Any(), gomock.Any()).Times(4).DoAndReturn(
Expand Down Expand Up @@ -233,16 +233,15 @@ func Test_ScavengeBuildIds_Heartbeats(t *testing.T) {
matchingClient := matchingservicemock.NewMockMatchingServiceClient(ctrl)

a := &Activities{
logger: log.NewCLILogger(),
visibilityManager: visiblityManager,
metadataManager: metadataManager,
taskManager: taskManager,
namespaceRegistry: namespaceRegistry,
matchingClient: matchingClient,
removableBuildIdDurationSinceDefault: func() time.Duration {
return time.Hour
},
currentClusterName: "test-cluster",
logger: log.NewCLILogger(),
visibilityManager: visiblityManager,
metadataManager: metadataManager,
taskManager: taskManager,
namespaceRegistry: namespaceRegistry,
matchingClient: matchingClient,
removableBuildIdDurationSinceDefault: dynamicconfig.GetDurationPropertyFn(time.Hour),
buildIdScavengerVisibilityRPS: dynamicconfig.GetFloatPropertyFn(1.0),
currentClusterName: "test-cluster",
}

rateLimiter.EXPECT().Wait(gomock.Any()).AnyTimes()
Expand Down
3 changes: 3 additions & 0 deletions service/worker/scanner/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ type (
// RemovableBuildIdDurationSinceDefault is the minimum duration since a build id was last default in its
// containing set for it to be considered for removal.
RemovableBuildIdDurationSinceDefault dynamicconfig.DurationPropertyFn
// BuildIdScavengerVisibilityRPS is the rate limit for visibility calls from the build id scavenger
BuildIdScavengerVisibilityRPS dynamicconfig.FloatPropertyFn
}

// scannerContext is the context object that gets
Expand Down Expand Up @@ -209,6 +211,7 @@ func (s *Scanner) Start() error {
s.context.matchingClient,
s.context.currentClusterName,
s.context.cfg.RemovableBuildIdDurationSinceDefault,
s.context.cfg.BuildIdScavengerVisibilityRPS,
)

work := s.context.sdkClientFactory.NewWorker(s.context.sdkClientFactory.GetSystemClient(), build_ids.BuildIdScavengerTaskQueueName, workerOpts)
Expand Down
4 changes: 4 additions & 0 deletions service/worker/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,10 @@ func NewConfig(
dynamicconfig.RemovableBuildIdDurationSinceDefault,
time.Hour,
),
BuildIdScavengerVisibilityRPS: dc.GetFloat64Property(
dynamicconfig.BuildIdScavenengerVisibilityRPS,
1.0,
),
},
EnableBatcher: dc.GetBoolProperty(dynamicconfig.EnableBatcher, true),
BatcherRPS: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.BatcherRPS, batcher.DefaultRPS),
Expand Down

0 comments on commit 873df34

Please sign in to comment.