Skip to content

Commit 9b43ad0

Browse files
authored
fix(host-cleanup): ensure no parallel processes of auto host cleanup (#6736)
Signed-off-by: Alexandr Zaytsev <alexandr.zaytsev@flant.com>
1 parent 967c6c3 commit 9b43ad0

File tree

7 files changed

+76
-51
lines changed

7 files changed

+76
-51
lines changed

Diff for: cmd/werf/common/common.go

+2-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/werf/logboek/pkg/types"
2020
"github.com/werf/nelm/pkg/action"
2121
"github.com/werf/nelm/pkg/log"
22+
"github.com/werf/werf/v2/pkg/background"
2223
"github.com/werf/werf/v2/pkg/build"
2324
"github.com/werf/werf/v2/pkg/build/stage"
2425
"github.com/werf/werf/v2/pkg/config"
@@ -1659,7 +1660,7 @@ func GetContextWithLogger() context.Context {
16591660
func WithContext(allowBackgroundMode bool, f func(ctx context.Context) error) error {
16601661
var ctx context.Context
16611662

1662-
if allowBackgroundMode && IsBackgroundModeEnabled() {
1663+
if allowBackgroundMode && background.IsBackgroundModeEnabled() {
16631664
out, err := os.OpenFile(GetBackgroundOutputFile(), os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o644)
16641665
if err != nil {
16651666
return fmt.Errorf("unable to open background output file %q: %w", GetBackgroundOutputFile(), err)
@@ -1708,10 +1709,6 @@ func GetAndRemoveLastBackgroundError() (error, error) {
17081709
return nil, nil
17091710
}
17101711

1711-
func IsBackgroundModeEnabled() bool {
1712-
return os.Getenv("_WERF_BACKGROUND_MODE_ENABLED") == "1"
1713-
}
1714-
17151712
func GetBackgroundOutputFile() string {
17161713
return filepath.Join(werf.GetServiceDir(), "background_output.log")
17171714
}

Diff for: cmd/werf/common/host_cleanup.go

+8-10
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,14 @@ func RunAutoHostCleanup(ctx context.Context, cmdData *CmdData, containerBackend
2626
return fmt.Errorf("incompatible params --allowed-local-cache-volume-usage=%d and --allowed-local-cache-volume-usage-margin=%d: margin percentage should be less than allowed volume usage level percentage", *cmdData.AllowedLocalCacheVolumeUsage, *cmdData.AllowedLocalCacheVolumeUsageMargin)
2727
}
2828

29-
return host_cleaning.RunAutoHostCleanup(ctx, containerBackend, host_cleaning.AutoHostCleanupOptions{
30-
HostCleanupOptions: host_cleaning.HostCleanupOptions{
31-
DryRun: false,
32-
Force: false,
33-
AllowedBackendStorageVolumeUsagePercentage: cmdData.AllowedBackendStorageVolumeUsage,
34-
AllowedBackendStorageVolumeUsageMarginPercentage: cmdData.AllowedBackendStorageVolumeUsageMargin,
35-
AllowedLocalCacheVolumeUsagePercentage: cmdData.AllowedLocalCacheVolumeUsage,
36-
AllowedLocalCacheVolumeUsageMarginPercentage: cmdData.AllowedLocalCacheVolumeUsageMargin,
37-
BackendStoragePath: cmdData.BackendStoragePath,
38-
},
29+
return host_cleaning.RunAutoHostCleanup(ctx, containerBackend, host_cleaning.HostCleanupOptions{
30+
DryRun: false,
31+
Force: false,
32+
AllowedBackendStorageVolumeUsagePercentage: cmdData.AllowedBackendStorageVolumeUsage,
33+
AllowedBackendStorageVolumeUsageMarginPercentage: cmdData.AllowedBackendStorageVolumeUsageMargin,
34+
AllowedLocalCacheVolumeUsagePercentage: cmdData.AllowedLocalCacheVolumeUsage,
35+
AllowedLocalCacheVolumeUsageMarginPercentage: cmdData.AllowedLocalCacheVolumeUsageMargin,
36+
BackendStoragePath: cmdData.BackendStoragePath,
3937
})
4038
}
4139

Diff for: cmd/werf/main.go

+8
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,18 @@ import (
1414
"github.com/werf/nelm/pkg/resrcchangcalc"
1515
"github.com/werf/werf/v2/cmd/werf/common"
1616
"github.com/werf/werf/v2/cmd/werf/root"
17+
"github.com/werf/werf/v2/pkg/background"
1718
"github.com/werf/werf/v2/pkg/process_exterminator"
1819
)
1920

2021
func main() {
22+
// IMPORTANT. In background mode we MUST take host-lock to prevent parallel "werf host cleanup" processes.
23+
// The processes write data to the same log files that causes "data race".
24+
// We don't need to release the lock manually, because it does automatically when the background process will end.
25+
if background.IsBackgroundModeEnabled() && !background.TryLock() {
26+
return
27+
}
28+
2129
ctx := common.GetContextWithLogger()
2230

2331
root.PrintStackTraces()

Diff for: pkg/background/background.go

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package background
2+
3+
import (
4+
"os"
5+
6+
chart "github.com/werf/common-go/pkg/lock"
7+
"github.com/werf/lockgate"
8+
)
9+
10+
func IsBackgroundModeEnabled() bool {
11+
return os.Getenv("_WERF_BACKGROUND_MODE_ENABLED") == "1"
12+
}
13+
14+
// TryLock tries background host-lock or panics if error.
15+
func TryLock() bool {
16+
locker, err := chart.HostLocker()
17+
if err != nil {
18+
panic(err)
19+
}
20+
lockName := "werf.background-process"
21+
lockOptions := lockgate.AcquireOptions{NonBlocking: true}
22+
ok, _, err := locker.Acquire(lockName, lockOptions)
23+
if err != nil {
24+
panic(err)
25+
}
26+
return ok
27+
}

Diff for: pkg/host_cleaning/host_cleanup.go

+7-17
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,6 @@ type HostCleanupOptions struct {
3030
Force bool
3131
}
3232

33-
type AutoHostCleanupOptions struct {
34-
HostCleanupOptions
35-
36-
ForceShouldRun bool
37-
}
38-
3933
func getOptionValueOrDefault(optionValue *uint, defaultValue float64) float64 {
4034
var res float64
4135
if optionValue != nil {
@@ -46,16 +40,12 @@ func getOptionValueOrDefault(optionValue *uint, defaultValue float64) float64 {
4640
return res
4741
}
4842

49-
func RunAutoHostCleanup(ctx context.Context, backend container_backend.ContainerBackend, options AutoHostCleanupOptions) error {
50-
if !options.ForceShouldRun {
51-
shouldRun, err := ShouldRunAutoHostCleanup(ctx, backend, options.HostCleanupOptions)
52-
if err != nil {
53-
logboek.Context(ctx).Warn().LogF("WARNING: unable to check if auto host cleanup should be run: %s\n", err)
54-
return nil
55-
}
56-
if !shouldRun {
57-
return nil
58-
}
43+
func RunAutoHostCleanup(ctx context.Context, backend container_backend.ContainerBackend, options HostCleanupOptions) error {
44+
if shouldRun, err := shouldRunAutoHostCleanup(ctx, backend, options); err != nil {
45+
logboek.Context(ctx).Warn().LogF("WARNING: unable to check if auto host cleanup should be run: %s\n", err)
46+
return nil
47+
} else if !shouldRun {
48+
return nil
5949
}
6050

6151
logboek.Context(ctx).Debug().LogF("RunAutoHostCleanup forking ...\n")
@@ -135,7 +125,7 @@ func RunHostCleanup(ctx context.Context, backend container_backend.ContainerBack
135125
})
136126
}
137127

138-
func ShouldRunAutoHostCleanup(ctx context.Context, backend container_backend.ContainerBackend, options HostCleanupOptions) (bool, error) {
128+
func shouldRunAutoHostCleanup(ctx context.Context, backend container_backend.ContainerBackend, options HostCleanupOptions) (bool, error) {
139129
shouldRun, err := tmp_manager.ShouldRunAutoGC()
140130
if err != nil {
141131
return false, fmt.Errorf("failed to check tmp manager GC: %w", err)

Diff for: pkg/host_cleaning/host_lock.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ import (
99
"github.com/werf/logboek"
1010
)
1111

12-
// withHostLockOrNothing executes callback function if "soft" (NonBlocking=true) lock is acquired. Otherwise, does nothing.
12+
// withHostLockOrNothing tries a lock. If the lock is acquired, executes callback and releases the lock after.
13+
// If the lock is NOT acquired, does nothing.
1314
func withHostLockOrNothing(ctx context.Context, lockName string, callback func() error) error {
1415
lockOptions := lockgate.AcquireOptions{NonBlocking: true}
1516

@@ -23,5 +24,6 @@ func withHostLockOrNothing(ctx context.Context, lockName string, callback func()
2324
return nil
2425
}
2526

27+
// Should we handle panic here and release the lock anyway?
2628
return errors.Join(callback(), chart.ReleaseHostLock(lock)) // join non-nil errors or return nil
2729
}

Diff for: pkg/host_cleaning/host_lock_test.go

+21-18
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,37 @@ import (
77
. "github.com/onsi/gomega"
88
)
99

10-
var _ = Describe("withHostLockOrNothing", func() {
10+
var _ = Describe("host lock", func() {
1111
var ctx context.Context
12-
var spy *spyHostLockCallback
1312
BeforeEach(func() {
1413
ctx = context.Background()
15-
spy = &spyHostLockCallback{}
1614
})
17-
It("should call callback function if lock is acquired", func() {
18-
lockName := "test"
19-
err := withHostLockOrNothing(ctx, lockName, spy.Method)
20-
Expect(err).To(Succeed())
21-
Expect(spy.callsCount).To(Equal(1))
22-
})
23-
It("should not call callback function if lock isn't acquired", func() {
24-
lockName := "test"
25-
err := withHostLockOrNothing(ctx, lockName, func() error {
26-
return withHostLockOrNothing(ctx, lockName, spy.Method) // lock is already acquired in parent function
15+
Describe("withHostLockOrNothing", func() {
16+
It("should call callback function if lock is acquired", func() {
17+
spy := &spyHostLock{}
18+
lockName := "test"
19+
err := withHostLockOrNothing(ctx, lockName, spy.Handle1)
20+
Expect(err).To(Succeed())
21+
Expect(spy.callsCount).To(Equal(1))
22+
})
23+
It("should not call callback function if lock isn't acquired", func() {
24+
spy := &spyHostLock{}
25+
lockName := "test"
26+
err := withHostLockOrNothing(ctx, lockName, func() error {
27+
return withHostLockOrNothing(ctx, lockName, spy.Handle1) // lock is already acquired in parent function
28+
})
29+
Expect(err).To(Succeed())
30+
Expect(spy.callsCount).To(Equal(0))
2731
})
28-
Expect(err).To(Succeed())
29-
Expect(spy.callsCount).To(Equal(0))
3032
})
3133
})
3234

33-
type spyHostLockCallback struct {
35+
type spyHostLock struct {
3436
callsCount int
37+
err error
3538
}
3639

37-
func (s *spyHostLockCallback) Method() error {
40+
func (s *spyHostLock) Handle1() error {
3841
s.callsCount++
39-
return nil
42+
return s.err
4043
}

0 commit comments

Comments
 (0)