Skip to content

Commit

Permalink
Align tenant pruning according to wall clock (#7299)
Browse files Browse the repository at this point in the history
* Align tenant pruning according to wall clock.

Pruning a tenant currently acquires a lock on the tenant's TSDB,
which blocks reads from incoming queries. We have noticed spikes in
query latency when tenants get decomissioned since each receiver will
prune the tenant at a different time.

To reduce the window where queries get degraded, this commit makes sure that
pruning happens at predictable intervals by aligning it to the wall clock, similar
to how head compaction is aligned.

The commit also changes the tenant deletion condition to look at the duration
from the min time of the tenant, rather than from the last append time.

Signed-off-by: Filip Petkovski <filip.petkovsky@gmail.com>

* Improve tests

Signed-off-by: Filip Petkovski <filip.petkovsky@gmail.com>

---------

Signed-off-by: Filip Petkovski <filip.petkovsky@gmail.com>
  • Loading branch information
fpetkovski committed May 13, 2024
1 parent 2d738f0 commit da2bbb6
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
8 changes: 7 additions & 1 deletion cmd/thanos/receive.go
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,13 @@ func runReceive(
{
ctx, cancel := context.WithCancel(context.Background())
g.Add(func() error {
return runutil.Repeat(2*time.Hour, ctx.Done(), func() error {
pruneInterval := 2 * time.Duration(tsdbOpts.MaxBlockDuration) * time.Millisecond
return runutil.Repeat(time.Minute, ctx.Done(), func() error {
currentTime := time.Now()
currentTotalMinutes := currentTime.Hour()*60 + currentTime.Minute()
if currentTotalMinutes%int(pruneInterval.Minutes()) != 0 {
return nil
}
if err := dbs.Prune(ctx); err != nil {
level.Error(logger).Log("err", err)
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/receive/multitsdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/prometheus/prometheus/tsdb"

"github.com/thanos-io/objstore"

"github.com/thanos-io/thanos/pkg/api/status"
"github.com/thanos-io/thanos/pkg/block/metadata"
"github.com/thanos-io/thanos/pkg/component"
Expand Down Expand Up @@ -334,6 +335,7 @@ func (t *MultiTSDB) Prune(ctx context.Context) error {
if t.tsdbOpts.RetentionDuration == 0 {
return nil
}
level.Info(t.logger).Log("msg", "Running pruning job")

var (
wg sync.WaitGroup
Expand All @@ -342,7 +344,6 @@ func (t *MultiTSDB) Prune(ctx context.Context) error {
prunedTenants []string
pmtx sync.Mutex
)

t.mtx.RLock()
for tenantID, tenantInstance := range t.tenants {
wg.Add(1)
Expand Down
8 changes: 4 additions & 4 deletions pkg/receive/multitsdb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,10 @@ func TestMultiTSDBPrune(t *testing.T) {
)
defer func() { testutil.Ok(t, m.Close()) }()

for i := 0; i < 100; i++ {
testutil.Ok(t, appendSample(m, "deleted-tenant", time.UnixMilli(int64(10+i))))
testutil.Ok(t, appendSample(m, "compacted-tenant", time.Now().Add(-4*time.Hour)))
testutil.Ok(t, appendSample(m, "active-tenant", time.Now().Add(time.Duration(i)*time.Second)))
for step := time.Duration(0); step <= 2*time.Hour; step += time.Minute {
testutil.Ok(t, appendSample(m, "deleted-tenant", time.Now().Add(-9*time.Hour+step)))
testutil.Ok(t, appendSample(m, "compacted-tenant", time.Now().Add(-4*time.Hour+step)))
testutil.Ok(t, appendSample(m, "active-tenant", time.Now().Add(step)))
}
testutil.Equals(t, 3, len(m.TSDBLocalClients()))

Expand Down

0 comments on commit da2bbb6

Please sign in to comment.