Skip to content

Commit

Permalink
Compactor: Adding minimum retention flag validation for downsampling …
Browse files Browse the repository at this point in the history
…retention (#5059)

* Renaming `DownsampleRange` -> `ResLevel1DownsampleRange` to specify the mapping between the downsample ranges and their respective resolutions

Signed-off-by: mzardab <mzardab@redhat.com>

* Adding compactor downsample retention flag validation...

...if the following conditions are met:
* Downsampling is enabled, raw retention must be > 10 days
* Downsampling is enabled, 5m resolution retention must be at least 40h
* Downsampling is enabled, 1h resolution must be at least 10d

This is to avoid unexpected deletions of downsampled/raw data as compactor will only downsample raw blocks after there is enough data to form 2 whole blocks, it will not downsample immediately.

Signed-off-by: mzardab <mzardab@redhat.com>

* Added more clarifying docs on impact of retention on downsampling

Signed-off-by: mzardab <mzardab@redhat.com>

* Resolving conflicts

Signed-off-by: mzardab <mzardab@redhat.com>

* Updated CHANGELOG.md

Signed-off-by: mzardab <mzardab@redhat.com>

* Addressing PR comments

Signed-off-by: mzardab <mzardab@redhat.com>
  • Loading branch information
moadz committed Jan 21, 2022
1 parent 5c11f24 commit ec73ed7
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 25 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -21,6 +21,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#4946](https://github.com/thanos-io/thanos/pull/4946) Store: Support tls_config configuration for the s3 minio client.
- [#4974](https://github.com/thanos-io/thanos/pull/4974) Store: Support tls_config configuration for connecting with Azure storage.
- [#4999](https://github.com/thanos-io/thanos/pull/4999) COS: Support `endpoint` configuration for vpc internal endpoint.
- [#5059](https://github.com/thanos-io/thanos/pull/5059) Compactor: Adding minimum retention flag validation for downsampling retention.

### Fixed

Expand Down
8 changes: 8 additions & 0 deletions cmd/thanos/compact.go
Expand Up @@ -388,9 +388,17 @@ func runCompact(
}

if retentionByResolution[compact.ResolutionLevelRaw].Seconds() != 0 {
// If downsampling is enabled, error if raw retention is not sufficient for downsampling to occur (upper bound 10 days for 1h resolution)
if !conf.disableDownsampling && retentionByResolution[compact.ResolutionLevelRaw].Seconds() < downsample.ResLevel1DownsampleRange {
return errors.New("raw resolution must be higher than the minimum block size after which 5m resolution downsampling will occur (40 hours)")
}
level.Info(logger).Log("msg", "retention policy of raw samples is enabled", "duration", retentionByResolution[compact.ResolutionLevelRaw])
}
if retentionByResolution[compact.ResolutionLevel5m].Seconds() != 0 {
// If retention is lower than minimum downsample range, then no downsampling at this resolution will be persisted
if !conf.disableDownsampling && retentionByResolution[compact.ResolutionLevel5m].Seconds() < downsample.ResLevel2DownsampleRange {
return errors.New("5m resolution retention must be higher than the minimum block size after which 1h resolution downsampling will occur (10 days)")
}
level.Info(logger).Log("msg", "retention policy of 5 min aggregated samples is enabled", "duration", retentionByResolution[compact.ResolutionLevel5m])
}
if retentionByResolution[compact.ResolutionLevel1h].Seconds() != 0 {
Expand Down
4 changes: 2 additions & 2 deletions cmd/thanos/downsample.go
Expand Up @@ -283,7 +283,7 @@ metaSendLoop:
// Only downsample blocks once we are sure to get roughly 2 chunks out of it.
// NOTE(fabxc): this must match with at which block size the compactor creates downsampled
// blocks. Otherwise we may never downsample some data.
if m.MaxTime-m.MinTime < downsample.DownsampleRange0 {
if m.MaxTime-m.MinTime < downsample.ResLevel1DownsampleRange {
continue
}

Expand All @@ -301,7 +301,7 @@ metaSendLoop:
// Only downsample blocks once we are sure to get roughly 2 chunks out of it.
// NOTE(fabxc): this must match with at which block size the compactor creates downsampled
// blocks. Otherwise we may never downsample some data.
if m.MaxTime-m.MinTime < downsample.DownsampleRange1 {
if m.MaxTime-m.MinTime < downsample.ResLevel2DownsampleRange {
continue
}
}
Expand Down
8 changes: 4 additions & 4 deletions cmd/thanos/main_test.go
Expand Up @@ -121,7 +121,7 @@ func TestRegression4960_Deadlock(t *testing.T) {
ctx,
dir,
[]labels.Labels{{{Name: "a", Value: "1"}}},
1, 0, downsample.DownsampleRange0+1, // Pass the minimum DownsampleRange0 check.
1, 0, downsample.ResLevel1DownsampleRange+1, // Pass the minimum ResLevel1DownsampleRange check.
labels.Labels{{Name: "e1", Value: "1"}},
downsample.ResLevel0, metadata.NoneFunc)
testutil.Ok(t, err)
Expand All @@ -132,7 +132,7 @@ func TestRegression4960_Deadlock(t *testing.T) {
ctx,
dir,
[]labels.Labels{{{Name: "a", Value: "2"}}},
1, 0, downsample.DownsampleRange0+1, // Pass the minimum DownsampleRange0 check.
1, 0, downsample.ResLevel1DownsampleRange+1, // Pass the minimum ResLevel1DownsampleRange check.
labels.Labels{{Name: "e1", Value: "2"}},
downsample.ResLevel0, metadata.NoneFunc)
testutil.Ok(t, err)
Expand All @@ -143,7 +143,7 @@ func TestRegression4960_Deadlock(t *testing.T) {
ctx,
dir,
[]labels.Labels{{{Name: "a", Value: "2"}}},
1, 0, downsample.DownsampleRange0+1, // Pass the minimum DownsampleRange0 check.
1, 0, downsample.ResLevel1DownsampleRange+1, // Pass the minimum ResLevel1DownsampleRange check.
labels.Labels{{Name: "e1", Value: "2"}},
downsample.ResLevel0, metadata.NoneFunc)
testutil.Ok(t, err)
Expand Down Expand Up @@ -183,7 +183,7 @@ func TestCleanupDownsampleCacheFolder(t *testing.T) {
ctx,
dir,
[]labels.Labels{{{Name: "a", Value: "1"}}},
1, 0, downsample.DownsampleRange0+1, // Pass the minimum DownsampleRange0 check.
1, 0, downsample.ResLevel1DownsampleRange+1, // Pass the minimum ResLevel1DownsampleRange check.
labels.Labels{{Name: "e1", Value: "1"}},
downsample.ResLevel0, metadata.NoneFunc)
testutil.Ok(t, err)
Expand Down
13 changes: 9 additions & 4 deletions docs/components/compact.md
Expand Up @@ -2,10 +2,9 @@

The `thanos compact` command applies the compaction procedure of the Prometheus 2.0 storage engine to block data stored in object storage. It is generally not semantically concurrency safe and must be deployed as a singleton against a bucket.

Compactor is also responsible for downsampling of data:

* Creating 5m downsampling for blocks larger than **40 hours** (2d, 2w)
* Creating 1h downsampling for blocks larger than **10 days** (2w)
Compactor is also responsible for downsampling of data. There is a time delay before downsampling at a given resolution is possible. This is necessary because downsampled chunks will have fewer samples in them, and as chunks are fixed size, data spanning more time will be required to fill them.
* Creating 5m downsampling for blocks older than **40 hours** (2d)
* Creating 1h downsampling for blocks older than **10 days** (2w)

Example:

Expand Down Expand Up @@ -161,6 +160,12 @@ Resolution is a distance between data points on your graphs. E.g.
* `5 minutes` - data point is every 5 minutes
* `1 hour` - data point is every 1h

Compactor downsampling is done in two passes:
1) All raw resolution metrics that are older than **40 hours** are downsampled at a 5m resolution
2) All 5m resolution metrics older than **10 days** are downsampled at a 1h resolution

> **NOTE:** If retention at each resolution is lower than minimum age for the successive downsampling pass, data will be deleted before downsampling can be completed. As a rule of thumb retention for each downsampling level should be the same, and should be greater than the maximum date range (10 days for 5m to 1h downsampling).
Keep in mind, that the initial goal of downsampling is not saving disk or object storage space. In fact, downsampling doesn't save you **any** space but instead, it adds 2 more blocks for each raw block which are only slightly smaller or relatively similar size to raw block. This is done by internal downsampling implementation which to be mathematically correct holds various aggregations. This means that downsampling can increase the size of your storage a bit (~3x), if you choose to store all resolutions (recommended and by default).

The goal of downsampling is to provide an opportunity to get fast results for range queries of big time intervals like months or years. In other words, if you set `--retention.resolution-raw` less than `--retention.resolution-5m` and `--retention.resolution-1h` - you might run into a problem of not being able to "zoom in" to your historical data.
Expand Down
8 changes: 4 additions & 4 deletions pkg/compact/compact.go
Expand Up @@ -123,9 +123,9 @@ func UntilNextDownsampling(m *metadata.Meta) (time.Duration, error) {
case downsample.ResLevel2:
return time.Duration(0), errors.New("no downsampling")
case downsample.ResLevel1:
return time.Duration(downsample.DownsampleRange1*time.Millisecond) - timeRange, nil
return time.Duration(downsample.ResLevel2DownsampleRange*time.Millisecond) - timeRange, nil
case downsample.ResLevel0:
return time.Duration(downsample.DownsampleRange0*time.Millisecond) - timeRange, nil
return time.Duration(downsample.ResLevel1DownsampleRange*time.Millisecond) - timeRange, nil
default:
panic(errors.Errorf("invalid resolution %v", m.Thanos.Downsample.Resolution))
}
Expand Down Expand Up @@ -637,7 +637,7 @@ func (ds *DownsampleProgressCalculator) ProgressCalculate(ctx context.Context, g
continue
}

if m.MaxTime-m.MinTime < downsample.DownsampleRange0 {
if m.MaxTime-m.MinTime < downsample.ResLevel1DownsampleRange {
continue
}
groupBlocks[group.key]++
Expand All @@ -653,7 +653,7 @@ func (ds *DownsampleProgressCalculator) ProgressCalculate(ctx context.Context, g
continue
}

if m.MaxTime-m.MinTime < downsample.DownsampleRange1 {
if m.MaxTime-m.MinTime < downsample.ResLevel2DownsampleRange {
continue
}
groupBlocks[group.key]++
Expand Down
18 changes: 9 additions & 9 deletions pkg/compact/compact_test.go
Expand Up @@ -509,10 +509,10 @@ func TestDownsampleProgressCalculate(t *testing.T) {
// This test case has blocks from multiple groups and resolution levels. Only the blocks in the second group should be downsampled since the others either have time differences not in the range for their resolution, or a resolution which should not be downsampled.
testName: "multi_group_test",
input: []*metadata.Meta{
createBlockMeta(6, 1, downsample.DownsampleRange0, map[string]string{"a": "1"}, downsample.ResLevel0, []uint64{7, 8}),
createBlockMeta(7, 0, downsample.DownsampleRange1, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{8, 9}),
createBlockMeta(9, 0, downsample.DownsampleRange1, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{8, 11}),
createBlockMeta(8, 0, downsample.DownsampleRange1, map[string]string{"a": "1", "b": "2"}, downsample.ResLevel2, []uint64{9, 10}),
createBlockMeta(6, 1, downsample.ResLevel1DownsampleRange, map[string]string{"a": "1"}, downsample.ResLevel0, []uint64{7, 8}),
createBlockMeta(7, 0, downsample.ResLevel2DownsampleRange, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{8, 9}),
createBlockMeta(9, 0, downsample.ResLevel2DownsampleRange, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{8, 11}),
createBlockMeta(8, 0, downsample.ResLevel2DownsampleRange, map[string]string{"a": "1", "b": "2"}, downsample.ResLevel2, []uint64{9, 10}),
},
expected: map[string]float64{
keys[0]: 0.0,
Expand All @@ -524,7 +524,7 @@ func TestDownsampleProgressCalculate(t *testing.T) {
// This block should be downsampled.
testName: "res_level0_test",
input: []*metadata.Meta{
createBlockMeta(9, 0, downsample.DownsampleRange0, map[string]string{"a": "1"}, downsample.ResLevel0, []uint64{10, 11}),
createBlockMeta(9, 0, downsample.ResLevel1DownsampleRange, map[string]string{"a": "1"}, downsample.ResLevel0, []uint64{10, 11}),
},
expected: map[string]float64{
keys[0]: 1.0,
Expand All @@ -534,7 +534,7 @@ func TestDownsampleProgressCalculate(t *testing.T) {
// This block should be downsampled.
testName: "res_level1_test",
input: []*metadata.Meta{
createBlockMeta(9, 0, downsample.DownsampleRange1, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{10, 11}),
createBlockMeta(9, 0, downsample.ResLevel2DownsampleRange, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{10, 11}),
},
expected: map[string]float64{
keys[1]: 1.0,
Expand All @@ -545,7 +545,7 @@ func TestDownsampleProgressCalculate(t *testing.T) {
// Blocks with this resolution should not be downsampled.
testName: "res_level2_test",
input: []*metadata.Meta{
createBlockMeta(10, 0, downsample.DownsampleRange1, map[string]string{"a": "1", "b": "2"}, downsample.ResLevel2, []uint64{11, 12}),
createBlockMeta(10, 0, downsample.ResLevel2DownsampleRange, map[string]string{"a": "1", "b": "2"}, downsample.ResLevel2, []uint64{11, 12}),
},
expected: map[string]float64{
keys[2]: 0.0,
Expand All @@ -555,7 +555,7 @@ func TestDownsampleProgressCalculate(t *testing.T) {
// This block should be downsampled.
testName: "res_level0_test_incorrect",
input: []*metadata.Meta{
createBlockMeta(9, 1, downsample.DownsampleRange0, map[string]string{"a": "1"}, downsample.ResLevel0, []uint64{10, 11}),
createBlockMeta(9, 1, downsample.ResLevel1DownsampleRange, map[string]string{"a": "1"}, downsample.ResLevel0, []uint64{10, 11}),
},
expected: map[string]float64{
keys[0]: 0.0,
Expand All @@ -566,7 +566,7 @@ func TestDownsampleProgressCalculate(t *testing.T) {
// This block should be downsampled.
testName: "res_level1_test",
input: []*metadata.Meta{
createBlockMeta(9, 1, downsample.DownsampleRange1, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{10, 11}),
createBlockMeta(9, 1, downsample.ResLevel2DownsampleRange, map[string]string{"b": "2"}, downsample.ResLevel1, []uint64{10, 11}),
},
expected: map[string]float64{
keys[1]: 0.0,
Expand Down
4 changes: 2 additions & 2 deletions pkg/compact/downsample/downsample.go
Expand Up @@ -35,8 +35,8 @@ const (

// Downsampling ranges i.e. minimum block size after which we start to downsample blocks (in seconds).
const (
DownsampleRange0 = 40 * 60 * 60 * 1000 // 40 hours.
DownsampleRange1 = 10 * 24 * 60 * 60 * 1000 // 10 days.
ResLevel1DownsampleRange = 40 * 60 * 60 * 1000 // 40 hours.
ResLevel2DownsampleRange = 10 * 24 * 60 * 60 * 1000 // 10 days.
)

// Downsample downsamples the given block. It writes a new block into dir and returns its ID.
Expand Down

0 comments on commit ec73ed7

Please sign in to comment.