From 8c76dab7fbae6557e5044243967abf6d2c6b764a Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 08:09:51 +0100 Subject: [PATCH 01/43] added unit tests for long labels and no external labels Signed-off-by: utukj --- pkg/query/endpointset.go | 3 + pkg/query/endpointset_test.go | 102 ++++++++++++++++++++++++++++++++-- 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 1660f85853..079bd9d25d 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -43,6 +43,8 @@ type GRPCEndpointSpec struct { isStrictStatic bool } +var externalLabelLimit = 1000 + // NewGRPCEndpointSpec creates gRPC endpoint spec. // It uses InfoAPI to get Metadata. func NewGRPCEndpointSpec(addr string, isStrictStatic bool) *GRPCEndpointSpec { @@ -268,6 +270,7 @@ func NewEndpointSet( dialOpts []grpc.DialOption, unhealthyEndpointTimeout time.Duration, endpointInfoTimeout time.Duration, + endpointMetricLabels ...string, ) *EndpointSet { endpointsMetric := newEndpointSetNodeCollector() if reg != nil { diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 1274648b93..a07880520c 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -9,6 +9,7 @@ import ( "fmt" "math" "net" + "strings" "sync" "testing" "time" @@ -23,6 +24,7 @@ import ( "google.golang.org/grpc/credentials/insecure" "github.com/pkg/errors" + promtestutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/info/infopb" "github.com/thanos-io/thanos/pkg/store/labelpb" @@ -336,6 +338,44 @@ func TestEndpointSetUpdate(t *testing.T) { strict: true, expectedEndpoints: 1, }, + { + name: "long external labels", + endpoints: []testEndpointMeta{ + { + InfoResponse: sidecarInfo, + extlsetFn: func(addr string) []labelpb.ZLabelSet { + sLabel := []string{} + for i := 0; i < 1000; i++ { + sLabel = append(sLabel, "lbl") + sLabel = append(sLabel, "val") + } + return labelpb.ZLabelSetsFromPromLabels( + labels.FromStrings(sLabel...), + ) + }, + }, + }, + expectedEndpoints: 1, + }, + { + name: "no external labels", + endpoints: []testEndpointMeta{ + { + InfoResponse: sidecarInfo, + extlsetFn: func(addr string) []labelpb.ZLabelSet { + sLabel := []string{} + for i := 0; i < 1000; i++ { + sLabel = append(sLabel, "lbl") + sLabel = append(sLabel, "val") + } + return labelpb.ZLabelSetsFromPromLabels( + labels.FromStrings(sLabel...), + ) + }, + }, + }, + expectedEndpoints: 1, + }, } for _, tc := range testCases { @@ -345,12 +385,66 @@ func TestEndpointSetUpdate(t *testing.T) { defer endpoints.Close() discoveredEndpointAddr := endpoints.EndpointAddresses() - endpointSet := makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now) + var endpointSet *EndpointSet + // specify only "store_type" to exclude "external_labels" + if tc.name == "no external labels" { + endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, "store_type") + } else { + endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now) + } defer endpointSet.Close() endpointSet.Update(context.Background()) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetEndpointStatus())) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetStoreClients())) + // slow or unavailable endpoint should collect nothing + if tc.name == "slow endpoint" || tc.name == "unavailable endpoint" { + testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(""))) + return + } + + if tc.name == "no external labels" { + expectedMetrics := fmt.Sprintf( + ` + # HELP thanos_store_nodes_grpc_connections Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier. + # TYPE thanos_store_nodes_grpc_connections gauge + thanos_store_nodes_grpc_connections{store_type="sidecar"} %d + `, + tc.expectedEndpoints, + ) + testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(expectedMetrics))) + return + } + + var externalLabels string + if tc.name == "long external labels" { + externalLabels = strings.Repeat(`lbl="val", `, 1000) + externalLabels = externalLabels[:len(externalLabels)-2] + } else { + externalLabels = fmt.Sprintf(`a="b", addr=%q`, discoveredEndpointAddr[0]) + } + // labels too long must be trimmed + if len(externalLabels) > externalLabelLimit { + externalLabels = externalLabels[:externalLabelLimit] + } + // add backslash escape for every quote character + var lbl strings.Builder + for _, ch := range externalLabels { + if string(ch) == `"` { + lbl.WriteString(`\`) + } + lbl.WriteRune(ch) + } + expectedMetrics := fmt.Sprintf( + ` + # HELP thanos_store_nodes_grpc_connections Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier. + # TYPE thanos_store_nodes_grpc_connections gauge + thanos_store_nodes_grpc_connections{external_labels="{%s}",store_type="sidecar"} %d + `, + lbl.String(), + tc.expectedEndpoints, + ) + testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(expectedMetrics))) }) } } @@ -576,7 +670,7 @@ func TestEndpointSetUpdate_AtomicEndpointAdditions(t *testing.T) { wg.Wait() } -func TestEndpointSet_Update(t *testing.T) { +func TestEndpointSetUpdate_AvailabilityScenarios(t *testing.T) { endpoints, err := startTestEndpoints([]testEndpointMeta{ { InfoResponse: sidecarInfo, @@ -1493,7 +1587,7 @@ func TestUpdateEndpointStateForgetsPreviousErrors(t *testing.T) { testutil.Equals(t, `null`, string(b)) } -func makeEndpointSet(discoveredEndpointAddr []string, strict bool, now nowFunc) *EndpointSet { +func makeEndpointSet(discoveredEndpointAddr []string, strict bool, now nowFunc, metricLabels ...string) *EndpointSet { endpointSet := NewEndpointSet(now, nil, nil, func() (specs []*GRPCEndpointSpec) { for _, addr := range discoveredEndpointAddr { @@ -1501,7 +1595,7 @@ func makeEndpointSet(discoveredEndpointAddr []string, strict bool, now nowFunc) } return specs }, - testGRPCOpts, time.Minute, time.Second) + testGRPCOpts, time.Minute, time.Second, metricLabels...) return endpointSet } From c5f153654424dee95501e40c6b388a43333fee00 Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 08:19:10 +0100 Subject: [PATCH 02/43] trimmed too long external labels Signed-off-by: utukj --- pkg/query/endpointset.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 079bd9d25d..a7e4cd35b7 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -185,15 +185,19 @@ type endpointSetNodeCollector struct { storePerExtLset map[string]int connectionsDesc *prometheus.Desc + requiredLabels []string } -func newEndpointSetNodeCollector() *endpointSetNodeCollector { +func newEndpointSetNodeCollector(requiredLabels ...string) *endpointSetNodeCollector { + if len(requiredLabels) == 0 { + requiredLabels = []string{"external_labels", "store_type"} + } return &endpointSetNodeCollector{ storeNodes: map[component.Component]map[string]int{}, connectionsDesc: prometheus.NewDesc( "thanos_store_nodes_grpc_connections", "Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier.", - []string{"external_labels", "store_type"}, nil, + requiredLabels, nil, ), } } @@ -205,6 +209,9 @@ func (c *endpointSetNodeCollector) Update(nodes map[component.Component]map[stri for k, v := range nodes { storeNodes[k] = make(map[string]int, len(v)) for kk, vv := range v { + if len(kk) > externalLabelLimit { + kk = kk[:externalLabelLimit+1] + "}" + } storePerExtLset[kk] += vv storeNodes[k][kk] = vv } From 745e729be76c2db29f85d3c631dfc5340afbb881 Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 08:26:37 +0100 Subject: [PATCH 03/43] added optional label selection Signed-off-by: utukj --- pkg/query/endpointset.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index a7e4cd35b7..899644d757 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -199,6 +199,7 @@ func newEndpointSetNodeCollector(requiredLabels ...string) *endpointSetNodeColle "Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier.", requiredLabels, nil, ), + requiredLabels: requiredLabels, } } @@ -237,7 +238,17 @@ func (c *endpointSetNodeCollector) Collect(ch chan<- prometheus.Metric) { if storeType != nil { storeTypeStr = storeType.String() } - ch <- prometheus.MustNewConstMetric(c.connectionsDesc, prometheus.GaugeValue, float64(occurrences), externalLabels, storeTypeStr) + // select only required labels + lbls := []string{} + for _, lbl := range c.requiredLabels { + switch lbl { + case "external_labels": + lbls = append(lbls, externalLabels) + case "store_type": + lbls = append(lbls, storeTypeStr) + } + } + ch <- prometheus.MustNewConstMetric(c.connectionsDesc, prometheus.GaugeValue, float64(occurrences), lbls...) } } } @@ -279,7 +290,7 @@ func NewEndpointSet( endpointInfoTimeout time.Duration, endpointMetricLabels ...string, ) *EndpointSet { - endpointsMetric := newEndpointSetNodeCollector() + endpointsMetric := newEndpointSetNodeCollector(endpointMetricLabels...) if reg != nil { reg.MustRegister(endpointsMetric) } From 72ce9759bbc5e6d7f8935edd0c705a9aec17706a Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 08:53:30 +0100 Subject: [PATCH 04/43] added cmd flag for choosing metric labels Signed-off-by: utukj --- cmd/thanos/query.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index e55bf4b520..776dbcb89a 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -108,6 +108,8 @@ func registerQuery(app *extkingpin.App) { maxConcurrentSelects := cmd.Flag("query.max-concurrent-select", "Maximum number of select requests made concurrently per a query."). Default("4").Int() + queryMetricLabels := cmd.Flag("query.metric-label", "Optional selection of metrics to be collected").Strings() + queryReplicaLabels := cmd.Flag("query.replica-label", "Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules."). Strings() @@ -275,6 +277,7 @@ func registerQuery(app *extkingpin.App) { *dynamicLookbackDelta, time.Duration(*defaultEvaluationInterval), time.Duration(*storeResponseTimeout), + *queryMetricLabels, *queryReplicaLabels, selectorLset, getFlagsMap(cmd.Flags()), @@ -347,6 +350,7 @@ func runQuery( dynamicLookbackDelta bool, defaultEvaluationInterval time.Duration, storeResponseTimeout time.Duration, + queryMetricLabels []string, queryReplicaLabels []string, selectorLset labels.Labels, flagsMap map[string]string, @@ -486,6 +490,7 @@ func runQuery( dialOpts, unhealthyStoreTimeout, endpointInfoTimeout, + queryMetricLabels..., ) proxy = store.NewProxyStore(logger, reg, endpoints.GetStoreClients, component.Query, selectorLset, storeResponseTimeout, store.RetrievalStrategy(grpcProxyStrategy)) rulesProxy = rules.NewProxy(logger, endpoints.GetRulesClients) From 2c08e6dec97e854ec94cacb11bc6652d85b6e00c Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 09:31:24 +0100 Subject: [PATCH 05/43] updated docs Signed-off-by: utukj --- docs/components/query.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/components/query.md b/docs/components/query.md index 8b64a4830b..a2e8c91f56 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -369,6 +369,8 @@ Flags: when the range parameters are not specified. The zero value means range covers the time since the beginning. + --query.metric-label=QUERY.METRIC-LABEL ... + Optional selection of metrics to be collected --query.partial-response Enable partial response for queries if no partial_response param is specified. --no-query.partial-response for disabling. From 9fe8ea73c3623a1d392315f19e597e90020f8c2c Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Thu, 13 Oct 2022 12:09:51 +0100 Subject: [PATCH 06/43] Update pkg/query/endpointset.go full sentence fix Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 899644d757..fd5b0f1c9f 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -238,7 +238,7 @@ func (c *endpointSetNodeCollector) Collect(ch chan<- prometheus.Metric) { if storeType != nil { storeTypeStr = storeType.String() } - // select only required labels + // Select only required labels. lbls := []string{} for _, lbl := range c.requiredLabels { switch lbl { From 2107140a809a3f35058d93adddb04d1f7b1bc262 Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Thu, 13 Oct 2022 19:29:29 +0100 Subject: [PATCH 07/43] Update pkg/query/endpointset.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index fd5b0f1c9f..edcc366254 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -43,7 +43,7 @@ type GRPCEndpointSpec struct { isStrictStatic bool } -var externalLabelLimit = 1000 +const externalLabelLimit = 1000 // NewGRPCEndpointSpec creates gRPC endpoint spec. // It uses InfoAPI to get Metadata. From 606cba54eac0002966d760269b1158185044db81 Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 19:35:13 +0100 Subject: [PATCH 08/43] minor fixes from code review Signed-off-by: utukj --- pkg/query/endpointset.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 899644d757..692b74e8d1 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -185,21 +185,21 @@ type endpointSetNodeCollector struct { storePerExtLset map[string]int connectionsDesc *prometheus.Desc - requiredLabels []string + labels []string } -func newEndpointSetNodeCollector(requiredLabels ...string) *endpointSetNodeCollector { - if len(requiredLabels) == 0 { - requiredLabels = []string{"external_labels", "store_type"} +func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { + if len(labels) == 0 { + labels = []string{"external_labels", "store_type"} } return &endpointSetNodeCollector{ storeNodes: map[component.Component]map[string]int{}, connectionsDesc: prometheus.NewDesc( "thanos_store_nodes_grpc_connections", "Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier.", - requiredLabels, nil, + labels, nil, ), - requiredLabels: requiredLabels, + labels: labels, } } @@ -207,14 +207,14 @@ func (c *endpointSetNodeCollector) Update(nodes map[component.Component]map[stri storeNodes := make(map[component.Component]map[string]int, len(nodes)) storePerExtLset := map[string]int{} - for k, v := range nodes { - storeNodes[k] = make(map[string]int, len(v)) - for kk, vv := range v { - if len(kk) > externalLabelLimit { - kk = kk[:externalLabelLimit+1] + "}" + for storeType, occurrencesPerExtLset := range nodes { + storeNodes[storeType] = make(map[string]int, len(occurrencesPerExtLset)) + for external_labels, occurrences := range occurrencesPerExtLset { + if len(external_labels) > externalLabelLimit { + external_labels = external_labels[:externalLabelLimit+1] + "}" } - storePerExtLset[kk] += vv - storeNodes[k][kk] = vv + storePerExtLset[external_labels] += occurrences + storeNodes[storeType][external_labels] = occurrences } } @@ -240,7 +240,7 @@ func (c *endpointSetNodeCollector) Collect(ch chan<- prometheus.Metric) { } // select only required labels lbls := []string{} - for _, lbl := range c.requiredLabels { + for _, lbl := range c.labels { switch lbl { case "external_labels": lbls = append(lbls, externalLabels) From a07e974a9e9b3dcee1094c91bd62c5383d74d8dd Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 13 Oct 2022 19:40:05 +0100 Subject: [PATCH 09/43] fixed code comments Signed-off-by: utukj --- pkg/query/endpointset_test.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index a07880520c..3fd332843e 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -386,7 +386,7 @@ func TestEndpointSetUpdate(t *testing.T) { discoveredEndpointAddr := endpoints.EndpointAddresses() var endpointSet *EndpointSet - // specify only "store_type" to exclude "external_labels" + // Specify only "store_type" to exclude "external_labels". if tc.name == "no external labels" { endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, "store_type") } else { @@ -397,7 +397,8 @@ func TestEndpointSetUpdate(t *testing.T) { endpointSet.Update(context.Background()) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetEndpointStatus())) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetStoreClients())) - // slow or unavailable endpoint should collect nothing + + // Slow or unavailable endpoint should collect nothing. if tc.name == "slow endpoint" || tc.name == "unavailable endpoint" { testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(""))) return @@ -423,11 +424,11 @@ func TestEndpointSetUpdate(t *testing.T) { } else { externalLabels = fmt.Sprintf(`a="b", addr=%q`, discoveredEndpointAddr[0]) } - // labels too long must be trimmed + // Labels too long must be trimmed. if len(externalLabels) > externalLabelLimit { externalLabels = externalLabels[:externalLabelLimit] } - // add backslash escape for every quote character + // Add backslash escape for every quote character. var lbl strings.Builder for _, ch := range externalLabels { if string(ch) == `"` { From 03b5289f5985c0db879284c0f9884093b8d7712f Mon Sep 17 00:00:00 2001 From: utukj Date: Fri, 14 Oct 2022 08:40:46 +0100 Subject: [PATCH 10/43] used enum for labels Signed-off-by: utukj --- cmd/thanos/query.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 776dbcb89a..54724f59a6 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -70,6 +70,13 @@ const ( promqlEngineThanos promqlEngineType = "thanos" ) +type queryConnMetricLabel string + +const ( + externalLabels queryConnMetricLabel = "external_labels" + storeType queryConnMetricLabel = "store_type" +) + // registerQuery registers a query command. func registerQuery(app *extkingpin.App) { comp := component.Query @@ -108,7 +115,9 @@ func registerQuery(app *extkingpin.App) { maxConcurrentSelects := cmd.Flag("query.max-concurrent-select", "Maximum number of select requests made concurrently per a query."). Default("4").Int() - queryMetricLabels := cmd.Flag("query.metric-label", "Optional selection of metrics to be collected").Strings() + queryConnMetricLabels := cmd.Flag("query.conn-metric.label", "Optional selection of query connection metric labels to be collected from endpoint set"). + Default(string(externalLabels), string(storeType)). + Enums(string(externalLabels), string(storeType)) queryReplicaLabels := cmd.Flag("query.replica-label", "Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules."). Strings() @@ -277,7 +286,7 @@ func registerQuery(app *extkingpin.App) { *dynamicLookbackDelta, time.Duration(*defaultEvaluationInterval), time.Duration(*storeResponseTimeout), - *queryMetricLabels, + *queryConnMetricLabels, *queryReplicaLabels, selectorLset, getFlagsMap(cmd.Flags()), @@ -350,7 +359,7 @@ func runQuery( dynamicLookbackDelta bool, defaultEvaluationInterval time.Duration, storeResponseTimeout time.Duration, - queryMetricLabels []string, + queryConnMetricLabels []string, queryReplicaLabels []string, selectorLset labels.Labels, flagsMap map[string]string, @@ -490,7 +499,7 @@ func runQuery( dialOpts, unhealthyStoreTimeout, endpointInfoTimeout, - queryMetricLabels..., + queryConnMetricLabels..., ) proxy = store.NewProxyStore(logger, reg, endpoints.GetStoreClients, component.Query, selectorLset, storeResponseTimeout, store.RetrievalStrategy(grpcProxyStrategy)) rulesProxy = rules.NewProxy(logger, endpoints.GetRulesClients) From be3ca5342bae7d2f911c699d5e6beb1fdac0ef31 Mon Sep 17 00:00:00 2001 From: utukj Date: Fri, 14 Oct 2022 10:00:57 +0100 Subject: [PATCH 11/43] updated query docs Signed-off-by: utukj --- docs/components/query.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/components/query.md b/docs/components/query.md index a2e8c91f56..1a028ee3ed 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -335,6 +335,9 @@ Flags: --query.auto-downsampling Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified. + --query.conn-metric.label=external_labels... ... + Optional selection of query connection metric + labels to be collected from endpoint set --query.default-evaluation-interval=1m Set default evaluation interval for sub queries. @@ -369,8 +372,6 @@ Flags: when the range parameters are not specified. The zero value means range covers the time since the beginning. - --query.metric-label=QUERY.METRIC-LABEL ... - Optional selection of metrics to be collected --query.partial-response Enable partial response for queries if no partial_response param is specified. --no-query.partial-response for disabling. From 32a0a9818409dfda6cdd0bb34287f0793c9e1ca4 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 16:23:21 +0100 Subject: [PATCH 12/43] cleaned up tests Signed-off-by: utukj --- pkg/query/endpointset.go | 17 +++-- pkg/query/endpointset_test.go | 130 ++++++++++++++-------------------- 2 files changed, 63 insertions(+), 84 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 649c824647..e886fb9725 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -203,18 +203,23 @@ func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { } } +func truncateExtLabels(s string, threshold int) string { + if len(s) > threshold { + s = s[:threshold+1] + "}" + } + return s +} + func (c *endpointSetNodeCollector) Update(nodes map[component.Component]map[string]int) { storeNodes := make(map[component.Component]map[string]int, len(nodes)) storePerExtLset := map[string]int{} for storeType, occurrencesPerExtLset := range nodes { storeNodes[storeType] = make(map[string]int, len(occurrencesPerExtLset)) - for external_labels, occurrences := range occurrencesPerExtLset { - if len(external_labels) > externalLabelLimit { - external_labels = external_labels[:externalLabelLimit+1] + "}" - } - storePerExtLset[external_labels] += occurrences - storeNodes[storeType][external_labels] = occurrences + for externalLabels, occurrences := range occurrencesPerExtLset { + externalLabels = truncateExtLabels(externalLabels, externalLabelLimit) + storePerExtLset[externalLabels] += occurrences + storeNodes[storeType][externalLabels] = occurrences } } diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 3fd332843e..b79196248d 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -271,13 +271,35 @@ func (e *testEndpoints) CloseOne(addr string) { delete(e.srvs, addr) } +func truncateAndEscapeQuotes(s string) string { + // Truncate string. + if len(s) > externalLabelLimit { + s = s[:externalLabelLimit] + } + // Add backslash escape for every quote character. + var lbl strings.Builder + for _, ch := range s { + if string(ch) == `"` { + lbl.WriteString(`\`) + } + lbl.WriteRune(ch) + } + return lbl.String() +} + func TestEndpointSetUpdate(t *testing.T) { + const metricsMeta = ` + # HELP thanos_store_nodes_grpc_connections Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier. + # TYPE thanos_store_nodes_grpc_connections gauge + ` testCases := []struct { - name string - endpoints []testEndpointMeta - strict bool + name string + endpoints []testEndpointMeta + strict bool + connLabels []string - expectedEndpoints int + expectedEndpoints int + expectedConnMetrics string }{ { name: "available endpoint", @@ -291,7 +313,13 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, }, + connLabels: []string{"store_type"}, + expectedEndpoints: 1, + expectedConnMetrics: metricsMeta + + ` + thanos_store_nodes_grpc_connections{store_type="sidecar"} 1 + `, }, { name: "unavailable endpoint", @@ -306,7 +334,9 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, }, - expectedEndpoints: 0, + + expectedEndpoints: 0, + expectedConnMetrics: "", }, { name: "slow endpoint", @@ -321,7 +351,9 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, }, - expectedEndpoints: 0, + + expectedEndpoints: 0, + expectedConnMetrics: "", }, { name: "strict endpoint", @@ -336,13 +368,19 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, strict: true, + connLabels: []string{"store_type"}, expectedEndpoints: 1, + expectedConnMetrics: metricsMeta + + ` + thanos_store_nodes_grpc_connections{store_type="sidecar"} 1 + `, }, { name: "long external labels", endpoints: []testEndpointMeta{ { InfoResponse: sidecarInfo, + // simulate very long external labels extlsetFn: func(addr string) []labelpb.ZLabelSet { sLabel := []string{} for i := 0; i < 1000; i++ { @@ -356,25 +394,12 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, expectedEndpoints: 1, - }, - { - name: "no external labels", - endpoints: []testEndpointMeta{ - { - InfoResponse: sidecarInfo, - extlsetFn: func(addr string) []labelpb.ZLabelSet { - sLabel := []string{} - for i := 0; i < 1000; i++ { - sLabel = append(sLabel, "lbl") - sLabel = append(sLabel, "val") - } - return labelpb.ZLabelSetsFromPromLabels( - labels.FromStrings(sLabel...), - ) - }, - }, - }, - expectedEndpoints: 1, + expectedConnMetrics: metricsMeta + fmt.Sprintf( + ` + thanos_store_nodes_grpc_connections{external_labels="{%s}", store_type="sidecar"} 1 + `, + truncateAndEscapeQuotes(strings.Repeat(`lbl="val", `, 1000)), + ), }, } @@ -387,65 +412,14 @@ func TestEndpointSetUpdate(t *testing.T) { discoveredEndpointAddr := endpoints.EndpointAddresses() var endpointSet *EndpointSet // Specify only "store_type" to exclude "external_labels". - if tc.name == "no external labels" { - endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, "store_type") - } else { - endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now) - } + endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, tc.connLabels...) defer endpointSet.Close() endpointSet.Update(context.Background()) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetEndpointStatus())) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetStoreClients())) - // Slow or unavailable endpoint should collect nothing. - if tc.name == "slow endpoint" || tc.name == "unavailable endpoint" { - testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(""))) - return - } - - if tc.name == "no external labels" { - expectedMetrics := fmt.Sprintf( - ` - # HELP thanos_store_nodes_grpc_connections Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier. - # TYPE thanos_store_nodes_grpc_connections gauge - thanos_store_nodes_grpc_connections{store_type="sidecar"} %d - `, - tc.expectedEndpoints, - ) - testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(expectedMetrics))) - return - } - - var externalLabels string - if tc.name == "long external labels" { - externalLabels = strings.Repeat(`lbl="val", `, 1000) - externalLabels = externalLabels[:len(externalLabels)-2] - } else { - externalLabels = fmt.Sprintf(`a="b", addr=%q`, discoveredEndpointAddr[0]) - } - // Labels too long must be trimmed. - if len(externalLabels) > externalLabelLimit { - externalLabels = externalLabels[:externalLabelLimit] - } - // Add backslash escape for every quote character. - var lbl strings.Builder - for _, ch := range externalLabels { - if string(ch) == `"` { - lbl.WriteString(`\`) - } - lbl.WriteRune(ch) - } - expectedMetrics := fmt.Sprintf( - ` - # HELP thanos_store_nodes_grpc_connections Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier. - # TYPE thanos_store_nodes_grpc_connections gauge - thanos_store_nodes_grpc_connections{external_labels="{%s}",store_type="sidecar"} %d - `, - lbl.String(), - tc.expectedEndpoints, - ) - testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(expectedMetrics))) + testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(tc.expectedConnMetrics))) }) } } From 9474c00fa6a1a7b0148287ee4296944e50f093b6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 15 Oct 2022 22:30:58 +0000 Subject: [PATCH 13/43] Updates busybox SHA (#5793) Signed-off-by: GitHub Signed-off-by: GitHub Co-authored-by: yeya24 Signed-off-by: utukj --- .busybox-versions | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.busybox-versions b/.busybox-versions index afcacb3c77..dfaea69d50 100644 --- a/.busybox-versions +++ b/.busybox-versions @@ -1,6 +1,6 @@ # Auto generated by busybox-updater.sh. DO NOT EDIT -amd64=d8d3654786836cad8c09543704807c7a6d75de53b9e9cd21a1bbd8cb1a607004 -arm64=a3435ee186dbf88238388c112761488ecd2c264dbff8957ab73f804be62a9080 -arm=b063a2176f23a13007de5c447ab3552f8e355162ac54fc2a545b00b612d4c81e -ppc64le=203c3f97bc34c4d5df50bd61beaa397f2a4c7cbd470c84fe7ec3db12409435d3 -s390x=1a6eb305bd08bd1d38cb85a097ad776a78dd72b7c1a35094bb080788a39b174c +amd64=c9f983fc55b0b74723a69c31688cca7d5a2e5b2af7c954780f29a331817982f3 +arm64=1349554b18d6c349a390929c2a4855fadb003b2243aabf2cc71b931068c69279 +arm=be08b36d0e8f90b6fb317d29582c632ce365a00648a81c4022c4ff79df928ad9 +ppc64le=d44f541b0df83608110e695b9a1e71604ab94924954a1b18f6d76c4b5871cadd +s390x=007b2b388c575d00c7234d29227bbb8216786d7ba3f86d82696dc6fe86ac1ec0 From 24e1cc0faf219049174020955f8e3c8251106d87 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 17 Oct 2022 17:52:00 +0200 Subject: [PATCH 14/43] Receive: Reload tenant limit configuration on file change (#5673) * Create a PathOrContent reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add docs to staticPathContent.Rewrite Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run goimports Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Properly cancel the context in the test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Watch parent directory of file This helps handling deletes and other situations. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless ctx.Done() Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add a debounce timer to config reload It helps managing situations where a create event is followed by a write or when a big file write is sent by the fsnotify backend as many write events. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix event.Op bitmask check Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update lastReload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix debouncer for path content reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Improve documentation of the PathContentRealoder Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Dain reload timer before resetting Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run tests in parallel Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify debouncing logic Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add more tests to file reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify condition for triggering reload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use absolute path to config file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of parallel test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back 2s wait between fs operations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless sleep Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Stop reloadTimer when context cancelled Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove unused fucntion Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright to test file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Auto-reload tenant limit config on file changes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Wrap error when reloading config Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Move limiter config reloader and update logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of useless types and allocations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove errorChan from config reload starter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use UnRegisterer in the Limiter To ensure that limit reloads will be able to re-register their metrics. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Better guard against nil registerer in the limiter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove wrong nil guard Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: utukj --- CHANGELOG.md | 1 + cmd/thanos/receive.go | 46 +++-- docs/components/receive.md | 2 +- go.mod | 8 +- go.sum | 4 +- pkg/extkingpin/path_content_reloader.go | 128 ++++++++++++ pkg/extkingpin/path_content_reloader_test.go | 105 ++++++++++ pkg/receive/handler.go | 22 +- pkg/receive/handler_test.go | 38 ++-- pkg/receive/limiter.go | 189 ++++++++++++++++-- pkg/receive/limiter_config.go | 4 +- pkg/receive/limiter_config_test.go | 6 +- pkg/receive/limiter_test.go | 100 +++++++++ pkg/receive/request_limiter.go | 31 +-- pkg/receive/request_limiter_test.go | 20 +- pkg/receive/testdata/limits.yaml | 22 ++ .../limits_config/invalid_limits.yaml | 17 ++ 17 files changed, 646 insertions(+), 97 deletions(-) create mode 100644 pkg/extkingpin/path_content_reloader.go create mode 100644 pkg/extkingpin/path_content_reloader_test.go create mode 100644 pkg/receive/limiter_test.go create mode 100644 pkg/receive/testdata/limits.yaml create mode 100644 pkg/receive/testdata/limits_config/invalid_limits.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed82d6525..6e1d2143c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. +- [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index 5c47b91dd5..d86b560983 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -192,19 +192,6 @@ func runReceive( return errors.Wrap(err, "parse relabel configuration") } - var limitsConfig *receive.RootLimitsConfig - if conf.limitsConfig != nil { - limitsContentYaml, err := conf.limitsConfig.Content() - if err != nil { - return errors.Wrap(err, "get content of limit configuration") - } - limitsConfig, err = receive.ParseRootLimitConfig(limitsContentYaml) - if err != nil { - return errors.Wrap(err, "parse limit configuration") - } - } - limiter := receive.NewLimiter(limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) - dbs := receive.NewMultiTSDB( conf.dataDir, logger, @@ -217,6 +204,23 @@ func runReceive( hashFunc, ) writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs) + + var limitsConfig *receive.RootLimitsConfig + if conf.limitsConfig != nil { + limitsContentYaml, err := conf.limitsConfig.Content() + if err != nil { + return errors.Wrap(err, "get content of limit configuration") + } + limitsConfig, err = receive.ParseRootLimitConfig(limitsContentYaml) + if err != nil { + return errors.Wrap(err, "parse limit configuration") + } + } + limiter, err := receive.NewLimiter(conf.limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) + if err != nil { + return errors.Wrap(err, "creating limiter") + } + webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{ Writer: writer, ListenAddress: conf.rwAddress, @@ -399,6 +403,22 @@ func runReceive( }) } + { + if limiter.CanReload() { + ctx, cancel := context.WithCancel(context.Background()) + g.Add(func() error { + level.Debug(logger).Log("msg", "limits config initialized with file watcher.") + if err := limiter.StartConfigReloader(ctx); err != nil { + return err + } + <-ctx.Done() + return nil + }, func(err error) { + cancel() + }) + } + } + level.Info(logger).Log("msg", "starting receiver") return nil } diff --git a/docs/components/receive.md b/docs/components/receive.md index 6fa13938e9..ef4e39e35e 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -86,7 +86,7 @@ Thanos Receive has some limits and gates that can be configured to control resou To configure the gates and limits you can use one of the two options: -- `--receive.limits-config-file=`: where `` is the path to the YAML file. +- `--receive.limits-config-file=`: where `` is the path to the YAML file. Any modification to the indicated file will trigger a configuration reload. If the updated configuration is invalid an error will be logged and it won't replace the previous valid configuration. - `--receive.limits-config=`: where `` is the content of YAML file. By default all the limits and gates are **disabled**. diff --git a/go.mod b/go.mod index 13743c8020..bee3e97fe7 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/dustin/go-humanize v1.0.0 github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a - github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d + github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.2.0 github.com/felixge/fgprof v0.9.2 @@ -108,6 +108,7 @@ require ( require ( github.com/efficientgo/core v1.0.0-rc.0 + github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd github.com/minio/sha256-simd v1.0.0 ) @@ -127,10 +128,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.10.0 ) -require ( - github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd - go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 -) +require go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.32.3 // indirect diff --git a/go.sum b/go.sum index 5ee9bab6be..97fc0d0411 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a h1:cnJajqeh/Hjv github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd h1:svR6KxSP1xiPw10RN4Pd7g6BAVkEcNN628PAqZH31mM= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:OmVcnJopJL8d3X3sSXTiypGoUSgFq1aDGmlrdi9dn/M= -github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d h1:WZV/mrUyKS9w9r+Jdw+zq/tdGAb5LwB+H37EkMLhEMA= -github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= +github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd h1:VaYzzXeUbC5fVheskcKVNOyJMEYD+HgrJNzIAg/mRIM= +github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= github.com/elastic/go-sysinfo v1.1.1/go.mod h1:i1ZYdU10oLNfRzq4vq62BEwD2fH8KaWh6eh0ikPT9F0= github.com/elastic/go-sysinfo v1.8.1 h1:4Yhj+HdV6WjbCRgGdZpPJ8lZQlXZLKDAeIkmQ/VRvi4= github.com/elastic/go-sysinfo v1.8.1/go.mod h1:JfllUnzoQV/JRYymbH3dO1yggI3mV2oTKSXsDHM+uIM= diff --git a/pkg/extkingpin/path_content_reloader.go b/pkg/extkingpin/path_content_reloader.go new file mode 100644 index 0000000000..68c2cd252c --- /dev/null +++ b/pkg/extkingpin/path_content_reloader.go @@ -0,0 +1,128 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package extkingpin + +import ( + "context" + "fmt" + "os" + "path" + "path/filepath" + "time" + + "github.com/fsnotify/fsnotify" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/pkg/errors" +) + +type fileContent interface { + Content() ([]byte, error) + Path() string +} + +// PathContentReloader starts a file watcher that monitors the file indicated by fileContent.Path() and runs +// reloadFunc whenever a change is detected. +// A debounce timer can be configured via opts to handle situations where many "write" events are received together or +// a "create" event is followed up by a "write" event, for example. Files will be effectively reloaded at the latest +// after 2 times the debounce timer. By default the debouncer timer is 1 second. +// To ensure renames and deletes are properly handled, the file watcher is put at the file's parent folder. See +// https://github.com/fsnotify/fsnotify/issues/214 for more details. +func PathContentReloader(ctx context.Context, fileContent fileContent, logger log.Logger, reloadFunc func(), debounceTime time.Duration) error { + filePath, err := filepath.Abs(fileContent.Path()) + if err != nil { + return errors.Wrap(err, "getting absolute file path") + } + + watcher, err := fsnotify.NewWatcher() + if filePath == "" { + level.Debug(logger).Log("msg", "no path detected for config reload") + } + if err != nil { + return errors.Wrap(err, "creating file watcher") + } + go func() { + var reloadTimer *time.Timer + if debounceTime != 0 { + reloadTimer = time.AfterFunc(debounceTime, func() { + reloadFunc() + level.Debug(logger).Log("msg", "configuration reloaded after debouncing") + }) + } + defer watcher.Close() + for { + select { + case <-ctx.Done(): + if reloadTimer != nil { + reloadTimer.Stop() + } + return + case event := <-watcher.Events: + // fsnotify sometimes sends a bunch of events without name or operation. + // It's unclear what they are and why they are sent - filter them out. + if event.Name == "" { + break + } + // We are watching the file's parent folder (more details on this is done can be found below), but are + // only interested in changed to the target file. Discard every other file as quickly as possible. + if event.Name != filePath { + break + } + // We only react to files being written or created. + // On chmod or remove we have nothing to do. + // On rename we have the old file name (not useful). A create event for the new file will come later. + if event.Op&fsnotify.Write == 0 && event.Op&fsnotify.Create == 0 { + break + } + level.Debug(logger).Log("msg", fmt.Sprintf("change detected for %s", filePath), "eventName", event.Name, "eventOp", event.Op) + if reloadTimer != nil { + reloadTimer.Reset(debounceTime) + } + case err := <-watcher.Errors: + level.Error(logger).Log("msg", "watcher error", "error", err) + } + } + }() + // We watch the file's parent folder and not the file itself to better handle DELETE and RENAME events. Check + // https://github.com/fsnotify/fsnotify/issues/214 for more details. + if err := watcher.Add(path.Dir(filePath)); err != nil { + return errors.Wrapf(err, "adding path %s to file watcher", filePath) + } + return nil +} + +type staticPathContent struct { + content []byte + path string +} + +var _ fileContent = (*staticPathContent)(nil) + +// Content returns the cached content. +func (t *staticPathContent) Content() ([]byte, error) { + return t.content, nil +} + +// Path returns the path to the file that contains the content. +func (t *staticPathContent) Path() string { + return t.path +} + +// NewStaticPathContent creates a new content that can be used to serve a static configuration. It copies the +// configuration from `fromPath` into `destPath` to avoid confusion with file watchers. +func NewStaticPathContent(fromPath string) (*staticPathContent, error) { + content, err := os.ReadFile(fromPath) + if err != nil { + return nil, errors.Wrapf(err, "could not load test content: %s", fromPath) + } + return &staticPathContent{content, fromPath}, nil +} + +// Rewrite rewrites the file backing this staticPathContent and swaps the local content cache. The file writing +// is needed to trigger the file system monitor. +func (t *staticPathContent) Rewrite(newContent []byte) error { + t.content = newContent + // Write the file to ensure possible file watcher reloaders get triggered. + return os.WriteFile(t.path, newContent, 0666) +} diff --git a/pkg/extkingpin/path_content_reloader_test.go b/pkg/extkingpin/path_content_reloader_test.go new file mode 100644 index 0000000000..fb20f83d5c --- /dev/null +++ b/pkg/extkingpin/path_content_reloader_test.go @@ -0,0 +1,105 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package extkingpin + +import ( + "context" + "os" + "path" + "sync" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/thanos-io/thanos/pkg/testutil" +) + +func TestPathContentReloader(t *testing.T) { + type args struct { + runSteps func(t *testing.T, testFile string, pathContent *staticPathContent) + } + tests := []struct { + name string + args args + wantReloads int + }{ + { + name: "Many operations, only rewrite triggers one reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + testutil.Ok(t, os.Remove(testFile)) + testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) + }, + }, + wantReloads: 1, + }, + { + name: "Many operations, only rename triggers one reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + testutil.Ok(t, os.Rename(testFile, testFile+".tmp")) + testutil.Ok(t, os.Rename(testFile+".tmp", testFile)) + }, + }, + wantReloads: 1, + }, + { + name: "Many operations, two rewrites trigger two reloads", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + testutil.Ok(t, os.Remove(testFile)) + testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) + time.Sleep(2 * time.Second) + testutil.Ok(t, pathContent.Rewrite([]byte("test modified again"))) + }, + }, + wantReloads: 1, + }, + { + name: "Chmod doesn't trigger reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + }, + }, + wantReloads: 0, + }, + { + name: "Remove doesn't trigger reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Remove(testFile)) + }, + }, + wantReloads: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testFile := path.Join(t.TempDir(), "test") + testutil.Ok(t, os.WriteFile(testFile, []byte("test"), 0666)) + pathContent, err := NewStaticPathContent(testFile) + testutil.Ok(t, err) + + wg := &sync.WaitGroup{} + wg.Add(tt.wantReloads) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + reloadCount := 0 + err = PathContentReloader(ctx, pathContent, log.NewLogfmtLogger(os.Stdout), func() { + reloadCount++ + wg.Done() + }, 100*time.Millisecond) + testutil.Ok(t, err) + + tt.args.runSteps(t, testFile, pathContent) + wg.Wait() + testutil.Equals(t, tt.wantReloads, reloadCount) + }) + } +} diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go index 156bb74566..12afb752b8 100644 --- a/pkg/receive/handler.go +++ b/pkg/receive/handler.go @@ -17,10 +17,6 @@ import ( "sync" "time" - "github.com/thanos-io/thanos/pkg/api" - statusapi "github.com/thanos-io/thanos/pkg/api/status" - "github.com/thanos-io/thanos/pkg/logging" - "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -35,6 +31,9 @@ import ( "github.com/prometheus/prometheus/model/relabel" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb" + "github.com/thanos-io/thanos/pkg/api" + statusapi "github.com/thanos-io/thanos/pkg/api/status" + "github.com/thanos-io/thanos/pkg/logging" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -99,7 +98,7 @@ type Options struct { ForwardTimeout time.Duration RelabelConfigs []*relabel.Config TSDBStats TSDBStats - Limiter *limiter + Limiter *Limiter } // Handler serves a Prometheus remote write receiving HTTP endpoint. @@ -124,7 +123,7 @@ type Handler struct { writeSamplesTotal *prometheus.HistogramVec writeTimeseriesTotal *prometheus.HistogramVec - limiter *limiter + Limiter *Limiter } func NewHandler(logger log.Logger, o *Options) *Handler { @@ -150,7 +149,7 @@ func NewHandler(logger log.Logger, o *Options) *Handler { Max: 30 * time.Second, Jitter: true, }, - limiter: o.Limiter, + Limiter: o.Limiter, forwardRequests: promauto.With(registerer).NewCounterVec( prometheus.CounterOpts{ Name: "thanos_receive_forward_requests_total", @@ -407,17 +406,18 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { tLogger := log.With(h.logger, "tenant", tenant) + writeGate := h.Limiter.WriteGate() tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) { - err = h.limiter.writeGate.Start(r.Context()) + err = writeGate.Start(r.Context()) }) + defer writeGate.Done() if err != nil { level.Error(tLogger).Log("err", err, "msg", "internal server error") http.Error(w, err.Error(), http.StatusInternalServerError) return } - defer h.limiter.writeGate.Done() - under, err := h.limiter.HeadSeriesLimiter.isUnderLimit(tenant) + under, err := h.Limiter.HeadSeriesLimiter.isUnderLimit(tenant) if err != nil { level.Error(tLogger).Log("msg", "error while limiting", "err", err.Error()) } @@ -428,7 +428,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { return } - requestLimiter := h.limiter.requestLimiter + requestLimiter := h.Limiter.RequestLimiter() // io.ReadAll dynamically adjust the byte slice for read data, starting from 512B. // Since this is receive hot path, grow upfront saving allocations and CPU time. compressed := bytes.Buffer{} diff --git a/pkg/receive/handler_test.go b/pkg/receive/handler_test.go index 44076de141..4a2a536038 100644 --- a/pkg/receive/handler_test.go +++ b/pkg/receive/handler_test.go @@ -13,6 +13,7 @@ import ( "net/http" "net/http/httptest" "os" + "path" "path/filepath" "runtime" "runtime/pprof" @@ -21,6 +22,8 @@ import ( "testing" "time" + "gopkg.in/yaml.v3" + "github.com/alecthomas/units" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -40,6 +43,7 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/errutil" + "github.com/thanos-io/thanos/pkg/extkingpin" "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/store/labelpb" "github.com/thanos-io/thanos/pkg/store/storepb" @@ -362,6 +366,7 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin }, } + limiter, _ := NewLimiter(NewNopConfig(), nil, RouterIngestor, log.NewNopLogger()) for i := range appendables { h := NewHandler(nil, &Options{ TenantHeader: DefaultTenantHeader, @@ -369,7 +374,7 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin ReplicationFactor: replicationFactor, ForwardTimeout: 5 * time.Second, Writer: NewWriter(log.NewNopLogger(), newFakeTenantAppendable(appendables[i])), - Limiter: NewLimiter(nil, nil, RouterIngestor, nil), + Limiter: limiter, }) handlers = append(handlers, h) h.peers = peers @@ -775,23 +780,28 @@ func TestReceiveWriteRequestLimits(t *testing.T) { } handlers, _ := newTestHandlerHashring(appendables, 3) handler := handlers[0] + tenant := "test" - handler.limiter = NewLimiter( - &RootLimitsConfig{ - WriteLimits: WriteLimitsConfig{ - TenantsLimits: TenantsWriteLimitsConfig{ - tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig(). - SetSizeBytesLimit(int64(1 * units.Megabyte)). - SetSeriesLimit(20). - SetSamplesLimit(200), - }, + tenantConfig, err := yaml.Marshal(&RootLimitsConfig{ + WriteLimits: WriteLimitsConfig{ + TenantsLimits: TenantsWriteLimitsConfig{ + tenant: &WriteLimitConfig{ + RequestLimits: NewEmptyRequestLimitsConfig(). + SetSizeBytesLimit(int64(1 * units.Megabyte)). + SetSeriesLimit(20). + SetSamplesLimit(200), }, }, }, - nil, - RouterIngestor, - log.NewNopLogger(), + }) + if err != nil { + t.Fatal("handler: failed to generate limit configuration") + } + tmpLimitsPath := path.Join(t.TempDir(), "limits.yaml") + testutil.Ok(t, os.WriteFile(tmpLimitsPath, tenantConfig, 0666)) + limitConfig, _ := extkingpin.NewStaticPathContent(tmpLimitsPath) + handler.Limiter, _ = NewLimiter( + limitConfig, nil, RouterIngestor, log.NewNopLogger(), ) wreq := &prompb.WriteRequest{ diff --git a/pkg/receive/limiter.go b/pkg/receive/limiter.go index bc3c4d8358..ff5bbe3199 100644 --- a/pkg/receive/limiter.go +++ b/pkg/receive/limiter.go @@ -5,59 +5,204 @@ package receive import ( "context" + "fmt" + "sync" + "time" "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/thanos-io/thanos/pkg/extkingpin" + + "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/thanos-io/thanos/pkg/extprom" "github.com/thanos-io/thanos/pkg/gate" ) -type limiter struct { - requestLimiter requestLimiter - writeGate gate.Gate - HeadSeriesLimiter headSeriesLimiter +// Limiter is responsible for managing the configuration and initialization of +// different types that apply limits to the Receive instance. +type Limiter struct { + sync.RWMutex + requestLimiter requestLimiter + HeadSeriesLimiter headSeriesLimiter + writeGate gate.Gate + registerer prometheus.Registerer + configPathOrContent fileContent + logger log.Logger + configReloadCounter prometheus.Counter + configReloadFailedCounter prometheus.Counter + receiverMode ReceiverMode +} + +// headSeriesLimiter encompasses active/head series limiting logic. +type headSeriesLimiter interface { + QueryMetaMonitoring(context.Context) error + isUnderLimit(tenant string) (bool, error) } -// requestLimiter encompasses logic for limiting remote write requests. type requestLimiter interface { AllowSizeBytes(tenant string, contentLengthBytes int64) bool AllowSeries(tenant string, amount int64) bool AllowSamples(tenant string, amount int64) bool } -// headSeriesLimiter encompasses active/head series limiting logic. -type headSeriesLimiter interface { - QueryMetaMonitoring(context.Context) error - isUnderLimit(tenant string) (bool, error) +// fileContent is an interface to avoid a direct dependency on kingpin or extkingpin. +type fileContent interface { + Content() ([]byte, error) + Path() string } -func NewLimiter(root *RootLimitsConfig, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) *limiter { - limiter := &limiter{ +// NewLimiter creates a new *Limiter given a configuration and prometheus +// registerer. +func NewLimiter(configFile fileContent, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) (*Limiter, error) { + limiter := &Limiter{ writeGate: gate.NewNoop(), requestLimiter: &noopRequestLimiter{}, HeadSeriesLimiter: NewNopSeriesLimit(), + logger: logger, + receiverMode: r, + } + + if reg != nil { + limiter.registerer = NewUnRegisterer(reg) + limiter.configReloadCounter = promauto.With(limiter.registerer).NewCounter( + prometheus.CounterOpts{ + Namespace: "thanos", + Subsystem: "receive", + Name: "limits_config_reload_total", + Help: "How many times the limit configuration was reloaded", + }, + ) + limiter.configReloadFailedCounter = promauto.With(limiter.registerer).NewCounter( + prometheus.CounterOpts{ + Namespace: "thanos", + Subsystem: "receive", + Name: "limits_config_reload_err_total", + Help: "How many times the limit configuration failed to reload.", + }, + ) + } + + if configFile == nil { + return limiter, nil + } + + limiter.configPathOrContent = configFile + if err := limiter.loadConfig(); err != nil { + return nil, errors.Wrap(err, "load tenant limits config") + } + + return limiter, nil +} + +// StartConfigReloader starts the automatic configuration reloader based off of +// the file indicated by pathOrContent. It starts a Go routine in the given +// *run.Group. +func (l *Limiter) StartConfigReloader(ctx context.Context) error { + if !l.CanReload() { + return nil } - if root == nil { - return limiter + + return extkingpin.PathContentReloader(ctx, l.configPathOrContent, l.logger, func() { + level.Info(l.logger).Log("msg", "reloading limit config") + if err := l.loadConfig(); err != nil { + if failedReload := l.configReloadCounter; failedReload != nil { + failedReload.Inc() + } + errMsg := fmt.Sprintf("error reloading tenant limits config from %s", l.configPathOrContent.Path()) + level.Error(l.logger).Log("msg", errMsg, "err", err) + } + if reloadCounter := l.configReloadCounter; reloadCounter != nil { + reloadCounter.Inc() + } + }, 1*time.Second) +} + +func (l *Limiter) CanReload() bool { + if l.configPathOrContent == nil { + return false } + if l.configPathOrContent.Path() == "" { + return false + } + return true +} - maxWriteConcurrency := root.WriteLimits.GlobalLimits.MaxConcurrency +func (l *Limiter) loadConfig() error { + config, err := ParseLimitConfigContent(l.configPathOrContent) + if err != nil { + return err + } + l.Lock() + defer l.Unlock() + maxWriteConcurrency := config.WriteLimits.GlobalLimits.MaxConcurrency if maxWriteConcurrency > 0 { - limiter.writeGate = gate.New( + l.writeGate = gate.New( extprom.WrapRegistererWithPrefix( "thanos_receive_write_request_concurrent_", - reg, + l.registerer, ), int(maxWriteConcurrency), ) } - limiter.requestLimiter = newConfigRequestLimiter(reg, &root.WriteLimits) - - // Impose active series limit only if Receiver is in Router or RouterIngestor mode, and config is provided. - seriesLimitSupported := (r == RouterOnly || r == RouterIngestor) && (len(root.WriteLimits.TenantsLimits) != 0 || root.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) + l.requestLimiter = newConfigRequestLimiter( + l.registerer, + &config.WriteLimits, + ) + seriesLimitSupported := (l.receiverMode == RouterOnly || l.receiverMode == RouterIngestor) && (len(config.WriteLimits.TenantsLimits) != 0 || config.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) if seriesLimitSupported { - limiter.HeadSeriesLimiter = NewHeadSeriesLimit(root.WriteLimits, reg, logger) + l.HeadSeriesLimiter = NewHeadSeriesLimit(config.WriteLimits, l.registerer, l.logger) } + return nil +} + +// RequestLimiter is a safe getter for the request limiter. +func (l *Limiter) RequestLimiter() requestLimiter { + l.RLock() + defer l.RUnlock() + return l.requestLimiter +} + +// WriteGate is a safe getter for the write gate. +func (l *Limiter) WriteGate() gate.Gate { + l.RLock() + defer l.RUnlock() + return l.writeGate +} + +// ParseLimitConfigContent parses the limit configuration from the path or +// content. +func ParseLimitConfigContent(limitsConfig fileContent) (*RootLimitsConfig, error) { + if limitsConfig == nil { + return &RootLimitsConfig{}, nil + } + limitsContentYaml, err := limitsConfig.Content() + if err != nil { + return nil, errors.Wrap(err, "get content of limit configuration") + } + parsedConfig, err := ParseRootLimitConfig(limitsContentYaml) + if err != nil { + return nil, errors.Wrap(err, "parse limit configuration") + } + return parsedConfig, nil +} + +type nopConfigContent struct{} + +var _ fileContent = (*nopConfigContent)(nil) + +// Content returns no content and no error. +func (n nopConfigContent) Content() ([]byte, error) { + return nil, nil +} + +// Path returns an empty path. +func (n nopConfigContent) Path() string { + return "" +} - return limiter +// NewNopConfig creates a no-op config content (no configuration). +func NewNopConfig() nopConfigContent { + return nopConfigContent{} } diff --git a/pkg/receive/limiter_config.go b/pkg/receive/limiter_config.go index 67aa5ef93a..c3bd330b6e 100644 --- a/pkg/receive/limiter_config.go +++ b/pkg/receive/limiter_config.go @@ -78,6 +78,7 @@ type DefaultLimitsConfig struct { HeadSeriesLimit uint64 `yaml:"head_series_limit"` } +// TenantsWriteLimitsConfig is a map of tenant IDs to their *WriteLimitConfig. type TenantsWriteLimitsConfig map[string]*WriteLimitConfig // A tenant might not always have limits configured, so things here must @@ -110,8 +111,7 @@ type requestLimitsConfig struct { SamplesLimit *int64 `yaml:"samples_limit"` } -// Utils for initializing. -func newEmptyRequestLimitsConfig() *requestLimitsConfig { +func NewEmptyRequestLimitsConfig() *requestLimitsConfig { return &requestLimitsConfig{} } diff --git a/pkg/receive/limiter_config_test.go b/pkg/receive/limiter_config_test.go index b080680162..3e32ea41e8 100644 --- a/pkg/receive/limiter_config_test.go +++ b/pkg/receive/limiter_config_test.go @@ -35,7 +35,7 @@ func TestParseLimiterConfig(t *testing.T) { }, }, DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig(). + RequestLimits: *NewEmptyRequestLimitsConfig(). SetSizeBytesLimit(1024). SetSeriesLimit(1000). SetSamplesLimit(10), @@ -44,7 +44,7 @@ func TestParseLimiterConfig(t *testing.T) { TenantsLimits: TenantsWriteLimitsConfig{ "acme": NewEmptyWriteLimitConfig(). SetRequestLimits( - newEmptyRequestLimitsConfig(). + NewEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0), @@ -52,7 +52,7 @@ func TestParseLimiterConfig(t *testing.T) { SetHeadSeriesLimit(2000), "ajax": NewEmptyWriteLimitConfig(). SetRequestLimits( - newEmptyRequestLimitsConfig(). + NewEmptyRequestLimitsConfig(). SetSeriesLimit(50000). SetSamplesLimit(500), ), diff --git a/pkg/receive/limiter_test.go b/pkg/receive/limiter_test.go new file mode 100644 index 0000000000..be7e8790c1 --- /dev/null +++ b/pkg/receive/limiter_test.go @@ -0,0 +1,100 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package receive + +import ( + "context" + "os" + "path" + "testing" + "time" + + "github.com/thanos-io/thanos/pkg/extkingpin" + + "github.com/efficientgo/tools/core/pkg/testutil" + "github.com/go-kit/log" +) + +func TestLimiter_StartConfigReloader(t *testing.T) { + origLimitsFile, err := os.ReadFile(path.Join("testdata", "limits_config", "good_limits.yaml")) + testutil.Ok(t, err) + copyLimitsFile := path.Join(t.TempDir(), "limits.yaml") + testutil.Ok(t, os.WriteFile(copyLimitsFile, origLimitsFile, 0666)) + + goodLimits, err := extkingpin.NewStaticPathContent(copyLimitsFile) + if err != nil { + t.Fatalf("error trying to save static limit config: %s", err) + } + invalidLimitsPath := path.Join("./testdata", "limits_config", "invalid_limits.yaml") + invalidLimits, err := os.ReadFile(invalidLimitsPath) + if err != nil { + t.Fatalf("could not load test content at %s: %s", invalidLimitsPath, err) + } + + limiter, err := NewLimiter(goodLimits, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) + testutil.Ok(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + err = limiter.StartConfigReloader(ctx) + testutil.Ok(t, err) + + time.Sleep(1 * time.Second) + testutil.Ok(t, goodLimits.Rewrite(invalidLimits)) +} + +type emptyPathFile struct{} + +func (e emptyPathFile) Content() ([]byte, error) { + return []byte{}, nil +} + +func (e emptyPathFile) Path() string { + return "" +} + +func TestLimiter_CanReload(t *testing.T) { + validLimitsPath, err := extkingpin.NewStaticPathContent( + path.Join("testdata", "limits_config", "good_limits.yaml"), + ) + testutil.Ok(t, err) + emptyLimitsPath := emptyPathFile{} + + type args struct { + configFilePath fileContent + } + tests := []struct { + name string + args args + wantReload bool + }{ + { + name: "Nil config file path cannot be reloaded", + args: args{configFilePath: nil}, + wantReload: false, + }, + { + name: "Empty config file path cannot be reloaded", + args: args{configFilePath: emptyLimitsPath}, + wantReload: false, + }, + { + name: "Valid config file path can be reloaded", + args: args{configFilePath: validLimitsPath}, + wantReload: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + configFile := tt.args.configFilePath + limiter, err := NewLimiter(configFile, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) + testutil.Ok(t, err) + if tt.wantReload { + testutil.Assert(t, limiter.CanReload()) + } else { + testutil.Assert(t, !limiter.CanReload()) + } + }) + } +} diff --git a/pkg/receive/request_limiter.go b/pkg/receive/request_limiter.go index de7554de2f..7da0c64a6d 100644 --- a/pkg/receive/request_limiter.go +++ b/pkg/receive/request_limiter.go @@ -14,7 +14,7 @@ const ( sizeBytesLimitName = "body_size" ) -var unlimitedRequestLimitsConfig = newEmptyRequestLimitsConfig(). +var unlimitedRequestLimitsConfig = NewEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0) @@ -49,7 +49,12 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits tenantLimits: tenantRequestLimits, cachedDefaultLimits: defaultRequestLimits, } - limiter.limitsHit = promauto.With(reg).NewSummaryVec( + limiter.registerMetrics(reg) + return &limiter +} + +func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { + l.limitsHit = promauto.With(reg).NewSummaryVec( prometheus.SummaryOpts{ Namespace: "thanos", Subsystem: "receive", @@ -58,7 +63,7 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits Objectives: map[float64]float64{0.50: 0.1, 0.95: 0.1, 0.99: 0.001}, }, []string{"tenant", "limit"}, ) - limiter.configuredLimits = promauto.With(reg).NewGaugeVec( + l.configuredLimits = promauto.With(reg).NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thanos", Subsystem: "receive", @@ -66,16 +71,14 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits Help: "The configured write limits.", }, []string{"tenant", "limit"}, ) - for tenant, limits := range tenantRequestLimits { - limiter.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) - limiter.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) - limiter.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) + for tenant, limits := range l.tenantLimits { + l.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) + l.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) + l.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) } - limiter.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*defaultRequestLimits.SizeBytesLimit)) - limiter.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*defaultRequestLimits.SeriesLimit)) - limiter.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*defaultRequestLimits.SamplesLimit)) - - return &limiter + l.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*l.cachedDefaultLimits.SizeBytesLimit)) + l.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*l.cachedDefaultLimits.SeriesLimit)) + l.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*l.cachedDefaultLimits.SamplesLimit)) } func (l *configRequestLimiter) AllowSizeBytes(tenant string, contentLengthBytes int64) bool { @@ -100,7 +103,7 @@ func (l *configRequestLimiter) AllowSeries(tenant string, amount int64) bool { } allowed := *limit >= amount - if !allowed { + if !allowed && l.limitsHit != nil { l.limitsHit. WithLabelValues(tenant, seriesLimitName). Observe(float64(amount - *limit)) @@ -114,7 +117,7 @@ func (l *configRequestLimiter) AllowSamples(tenant string, amount int64) bool { return true } allowed := *limit >= amount - if !allowed { + if !allowed && l.limitsHit != nil { l.limitsHit. WithLabelValues(tenant, samplesLimitName). Observe(float64(amount - *limit)) diff --git a/pkg/receive/request_limiter_test.go b/pkg/receive/request_limiter_test.go index e654cd1cdf..dfbea066d9 100644 --- a/pkg/receive/request_limiter_test.go +++ b/pkg/receive/request_limiter_test.go @@ -15,12 +15,12 @@ func TestRequestLimiter_limitsFor(t *testing.T) { limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig(). + RequestLimits: *NewEmptyRequestLimitsConfig(). SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenantWithLimits: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig(). + RequestLimits: NewEmptyRequestLimitsConfig(). SetSeriesLimit(30), }, }, @@ -33,7 +33,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the default limits when tenant's limits aren't present", tenant: tenantWithoutLimits, - wantLimits: newEmptyRequestLimitsConfig(). + wantLimits: NewEmptyRequestLimitsConfig(). SetSeriesLimit(10). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -41,7 +41,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the tenant's limits when it is present", tenant: tenantWithLimits, - wantLimits: newEmptyRequestLimitsConfig(). + wantLimits: NewEmptyRequestLimitsConfig(). SetSeriesLimit(30). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -102,11 +102,11 @@ func TestRequestLimiter_AllowRequestBodySizeBytes(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), + RequestLimits: NewEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), }, }, } @@ -159,11 +159,11 @@ func TestRequestLimiter_AllowSeries(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), + RequestLimits: NewEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), }, }, } @@ -217,11 +217,11 @@ func TestRequestLimiter_AllowSamples(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), + RequestLimits: NewEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), }, }, } diff --git a/pkg/receive/testdata/limits.yaml b/pkg/receive/testdata/limits.yaml new file mode 100644 index 0000000000..2345756179 --- /dev/null +++ b/pkg/receive/testdata/limits.yaml @@ -0,0 +1,22 @@ +write: + global: + max_concurrency: 30 + meta_monitoring_url: "http://localhost:9090" + meta_monitoring_limit_query: "sum(prometheus_tsdb_head_series) by (tenant)" + default: + request: + size_bytes_limit: 1024 + series_limit: 1000 + samples_limit: 10 + head_series_limit: 1000 + tenants: + acme: + request: + size_bytes_limit: 0 + series_limit: 0 + samples_limit: 0 + head_series_limit: 2000 + ajax: + request: + series_limit: 50000 + samples_limit: 500 diff --git a/pkg/receive/testdata/limits_config/invalid_limits.yaml b/pkg/receive/testdata/limits_config/invalid_limits.yaml new file mode 100644 index 0000000000..74db0453f8 --- /dev/null +++ b/pkg/receive/testdata/limits_config/invalid_limits.yaml @@ -0,0 +1,17 @@ +write: + global: + max_concurrency: 30 + request: + size_bytes_limit: 1024 + series_limit: 1000 + samples_limit: 10 + tenants: + acme: + request: + size_bytes_limit: 0 + series_limit: 0 + samples_limit: 0 + ajax: + request: + series_limit: 50000 + samples_limit: 500 From eec4fd0053b841ee5fe5284d0fe25e6cbe14a738 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 18 Oct 2022 09:10:32 +0200 Subject: [PATCH 15/43] Query: add query metrics to calls going through the Store API (#5741) * Implement granular query performance metrics for Thanos Query These are grabbed from the data returned by multiple Store APIs after execution of a query. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix some linter warnings Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix long function definition (newQuerier) Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove TODO comment Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Reformat query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless return Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back old query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update query docs again Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix e2e env name Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright notice. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Bump wait time to twice scrape interval Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Attempt to fix randomly failing test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Checking more metrics to ensure the store is ready Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Clean up test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Do not record store api metrics when didn't touch series or samples Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Also skip store api metrics on zero chunks touched Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update changelog Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix broken changelog after merge Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove extra empty line Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor names and (un)exported types and fields Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Start listing metrics exported by Thanos Query Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Rename pkg/store/metrics -> pkg/store/telemetry Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of the pkg/store/telemetry package Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Co-authored-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Signed-off-by: utukj --- CHANGELOG.md | 1 + cmd/thanos/query.go | 19 ++++- docs/components/query.md | 19 +++++ pkg/api/query/grpc.go | 2 + pkg/api/query/v1.go | 99 +++++++++++++++++++--- pkg/api/query/v1_test.go | 6 +- pkg/query/querier.go | 79 ++++++++++++++--- pkg/query/querier_test.go | 37 ++++++-- pkg/query/query_bench_test.go | 13 +-- pkg/query/query_test.go | 11 ++- pkg/store/telemetry.go | 88 +++++++++++++++++++ test/e2e/query_test.go | 155 +++++++++++++++++++++++++++++++--- 12 files changed, 478 insertions(+), 51 deletions(-) create mode 100644 pkg/store/telemetry.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e1d2143c3..6e2e854ce2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. +- [#5741](https://github.com/thanos-io/thanos/pull/5741) Query: add metrics on how much data is being selected by downstream Store APIs. - [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 54724f59a6..5e5a7fc7cd 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,6 +25,8 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" + "google.golang.org/grpc" + v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-community/promql-engine/engine" apiv1 "github.com/thanos-io/thanos/pkg/api/query" @@ -54,7 +56,6 @@ import ( "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/tls" "github.com/thanos-io/thanos/pkg/ui" - "google.golang.org/grpc" ) const ( @@ -205,6 +206,10 @@ func registerQuery(app *extkingpin.App) { alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field.").String() grpcProxyStrategy := cmd.Flag("grpc.proxy-strategy", "Strategy to use when proxying Series requests to leaf nodes. Hidden and only used for testing, will be removed after lazy becomes the default.").Default(string(store.EagerRetrieval)).Hidden().Enum(string(store.EagerRetrieval), string(store.LazyRetrieval)) + queryTelemetryDurationQuantiles := cmd.Flag("query.telemetry.request-duration-seconds-quantiles", "The quantiles for exporting metrics about the request duration quantiles.").Default("0.1", "0.25", "0.75", "1.25", "1.75", "2.5", "3", "5", "10").Float64List() + queryTelemetrySamplesQuantiles := cmd.Flag("query.telemetry.request-samples-quantiles", "The quantiles for exporting metrics about the samples count quantiles.").Default("100", "1000", "10000", "100000", "1000000").Int64List() + queryTelemetrySeriesQuantiles := cmd.Flag("query.telemetry.request-series-seconds-quantiles", "The quantiles for exporting metrics about the series count quantiles.").Default("10", "100", "1000", "10000", "100000").Int64List() + cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { selectorLset, err := parseFlagLabels(*selectorLabels) if err != nil { @@ -317,6 +322,9 @@ func registerQuery(app *extkingpin.App) { *alertQueryURL, *grpcProxyStrategy, component.Query, + *queryTelemetryDurationQuantiles, + *queryTelemetrySamplesQuantiles, + *queryTelemetrySeriesQuantiles, promqlEngineType(*promqlEngine), ) }) @@ -390,6 +398,9 @@ func runQuery( alertQueryURL string, grpcProxyStrategy string, comp component.Component, + queryTelemetryDurationQuantiles []float64, + queryTelemetrySamplesQuantiles []int64, + queryTelemetrySeriesQuantiles []int64, promqlEngine promqlEngineType, ) error { if alertQueryURL == "" { @@ -694,6 +705,12 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), + store.NewSeriesStatsAggregator( + reg, + queryTelemetryDurationQuantiles, + queryTelemetrySamplesQuantiles, + queryTelemetrySeriesQuantiles, + ), reg, ) diff --git a/docs/components/query.md b/docs/components/query.md index 1a028ee3ed..c3690ca05a 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -381,6 +381,15 @@ Flags: be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. + --query.telemetry.request-duration-seconds-quantiles=0.1... ... + The quantiles for exporting metrics about the + request duration quantiles. + --query.telemetry.request-samples-quantiles=100... ... + The quantiles for exporting metrics about the + samples count quantiles. + --query.telemetry.request-series-seconds-quantiles=10... ... + The quantiles for exporting metrics about the + series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' @@ -463,3 +472,13 @@ Flags: of Prometheus. ``` + +## Exported metrics + +Thanos Query also exports metrics about its own performance. You can find a list with these metrics below. + +**Disclaimer**: this list is incomplete. The remaining metrics will be added over time. + +| Name | Type | Labels | Description | +|-----------------------------------------|-----------|-----------------------|-------------------------------------------------------------------------------------------------------------------| +| thanos_store_api_query_duration_seconds | Histogram | samples_le, series_le | Duration of the Thanos Store API select phase for a query according to the amount of samples and series selected. | diff --git a/pkg/api/query/grpc.go b/pkg/api/query/grpc.go index 144166f57b..8848cd2ffe 100644 --- a/pkg/api/query/grpc.go +++ b/pkg/api/query/grpc.go @@ -94,6 +94,7 @@ func (g *GRPCAPI) Query(request *querypb.QueryRequest, server querypb.Query_Quer request.EnableQueryPushdown, false, request.ShardInfo, + query.NoopSeriesStatsReporter, ) qry, err := g.queryEngine.NewInstantQuery(queryable, &promql.QueryOpts{LookbackDelta: lookbackDelta}, request.Query, ts) if err != nil { @@ -168,6 +169,7 @@ func (g *GRPCAPI) QueryRange(request *querypb.QueryRangeRequest, srv querypb.Que request.EnableQueryPushdown, false, request.ShardInfo, + query.NoopSeriesStatsReporter, ) startTime := time.Unix(request.StartTimeSeconds, 0) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index cbe1327a36..918bcbf5fd 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -41,10 +41,8 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" - v1 "github.com/prometheus/prometheus/web/api/v1" - "github.com/prometheus/prometheus/util/stats" - + v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" "github.com/thanos-io/thanos/pkg/exemplars/exemplarspb" @@ -57,6 +55,7 @@ import ( "github.com/thanos-io/thanos/pkg/rules" "github.com/thanos-io/thanos/pkg/rules/rulespb" "github.com/thanos-io/thanos/pkg/runutil" + "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/targets/targetspb" @@ -107,6 +106,13 @@ type QueryAPI struct { defaultMetadataTimeRange time.Duration queryRangeHist prometheus.Histogram + + seriesStatsAggregator seriesQueryPerformanceMetricsAggregator +} + +type seriesQueryPerformanceMetricsAggregator interface { + Aggregate(seriesStats storepb.SeriesStatsCounter) + Observe(duration float64) } // NewQueryAPI returns an initialized QueryAPI type. @@ -134,8 +140,12 @@ func NewQueryAPI( defaultMetadataTimeRange time.Duration, disableCORS bool, gate gate.Gate, + statsAggregator seriesQueryPerformanceMetricsAggregator, reg *prometheus.Registry, ) *QueryAPI { + if statsAggregator == nil { + statsAggregator = &store.NoopSeriesStatsAggregator{} + } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), logger: logger, @@ -160,6 +170,7 @@ func NewQueryAPI( defaultInstantQueryMaxSourceResolution: defaultInstantQueryMaxSourceResolution, defaultMetadataTimeRange: defaultMetadataTimeRange, disableCORS: disableCORS, + seriesStatsAggregator: statsAggregator, queryRangeHist: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "thanos_query_range_requested_timespan_duration_seconds", @@ -396,7 +407,24 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro span, ctx := tracing.StartSpan(ctx, "promql_instant_query") defer span.Finish() - qry, err := qapi.queryEngine.NewInstantQuery(qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), ts) + var seriesStats []storepb.SeriesStatsCounter + qry, err := qapi.queryEngine.NewInstantQuery( + qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + maxSourceResolution, + enablePartialResponse, + qapi.enableQueryPushdown, + false, + shardInfo, + query.NewAggregateStatsReporter(&seriesStats), + ), + &promql.QueryOpts{LookbackDelta: lookbackDelta}, + r.FormValue("query"), + ts, + ) + if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}, func() {} } @@ -409,6 +437,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } defer qapi.gate.Done() + beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -421,6 +450,10 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } + for i := range seriesStats { + qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) + } + qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -525,8 +558,19 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap span, ctx := tracing.StartSpan(ctx, "promql_range_query") defer span.Finish() + var seriesStats []storepb.SeriesStatsCounter qry, err := qapi.queryEngine.NewRangeQuery( - qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), + qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + maxSourceResolution, + enablePartialResponse, + qapi.enableQueryPushdown, + false, + shardInfo, + query.NewAggregateStatsReporter(&seriesStats), + ), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), start, @@ -545,6 +589,7 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } defer qapi.gate.Done() + beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -555,6 +600,10 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } + for i := range seriesStats { + qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) + } + qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -600,8 +649,17 @@ func (qapi *QueryAPI) labelValues(r *http.Request) (interface{}, []error, *api.A matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + true, + nil, + storeDebugMatchers, + 0, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -687,8 +745,18 @@ func (qapi *QueryAPI) series(r *http.Request) (interface{}, []error, *api.ApiErr return nil, nil, apiErr, func() {} } - q, err := qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, math.MaxInt64, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + math.MaxInt64, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -737,8 +805,17 @@ func (qapi *QueryAPI) labelNames(r *http.Request) (interface{}, []error, *api.Ap matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + true, + nil, + storeDebugMatchers, + 0, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 000410ddbd..07c562af9c 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -44,9 +44,8 @@ import ( "github.com/prometheus/prometheus/tsdb/tsdbutil" promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" - "github.com/thanos-io/thanos/pkg/compact" - baseAPI "github.com/thanos-io/thanos/pkg/api" + "github.com/thanos-io/thanos/pkg/compact" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/gate" "github.com/thanos-io/thanos/pkg/query" @@ -198,6 +197,7 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -737,6 +737,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -750,6 +751,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/query/querier.go b/pkg/query/querier.go index 361834c07d..b094cbd45c 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -7,6 +7,7 @@ import ( "context" "sort" "strings" + "sync" "time" "github.com/go-kit/log" @@ -28,21 +29,60 @@ import ( "github.com/thanos-io/thanos/pkg/tracing" ) +type seriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) + +var NoopSeriesStatsReporter seriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} + +func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) seriesStatsReporter { + var mutex sync.Mutex + return func(s storepb.SeriesStatsCounter) { + mutex.Lock() + defer mutex.Unlock() + *stats = append(*stats, s) + } +} + // QueryableCreator returns implementation of promql.Queryable that fetches data from the proxy store API endpoints. // If deduplication is enabled, all data retrieved from it will be deduplicated along all replicaLabels by default. // When the replicaLabels argument is not empty it overwrites the global replicaLabels flag. This allows specifying // replicaLabels at query time. // maxResolutionMillis controls downsampling resolution that is allowed (specified in milliseconds). // partialResponse controls `partialResponseDisabled` option of StoreAPI and partial response behavior of proxy. -type QueryableCreator func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable +type QueryableCreator func( + deduplicate bool, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + shardInfo *storepb.ShardInfo, + seriesStatsReporter seriesStatsReporter, +) storage.Queryable // NewQueryableCreator creates QueryableCreator. -func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy storepb.StoreServer, maxConcurrentSelects int, selectTimeout time.Duration) QueryableCreator { +func NewQueryableCreator( + logger log.Logger, + reg prometheus.Registerer, + proxy storepb.StoreServer, + maxConcurrentSelects int, + selectTimeout time.Duration, +) QueryableCreator { duration := promauto.With( extprom.WrapRegistererWithPrefix("concurrent_selects_", reg), ).NewHistogram(gate.DurationHistogramOpts) - return func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable { + return func( + deduplicate bool, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + shardInfo *storepb.ShardInfo, + seriesStatsReporter seriesStatsReporter, + ) storage.Queryable { return &queryable{ logger: logger, replicaLabels: replicaLabels, @@ -59,6 +99,7 @@ func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy sto selectTimeout: selectTimeout, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, + seriesStatsReporter: seriesStatsReporter, } } } @@ -77,11 +118,12 @@ type queryable struct { selectTimeout time.Duration enableQueryPushdown bool shardInfo *storepb.ShardInfo + seriesStatsReporter seriesStatsReporter } // Querier returns a new storage querier against the underlying proxy store API. func (q *queryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) { - return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo), nil + return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo, q.seriesStatsReporter), nil } type querier struct { @@ -100,6 +142,7 @@ type querier struct { selectGate gate.Gate selectTimeout time.Duration shardInfo *storepb.ShardInfo + seriesStatsReporter seriesStatsReporter } // newQuerier creates implementation of storage.Querier that fetches data from the proxy @@ -107,16 +150,20 @@ type querier struct { func newQuerier( ctx context.Context, logger log.Logger, - mint, maxt int64, + mint, + maxt int64, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, proxy storepb.StoreServer, deduplicate bool, maxResolutionMillis int64, - partialResponse, enableQueryPushdown bool, skipChunks bool, + partialResponse, + enableQueryPushdown, + skipChunks bool, selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, + seriesStatsReporter seriesStatsReporter, ) *querier { if logger == nil { logger = log.NewNopLogger() @@ -145,6 +192,7 @@ func newQuerier( skipChunks: skipChunks, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, + seriesStatsReporter: seriesStatsReporter, } } @@ -157,8 +205,9 @@ type seriesServer struct { storepb.Store_SeriesServer ctx context.Context - seriesSet []storepb.Series - warnings []string + seriesSet []storepb.Series + seriesSetStats storepb.SeriesStatsCounter + warnings []string } func (s *seriesServer) Send(r *storepb.SeriesResponse) error { @@ -169,6 +218,7 @@ func (s *seriesServer) Send(r *storepb.SeriesResponse) error { if r.GetSeries() != nil { s.seriesSet = append(s.seriesSet, *r.GetSeries()) + s.seriesSetStats.Count(r.GetSeries()) return nil } @@ -257,11 +307,12 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match span, ctx := tracing.StartSpan(ctx, "querier_select_select_fn") defer span.Finish() - set, err := q.selectFn(ctx, hints, ms...) + set, stats, err := q.selectFn(ctx, hints, ms...) if err != nil { promise <- storage.ErrSeriesSet(err) return } + q.seriesStatsReporter(stats) promise <- set }() @@ -279,10 +330,10 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match }} } -func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, error) { +func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, storepb.SeriesStatsCounter, error) { sms, err := storepb.PromMatchersToMatchers(ms...) if err != nil { - return nil, errors.Wrap(err, "convert matchers") + return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "convert matchers") } aggrs := aggrsFromFunc(hints.Func) @@ -310,7 +361,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . Step: hints.Step, Range: hints.Range, }, resp); err != nil { - return nil, errors.Wrap(err, "proxy Series()") + return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "proxy Series()") } var warns storage.Warnings @@ -342,7 +393,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . set: newStoreSeriesSet(resp.seriesSet), aggrs: aggrs, warns: warns, - }, nil + }, resp.seriesSetStats, nil } // TODO(fabxc): this could potentially pushed further down into the store API to make true streaming possible. @@ -357,7 +408,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . // The merged series set assembles all potentially-overlapping time ranges of the same series into a single one. // TODO(bwplotka): We could potentially dedup on chunk level, use chunk iterator for that when available. - return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), nil + return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), resp.seriesSetStats, nil } // sortDedupLabels re-sorts the set so that the same series with different replica diff --git a/pkg/query/querier_test.go b/pkg/query/querier_test.go index a43c75e7a5..2e31fa65a0 100644 --- a/pkg/query/querier_test.go +++ b/pkg/query/querier_test.go @@ -44,7 +44,17 @@ func TestQueryableCreator_MaxResolution(t *testing.T) { queryableCreator := NewQueryableCreator(nil, nil, testProxy, 2, 5*time.Second) oneHourMillis := int64(1*time.Hour) / int64(time.Millisecond) - queryable := queryableCreator(false, nil, nil, oneHourMillis, false, false, false, nil) + queryable := queryableCreator( + false, + nil, + nil, + oneHourMillis, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) q, err := queryable.Querier(context.Background(), 0, 42) testutil.Ok(t, err) @@ -71,7 +81,22 @@ func TestQuerier_DownsampledData(t *testing.T) { } timeout := 10 * time.Second - q := NewQueryableCreator(nil, nil, testProxy, 2, timeout)(false, nil, nil, 9999999, false, false, false, nil) + q := NewQueryableCreator( + nil, + nil, + testProxy, + 2, + timeout, + )(false, + nil, + nil, + 9999999, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) engine := promql.NewEngine( promql.EngineOpts{ MaxSamples: math.MaxInt32, @@ -365,7 +390,7 @@ func TestQuerier_Select_AfterPromQL(t *testing.T) { g := gate.New(2) mq := &mockedQueryable{ Creator: func(mint, maxt int64) storage.Querier { - return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) + return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) }, } t.Cleanup(func() { @@ -609,7 +634,7 @@ func TestQuerier_Select(t *testing.T) { {dedup: true, expected: []series{tcase.expectedAfterDedup}}, } { g := gate.New(2) - q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, func(i storepb.SeriesStatsCounter) {}) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) t.Run(fmt.Sprintf("dedup=%v", sc.dedup), func(t *testing.T) { @@ -838,7 +863,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 100 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) @@ -908,7 +933,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 5 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) diff --git a/pkg/query/query_bench_test.go b/pkg/query/query_bench_test.go index 301c880877..84efb46820 100644 --- a/pkg/query/query_bench_test.go +++ b/pkg/query/query_bench_test.go @@ -80,12 +80,13 @@ func benchQuerySelect(t testutil.TB, totalSamples, totalSeries int, dedup bool) logger := log.NewNopLogger() q := &querier{ - ctx: context.Background(), - logger: logger, - proxy: &mockedStoreServer{responses: resps}, - replicaLabels: map[string]struct{}{"a_replica": {}}, - deduplicate: dedup, - selectGate: gate.NewNoop(), + ctx: context.Background(), + logger: logger, + proxy: &mockedStoreServer{responses: resps}, + replicaLabels: map[string]struct{}{"a_replica": {}}, + deduplicate: dedup, + selectGate: gate.NewNoop(), + seriesStatsReporter: NoopSeriesStatsReporter, } testSelect(t, q, expectedSeries) } diff --git a/pkg/query/query_test.go b/pkg/query/query_test.go index 99e29be66f..060571fc70 100644 --- a/pkg/query/query_test.go +++ b/pkg/query/query_test.go @@ -54,7 +54,16 @@ func TestQuerier_Proxy(t *testing.T) { name: fmt.Sprintf("store number %v", i), }) } - return q(true, nil, nil, 0, false, false, false, nil) + return q(true, + nil, + nil, + 0, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) } for _, fn := range files { diff --git a/pkg/store/telemetry.go b/pkg/store/telemetry.go new file mode 100644 index 0000000000..a854daaf0c --- /dev/null +++ b/pkg/store/telemetry.go @@ -0,0 +1,88 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package store + +import ( + "strconv" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/thanos-io/thanos/pkg/store/storepb" +) + +// seriesStatsAggregator aggregates results from fanned-out queries into a histogram given their +// response's shape. +type seriesStatsAggregator struct { + queryDuration *prometheus.HistogramVec + + seriesLeBuckets []int64 + samplesLeBuckets []int64 + seriesStats storepb.SeriesStatsCounter +} + +// NewSeriesStatsAggregator is a constructor for seriesStatsAggregator. +func NewSeriesStatsAggregator( + reg prometheus.Registerer, + durationQuantiles []float64, + sampleQuantiles []int64, + seriesQuantiles []int64, +) *seriesStatsAggregator { + return &seriesStatsAggregator{ + queryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "thanos_store_api_query_duration_seconds", + Help: "Duration of the Thanos Store API select phase for a query.", + Buckets: durationQuantiles, + }, []string{"series_le", "samples_le"}), + seriesLeBuckets: seriesQuantiles, + samplesLeBuckets: sampleQuantiles, + seriesStats: storepb.SeriesStatsCounter{}, + } +} + +// Aggregate is an aggregator for merging `storepb.SeriesStatsCounter` for each incoming fanned out query. +func (s *seriesStatsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { + s.seriesStats.Series += stats.Series + s.seriesStats.Samples += stats.Samples + s.seriesStats.Chunks += stats.Chunks +} + +// Observe commits the aggregated SeriesStatsCounter as an observation. +func (s *seriesStatsAggregator) Observe(duration float64) { + if s.seriesStats.Series == 0 || s.seriesStats.Samples == 0 || s.seriesStats.Chunks == 0 { + return + } + // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 + seriesLeBucket := s.findBucket(float64(s.seriesStats.Series), s.seriesLeBuckets) + samplesLeBucket := s.findBucket(float64(s.seriesStats.Samples), s.samplesLeBuckets) + s.queryDuration.With(prometheus.Labels{ + "series_le": strconv.Itoa(int(seriesLeBucket)), + "samples_le": strconv.Itoa(int(samplesLeBucket)), + }).Observe(duration) + s.reset() +} + +func (s *seriesStatsAggregator) reset() { + s.seriesStats = storepb.SeriesStatsCounter{} +} + +func (s *seriesStatsAggregator) findBucket(value float64, quantiles []int64) int64 { + if len(quantiles) == 0 { + return 0 + } + var foundBucket int64 + for _, bucket := range quantiles { + foundBucket = bucket + if value < float64(bucket) { + break + } + } + return foundBucket +} + +// NoopSeriesStatsAggregator is a query performance series aggregator that does nothing. +type NoopSeriesStatsAggregator struct{} + +func (s *NoopSeriesStatsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} + +func (s *NoopSeriesStatsAggregator) Observe(_ float64) {} diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 7fc56bda97..04b425061a 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -23,6 +23,7 @@ import ( "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" "github.com/efficientgo/e2e" + e2edb "github.com/efficientgo/e2e/db" e2emon "github.com/efficientgo/e2e/monitoring" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -578,6 +579,130 @@ func newSample(s fakeMetricSample) model.Sample { } } +func TestQueryStoreMetrics(t *testing.T) { + t.Parallel() + + // Build up. + e, err := e2e.New(e2e.WithName("storemetrics01")) + testutil.Ok(t, err) + t.Cleanup(e2ethanos.CleanScenario(t, e)) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + t.Cleanup(cancel) + + bucket := "store-gw-test" + minio := e2ethanos.NewMinio(e, "thanos-minio", bucket) + testutil.Ok(t, e2e.StartAndWaitReady(minio)) + + l := log.NewLogfmtLogger(os.Stdout) + bkt, err := s3.NewBucketWithConfig(l, e2ethanos.NewS3Config(bucket, minio.Endpoint("https"), minio.Dir()), "test") + testutil.Ok(t, err) + + // Preparing 2 different blocks for the tests. + { + blockSizes := []struct { + samples int + series int + name string + }{ + {samples: 10, series: 1, name: "one_series"}, + {samples: 10, series: 1001, name: "thousand_one_series"}, + } + now := time.Now() + externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") + dir := filepath.Join(e.SharedDir(), "tmp") + testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) + for _, blockSize := range blockSizes { + series := make([]labels.Labels, blockSize.series) + for i := 0; i < blockSize.series; i++ { + bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) + series[i] = bigSeriesLabels + } + blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, + dir, + series, + blockSize.samples, + timestamp.FromTime(now), + timestamp.FromTime(now.Add(2*time.Hour)), + 30*time.Minute, + externalLabels, + 0, + metadata.NoneFunc, + ) + testutil.Ok(t, err) + testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) + } + } + + storeGW := e2ethanos.NewStoreGW( + e, + "s1", + client.BucketConfig{ + Type: client.S3, + Config: e2ethanos.NewS3Config(bucket, minio.InternalEndpoint("https"), minio.InternalDir()), + }, + "", + nil, + ) + querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() + testutil.Ok(t, e2e.StartAndWaitReady(storeGW, querier)) + testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) + + // Querying the series in the previously created blocks to ensure we produce Store API query metrics. + { + instantQuery(t, ctx, querier.Endpoint("http"), func() string { + return "max_over_time(one_series{instance='foo_0'}[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1) + testutil.Ok(t, err) + + instantQuery(t, ctx, querier.Endpoint("http"), func() string { + return "max_over_time(thousand_one_series[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1001) + testutil.Ok(t, err) + } + + mon, err := e2emon.Start(e) + testutil.Ok(t, err) + + queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { + return "thanos_store_api_query_duration_seconds_count{samples_le='100000',series_le='10000'}" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{ + "__name__": "thanos_store_api_query_duration_seconds_count", + "instance": "storemetrics01-querier-1:8080", + "job": "querier-1", + "samples_le": "100000", + "series_le": "10000", + }, + Value: model.SampleValue(1), + }, + }) + + queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { + return "thanos_store_api_query_duration_seconds_count{samples_le='100',series_le='10'}" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{ + "__name__": "thanos_store_api_query_duration_seconds_count", + "instance": "storemetrics01-querier-1:8080", + "job": "querier-1", + "samples_le": "100", + "series_le": "10", + }, + Value: model.SampleValue(1), + }, + }) +} + // Regression test for https://github.com/thanos-io/thanos/issues/5033. // Tests whether queries work with mixed sources, and with functions // that we are pushing down: min, max, min_over_time, max_over_time, @@ -882,18 +1007,10 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin "msg", fmt.Sprintf("Waiting for %d results for query %s", expectedSeriesLen, q()), ) testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { - res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) + res, err := simpleInstantQuery(t, ctx, addr, q, ts, opts, expectedSeriesLen) if err != nil { return err } - - if len(warnings) > 0 { - return errors.Errorf("unexpected warnings %s", warnings) - } - - if len(res) != expectedSeriesLen { - return errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) - } result = res return nil })) @@ -901,6 +1018,24 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin return result } +func simpleInstantQuery(t testing.TB, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expectedSeriesLen int) (model.Vector, error) { + res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) + if err != nil { + return nil, err + } + + if len(warnings) > 0 { + return nil, errors.Errorf("unexpected warnings %s", warnings) + } + + if len(res) != expectedSeriesLen { + return nil, errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) + } + + sortResults(res) + return res, nil +} + func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) { t.Helper() @@ -912,7 +1047,7 @@ func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() "caller", "queryWaitAndAssert", "msg", fmt.Sprintf("Waiting for %d results for query %s", len(expected), q()), ) - testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.RetryWithLog(logger, 10*time.Second, ctx.Done(), func() error { res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err From ea646a6010e4fc11997880d696ec7b2718b7ecf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Tue, 18 Oct 2022 12:48:11 +0300 Subject: [PATCH 16/43] docs: mark me as shepherd for next release (#5797) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's release the RC on Friday. Signed-off-by: Giedrius Statkevičius Signed-off-by: Giedrius Statkevičius Signed-off-by: utukj --- docs/release-process.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-process.md b/docs/release-process.md index 84415b0f11..1d99961df5 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -24,7 +24,7 @@ Release shepherd responsibilities: | Release | Time of first RC | Shepherd (GitHub handle) | |---------|----------------------|-------------------------------| | v0.30.0 | (planned) 2022.11.21 | No one ATM | -| v0.29.0 | (planned) 2022.09.29 | No one ATM | +| v0.29.0 | 2022.10.21 | `@GiedriusS` | | v0.28.0 | 2022.08.22 | `@yeya24` | | v0.27.0 | 2022.06.21 | `@wiardvanrij` and `@matej-g` | | v0.26.0 | 2022.04.29 | `@wiardvanrij` | From f01954d97d2c78657560476612f549f8b4cc882b Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 16:56:34 +0100 Subject: [PATCH 17/43] Revert "docs: mark me as shepherd for next release (#5797)" This reverts commit ea646a6010e4fc11997880d696ec7b2718b7ecf2. Signed-off-by: utukj --- docs/release-process.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-process.md b/docs/release-process.md index 1d99961df5..84415b0f11 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -24,7 +24,7 @@ Release shepherd responsibilities: | Release | Time of first RC | Shepherd (GitHub handle) | |---------|----------------------|-------------------------------| | v0.30.0 | (planned) 2022.11.21 | No one ATM | -| v0.29.0 | 2022.10.21 | `@GiedriusS` | +| v0.29.0 | (planned) 2022.09.29 | No one ATM | | v0.28.0 | 2022.08.22 | `@yeya24` | | v0.27.0 | 2022.06.21 | `@wiardvanrij` and `@matej-g` | | v0.26.0 | 2022.04.29 | `@wiardvanrij` | From 3d6dd071899861f570d1fb9bed985637b6e99721 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 16:57:11 +0100 Subject: [PATCH 18/43] Revert "Query: add query metrics to calls going through the Store API (#5741)" This reverts commit eec4fd0053b841ee5fe5284d0fe25e6cbe14a738. Signed-off-by: utukj --- CHANGELOG.md | 1 - cmd/thanos/query.go | 19 +---- docs/components/query.md | 19 ----- pkg/api/query/grpc.go | 2 - pkg/api/query/v1.go | 99 +++------------------- pkg/api/query/v1_test.go | 6 +- pkg/query/querier.go | 79 +++-------------- pkg/query/querier_test.go | 37 ++------ pkg/query/query_bench_test.go | 13 ++- pkg/query/query_test.go | 11 +-- pkg/store/telemetry.go | 88 ------------------- test/e2e/query_test.go | 155 +++------------------------------- 12 files changed, 51 insertions(+), 478 deletions(-) delete mode 100644 pkg/store/telemetry.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e2e854ce2..6e1d2143c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,6 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. -- [#5741](https://github.com/thanos-io/thanos/pull/5741) Query: add metrics on how much data is being selected by downstream Store APIs. - [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 5e5a7fc7cd..54724f59a6 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,8 +25,6 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" - "google.golang.org/grpc" - v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-community/promql-engine/engine" apiv1 "github.com/thanos-io/thanos/pkg/api/query" @@ -56,6 +54,7 @@ import ( "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/tls" "github.com/thanos-io/thanos/pkg/ui" + "google.golang.org/grpc" ) const ( @@ -206,10 +205,6 @@ func registerQuery(app *extkingpin.App) { alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field.").String() grpcProxyStrategy := cmd.Flag("grpc.proxy-strategy", "Strategy to use when proxying Series requests to leaf nodes. Hidden and only used for testing, will be removed after lazy becomes the default.").Default(string(store.EagerRetrieval)).Hidden().Enum(string(store.EagerRetrieval), string(store.LazyRetrieval)) - queryTelemetryDurationQuantiles := cmd.Flag("query.telemetry.request-duration-seconds-quantiles", "The quantiles for exporting metrics about the request duration quantiles.").Default("0.1", "0.25", "0.75", "1.25", "1.75", "2.5", "3", "5", "10").Float64List() - queryTelemetrySamplesQuantiles := cmd.Flag("query.telemetry.request-samples-quantiles", "The quantiles for exporting metrics about the samples count quantiles.").Default("100", "1000", "10000", "100000", "1000000").Int64List() - queryTelemetrySeriesQuantiles := cmd.Flag("query.telemetry.request-series-seconds-quantiles", "The quantiles for exporting metrics about the series count quantiles.").Default("10", "100", "1000", "10000", "100000").Int64List() - cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { selectorLset, err := parseFlagLabels(*selectorLabels) if err != nil { @@ -322,9 +317,6 @@ func registerQuery(app *extkingpin.App) { *alertQueryURL, *grpcProxyStrategy, component.Query, - *queryTelemetryDurationQuantiles, - *queryTelemetrySamplesQuantiles, - *queryTelemetrySeriesQuantiles, promqlEngineType(*promqlEngine), ) }) @@ -398,9 +390,6 @@ func runQuery( alertQueryURL string, grpcProxyStrategy string, comp component.Component, - queryTelemetryDurationQuantiles []float64, - queryTelemetrySamplesQuantiles []int64, - queryTelemetrySeriesQuantiles []int64, promqlEngine promqlEngineType, ) error { if alertQueryURL == "" { @@ -705,12 +694,6 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), - store.NewSeriesStatsAggregator( - reg, - queryTelemetryDurationQuantiles, - queryTelemetrySamplesQuantiles, - queryTelemetrySeriesQuantiles, - ), reg, ) diff --git a/docs/components/query.md b/docs/components/query.md index c3690ca05a..1a028ee3ed 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -381,15 +381,6 @@ Flags: be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. - --query.telemetry.request-duration-seconds-quantiles=0.1... ... - The quantiles for exporting metrics about the - request duration quantiles. - --query.telemetry.request-samples-quantiles=100... ... - The quantiles for exporting metrics about the - samples count quantiles. - --query.telemetry.request-series-seconds-quantiles=10... ... - The quantiles for exporting metrics about the - series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' @@ -472,13 +463,3 @@ Flags: of Prometheus. ``` - -## Exported metrics - -Thanos Query also exports metrics about its own performance. You can find a list with these metrics below. - -**Disclaimer**: this list is incomplete. The remaining metrics will be added over time. - -| Name | Type | Labels | Description | -|-----------------------------------------|-----------|-----------------------|-------------------------------------------------------------------------------------------------------------------| -| thanos_store_api_query_duration_seconds | Histogram | samples_le, series_le | Duration of the Thanos Store API select phase for a query according to the amount of samples and series selected. | diff --git a/pkg/api/query/grpc.go b/pkg/api/query/grpc.go index 8848cd2ffe..144166f57b 100644 --- a/pkg/api/query/grpc.go +++ b/pkg/api/query/grpc.go @@ -94,7 +94,6 @@ func (g *GRPCAPI) Query(request *querypb.QueryRequest, server querypb.Query_Quer request.EnableQueryPushdown, false, request.ShardInfo, - query.NoopSeriesStatsReporter, ) qry, err := g.queryEngine.NewInstantQuery(queryable, &promql.QueryOpts{LookbackDelta: lookbackDelta}, request.Query, ts) if err != nil { @@ -169,7 +168,6 @@ func (g *GRPCAPI) QueryRange(request *querypb.QueryRangeRequest, srv querypb.Que request.EnableQueryPushdown, false, request.ShardInfo, - query.NoopSeriesStatsReporter, ) startTime := time.Unix(request.StartTimeSeconds, 0) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index 918bcbf5fd..cbe1327a36 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -41,8 +41,10 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" - "github.com/prometheus/prometheus/util/stats" v1 "github.com/prometheus/prometheus/web/api/v1" + + "github.com/prometheus/prometheus/util/stats" + "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" "github.com/thanos-io/thanos/pkg/exemplars/exemplarspb" @@ -55,7 +57,6 @@ import ( "github.com/thanos-io/thanos/pkg/rules" "github.com/thanos-io/thanos/pkg/rules/rulespb" "github.com/thanos-io/thanos/pkg/runutil" - "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/targets/targetspb" @@ -106,13 +107,6 @@ type QueryAPI struct { defaultMetadataTimeRange time.Duration queryRangeHist prometheus.Histogram - - seriesStatsAggregator seriesQueryPerformanceMetricsAggregator -} - -type seriesQueryPerformanceMetricsAggregator interface { - Aggregate(seriesStats storepb.SeriesStatsCounter) - Observe(duration float64) } // NewQueryAPI returns an initialized QueryAPI type. @@ -140,12 +134,8 @@ func NewQueryAPI( defaultMetadataTimeRange time.Duration, disableCORS bool, gate gate.Gate, - statsAggregator seriesQueryPerformanceMetricsAggregator, reg *prometheus.Registry, ) *QueryAPI { - if statsAggregator == nil { - statsAggregator = &store.NoopSeriesStatsAggregator{} - } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), logger: logger, @@ -170,7 +160,6 @@ func NewQueryAPI( defaultInstantQueryMaxSourceResolution: defaultInstantQueryMaxSourceResolution, defaultMetadataTimeRange: defaultMetadataTimeRange, disableCORS: disableCORS, - seriesStatsAggregator: statsAggregator, queryRangeHist: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "thanos_query_range_requested_timespan_duration_seconds", @@ -407,24 +396,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro span, ctx := tracing.StartSpan(ctx, "promql_instant_query") defer span.Finish() - var seriesStats []storepb.SeriesStatsCounter - qry, err := qapi.queryEngine.NewInstantQuery( - qapi.queryableCreate( - enableDedup, - replicaLabels, - storeDebugMatchers, - maxSourceResolution, - enablePartialResponse, - qapi.enableQueryPushdown, - false, - shardInfo, - query.NewAggregateStatsReporter(&seriesStats), - ), - &promql.QueryOpts{LookbackDelta: lookbackDelta}, - r.FormValue("query"), - ts, - ) - + qry, err := qapi.queryEngine.NewInstantQuery(qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), ts) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}, func() {} } @@ -437,7 +409,6 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } defer qapi.gate.Done() - beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -450,10 +421,6 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } - for i := range seriesStats { - qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) - } - qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -558,19 +525,8 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap span, ctx := tracing.StartSpan(ctx, "promql_range_query") defer span.Finish() - var seriesStats []storepb.SeriesStatsCounter qry, err := qapi.queryEngine.NewRangeQuery( - qapi.queryableCreate( - enableDedup, - replicaLabels, - storeDebugMatchers, - maxSourceResolution, - enablePartialResponse, - qapi.enableQueryPushdown, - false, - shardInfo, - query.NewAggregateStatsReporter(&seriesStats), - ), + qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), start, @@ -589,7 +545,6 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } defer qapi.gate.Done() - beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -600,10 +555,6 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } - for i := range seriesStats { - qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) - } - qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -649,17 +600,8 @@ func (qapi *QueryAPI) labelValues(r *http.Request) (interface{}, []error, *api.A matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate( - true, - nil, - storeDebugMatchers, - 0, - enablePartialResponse, - qapi.enableQueryPushdown, - true, - nil, - query.NoopSeriesStatsReporter, - ).Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). + Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -745,18 +687,8 @@ func (qapi *QueryAPI) series(r *http.Request) (interface{}, []error, *api.ApiErr return nil, nil, apiErr, func() {} } - q, err := qapi.queryableCreate( - enableDedup, - replicaLabels, - storeDebugMatchers, - math.MaxInt64, - enablePartialResponse, - qapi.enableQueryPushdown, - true, - nil, - query.NoopSeriesStatsReporter, - ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) - + q, err := qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, math.MaxInt64, enablePartialResponse, qapi.enableQueryPushdown, true, nil). + Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -805,17 +737,8 @@ func (qapi *QueryAPI) labelNames(r *http.Request) (interface{}, []error, *api.Ap matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate( - true, - nil, - storeDebugMatchers, - 0, - enablePartialResponse, - qapi.enableQueryPushdown, - true, - nil, - query.NoopSeriesStatsReporter, - ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). + Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 07c562af9c..000410ddbd 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -44,8 +44,9 @@ import ( "github.com/prometheus/prometheus/tsdb/tsdbutil" promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" - baseAPI "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/compact" + + baseAPI "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/gate" "github.com/thanos-io/thanos/pkg/query" @@ -197,7 +198,6 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -737,7 +737,6 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -751,7 +750,6 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/query/querier.go b/pkg/query/querier.go index b094cbd45c..361834c07d 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -7,7 +7,6 @@ import ( "context" "sort" "strings" - "sync" "time" "github.com/go-kit/log" @@ -29,60 +28,21 @@ import ( "github.com/thanos-io/thanos/pkg/tracing" ) -type seriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) - -var NoopSeriesStatsReporter seriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} - -func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) seriesStatsReporter { - var mutex sync.Mutex - return func(s storepb.SeriesStatsCounter) { - mutex.Lock() - defer mutex.Unlock() - *stats = append(*stats, s) - } -} - // QueryableCreator returns implementation of promql.Queryable that fetches data from the proxy store API endpoints. // If deduplication is enabled, all data retrieved from it will be deduplicated along all replicaLabels by default. // When the replicaLabels argument is not empty it overwrites the global replicaLabels flag. This allows specifying // replicaLabels at query time. // maxResolutionMillis controls downsampling resolution that is allowed (specified in milliseconds). // partialResponse controls `partialResponseDisabled` option of StoreAPI and partial response behavior of proxy. -type QueryableCreator func( - deduplicate bool, - replicaLabels []string, - storeDebugMatchers [][]*labels.Matcher, - maxResolutionMillis int64, - partialResponse, - enableQueryPushdown, - skipChunks bool, - shardInfo *storepb.ShardInfo, - seriesStatsReporter seriesStatsReporter, -) storage.Queryable +type QueryableCreator func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable // NewQueryableCreator creates QueryableCreator. -func NewQueryableCreator( - logger log.Logger, - reg prometheus.Registerer, - proxy storepb.StoreServer, - maxConcurrentSelects int, - selectTimeout time.Duration, -) QueryableCreator { +func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy storepb.StoreServer, maxConcurrentSelects int, selectTimeout time.Duration) QueryableCreator { duration := promauto.With( extprom.WrapRegistererWithPrefix("concurrent_selects_", reg), ).NewHistogram(gate.DurationHistogramOpts) - return func( - deduplicate bool, - replicaLabels []string, - storeDebugMatchers [][]*labels.Matcher, - maxResolutionMillis int64, - partialResponse, - enableQueryPushdown, - skipChunks bool, - shardInfo *storepb.ShardInfo, - seriesStatsReporter seriesStatsReporter, - ) storage.Queryable { + return func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable { return &queryable{ logger: logger, replicaLabels: replicaLabels, @@ -99,7 +59,6 @@ func NewQueryableCreator( selectTimeout: selectTimeout, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, - seriesStatsReporter: seriesStatsReporter, } } } @@ -118,12 +77,11 @@ type queryable struct { selectTimeout time.Duration enableQueryPushdown bool shardInfo *storepb.ShardInfo - seriesStatsReporter seriesStatsReporter } // Querier returns a new storage querier against the underlying proxy store API. func (q *queryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) { - return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo, q.seriesStatsReporter), nil + return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo), nil } type querier struct { @@ -142,7 +100,6 @@ type querier struct { selectGate gate.Gate selectTimeout time.Duration shardInfo *storepb.ShardInfo - seriesStatsReporter seriesStatsReporter } // newQuerier creates implementation of storage.Querier that fetches data from the proxy @@ -150,20 +107,16 @@ type querier struct { func newQuerier( ctx context.Context, logger log.Logger, - mint, - maxt int64, + mint, maxt int64, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, proxy storepb.StoreServer, deduplicate bool, maxResolutionMillis int64, - partialResponse, - enableQueryPushdown, - skipChunks bool, + partialResponse, enableQueryPushdown bool, skipChunks bool, selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, - seriesStatsReporter seriesStatsReporter, ) *querier { if logger == nil { logger = log.NewNopLogger() @@ -192,7 +145,6 @@ func newQuerier( skipChunks: skipChunks, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, - seriesStatsReporter: seriesStatsReporter, } } @@ -205,9 +157,8 @@ type seriesServer struct { storepb.Store_SeriesServer ctx context.Context - seriesSet []storepb.Series - seriesSetStats storepb.SeriesStatsCounter - warnings []string + seriesSet []storepb.Series + warnings []string } func (s *seriesServer) Send(r *storepb.SeriesResponse) error { @@ -218,7 +169,6 @@ func (s *seriesServer) Send(r *storepb.SeriesResponse) error { if r.GetSeries() != nil { s.seriesSet = append(s.seriesSet, *r.GetSeries()) - s.seriesSetStats.Count(r.GetSeries()) return nil } @@ -307,12 +257,11 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match span, ctx := tracing.StartSpan(ctx, "querier_select_select_fn") defer span.Finish() - set, stats, err := q.selectFn(ctx, hints, ms...) + set, err := q.selectFn(ctx, hints, ms...) if err != nil { promise <- storage.ErrSeriesSet(err) return } - q.seriesStatsReporter(stats) promise <- set }() @@ -330,10 +279,10 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match }} } -func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, storepb.SeriesStatsCounter, error) { +func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, error) { sms, err := storepb.PromMatchersToMatchers(ms...) if err != nil { - return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "convert matchers") + return nil, errors.Wrap(err, "convert matchers") } aggrs := aggrsFromFunc(hints.Func) @@ -361,7 +310,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . Step: hints.Step, Range: hints.Range, }, resp); err != nil { - return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "proxy Series()") + return nil, errors.Wrap(err, "proxy Series()") } var warns storage.Warnings @@ -393,7 +342,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . set: newStoreSeriesSet(resp.seriesSet), aggrs: aggrs, warns: warns, - }, resp.seriesSetStats, nil + }, nil } // TODO(fabxc): this could potentially pushed further down into the store API to make true streaming possible. @@ -408,7 +357,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . // The merged series set assembles all potentially-overlapping time ranges of the same series into a single one. // TODO(bwplotka): We could potentially dedup on chunk level, use chunk iterator for that when available. - return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), resp.seriesSetStats, nil + return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), nil } // sortDedupLabels re-sorts the set so that the same series with different replica diff --git a/pkg/query/querier_test.go b/pkg/query/querier_test.go index 2e31fa65a0..a43c75e7a5 100644 --- a/pkg/query/querier_test.go +++ b/pkg/query/querier_test.go @@ -44,17 +44,7 @@ func TestQueryableCreator_MaxResolution(t *testing.T) { queryableCreator := NewQueryableCreator(nil, nil, testProxy, 2, 5*time.Second) oneHourMillis := int64(1*time.Hour) / int64(time.Millisecond) - queryable := queryableCreator( - false, - nil, - nil, - oneHourMillis, - false, - false, - false, - nil, - NoopSeriesStatsReporter, - ) + queryable := queryableCreator(false, nil, nil, oneHourMillis, false, false, false, nil) q, err := queryable.Querier(context.Background(), 0, 42) testutil.Ok(t, err) @@ -81,22 +71,7 @@ func TestQuerier_DownsampledData(t *testing.T) { } timeout := 10 * time.Second - q := NewQueryableCreator( - nil, - nil, - testProxy, - 2, - timeout, - )(false, - nil, - nil, - 9999999, - false, - false, - false, - nil, - NoopSeriesStatsReporter, - ) + q := NewQueryableCreator(nil, nil, testProxy, 2, timeout)(false, nil, nil, 9999999, false, false, false, nil) engine := promql.NewEngine( promql.EngineOpts{ MaxSamples: math.MaxInt32, @@ -390,7 +365,7 @@ func TestQuerier_Select_AfterPromQL(t *testing.T) { g := gate.New(2) mq := &mockedQueryable{ Creator: func(mint, maxt int64) storage.Querier { - return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) + return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) }, } t.Cleanup(func() { @@ -634,7 +609,7 @@ func TestQuerier_Select(t *testing.T) { {dedup: true, expected: []series{tcase.expectedAfterDedup}}, } { g := gate.New(2) - q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, func(i storepb.SeriesStatsCounter) {}) + q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) t.Run(fmt.Sprintf("dedup=%v", sc.dedup), func(t *testing.T) { @@ -863,7 +838,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 100 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) @@ -933,7 +908,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 5 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) diff --git a/pkg/query/query_bench_test.go b/pkg/query/query_bench_test.go index 84efb46820..301c880877 100644 --- a/pkg/query/query_bench_test.go +++ b/pkg/query/query_bench_test.go @@ -80,13 +80,12 @@ func benchQuerySelect(t testutil.TB, totalSamples, totalSeries int, dedup bool) logger := log.NewNopLogger() q := &querier{ - ctx: context.Background(), - logger: logger, - proxy: &mockedStoreServer{responses: resps}, - replicaLabels: map[string]struct{}{"a_replica": {}}, - deduplicate: dedup, - selectGate: gate.NewNoop(), - seriesStatsReporter: NoopSeriesStatsReporter, + ctx: context.Background(), + logger: logger, + proxy: &mockedStoreServer{responses: resps}, + replicaLabels: map[string]struct{}{"a_replica": {}}, + deduplicate: dedup, + selectGate: gate.NewNoop(), } testSelect(t, q, expectedSeries) } diff --git a/pkg/query/query_test.go b/pkg/query/query_test.go index 060571fc70..99e29be66f 100644 --- a/pkg/query/query_test.go +++ b/pkg/query/query_test.go @@ -54,16 +54,7 @@ func TestQuerier_Proxy(t *testing.T) { name: fmt.Sprintf("store number %v", i), }) } - return q(true, - nil, - nil, - 0, - false, - false, - false, - nil, - NoopSeriesStatsReporter, - ) + return q(true, nil, nil, 0, false, false, false, nil) } for _, fn := range files { diff --git a/pkg/store/telemetry.go b/pkg/store/telemetry.go deleted file mode 100644 index a854daaf0c..0000000000 --- a/pkg/store/telemetry.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package store - -import ( - "strconv" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/thanos-io/thanos/pkg/store/storepb" -) - -// seriesStatsAggregator aggregates results from fanned-out queries into a histogram given their -// response's shape. -type seriesStatsAggregator struct { - queryDuration *prometheus.HistogramVec - - seriesLeBuckets []int64 - samplesLeBuckets []int64 - seriesStats storepb.SeriesStatsCounter -} - -// NewSeriesStatsAggregator is a constructor for seriesStatsAggregator. -func NewSeriesStatsAggregator( - reg prometheus.Registerer, - durationQuantiles []float64, - sampleQuantiles []int64, - seriesQuantiles []int64, -) *seriesStatsAggregator { - return &seriesStatsAggregator{ - queryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "thanos_store_api_query_duration_seconds", - Help: "Duration of the Thanos Store API select phase for a query.", - Buckets: durationQuantiles, - }, []string{"series_le", "samples_le"}), - seriesLeBuckets: seriesQuantiles, - samplesLeBuckets: sampleQuantiles, - seriesStats: storepb.SeriesStatsCounter{}, - } -} - -// Aggregate is an aggregator for merging `storepb.SeriesStatsCounter` for each incoming fanned out query. -func (s *seriesStatsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { - s.seriesStats.Series += stats.Series - s.seriesStats.Samples += stats.Samples - s.seriesStats.Chunks += stats.Chunks -} - -// Observe commits the aggregated SeriesStatsCounter as an observation. -func (s *seriesStatsAggregator) Observe(duration float64) { - if s.seriesStats.Series == 0 || s.seriesStats.Samples == 0 || s.seriesStats.Chunks == 0 { - return - } - // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 - seriesLeBucket := s.findBucket(float64(s.seriesStats.Series), s.seriesLeBuckets) - samplesLeBucket := s.findBucket(float64(s.seriesStats.Samples), s.samplesLeBuckets) - s.queryDuration.With(prometheus.Labels{ - "series_le": strconv.Itoa(int(seriesLeBucket)), - "samples_le": strconv.Itoa(int(samplesLeBucket)), - }).Observe(duration) - s.reset() -} - -func (s *seriesStatsAggregator) reset() { - s.seriesStats = storepb.SeriesStatsCounter{} -} - -func (s *seriesStatsAggregator) findBucket(value float64, quantiles []int64) int64 { - if len(quantiles) == 0 { - return 0 - } - var foundBucket int64 - for _, bucket := range quantiles { - foundBucket = bucket - if value < float64(bucket) { - break - } - } - return foundBucket -} - -// NoopSeriesStatsAggregator is a query performance series aggregator that does nothing. -type NoopSeriesStatsAggregator struct{} - -func (s *NoopSeriesStatsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} - -func (s *NoopSeriesStatsAggregator) Observe(_ float64) {} diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 04b425061a..7fc56bda97 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -23,7 +23,6 @@ import ( "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" "github.com/efficientgo/e2e" - e2edb "github.com/efficientgo/e2e/db" e2emon "github.com/efficientgo/e2e/monitoring" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -579,130 +578,6 @@ func newSample(s fakeMetricSample) model.Sample { } } -func TestQueryStoreMetrics(t *testing.T) { - t.Parallel() - - // Build up. - e, err := e2e.New(e2e.WithName("storemetrics01")) - testutil.Ok(t, err) - t.Cleanup(e2ethanos.CleanScenario(t, e)) - - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) - t.Cleanup(cancel) - - bucket := "store-gw-test" - minio := e2ethanos.NewMinio(e, "thanos-minio", bucket) - testutil.Ok(t, e2e.StartAndWaitReady(minio)) - - l := log.NewLogfmtLogger(os.Stdout) - bkt, err := s3.NewBucketWithConfig(l, e2ethanos.NewS3Config(bucket, minio.Endpoint("https"), minio.Dir()), "test") - testutil.Ok(t, err) - - // Preparing 2 different blocks for the tests. - { - blockSizes := []struct { - samples int - series int - name string - }{ - {samples: 10, series: 1, name: "one_series"}, - {samples: 10, series: 1001, name: "thousand_one_series"}, - } - now := time.Now() - externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") - dir := filepath.Join(e.SharedDir(), "tmp") - testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) - for _, blockSize := range blockSizes { - series := make([]labels.Labels, blockSize.series) - for i := 0; i < blockSize.series; i++ { - bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) - series[i] = bigSeriesLabels - } - blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, - dir, - series, - blockSize.samples, - timestamp.FromTime(now), - timestamp.FromTime(now.Add(2*time.Hour)), - 30*time.Minute, - externalLabels, - 0, - metadata.NoneFunc, - ) - testutil.Ok(t, err) - testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) - } - } - - storeGW := e2ethanos.NewStoreGW( - e, - "s1", - client.BucketConfig{ - Type: client.S3, - Config: e2ethanos.NewS3Config(bucket, minio.InternalEndpoint("https"), minio.InternalDir()), - }, - "", - nil, - ) - querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() - testutil.Ok(t, e2e.StartAndWaitReady(storeGW, querier)) - testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) - - // Querying the series in the previously created blocks to ensure we produce Store API query metrics. - { - instantQuery(t, ctx, querier.Endpoint("http"), func() string { - return "max_over_time(one_series{instance='foo_0'}[2h])" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, 1) - testutil.Ok(t, err) - - instantQuery(t, ctx, querier.Endpoint("http"), func() string { - return "max_over_time(thousand_one_series[2h])" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, 1001) - testutil.Ok(t, err) - } - - mon, err := e2emon.Start(e) - testutil.Ok(t, err) - - queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { - return "thanos_store_api_query_duration_seconds_count{samples_le='100000',series_le='10000'}" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, model.Vector{ - &model.Sample{ - Metric: model.Metric{ - "__name__": "thanos_store_api_query_duration_seconds_count", - "instance": "storemetrics01-querier-1:8080", - "job": "querier-1", - "samples_le": "100000", - "series_le": "10000", - }, - Value: model.SampleValue(1), - }, - }) - - queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { - return "thanos_store_api_query_duration_seconds_count{samples_le='100',series_le='10'}" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, model.Vector{ - &model.Sample{ - Metric: model.Metric{ - "__name__": "thanos_store_api_query_duration_seconds_count", - "instance": "storemetrics01-querier-1:8080", - "job": "querier-1", - "samples_le": "100", - "series_le": "10", - }, - Value: model.SampleValue(1), - }, - }) -} - // Regression test for https://github.com/thanos-io/thanos/issues/5033. // Tests whether queries work with mixed sources, and with functions // that we are pushing down: min, max, min_over_time, max_over_time, @@ -1007,10 +882,18 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin "msg", fmt.Sprintf("Waiting for %d results for query %s", expectedSeriesLen, q()), ) testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { - res, err := simpleInstantQuery(t, ctx, addr, q, ts, opts, expectedSeriesLen) + res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err } + + if len(warnings) > 0 { + return errors.Errorf("unexpected warnings %s", warnings) + } + + if len(res) != expectedSeriesLen { + return errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) + } result = res return nil })) @@ -1018,24 +901,6 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin return result } -func simpleInstantQuery(t testing.TB, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expectedSeriesLen int) (model.Vector, error) { - res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) - if err != nil { - return nil, err - } - - if len(warnings) > 0 { - return nil, errors.Errorf("unexpected warnings %s", warnings) - } - - if len(res) != expectedSeriesLen { - return nil, errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) - } - - sortResults(res) - return res, nil -} - func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) { t.Helper() @@ -1047,7 +912,7 @@ func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() "caller", "queryWaitAndAssert", "msg", fmt.Sprintf("Waiting for %d results for query %s", len(expected), q()), ) - testutil.Ok(t, runutil.RetryWithLog(logger, 10*time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err From 976fc7eed703e0cae69266802eafcb925e4ae2a3 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 16:57:19 +0100 Subject: [PATCH 19/43] Revert "Receive: Reload tenant limit configuration on file change (#5673)" This reverts commit 24e1cc0faf219049174020955f8e3c8251106d87. Signed-off-by: utukj --- CHANGELOG.md | 1 - cmd/thanos/receive.go | 46 ++--- docs/components/receive.md | 2 +- go.mod | 8 +- go.sum | 4 +- pkg/extkingpin/path_content_reloader.go | 128 ------------ pkg/extkingpin/path_content_reloader_test.go | 105 ---------- pkg/receive/handler.go | 22 +- pkg/receive/handler_test.go | 38 ++-- pkg/receive/limiter.go | 189 ++---------------- pkg/receive/limiter_config.go | 4 +- pkg/receive/limiter_config_test.go | 6 +- pkg/receive/limiter_test.go | 100 --------- pkg/receive/request_limiter.go | 31 ++- pkg/receive/request_limiter_test.go | 20 +- pkg/receive/testdata/limits.yaml | 22 -- .../limits_config/invalid_limits.yaml | 17 -- 17 files changed, 97 insertions(+), 646 deletions(-) delete mode 100644 pkg/extkingpin/path_content_reloader.go delete mode 100644 pkg/extkingpin/path_content_reloader_test.go delete mode 100644 pkg/receive/limiter_test.go delete mode 100644 pkg/receive/testdata/limits.yaml delete mode 100644 pkg/receive/testdata/limits_config/invalid_limits.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e1d2143c3..9ed82d6525 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,6 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. -- [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index d86b560983..5c47b91dd5 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -192,19 +192,6 @@ func runReceive( return errors.Wrap(err, "parse relabel configuration") } - dbs := receive.NewMultiTSDB( - conf.dataDir, - logger, - reg, - tsdbOpts, - lset, - conf.tenantLabelName, - bkt, - conf.allowOutOfOrderUpload, - hashFunc, - ) - writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs) - var limitsConfig *receive.RootLimitsConfig if conf.limitsConfig != nil { limitsContentYaml, err := conf.limitsConfig.Content() @@ -216,11 +203,20 @@ func runReceive( return errors.Wrap(err, "parse limit configuration") } } - limiter, err := receive.NewLimiter(conf.limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) - if err != nil { - return errors.Wrap(err, "creating limiter") - } + limiter := receive.NewLimiter(limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) + dbs := receive.NewMultiTSDB( + conf.dataDir, + logger, + reg, + tsdbOpts, + lset, + conf.tenantLabelName, + bkt, + conf.allowOutOfOrderUpload, + hashFunc, + ) + writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs) webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{ Writer: writer, ListenAddress: conf.rwAddress, @@ -403,22 +399,6 @@ func runReceive( }) } - { - if limiter.CanReload() { - ctx, cancel := context.WithCancel(context.Background()) - g.Add(func() error { - level.Debug(logger).Log("msg", "limits config initialized with file watcher.") - if err := limiter.StartConfigReloader(ctx); err != nil { - return err - } - <-ctx.Done() - return nil - }, func(err error) { - cancel() - }) - } - } - level.Info(logger).Log("msg", "starting receiver") return nil } diff --git a/docs/components/receive.md b/docs/components/receive.md index ef4e39e35e..6fa13938e9 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -86,7 +86,7 @@ Thanos Receive has some limits and gates that can be configured to control resou To configure the gates and limits you can use one of the two options: -- `--receive.limits-config-file=`: where `` is the path to the YAML file. Any modification to the indicated file will trigger a configuration reload. If the updated configuration is invalid an error will be logged and it won't replace the previous valid configuration. +- `--receive.limits-config-file=`: where `` is the path to the YAML file. - `--receive.limits-config=`: where `` is the content of YAML file. By default all the limits and gates are **disabled**. diff --git a/go.mod b/go.mod index bee3e97fe7..13743c8020 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/dustin/go-humanize v1.0.0 github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a - github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd + github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.2.0 github.com/felixge/fgprof v0.9.2 @@ -108,7 +108,6 @@ require ( require ( github.com/efficientgo/core v1.0.0-rc.0 - github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd github.com/minio/sha256-simd v1.0.0 ) @@ -128,7 +127,10 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.10.0 ) -require go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 +require ( + github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd + go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 +) require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.32.3 // indirect diff --git a/go.sum b/go.sum index 97fc0d0411..5ee9bab6be 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a h1:cnJajqeh/Hjv github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd h1:svR6KxSP1xiPw10RN4Pd7g6BAVkEcNN628PAqZH31mM= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:OmVcnJopJL8d3X3sSXTiypGoUSgFq1aDGmlrdi9dn/M= -github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd h1:VaYzzXeUbC5fVheskcKVNOyJMEYD+HgrJNzIAg/mRIM= -github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= +github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d h1:WZV/mrUyKS9w9r+Jdw+zq/tdGAb5LwB+H37EkMLhEMA= +github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= github.com/elastic/go-sysinfo v1.1.1/go.mod h1:i1ZYdU10oLNfRzq4vq62BEwD2fH8KaWh6eh0ikPT9F0= github.com/elastic/go-sysinfo v1.8.1 h1:4Yhj+HdV6WjbCRgGdZpPJ8lZQlXZLKDAeIkmQ/VRvi4= github.com/elastic/go-sysinfo v1.8.1/go.mod h1:JfllUnzoQV/JRYymbH3dO1yggI3mV2oTKSXsDHM+uIM= diff --git a/pkg/extkingpin/path_content_reloader.go b/pkg/extkingpin/path_content_reloader.go deleted file mode 100644 index 68c2cd252c..0000000000 --- a/pkg/extkingpin/path_content_reloader.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package extkingpin - -import ( - "context" - "fmt" - "os" - "path" - "path/filepath" - "time" - - "github.com/fsnotify/fsnotify" - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/pkg/errors" -) - -type fileContent interface { - Content() ([]byte, error) - Path() string -} - -// PathContentReloader starts a file watcher that monitors the file indicated by fileContent.Path() and runs -// reloadFunc whenever a change is detected. -// A debounce timer can be configured via opts to handle situations where many "write" events are received together or -// a "create" event is followed up by a "write" event, for example. Files will be effectively reloaded at the latest -// after 2 times the debounce timer. By default the debouncer timer is 1 second. -// To ensure renames and deletes are properly handled, the file watcher is put at the file's parent folder. See -// https://github.com/fsnotify/fsnotify/issues/214 for more details. -func PathContentReloader(ctx context.Context, fileContent fileContent, logger log.Logger, reloadFunc func(), debounceTime time.Duration) error { - filePath, err := filepath.Abs(fileContent.Path()) - if err != nil { - return errors.Wrap(err, "getting absolute file path") - } - - watcher, err := fsnotify.NewWatcher() - if filePath == "" { - level.Debug(logger).Log("msg", "no path detected for config reload") - } - if err != nil { - return errors.Wrap(err, "creating file watcher") - } - go func() { - var reloadTimer *time.Timer - if debounceTime != 0 { - reloadTimer = time.AfterFunc(debounceTime, func() { - reloadFunc() - level.Debug(logger).Log("msg", "configuration reloaded after debouncing") - }) - } - defer watcher.Close() - for { - select { - case <-ctx.Done(): - if reloadTimer != nil { - reloadTimer.Stop() - } - return - case event := <-watcher.Events: - // fsnotify sometimes sends a bunch of events without name or operation. - // It's unclear what they are and why they are sent - filter them out. - if event.Name == "" { - break - } - // We are watching the file's parent folder (more details on this is done can be found below), but are - // only interested in changed to the target file. Discard every other file as quickly as possible. - if event.Name != filePath { - break - } - // We only react to files being written or created. - // On chmod or remove we have nothing to do. - // On rename we have the old file name (not useful). A create event for the new file will come later. - if event.Op&fsnotify.Write == 0 && event.Op&fsnotify.Create == 0 { - break - } - level.Debug(logger).Log("msg", fmt.Sprintf("change detected for %s", filePath), "eventName", event.Name, "eventOp", event.Op) - if reloadTimer != nil { - reloadTimer.Reset(debounceTime) - } - case err := <-watcher.Errors: - level.Error(logger).Log("msg", "watcher error", "error", err) - } - } - }() - // We watch the file's parent folder and not the file itself to better handle DELETE and RENAME events. Check - // https://github.com/fsnotify/fsnotify/issues/214 for more details. - if err := watcher.Add(path.Dir(filePath)); err != nil { - return errors.Wrapf(err, "adding path %s to file watcher", filePath) - } - return nil -} - -type staticPathContent struct { - content []byte - path string -} - -var _ fileContent = (*staticPathContent)(nil) - -// Content returns the cached content. -func (t *staticPathContent) Content() ([]byte, error) { - return t.content, nil -} - -// Path returns the path to the file that contains the content. -func (t *staticPathContent) Path() string { - return t.path -} - -// NewStaticPathContent creates a new content that can be used to serve a static configuration. It copies the -// configuration from `fromPath` into `destPath` to avoid confusion with file watchers. -func NewStaticPathContent(fromPath string) (*staticPathContent, error) { - content, err := os.ReadFile(fromPath) - if err != nil { - return nil, errors.Wrapf(err, "could not load test content: %s", fromPath) - } - return &staticPathContent{content, fromPath}, nil -} - -// Rewrite rewrites the file backing this staticPathContent and swaps the local content cache. The file writing -// is needed to trigger the file system monitor. -func (t *staticPathContent) Rewrite(newContent []byte) error { - t.content = newContent - // Write the file to ensure possible file watcher reloaders get triggered. - return os.WriteFile(t.path, newContent, 0666) -} diff --git a/pkg/extkingpin/path_content_reloader_test.go b/pkg/extkingpin/path_content_reloader_test.go deleted file mode 100644 index fb20f83d5c..0000000000 --- a/pkg/extkingpin/path_content_reloader_test.go +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package extkingpin - -import ( - "context" - "os" - "path" - "sync" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/thanos-io/thanos/pkg/testutil" -) - -func TestPathContentReloader(t *testing.T) { - type args struct { - runSteps func(t *testing.T, testFile string, pathContent *staticPathContent) - } - tests := []struct { - name string - args args - wantReloads int - }{ - { - name: "Many operations, only rewrite triggers one reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - testutil.Ok(t, os.Remove(testFile)) - testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) - }, - }, - wantReloads: 1, - }, - { - name: "Many operations, only rename triggers one reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - testutil.Ok(t, os.Rename(testFile, testFile+".tmp")) - testutil.Ok(t, os.Rename(testFile+".tmp", testFile)) - }, - }, - wantReloads: 1, - }, - { - name: "Many operations, two rewrites trigger two reloads", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - testutil.Ok(t, os.Remove(testFile)) - testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) - time.Sleep(2 * time.Second) - testutil.Ok(t, pathContent.Rewrite([]byte("test modified again"))) - }, - }, - wantReloads: 1, - }, - { - name: "Chmod doesn't trigger reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - }, - }, - wantReloads: 0, - }, - { - name: "Remove doesn't trigger reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Remove(testFile)) - }, - }, - wantReloads: 0, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - testFile := path.Join(t.TempDir(), "test") - testutil.Ok(t, os.WriteFile(testFile, []byte("test"), 0666)) - pathContent, err := NewStaticPathContent(testFile) - testutil.Ok(t, err) - - wg := &sync.WaitGroup{} - wg.Add(tt.wantReloads) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - reloadCount := 0 - err = PathContentReloader(ctx, pathContent, log.NewLogfmtLogger(os.Stdout), func() { - reloadCount++ - wg.Done() - }, 100*time.Millisecond) - testutil.Ok(t, err) - - tt.args.runSteps(t, testFile, pathContent) - wg.Wait() - testutil.Equals(t, tt.wantReloads, reloadCount) - }) - } -} diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go index 12afb752b8..156bb74566 100644 --- a/pkg/receive/handler.go +++ b/pkg/receive/handler.go @@ -17,6 +17,10 @@ import ( "sync" "time" + "github.com/thanos-io/thanos/pkg/api" + statusapi "github.com/thanos-io/thanos/pkg/api/status" + "github.com/thanos-io/thanos/pkg/logging" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -31,9 +35,6 @@ import ( "github.com/prometheus/prometheus/model/relabel" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb" - "github.com/thanos-io/thanos/pkg/api" - statusapi "github.com/thanos-io/thanos/pkg/api/status" - "github.com/thanos-io/thanos/pkg/logging" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -98,7 +99,7 @@ type Options struct { ForwardTimeout time.Duration RelabelConfigs []*relabel.Config TSDBStats TSDBStats - Limiter *Limiter + Limiter *limiter } // Handler serves a Prometheus remote write receiving HTTP endpoint. @@ -123,7 +124,7 @@ type Handler struct { writeSamplesTotal *prometheus.HistogramVec writeTimeseriesTotal *prometheus.HistogramVec - Limiter *Limiter + limiter *limiter } func NewHandler(logger log.Logger, o *Options) *Handler { @@ -149,7 +150,7 @@ func NewHandler(logger log.Logger, o *Options) *Handler { Max: 30 * time.Second, Jitter: true, }, - Limiter: o.Limiter, + limiter: o.Limiter, forwardRequests: promauto.With(registerer).NewCounterVec( prometheus.CounterOpts{ Name: "thanos_receive_forward_requests_total", @@ -406,18 +407,17 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { tLogger := log.With(h.logger, "tenant", tenant) - writeGate := h.Limiter.WriteGate() tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) { - err = writeGate.Start(r.Context()) + err = h.limiter.writeGate.Start(r.Context()) }) - defer writeGate.Done() if err != nil { level.Error(tLogger).Log("err", err, "msg", "internal server error") http.Error(w, err.Error(), http.StatusInternalServerError) return } + defer h.limiter.writeGate.Done() - under, err := h.Limiter.HeadSeriesLimiter.isUnderLimit(tenant) + under, err := h.limiter.HeadSeriesLimiter.isUnderLimit(tenant) if err != nil { level.Error(tLogger).Log("msg", "error while limiting", "err", err.Error()) } @@ -428,7 +428,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { return } - requestLimiter := h.Limiter.RequestLimiter() + requestLimiter := h.limiter.requestLimiter // io.ReadAll dynamically adjust the byte slice for read data, starting from 512B. // Since this is receive hot path, grow upfront saving allocations and CPU time. compressed := bytes.Buffer{} diff --git a/pkg/receive/handler_test.go b/pkg/receive/handler_test.go index 4a2a536038..44076de141 100644 --- a/pkg/receive/handler_test.go +++ b/pkg/receive/handler_test.go @@ -13,7 +13,6 @@ import ( "net/http" "net/http/httptest" "os" - "path" "path/filepath" "runtime" "runtime/pprof" @@ -22,8 +21,6 @@ import ( "testing" "time" - "gopkg.in/yaml.v3" - "github.com/alecthomas/units" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -43,7 +40,6 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/errutil" - "github.com/thanos-io/thanos/pkg/extkingpin" "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/store/labelpb" "github.com/thanos-io/thanos/pkg/store/storepb" @@ -366,7 +362,6 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin }, } - limiter, _ := NewLimiter(NewNopConfig(), nil, RouterIngestor, log.NewNopLogger()) for i := range appendables { h := NewHandler(nil, &Options{ TenantHeader: DefaultTenantHeader, @@ -374,7 +369,7 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin ReplicationFactor: replicationFactor, ForwardTimeout: 5 * time.Second, Writer: NewWriter(log.NewNopLogger(), newFakeTenantAppendable(appendables[i])), - Limiter: limiter, + Limiter: NewLimiter(nil, nil, RouterIngestor, nil), }) handlers = append(handlers, h) h.peers = peers @@ -780,28 +775,23 @@ func TestReceiveWriteRequestLimits(t *testing.T) { } handlers, _ := newTestHandlerHashring(appendables, 3) handler := handlers[0] - tenant := "test" - tenantConfig, err := yaml.Marshal(&RootLimitsConfig{ - WriteLimits: WriteLimitsConfig{ - TenantsLimits: TenantsWriteLimitsConfig{ - tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig(). - SetSizeBytesLimit(int64(1 * units.Megabyte)). - SetSeriesLimit(20). - SetSamplesLimit(200), + handler.limiter = NewLimiter( + &RootLimitsConfig{ + WriteLimits: WriteLimitsConfig{ + TenantsLimits: TenantsWriteLimitsConfig{ + tenant: &WriteLimitConfig{ + RequestLimits: newEmptyRequestLimitsConfig(). + SetSizeBytesLimit(int64(1 * units.Megabyte)). + SetSeriesLimit(20). + SetSamplesLimit(200), + }, }, }, }, - }) - if err != nil { - t.Fatal("handler: failed to generate limit configuration") - } - tmpLimitsPath := path.Join(t.TempDir(), "limits.yaml") - testutil.Ok(t, os.WriteFile(tmpLimitsPath, tenantConfig, 0666)) - limitConfig, _ := extkingpin.NewStaticPathContent(tmpLimitsPath) - handler.Limiter, _ = NewLimiter( - limitConfig, nil, RouterIngestor, log.NewNopLogger(), + nil, + RouterIngestor, + log.NewNopLogger(), ) wreq := &prompb.WriteRequest{ diff --git a/pkg/receive/limiter.go b/pkg/receive/limiter.go index ff5bbe3199..bc3c4d8358 100644 --- a/pkg/receive/limiter.go +++ b/pkg/receive/limiter.go @@ -5,204 +5,59 @@ package receive import ( "context" - "fmt" - "sync" - "time" "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/thanos-io/thanos/pkg/extkingpin" - - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/thanos-io/thanos/pkg/extprom" "github.com/thanos-io/thanos/pkg/gate" ) -// Limiter is responsible for managing the configuration and initialization of -// different types that apply limits to the Receive instance. -type Limiter struct { - sync.RWMutex - requestLimiter requestLimiter - HeadSeriesLimiter headSeriesLimiter - writeGate gate.Gate - registerer prometheus.Registerer - configPathOrContent fileContent - logger log.Logger - configReloadCounter prometheus.Counter - configReloadFailedCounter prometheus.Counter - receiverMode ReceiverMode -} - -// headSeriesLimiter encompasses active/head series limiting logic. -type headSeriesLimiter interface { - QueryMetaMonitoring(context.Context) error - isUnderLimit(tenant string) (bool, error) +type limiter struct { + requestLimiter requestLimiter + writeGate gate.Gate + HeadSeriesLimiter headSeriesLimiter } +// requestLimiter encompasses logic for limiting remote write requests. type requestLimiter interface { AllowSizeBytes(tenant string, contentLengthBytes int64) bool AllowSeries(tenant string, amount int64) bool AllowSamples(tenant string, amount int64) bool } -// fileContent is an interface to avoid a direct dependency on kingpin or extkingpin. -type fileContent interface { - Content() ([]byte, error) - Path() string +// headSeriesLimiter encompasses active/head series limiting logic. +type headSeriesLimiter interface { + QueryMetaMonitoring(context.Context) error + isUnderLimit(tenant string) (bool, error) } -// NewLimiter creates a new *Limiter given a configuration and prometheus -// registerer. -func NewLimiter(configFile fileContent, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) (*Limiter, error) { - limiter := &Limiter{ +func NewLimiter(root *RootLimitsConfig, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) *limiter { + limiter := &limiter{ writeGate: gate.NewNoop(), requestLimiter: &noopRequestLimiter{}, HeadSeriesLimiter: NewNopSeriesLimit(), - logger: logger, - receiverMode: r, - } - - if reg != nil { - limiter.registerer = NewUnRegisterer(reg) - limiter.configReloadCounter = promauto.With(limiter.registerer).NewCounter( - prometheus.CounterOpts{ - Namespace: "thanos", - Subsystem: "receive", - Name: "limits_config_reload_total", - Help: "How many times the limit configuration was reloaded", - }, - ) - limiter.configReloadFailedCounter = promauto.With(limiter.registerer).NewCounter( - prometheus.CounterOpts{ - Namespace: "thanos", - Subsystem: "receive", - Name: "limits_config_reload_err_total", - Help: "How many times the limit configuration failed to reload.", - }, - ) - } - - if configFile == nil { - return limiter, nil - } - - limiter.configPathOrContent = configFile - if err := limiter.loadConfig(); err != nil { - return nil, errors.Wrap(err, "load tenant limits config") - } - - return limiter, nil -} - -// StartConfigReloader starts the automatic configuration reloader based off of -// the file indicated by pathOrContent. It starts a Go routine in the given -// *run.Group. -func (l *Limiter) StartConfigReloader(ctx context.Context) error { - if !l.CanReload() { - return nil } - - return extkingpin.PathContentReloader(ctx, l.configPathOrContent, l.logger, func() { - level.Info(l.logger).Log("msg", "reloading limit config") - if err := l.loadConfig(); err != nil { - if failedReload := l.configReloadCounter; failedReload != nil { - failedReload.Inc() - } - errMsg := fmt.Sprintf("error reloading tenant limits config from %s", l.configPathOrContent.Path()) - level.Error(l.logger).Log("msg", errMsg, "err", err) - } - if reloadCounter := l.configReloadCounter; reloadCounter != nil { - reloadCounter.Inc() - } - }, 1*time.Second) -} - -func (l *Limiter) CanReload() bool { - if l.configPathOrContent == nil { - return false + if root == nil { + return limiter } - if l.configPathOrContent.Path() == "" { - return false - } - return true -} -func (l *Limiter) loadConfig() error { - config, err := ParseLimitConfigContent(l.configPathOrContent) - if err != nil { - return err - } - l.Lock() - defer l.Unlock() - maxWriteConcurrency := config.WriteLimits.GlobalLimits.MaxConcurrency + maxWriteConcurrency := root.WriteLimits.GlobalLimits.MaxConcurrency if maxWriteConcurrency > 0 { - l.writeGate = gate.New( + limiter.writeGate = gate.New( extprom.WrapRegistererWithPrefix( "thanos_receive_write_request_concurrent_", - l.registerer, + reg, ), int(maxWriteConcurrency), ) } - l.requestLimiter = newConfigRequestLimiter( - l.registerer, - &config.WriteLimits, - ) - seriesLimitSupported := (l.receiverMode == RouterOnly || l.receiverMode == RouterIngestor) && (len(config.WriteLimits.TenantsLimits) != 0 || config.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) - if seriesLimitSupported { - l.HeadSeriesLimiter = NewHeadSeriesLimit(config.WriteLimits, l.registerer, l.logger) - } - return nil -} + limiter.requestLimiter = newConfigRequestLimiter(reg, &root.WriteLimits) -// RequestLimiter is a safe getter for the request limiter. -func (l *Limiter) RequestLimiter() requestLimiter { - l.RLock() - defer l.RUnlock() - return l.requestLimiter -} - -// WriteGate is a safe getter for the write gate. -func (l *Limiter) WriteGate() gate.Gate { - l.RLock() - defer l.RUnlock() - return l.writeGate -} - -// ParseLimitConfigContent parses the limit configuration from the path or -// content. -func ParseLimitConfigContent(limitsConfig fileContent) (*RootLimitsConfig, error) { - if limitsConfig == nil { - return &RootLimitsConfig{}, nil - } - limitsContentYaml, err := limitsConfig.Content() - if err != nil { - return nil, errors.Wrap(err, "get content of limit configuration") - } - parsedConfig, err := ParseRootLimitConfig(limitsContentYaml) - if err != nil { - return nil, errors.Wrap(err, "parse limit configuration") + // Impose active series limit only if Receiver is in Router or RouterIngestor mode, and config is provided. + seriesLimitSupported := (r == RouterOnly || r == RouterIngestor) && (len(root.WriteLimits.TenantsLimits) != 0 || root.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) + if seriesLimitSupported { + limiter.HeadSeriesLimiter = NewHeadSeriesLimit(root.WriteLimits, reg, logger) } - return parsedConfig, nil -} - -type nopConfigContent struct{} - -var _ fileContent = (*nopConfigContent)(nil) - -// Content returns no content and no error. -func (n nopConfigContent) Content() ([]byte, error) { - return nil, nil -} - -// Path returns an empty path. -func (n nopConfigContent) Path() string { - return "" -} -// NewNopConfig creates a no-op config content (no configuration). -func NewNopConfig() nopConfigContent { - return nopConfigContent{} + return limiter } diff --git a/pkg/receive/limiter_config.go b/pkg/receive/limiter_config.go index c3bd330b6e..67aa5ef93a 100644 --- a/pkg/receive/limiter_config.go +++ b/pkg/receive/limiter_config.go @@ -78,7 +78,6 @@ type DefaultLimitsConfig struct { HeadSeriesLimit uint64 `yaml:"head_series_limit"` } -// TenantsWriteLimitsConfig is a map of tenant IDs to their *WriteLimitConfig. type TenantsWriteLimitsConfig map[string]*WriteLimitConfig // A tenant might not always have limits configured, so things here must @@ -111,7 +110,8 @@ type requestLimitsConfig struct { SamplesLimit *int64 `yaml:"samples_limit"` } -func NewEmptyRequestLimitsConfig() *requestLimitsConfig { +// Utils for initializing. +func newEmptyRequestLimitsConfig() *requestLimitsConfig { return &requestLimitsConfig{} } diff --git a/pkg/receive/limiter_config_test.go b/pkg/receive/limiter_config_test.go index 3e32ea41e8..b080680162 100644 --- a/pkg/receive/limiter_config_test.go +++ b/pkg/receive/limiter_config_test.go @@ -35,7 +35,7 @@ func TestParseLimiterConfig(t *testing.T) { }, }, DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig(). + RequestLimits: *newEmptyRequestLimitsConfig(). SetSizeBytesLimit(1024). SetSeriesLimit(1000). SetSamplesLimit(10), @@ -44,7 +44,7 @@ func TestParseLimiterConfig(t *testing.T) { TenantsLimits: TenantsWriteLimitsConfig{ "acme": NewEmptyWriteLimitConfig(). SetRequestLimits( - NewEmptyRequestLimitsConfig(). + newEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0), @@ -52,7 +52,7 @@ func TestParseLimiterConfig(t *testing.T) { SetHeadSeriesLimit(2000), "ajax": NewEmptyWriteLimitConfig(). SetRequestLimits( - NewEmptyRequestLimitsConfig(). + newEmptyRequestLimitsConfig(). SetSeriesLimit(50000). SetSamplesLimit(500), ), diff --git a/pkg/receive/limiter_test.go b/pkg/receive/limiter_test.go deleted file mode 100644 index be7e8790c1..0000000000 --- a/pkg/receive/limiter_test.go +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package receive - -import ( - "context" - "os" - "path" - "testing" - "time" - - "github.com/thanos-io/thanos/pkg/extkingpin" - - "github.com/efficientgo/tools/core/pkg/testutil" - "github.com/go-kit/log" -) - -func TestLimiter_StartConfigReloader(t *testing.T) { - origLimitsFile, err := os.ReadFile(path.Join("testdata", "limits_config", "good_limits.yaml")) - testutil.Ok(t, err) - copyLimitsFile := path.Join(t.TempDir(), "limits.yaml") - testutil.Ok(t, os.WriteFile(copyLimitsFile, origLimitsFile, 0666)) - - goodLimits, err := extkingpin.NewStaticPathContent(copyLimitsFile) - if err != nil { - t.Fatalf("error trying to save static limit config: %s", err) - } - invalidLimitsPath := path.Join("./testdata", "limits_config", "invalid_limits.yaml") - invalidLimits, err := os.ReadFile(invalidLimitsPath) - if err != nil { - t.Fatalf("could not load test content at %s: %s", invalidLimitsPath, err) - } - - limiter, err := NewLimiter(goodLimits, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) - testutil.Ok(t, err) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - err = limiter.StartConfigReloader(ctx) - testutil.Ok(t, err) - - time.Sleep(1 * time.Second) - testutil.Ok(t, goodLimits.Rewrite(invalidLimits)) -} - -type emptyPathFile struct{} - -func (e emptyPathFile) Content() ([]byte, error) { - return []byte{}, nil -} - -func (e emptyPathFile) Path() string { - return "" -} - -func TestLimiter_CanReload(t *testing.T) { - validLimitsPath, err := extkingpin.NewStaticPathContent( - path.Join("testdata", "limits_config", "good_limits.yaml"), - ) - testutil.Ok(t, err) - emptyLimitsPath := emptyPathFile{} - - type args struct { - configFilePath fileContent - } - tests := []struct { - name string - args args - wantReload bool - }{ - { - name: "Nil config file path cannot be reloaded", - args: args{configFilePath: nil}, - wantReload: false, - }, - { - name: "Empty config file path cannot be reloaded", - args: args{configFilePath: emptyLimitsPath}, - wantReload: false, - }, - { - name: "Valid config file path can be reloaded", - args: args{configFilePath: validLimitsPath}, - wantReload: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - configFile := tt.args.configFilePath - limiter, err := NewLimiter(configFile, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) - testutil.Ok(t, err) - if tt.wantReload { - testutil.Assert(t, limiter.CanReload()) - } else { - testutil.Assert(t, !limiter.CanReload()) - } - }) - } -} diff --git a/pkg/receive/request_limiter.go b/pkg/receive/request_limiter.go index 7da0c64a6d..de7554de2f 100644 --- a/pkg/receive/request_limiter.go +++ b/pkg/receive/request_limiter.go @@ -14,7 +14,7 @@ const ( sizeBytesLimitName = "body_size" ) -var unlimitedRequestLimitsConfig = NewEmptyRequestLimitsConfig(). +var unlimitedRequestLimitsConfig = newEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0) @@ -49,12 +49,7 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits tenantLimits: tenantRequestLimits, cachedDefaultLimits: defaultRequestLimits, } - limiter.registerMetrics(reg) - return &limiter -} - -func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { - l.limitsHit = promauto.With(reg).NewSummaryVec( + limiter.limitsHit = promauto.With(reg).NewSummaryVec( prometheus.SummaryOpts{ Namespace: "thanos", Subsystem: "receive", @@ -63,7 +58,7 @@ func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { Objectives: map[float64]float64{0.50: 0.1, 0.95: 0.1, 0.99: 0.001}, }, []string{"tenant", "limit"}, ) - l.configuredLimits = promauto.With(reg).NewGaugeVec( + limiter.configuredLimits = promauto.With(reg).NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thanos", Subsystem: "receive", @@ -71,14 +66,16 @@ func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { Help: "The configured write limits.", }, []string{"tenant", "limit"}, ) - for tenant, limits := range l.tenantLimits { - l.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) - l.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) - l.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) + for tenant, limits := range tenantRequestLimits { + limiter.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) + limiter.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) + limiter.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) } - l.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*l.cachedDefaultLimits.SizeBytesLimit)) - l.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*l.cachedDefaultLimits.SeriesLimit)) - l.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*l.cachedDefaultLimits.SamplesLimit)) + limiter.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*defaultRequestLimits.SizeBytesLimit)) + limiter.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*defaultRequestLimits.SeriesLimit)) + limiter.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*defaultRequestLimits.SamplesLimit)) + + return &limiter } func (l *configRequestLimiter) AllowSizeBytes(tenant string, contentLengthBytes int64) bool { @@ -103,7 +100,7 @@ func (l *configRequestLimiter) AllowSeries(tenant string, amount int64) bool { } allowed := *limit >= amount - if !allowed && l.limitsHit != nil { + if !allowed { l.limitsHit. WithLabelValues(tenant, seriesLimitName). Observe(float64(amount - *limit)) @@ -117,7 +114,7 @@ func (l *configRequestLimiter) AllowSamples(tenant string, amount int64) bool { return true } allowed := *limit >= amount - if !allowed && l.limitsHit != nil { + if !allowed { l.limitsHit. WithLabelValues(tenant, samplesLimitName). Observe(float64(amount - *limit)) diff --git a/pkg/receive/request_limiter_test.go b/pkg/receive/request_limiter_test.go index dfbea066d9..e654cd1cdf 100644 --- a/pkg/receive/request_limiter_test.go +++ b/pkg/receive/request_limiter_test.go @@ -15,12 +15,12 @@ func TestRequestLimiter_limitsFor(t *testing.T) { limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig(). + RequestLimits: *newEmptyRequestLimitsConfig(). SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenantWithLimits: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig(). + RequestLimits: newEmptyRequestLimitsConfig(). SetSeriesLimit(30), }, }, @@ -33,7 +33,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the default limits when tenant's limits aren't present", tenant: tenantWithoutLimits, - wantLimits: NewEmptyRequestLimitsConfig(). + wantLimits: newEmptyRequestLimitsConfig(). SetSeriesLimit(10). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -41,7 +41,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the tenant's limits when it is present", tenant: tenantWithLimits, - wantLimits: NewEmptyRequestLimitsConfig(). + wantLimits: newEmptyRequestLimitsConfig(). SetSeriesLimit(30). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -102,11 +102,11 @@ func TestRequestLimiter_AllowRequestBodySizeBytes(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), + RequestLimits: newEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), }, }, } @@ -159,11 +159,11 @@ func TestRequestLimiter_AllowSeries(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), + RequestLimits: newEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), }, }, } @@ -217,11 +217,11 @@ func TestRequestLimiter_AllowSamples(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), + RequestLimits: newEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), }, }, } diff --git a/pkg/receive/testdata/limits.yaml b/pkg/receive/testdata/limits.yaml deleted file mode 100644 index 2345756179..0000000000 --- a/pkg/receive/testdata/limits.yaml +++ /dev/null @@ -1,22 +0,0 @@ -write: - global: - max_concurrency: 30 - meta_monitoring_url: "http://localhost:9090" - meta_monitoring_limit_query: "sum(prometheus_tsdb_head_series) by (tenant)" - default: - request: - size_bytes_limit: 1024 - series_limit: 1000 - samples_limit: 10 - head_series_limit: 1000 - tenants: - acme: - request: - size_bytes_limit: 0 - series_limit: 0 - samples_limit: 0 - head_series_limit: 2000 - ajax: - request: - series_limit: 50000 - samples_limit: 500 diff --git a/pkg/receive/testdata/limits_config/invalid_limits.yaml b/pkg/receive/testdata/limits_config/invalid_limits.yaml deleted file mode 100644 index 74db0453f8..0000000000 --- a/pkg/receive/testdata/limits_config/invalid_limits.yaml +++ /dev/null @@ -1,17 +0,0 @@ -write: - global: - max_concurrency: 30 - request: - size_bytes_limit: 1024 - series_limit: 1000 - samples_limit: 10 - tenants: - acme: - request: - size_bytes_limit: 0 - series_limit: 0 - samples_limit: 0 - ajax: - request: - series_limit: 50000 - samples_limit: 500 From a07ad4a2be83419e067ff3c7c6c41f031d8c29a3 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 16:57:27 +0100 Subject: [PATCH 20/43] Revert "Updates busybox SHA (#5793)" This reverts commit 9474c00fa6a1a7b0148287ee4296944e50f093b6. Signed-off-by: utukj --- .busybox-versions | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.busybox-versions b/.busybox-versions index dfaea69d50..afcacb3c77 100644 --- a/.busybox-versions +++ b/.busybox-versions @@ -1,6 +1,6 @@ # Auto generated by busybox-updater.sh. DO NOT EDIT -amd64=c9f983fc55b0b74723a69c31688cca7d5a2e5b2af7c954780f29a331817982f3 -arm64=1349554b18d6c349a390929c2a4855fadb003b2243aabf2cc71b931068c69279 -arm=be08b36d0e8f90b6fb317d29582c632ce365a00648a81c4022c4ff79df928ad9 -ppc64le=d44f541b0df83608110e695b9a1e71604ab94924954a1b18f6d76c4b5871cadd -s390x=007b2b388c575d00c7234d29227bbb8216786d7ba3f86d82696dc6fe86ac1ec0 +amd64=d8d3654786836cad8c09543704807c7a6d75de53b9e9cd21a1bbd8cb1a607004 +arm64=a3435ee186dbf88238388c112761488ecd2c264dbff8957ab73f804be62a9080 +arm=b063a2176f23a13007de5c447ab3552f8e355162ac54fc2a545b00b612d4c81e +ppc64le=203c3f97bc34c4d5df50bd61beaa397f2a4c7cbd470c84fe7ec3db12409435d3 +s390x=1a6eb305bd08bd1d38cb85a097ad776a78dd72b7c1a35094bb080788a39b174c From ad11a03ebeb6ae10ed7ad7a6336365f33a07538b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 15 Oct 2022 22:30:58 +0000 Subject: [PATCH 21/43] Updates busybox SHA (#5793) Signed-off-by: GitHub Signed-off-by: GitHub Co-authored-by: yeya24 Signed-off-by: utukj --- .busybox-versions | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.busybox-versions b/.busybox-versions index afcacb3c77..dfaea69d50 100644 --- a/.busybox-versions +++ b/.busybox-versions @@ -1,6 +1,6 @@ # Auto generated by busybox-updater.sh. DO NOT EDIT -amd64=d8d3654786836cad8c09543704807c7a6d75de53b9e9cd21a1bbd8cb1a607004 -arm64=a3435ee186dbf88238388c112761488ecd2c264dbff8957ab73f804be62a9080 -arm=b063a2176f23a13007de5c447ab3552f8e355162ac54fc2a545b00b612d4c81e -ppc64le=203c3f97bc34c4d5df50bd61beaa397f2a4c7cbd470c84fe7ec3db12409435d3 -s390x=1a6eb305bd08bd1d38cb85a097ad776a78dd72b7c1a35094bb080788a39b174c +amd64=c9f983fc55b0b74723a69c31688cca7d5a2e5b2af7c954780f29a331817982f3 +arm64=1349554b18d6c349a390929c2a4855fadb003b2243aabf2cc71b931068c69279 +arm=be08b36d0e8f90b6fb317d29582c632ce365a00648a81c4022c4ff79df928ad9 +ppc64le=d44f541b0df83608110e695b9a1e71604ab94924954a1b18f6d76c4b5871cadd +s390x=007b2b388c575d00c7234d29227bbb8216786d7ba3f86d82696dc6fe86ac1ec0 From 32ca3279bb995861b7f3b7ba5b9cb4cbeeddf68c Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 17 Oct 2022 17:52:00 +0200 Subject: [PATCH 22/43] Receive: Reload tenant limit configuration on file change (#5673) * Create a PathOrContent reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add docs to staticPathContent.Rewrite Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run goimports Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Properly cancel the context in the test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Watch parent directory of file This helps handling deletes and other situations. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless ctx.Done() Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add a debounce timer to config reload It helps managing situations where a create event is followed by a write or when a big file write is sent by the fsnotify backend as many write events. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix event.Op bitmask check Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update lastReload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix debouncer for path content reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Improve documentation of the PathContentRealoder Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Dain reload timer before resetting Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run tests in parallel Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify debouncing logic Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add more tests to file reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify condition for triggering reload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use absolute path to config file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of parallel test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back 2s wait between fs operations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless sleep Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Stop reloadTimer when context cancelled Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove unused fucntion Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright to test file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Auto-reload tenant limit config on file changes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Wrap error when reloading config Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Move limiter config reloader and update logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of useless types and allocations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove errorChan from config reload starter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use UnRegisterer in the Limiter To ensure that limit reloads will be able to re-register their metrics. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Better guard against nil registerer in the limiter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove wrong nil guard Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: utukj --- CHANGELOG.md | 1 + cmd/thanos/receive.go | 46 +++-- docs/components/receive.md | 2 +- go.mod | 8 +- go.sum | 4 +- pkg/extkingpin/path_content_reloader.go | 128 ++++++++++++ pkg/extkingpin/path_content_reloader_test.go | 105 ++++++++++ pkg/receive/handler.go | 22 +- pkg/receive/handler_test.go | 38 ++-- pkg/receive/limiter.go | 189 ++++++++++++++++-- pkg/receive/limiter_config.go | 4 +- pkg/receive/limiter_config_test.go | 6 +- pkg/receive/limiter_test.go | 100 +++++++++ pkg/receive/request_limiter.go | 31 +-- pkg/receive/request_limiter_test.go | 20 +- pkg/receive/testdata/limits.yaml | 22 ++ .../limits_config/invalid_limits.yaml | 17 ++ 17 files changed, 646 insertions(+), 97 deletions(-) create mode 100644 pkg/extkingpin/path_content_reloader.go create mode 100644 pkg/extkingpin/path_content_reloader_test.go create mode 100644 pkg/receive/limiter_test.go create mode 100644 pkg/receive/testdata/limits.yaml create mode 100644 pkg/receive/testdata/limits_config/invalid_limits.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed82d6525..6e1d2143c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. +- [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index 5c47b91dd5..d86b560983 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -192,19 +192,6 @@ func runReceive( return errors.Wrap(err, "parse relabel configuration") } - var limitsConfig *receive.RootLimitsConfig - if conf.limitsConfig != nil { - limitsContentYaml, err := conf.limitsConfig.Content() - if err != nil { - return errors.Wrap(err, "get content of limit configuration") - } - limitsConfig, err = receive.ParseRootLimitConfig(limitsContentYaml) - if err != nil { - return errors.Wrap(err, "parse limit configuration") - } - } - limiter := receive.NewLimiter(limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) - dbs := receive.NewMultiTSDB( conf.dataDir, logger, @@ -217,6 +204,23 @@ func runReceive( hashFunc, ) writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs) + + var limitsConfig *receive.RootLimitsConfig + if conf.limitsConfig != nil { + limitsContentYaml, err := conf.limitsConfig.Content() + if err != nil { + return errors.Wrap(err, "get content of limit configuration") + } + limitsConfig, err = receive.ParseRootLimitConfig(limitsContentYaml) + if err != nil { + return errors.Wrap(err, "parse limit configuration") + } + } + limiter, err := receive.NewLimiter(conf.limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) + if err != nil { + return errors.Wrap(err, "creating limiter") + } + webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{ Writer: writer, ListenAddress: conf.rwAddress, @@ -399,6 +403,22 @@ func runReceive( }) } + { + if limiter.CanReload() { + ctx, cancel := context.WithCancel(context.Background()) + g.Add(func() error { + level.Debug(logger).Log("msg", "limits config initialized with file watcher.") + if err := limiter.StartConfigReloader(ctx); err != nil { + return err + } + <-ctx.Done() + return nil + }, func(err error) { + cancel() + }) + } + } + level.Info(logger).Log("msg", "starting receiver") return nil } diff --git a/docs/components/receive.md b/docs/components/receive.md index 6fa13938e9..ef4e39e35e 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -86,7 +86,7 @@ Thanos Receive has some limits and gates that can be configured to control resou To configure the gates and limits you can use one of the two options: -- `--receive.limits-config-file=`: where `` is the path to the YAML file. +- `--receive.limits-config-file=`: where `` is the path to the YAML file. Any modification to the indicated file will trigger a configuration reload. If the updated configuration is invalid an error will be logged and it won't replace the previous valid configuration. - `--receive.limits-config=`: where `` is the content of YAML file. By default all the limits and gates are **disabled**. diff --git a/go.mod b/go.mod index 13743c8020..bee3e97fe7 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/dustin/go-humanize v1.0.0 github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a - github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d + github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.2.0 github.com/felixge/fgprof v0.9.2 @@ -108,6 +108,7 @@ require ( require ( github.com/efficientgo/core v1.0.0-rc.0 + github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd github.com/minio/sha256-simd v1.0.0 ) @@ -127,10 +128,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.10.0 ) -require ( - github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd - go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 -) +require go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.32.3 // indirect diff --git a/go.sum b/go.sum index 5ee9bab6be..97fc0d0411 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a h1:cnJajqeh/Hjv github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd h1:svR6KxSP1xiPw10RN4Pd7g6BAVkEcNN628PAqZH31mM= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:OmVcnJopJL8d3X3sSXTiypGoUSgFq1aDGmlrdi9dn/M= -github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d h1:WZV/mrUyKS9w9r+Jdw+zq/tdGAb5LwB+H37EkMLhEMA= -github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= +github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd h1:VaYzzXeUbC5fVheskcKVNOyJMEYD+HgrJNzIAg/mRIM= +github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= github.com/elastic/go-sysinfo v1.1.1/go.mod h1:i1ZYdU10oLNfRzq4vq62BEwD2fH8KaWh6eh0ikPT9F0= github.com/elastic/go-sysinfo v1.8.1 h1:4Yhj+HdV6WjbCRgGdZpPJ8lZQlXZLKDAeIkmQ/VRvi4= github.com/elastic/go-sysinfo v1.8.1/go.mod h1:JfllUnzoQV/JRYymbH3dO1yggI3mV2oTKSXsDHM+uIM= diff --git a/pkg/extkingpin/path_content_reloader.go b/pkg/extkingpin/path_content_reloader.go new file mode 100644 index 0000000000..68c2cd252c --- /dev/null +++ b/pkg/extkingpin/path_content_reloader.go @@ -0,0 +1,128 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package extkingpin + +import ( + "context" + "fmt" + "os" + "path" + "path/filepath" + "time" + + "github.com/fsnotify/fsnotify" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/pkg/errors" +) + +type fileContent interface { + Content() ([]byte, error) + Path() string +} + +// PathContentReloader starts a file watcher that monitors the file indicated by fileContent.Path() and runs +// reloadFunc whenever a change is detected. +// A debounce timer can be configured via opts to handle situations where many "write" events are received together or +// a "create" event is followed up by a "write" event, for example. Files will be effectively reloaded at the latest +// after 2 times the debounce timer. By default the debouncer timer is 1 second. +// To ensure renames and deletes are properly handled, the file watcher is put at the file's parent folder. See +// https://github.com/fsnotify/fsnotify/issues/214 for more details. +func PathContentReloader(ctx context.Context, fileContent fileContent, logger log.Logger, reloadFunc func(), debounceTime time.Duration) error { + filePath, err := filepath.Abs(fileContent.Path()) + if err != nil { + return errors.Wrap(err, "getting absolute file path") + } + + watcher, err := fsnotify.NewWatcher() + if filePath == "" { + level.Debug(logger).Log("msg", "no path detected for config reload") + } + if err != nil { + return errors.Wrap(err, "creating file watcher") + } + go func() { + var reloadTimer *time.Timer + if debounceTime != 0 { + reloadTimer = time.AfterFunc(debounceTime, func() { + reloadFunc() + level.Debug(logger).Log("msg", "configuration reloaded after debouncing") + }) + } + defer watcher.Close() + for { + select { + case <-ctx.Done(): + if reloadTimer != nil { + reloadTimer.Stop() + } + return + case event := <-watcher.Events: + // fsnotify sometimes sends a bunch of events without name or operation. + // It's unclear what they are and why they are sent - filter them out. + if event.Name == "" { + break + } + // We are watching the file's parent folder (more details on this is done can be found below), but are + // only interested in changed to the target file. Discard every other file as quickly as possible. + if event.Name != filePath { + break + } + // We only react to files being written or created. + // On chmod or remove we have nothing to do. + // On rename we have the old file name (not useful). A create event for the new file will come later. + if event.Op&fsnotify.Write == 0 && event.Op&fsnotify.Create == 0 { + break + } + level.Debug(logger).Log("msg", fmt.Sprintf("change detected for %s", filePath), "eventName", event.Name, "eventOp", event.Op) + if reloadTimer != nil { + reloadTimer.Reset(debounceTime) + } + case err := <-watcher.Errors: + level.Error(logger).Log("msg", "watcher error", "error", err) + } + } + }() + // We watch the file's parent folder and not the file itself to better handle DELETE and RENAME events. Check + // https://github.com/fsnotify/fsnotify/issues/214 for more details. + if err := watcher.Add(path.Dir(filePath)); err != nil { + return errors.Wrapf(err, "adding path %s to file watcher", filePath) + } + return nil +} + +type staticPathContent struct { + content []byte + path string +} + +var _ fileContent = (*staticPathContent)(nil) + +// Content returns the cached content. +func (t *staticPathContent) Content() ([]byte, error) { + return t.content, nil +} + +// Path returns the path to the file that contains the content. +func (t *staticPathContent) Path() string { + return t.path +} + +// NewStaticPathContent creates a new content that can be used to serve a static configuration. It copies the +// configuration from `fromPath` into `destPath` to avoid confusion with file watchers. +func NewStaticPathContent(fromPath string) (*staticPathContent, error) { + content, err := os.ReadFile(fromPath) + if err != nil { + return nil, errors.Wrapf(err, "could not load test content: %s", fromPath) + } + return &staticPathContent{content, fromPath}, nil +} + +// Rewrite rewrites the file backing this staticPathContent and swaps the local content cache. The file writing +// is needed to trigger the file system monitor. +func (t *staticPathContent) Rewrite(newContent []byte) error { + t.content = newContent + // Write the file to ensure possible file watcher reloaders get triggered. + return os.WriteFile(t.path, newContent, 0666) +} diff --git a/pkg/extkingpin/path_content_reloader_test.go b/pkg/extkingpin/path_content_reloader_test.go new file mode 100644 index 0000000000..fb20f83d5c --- /dev/null +++ b/pkg/extkingpin/path_content_reloader_test.go @@ -0,0 +1,105 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package extkingpin + +import ( + "context" + "os" + "path" + "sync" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/thanos-io/thanos/pkg/testutil" +) + +func TestPathContentReloader(t *testing.T) { + type args struct { + runSteps func(t *testing.T, testFile string, pathContent *staticPathContent) + } + tests := []struct { + name string + args args + wantReloads int + }{ + { + name: "Many operations, only rewrite triggers one reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + testutil.Ok(t, os.Remove(testFile)) + testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) + }, + }, + wantReloads: 1, + }, + { + name: "Many operations, only rename triggers one reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + testutil.Ok(t, os.Rename(testFile, testFile+".tmp")) + testutil.Ok(t, os.Rename(testFile+".tmp", testFile)) + }, + }, + wantReloads: 1, + }, + { + name: "Many operations, two rewrites trigger two reloads", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + testutil.Ok(t, os.Remove(testFile)) + testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) + time.Sleep(2 * time.Second) + testutil.Ok(t, pathContent.Rewrite([]byte("test modified again"))) + }, + }, + wantReloads: 1, + }, + { + name: "Chmod doesn't trigger reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Chmod(testFile, 0777)) + }, + }, + wantReloads: 0, + }, + { + name: "Remove doesn't trigger reload", + args: args{ + runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { + testutil.Ok(t, os.Remove(testFile)) + }, + }, + wantReloads: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testFile := path.Join(t.TempDir(), "test") + testutil.Ok(t, os.WriteFile(testFile, []byte("test"), 0666)) + pathContent, err := NewStaticPathContent(testFile) + testutil.Ok(t, err) + + wg := &sync.WaitGroup{} + wg.Add(tt.wantReloads) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + reloadCount := 0 + err = PathContentReloader(ctx, pathContent, log.NewLogfmtLogger(os.Stdout), func() { + reloadCount++ + wg.Done() + }, 100*time.Millisecond) + testutil.Ok(t, err) + + tt.args.runSteps(t, testFile, pathContent) + wg.Wait() + testutil.Equals(t, tt.wantReloads, reloadCount) + }) + } +} diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go index 156bb74566..12afb752b8 100644 --- a/pkg/receive/handler.go +++ b/pkg/receive/handler.go @@ -17,10 +17,6 @@ import ( "sync" "time" - "github.com/thanos-io/thanos/pkg/api" - statusapi "github.com/thanos-io/thanos/pkg/api/status" - "github.com/thanos-io/thanos/pkg/logging" - "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -35,6 +31,9 @@ import ( "github.com/prometheus/prometheus/model/relabel" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb" + "github.com/thanos-io/thanos/pkg/api" + statusapi "github.com/thanos-io/thanos/pkg/api/status" + "github.com/thanos-io/thanos/pkg/logging" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -99,7 +98,7 @@ type Options struct { ForwardTimeout time.Duration RelabelConfigs []*relabel.Config TSDBStats TSDBStats - Limiter *limiter + Limiter *Limiter } // Handler serves a Prometheus remote write receiving HTTP endpoint. @@ -124,7 +123,7 @@ type Handler struct { writeSamplesTotal *prometheus.HistogramVec writeTimeseriesTotal *prometheus.HistogramVec - limiter *limiter + Limiter *Limiter } func NewHandler(logger log.Logger, o *Options) *Handler { @@ -150,7 +149,7 @@ func NewHandler(logger log.Logger, o *Options) *Handler { Max: 30 * time.Second, Jitter: true, }, - limiter: o.Limiter, + Limiter: o.Limiter, forwardRequests: promauto.With(registerer).NewCounterVec( prometheus.CounterOpts{ Name: "thanos_receive_forward_requests_total", @@ -407,17 +406,18 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { tLogger := log.With(h.logger, "tenant", tenant) + writeGate := h.Limiter.WriteGate() tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) { - err = h.limiter.writeGate.Start(r.Context()) + err = writeGate.Start(r.Context()) }) + defer writeGate.Done() if err != nil { level.Error(tLogger).Log("err", err, "msg", "internal server error") http.Error(w, err.Error(), http.StatusInternalServerError) return } - defer h.limiter.writeGate.Done() - under, err := h.limiter.HeadSeriesLimiter.isUnderLimit(tenant) + under, err := h.Limiter.HeadSeriesLimiter.isUnderLimit(tenant) if err != nil { level.Error(tLogger).Log("msg", "error while limiting", "err", err.Error()) } @@ -428,7 +428,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { return } - requestLimiter := h.limiter.requestLimiter + requestLimiter := h.Limiter.RequestLimiter() // io.ReadAll dynamically adjust the byte slice for read data, starting from 512B. // Since this is receive hot path, grow upfront saving allocations and CPU time. compressed := bytes.Buffer{} diff --git a/pkg/receive/handler_test.go b/pkg/receive/handler_test.go index 44076de141..4a2a536038 100644 --- a/pkg/receive/handler_test.go +++ b/pkg/receive/handler_test.go @@ -13,6 +13,7 @@ import ( "net/http" "net/http/httptest" "os" + "path" "path/filepath" "runtime" "runtime/pprof" @@ -21,6 +22,8 @@ import ( "testing" "time" + "gopkg.in/yaml.v3" + "github.com/alecthomas/units" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -40,6 +43,7 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/errutil" + "github.com/thanos-io/thanos/pkg/extkingpin" "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/store/labelpb" "github.com/thanos-io/thanos/pkg/store/storepb" @@ -362,6 +366,7 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin }, } + limiter, _ := NewLimiter(NewNopConfig(), nil, RouterIngestor, log.NewNopLogger()) for i := range appendables { h := NewHandler(nil, &Options{ TenantHeader: DefaultTenantHeader, @@ -369,7 +374,7 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin ReplicationFactor: replicationFactor, ForwardTimeout: 5 * time.Second, Writer: NewWriter(log.NewNopLogger(), newFakeTenantAppendable(appendables[i])), - Limiter: NewLimiter(nil, nil, RouterIngestor, nil), + Limiter: limiter, }) handlers = append(handlers, h) h.peers = peers @@ -775,23 +780,28 @@ func TestReceiveWriteRequestLimits(t *testing.T) { } handlers, _ := newTestHandlerHashring(appendables, 3) handler := handlers[0] + tenant := "test" - handler.limiter = NewLimiter( - &RootLimitsConfig{ - WriteLimits: WriteLimitsConfig{ - TenantsLimits: TenantsWriteLimitsConfig{ - tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig(). - SetSizeBytesLimit(int64(1 * units.Megabyte)). - SetSeriesLimit(20). - SetSamplesLimit(200), - }, + tenantConfig, err := yaml.Marshal(&RootLimitsConfig{ + WriteLimits: WriteLimitsConfig{ + TenantsLimits: TenantsWriteLimitsConfig{ + tenant: &WriteLimitConfig{ + RequestLimits: NewEmptyRequestLimitsConfig(). + SetSizeBytesLimit(int64(1 * units.Megabyte)). + SetSeriesLimit(20). + SetSamplesLimit(200), }, }, }, - nil, - RouterIngestor, - log.NewNopLogger(), + }) + if err != nil { + t.Fatal("handler: failed to generate limit configuration") + } + tmpLimitsPath := path.Join(t.TempDir(), "limits.yaml") + testutil.Ok(t, os.WriteFile(tmpLimitsPath, tenantConfig, 0666)) + limitConfig, _ := extkingpin.NewStaticPathContent(tmpLimitsPath) + handler.Limiter, _ = NewLimiter( + limitConfig, nil, RouterIngestor, log.NewNopLogger(), ) wreq := &prompb.WriteRequest{ diff --git a/pkg/receive/limiter.go b/pkg/receive/limiter.go index bc3c4d8358..ff5bbe3199 100644 --- a/pkg/receive/limiter.go +++ b/pkg/receive/limiter.go @@ -5,59 +5,204 @@ package receive import ( "context" + "fmt" + "sync" + "time" "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/thanos-io/thanos/pkg/extkingpin" + + "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/thanos-io/thanos/pkg/extprom" "github.com/thanos-io/thanos/pkg/gate" ) -type limiter struct { - requestLimiter requestLimiter - writeGate gate.Gate - HeadSeriesLimiter headSeriesLimiter +// Limiter is responsible for managing the configuration and initialization of +// different types that apply limits to the Receive instance. +type Limiter struct { + sync.RWMutex + requestLimiter requestLimiter + HeadSeriesLimiter headSeriesLimiter + writeGate gate.Gate + registerer prometheus.Registerer + configPathOrContent fileContent + logger log.Logger + configReloadCounter prometheus.Counter + configReloadFailedCounter prometheus.Counter + receiverMode ReceiverMode +} + +// headSeriesLimiter encompasses active/head series limiting logic. +type headSeriesLimiter interface { + QueryMetaMonitoring(context.Context) error + isUnderLimit(tenant string) (bool, error) } -// requestLimiter encompasses logic for limiting remote write requests. type requestLimiter interface { AllowSizeBytes(tenant string, contentLengthBytes int64) bool AllowSeries(tenant string, amount int64) bool AllowSamples(tenant string, amount int64) bool } -// headSeriesLimiter encompasses active/head series limiting logic. -type headSeriesLimiter interface { - QueryMetaMonitoring(context.Context) error - isUnderLimit(tenant string) (bool, error) +// fileContent is an interface to avoid a direct dependency on kingpin or extkingpin. +type fileContent interface { + Content() ([]byte, error) + Path() string } -func NewLimiter(root *RootLimitsConfig, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) *limiter { - limiter := &limiter{ +// NewLimiter creates a new *Limiter given a configuration and prometheus +// registerer. +func NewLimiter(configFile fileContent, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) (*Limiter, error) { + limiter := &Limiter{ writeGate: gate.NewNoop(), requestLimiter: &noopRequestLimiter{}, HeadSeriesLimiter: NewNopSeriesLimit(), + logger: logger, + receiverMode: r, + } + + if reg != nil { + limiter.registerer = NewUnRegisterer(reg) + limiter.configReloadCounter = promauto.With(limiter.registerer).NewCounter( + prometheus.CounterOpts{ + Namespace: "thanos", + Subsystem: "receive", + Name: "limits_config_reload_total", + Help: "How many times the limit configuration was reloaded", + }, + ) + limiter.configReloadFailedCounter = promauto.With(limiter.registerer).NewCounter( + prometheus.CounterOpts{ + Namespace: "thanos", + Subsystem: "receive", + Name: "limits_config_reload_err_total", + Help: "How many times the limit configuration failed to reload.", + }, + ) + } + + if configFile == nil { + return limiter, nil + } + + limiter.configPathOrContent = configFile + if err := limiter.loadConfig(); err != nil { + return nil, errors.Wrap(err, "load tenant limits config") + } + + return limiter, nil +} + +// StartConfigReloader starts the automatic configuration reloader based off of +// the file indicated by pathOrContent. It starts a Go routine in the given +// *run.Group. +func (l *Limiter) StartConfigReloader(ctx context.Context) error { + if !l.CanReload() { + return nil } - if root == nil { - return limiter + + return extkingpin.PathContentReloader(ctx, l.configPathOrContent, l.logger, func() { + level.Info(l.logger).Log("msg", "reloading limit config") + if err := l.loadConfig(); err != nil { + if failedReload := l.configReloadCounter; failedReload != nil { + failedReload.Inc() + } + errMsg := fmt.Sprintf("error reloading tenant limits config from %s", l.configPathOrContent.Path()) + level.Error(l.logger).Log("msg", errMsg, "err", err) + } + if reloadCounter := l.configReloadCounter; reloadCounter != nil { + reloadCounter.Inc() + } + }, 1*time.Second) +} + +func (l *Limiter) CanReload() bool { + if l.configPathOrContent == nil { + return false } + if l.configPathOrContent.Path() == "" { + return false + } + return true +} - maxWriteConcurrency := root.WriteLimits.GlobalLimits.MaxConcurrency +func (l *Limiter) loadConfig() error { + config, err := ParseLimitConfigContent(l.configPathOrContent) + if err != nil { + return err + } + l.Lock() + defer l.Unlock() + maxWriteConcurrency := config.WriteLimits.GlobalLimits.MaxConcurrency if maxWriteConcurrency > 0 { - limiter.writeGate = gate.New( + l.writeGate = gate.New( extprom.WrapRegistererWithPrefix( "thanos_receive_write_request_concurrent_", - reg, + l.registerer, ), int(maxWriteConcurrency), ) } - limiter.requestLimiter = newConfigRequestLimiter(reg, &root.WriteLimits) - - // Impose active series limit only if Receiver is in Router or RouterIngestor mode, and config is provided. - seriesLimitSupported := (r == RouterOnly || r == RouterIngestor) && (len(root.WriteLimits.TenantsLimits) != 0 || root.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) + l.requestLimiter = newConfigRequestLimiter( + l.registerer, + &config.WriteLimits, + ) + seriesLimitSupported := (l.receiverMode == RouterOnly || l.receiverMode == RouterIngestor) && (len(config.WriteLimits.TenantsLimits) != 0 || config.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) if seriesLimitSupported { - limiter.HeadSeriesLimiter = NewHeadSeriesLimit(root.WriteLimits, reg, logger) + l.HeadSeriesLimiter = NewHeadSeriesLimit(config.WriteLimits, l.registerer, l.logger) } + return nil +} + +// RequestLimiter is a safe getter for the request limiter. +func (l *Limiter) RequestLimiter() requestLimiter { + l.RLock() + defer l.RUnlock() + return l.requestLimiter +} + +// WriteGate is a safe getter for the write gate. +func (l *Limiter) WriteGate() gate.Gate { + l.RLock() + defer l.RUnlock() + return l.writeGate +} + +// ParseLimitConfigContent parses the limit configuration from the path or +// content. +func ParseLimitConfigContent(limitsConfig fileContent) (*RootLimitsConfig, error) { + if limitsConfig == nil { + return &RootLimitsConfig{}, nil + } + limitsContentYaml, err := limitsConfig.Content() + if err != nil { + return nil, errors.Wrap(err, "get content of limit configuration") + } + parsedConfig, err := ParseRootLimitConfig(limitsContentYaml) + if err != nil { + return nil, errors.Wrap(err, "parse limit configuration") + } + return parsedConfig, nil +} + +type nopConfigContent struct{} + +var _ fileContent = (*nopConfigContent)(nil) + +// Content returns no content and no error. +func (n nopConfigContent) Content() ([]byte, error) { + return nil, nil +} + +// Path returns an empty path. +func (n nopConfigContent) Path() string { + return "" +} - return limiter +// NewNopConfig creates a no-op config content (no configuration). +func NewNopConfig() nopConfigContent { + return nopConfigContent{} } diff --git a/pkg/receive/limiter_config.go b/pkg/receive/limiter_config.go index 67aa5ef93a..c3bd330b6e 100644 --- a/pkg/receive/limiter_config.go +++ b/pkg/receive/limiter_config.go @@ -78,6 +78,7 @@ type DefaultLimitsConfig struct { HeadSeriesLimit uint64 `yaml:"head_series_limit"` } +// TenantsWriteLimitsConfig is a map of tenant IDs to their *WriteLimitConfig. type TenantsWriteLimitsConfig map[string]*WriteLimitConfig // A tenant might not always have limits configured, so things here must @@ -110,8 +111,7 @@ type requestLimitsConfig struct { SamplesLimit *int64 `yaml:"samples_limit"` } -// Utils for initializing. -func newEmptyRequestLimitsConfig() *requestLimitsConfig { +func NewEmptyRequestLimitsConfig() *requestLimitsConfig { return &requestLimitsConfig{} } diff --git a/pkg/receive/limiter_config_test.go b/pkg/receive/limiter_config_test.go index b080680162..3e32ea41e8 100644 --- a/pkg/receive/limiter_config_test.go +++ b/pkg/receive/limiter_config_test.go @@ -35,7 +35,7 @@ func TestParseLimiterConfig(t *testing.T) { }, }, DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig(). + RequestLimits: *NewEmptyRequestLimitsConfig(). SetSizeBytesLimit(1024). SetSeriesLimit(1000). SetSamplesLimit(10), @@ -44,7 +44,7 @@ func TestParseLimiterConfig(t *testing.T) { TenantsLimits: TenantsWriteLimitsConfig{ "acme": NewEmptyWriteLimitConfig(). SetRequestLimits( - newEmptyRequestLimitsConfig(). + NewEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0), @@ -52,7 +52,7 @@ func TestParseLimiterConfig(t *testing.T) { SetHeadSeriesLimit(2000), "ajax": NewEmptyWriteLimitConfig(). SetRequestLimits( - newEmptyRequestLimitsConfig(). + NewEmptyRequestLimitsConfig(). SetSeriesLimit(50000). SetSamplesLimit(500), ), diff --git a/pkg/receive/limiter_test.go b/pkg/receive/limiter_test.go new file mode 100644 index 0000000000..be7e8790c1 --- /dev/null +++ b/pkg/receive/limiter_test.go @@ -0,0 +1,100 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package receive + +import ( + "context" + "os" + "path" + "testing" + "time" + + "github.com/thanos-io/thanos/pkg/extkingpin" + + "github.com/efficientgo/tools/core/pkg/testutil" + "github.com/go-kit/log" +) + +func TestLimiter_StartConfigReloader(t *testing.T) { + origLimitsFile, err := os.ReadFile(path.Join("testdata", "limits_config", "good_limits.yaml")) + testutil.Ok(t, err) + copyLimitsFile := path.Join(t.TempDir(), "limits.yaml") + testutil.Ok(t, os.WriteFile(copyLimitsFile, origLimitsFile, 0666)) + + goodLimits, err := extkingpin.NewStaticPathContent(copyLimitsFile) + if err != nil { + t.Fatalf("error trying to save static limit config: %s", err) + } + invalidLimitsPath := path.Join("./testdata", "limits_config", "invalid_limits.yaml") + invalidLimits, err := os.ReadFile(invalidLimitsPath) + if err != nil { + t.Fatalf("could not load test content at %s: %s", invalidLimitsPath, err) + } + + limiter, err := NewLimiter(goodLimits, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) + testutil.Ok(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + err = limiter.StartConfigReloader(ctx) + testutil.Ok(t, err) + + time.Sleep(1 * time.Second) + testutil.Ok(t, goodLimits.Rewrite(invalidLimits)) +} + +type emptyPathFile struct{} + +func (e emptyPathFile) Content() ([]byte, error) { + return []byte{}, nil +} + +func (e emptyPathFile) Path() string { + return "" +} + +func TestLimiter_CanReload(t *testing.T) { + validLimitsPath, err := extkingpin.NewStaticPathContent( + path.Join("testdata", "limits_config", "good_limits.yaml"), + ) + testutil.Ok(t, err) + emptyLimitsPath := emptyPathFile{} + + type args struct { + configFilePath fileContent + } + tests := []struct { + name string + args args + wantReload bool + }{ + { + name: "Nil config file path cannot be reloaded", + args: args{configFilePath: nil}, + wantReload: false, + }, + { + name: "Empty config file path cannot be reloaded", + args: args{configFilePath: emptyLimitsPath}, + wantReload: false, + }, + { + name: "Valid config file path can be reloaded", + args: args{configFilePath: validLimitsPath}, + wantReload: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + configFile := tt.args.configFilePath + limiter, err := NewLimiter(configFile, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) + testutil.Ok(t, err) + if tt.wantReload { + testutil.Assert(t, limiter.CanReload()) + } else { + testutil.Assert(t, !limiter.CanReload()) + } + }) + } +} diff --git a/pkg/receive/request_limiter.go b/pkg/receive/request_limiter.go index de7554de2f..7da0c64a6d 100644 --- a/pkg/receive/request_limiter.go +++ b/pkg/receive/request_limiter.go @@ -14,7 +14,7 @@ const ( sizeBytesLimitName = "body_size" ) -var unlimitedRequestLimitsConfig = newEmptyRequestLimitsConfig(). +var unlimitedRequestLimitsConfig = NewEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0) @@ -49,7 +49,12 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits tenantLimits: tenantRequestLimits, cachedDefaultLimits: defaultRequestLimits, } - limiter.limitsHit = promauto.With(reg).NewSummaryVec( + limiter.registerMetrics(reg) + return &limiter +} + +func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { + l.limitsHit = promauto.With(reg).NewSummaryVec( prometheus.SummaryOpts{ Namespace: "thanos", Subsystem: "receive", @@ -58,7 +63,7 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits Objectives: map[float64]float64{0.50: 0.1, 0.95: 0.1, 0.99: 0.001}, }, []string{"tenant", "limit"}, ) - limiter.configuredLimits = promauto.With(reg).NewGaugeVec( + l.configuredLimits = promauto.With(reg).NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thanos", Subsystem: "receive", @@ -66,16 +71,14 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits Help: "The configured write limits.", }, []string{"tenant", "limit"}, ) - for tenant, limits := range tenantRequestLimits { - limiter.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) - limiter.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) - limiter.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) + for tenant, limits := range l.tenantLimits { + l.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) + l.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) + l.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) } - limiter.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*defaultRequestLimits.SizeBytesLimit)) - limiter.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*defaultRequestLimits.SeriesLimit)) - limiter.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*defaultRequestLimits.SamplesLimit)) - - return &limiter + l.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*l.cachedDefaultLimits.SizeBytesLimit)) + l.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*l.cachedDefaultLimits.SeriesLimit)) + l.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*l.cachedDefaultLimits.SamplesLimit)) } func (l *configRequestLimiter) AllowSizeBytes(tenant string, contentLengthBytes int64) bool { @@ -100,7 +103,7 @@ func (l *configRequestLimiter) AllowSeries(tenant string, amount int64) bool { } allowed := *limit >= amount - if !allowed { + if !allowed && l.limitsHit != nil { l.limitsHit. WithLabelValues(tenant, seriesLimitName). Observe(float64(amount - *limit)) @@ -114,7 +117,7 @@ func (l *configRequestLimiter) AllowSamples(tenant string, amount int64) bool { return true } allowed := *limit >= amount - if !allowed { + if !allowed && l.limitsHit != nil { l.limitsHit. WithLabelValues(tenant, samplesLimitName). Observe(float64(amount - *limit)) diff --git a/pkg/receive/request_limiter_test.go b/pkg/receive/request_limiter_test.go index e654cd1cdf..dfbea066d9 100644 --- a/pkg/receive/request_limiter_test.go +++ b/pkg/receive/request_limiter_test.go @@ -15,12 +15,12 @@ func TestRequestLimiter_limitsFor(t *testing.T) { limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig(). + RequestLimits: *NewEmptyRequestLimitsConfig(). SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenantWithLimits: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig(). + RequestLimits: NewEmptyRequestLimitsConfig(). SetSeriesLimit(30), }, }, @@ -33,7 +33,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the default limits when tenant's limits aren't present", tenant: tenantWithoutLimits, - wantLimits: newEmptyRequestLimitsConfig(). + wantLimits: NewEmptyRequestLimitsConfig(). SetSeriesLimit(10). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -41,7 +41,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the tenant's limits when it is present", tenant: tenantWithLimits, - wantLimits: newEmptyRequestLimitsConfig(). + wantLimits: NewEmptyRequestLimitsConfig(). SetSeriesLimit(30). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -102,11 +102,11 @@ func TestRequestLimiter_AllowRequestBodySizeBytes(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), + RequestLimits: NewEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), }, }, } @@ -159,11 +159,11 @@ func TestRequestLimiter_AllowSeries(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), + RequestLimits: NewEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), }, }, } @@ -217,11 +217,11 @@ func TestRequestLimiter_AllowSamples(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: newEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), + RequestLimits: NewEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), }, }, } diff --git a/pkg/receive/testdata/limits.yaml b/pkg/receive/testdata/limits.yaml new file mode 100644 index 0000000000..2345756179 --- /dev/null +++ b/pkg/receive/testdata/limits.yaml @@ -0,0 +1,22 @@ +write: + global: + max_concurrency: 30 + meta_monitoring_url: "http://localhost:9090" + meta_monitoring_limit_query: "sum(prometheus_tsdb_head_series) by (tenant)" + default: + request: + size_bytes_limit: 1024 + series_limit: 1000 + samples_limit: 10 + head_series_limit: 1000 + tenants: + acme: + request: + size_bytes_limit: 0 + series_limit: 0 + samples_limit: 0 + head_series_limit: 2000 + ajax: + request: + series_limit: 50000 + samples_limit: 500 diff --git a/pkg/receive/testdata/limits_config/invalid_limits.yaml b/pkg/receive/testdata/limits_config/invalid_limits.yaml new file mode 100644 index 0000000000..74db0453f8 --- /dev/null +++ b/pkg/receive/testdata/limits_config/invalid_limits.yaml @@ -0,0 +1,17 @@ +write: + global: + max_concurrency: 30 + request: + size_bytes_limit: 1024 + series_limit: 1000 + samples_limit: 10 + tenants: + acme: + request: + size_bytes_limit: 0 + series_limit: 0 + samples_limit: 0 + ajax: + request: + series_limit: 50000 + samples_limit: 500 From 7a77769258a138394be68a018be5f0c871afee7a Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 18 Oct 2022 09:10:32 +0200 Subject: [PATCH 23/43] Query: add query metrics to calls going through the Store API (#5741) * Implement granular query performance metrics for Thanos Query These are grabbed from the data returned by multiple Store APIs after execution of a query. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix some linter warnings Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix long function definition (newQuerier) Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove TODO comment Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Reformat query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless return Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back old query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update query docs again Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix e2e env name Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright notice. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Bump wait time to twice scrape interval Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Attempt to fix randomly failing test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Checking more metrics to ensure the store is ready Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Clean up test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Do not record store api metrics when didn't touch series or samples Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Also skip store api metrics on zero chunks touched Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update changelog Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix broken changelog after merge Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove extra empty line Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor names and (un)exported types and fields Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Start listing metrics exported by Thanos Query Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Rename pkg/store/metrics -> pkg/store/telemetry Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of the pkg/store/telemetry package Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Co-authored-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Signed-off-by: utukj --- CHANGELOG.md | 1 + cmd/thanos/query.go | 19 ++++- docs/components/query.md | 19 +++++ pkg/api/query/grpc.go | 2 + pkg/api/query/v1.go | 99 +++++++++++++++++++--- pkg/api/query/v1_test.go | 6 +- pkg/query/querier.go | 79 ++++++++++++++--- pkg/query/querier_test.go | 37 ++++++-- pkg/query/query_bench_test.go | 13 +-- pkg/query/query_test.go | 11 ++- pkg/store/telemetry.go | 88 +++++++++++++++++++ test/e2e/query_test.go | 155 +++++++++++++++++++++++++++++++--- 12 files changed, 478 insertions(+), 51 deletions(-) create mode 100644 pkg/store/telemetry.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e1d2143c3..6e2e854ce2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. +- [#5741](https://github.com/thanos-io/thanos/pull/5741) Query: add metrics on how much data is being selected by downstream Store APIs. - [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 54724f59a6..5e5a7fc7cd 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,6 +25,8 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" + "google.golang.org/grpc" + v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-community/promql-engine/engine" apiv1 "github.com/thanos-io/thanos/pkg/api/query" @@ -54,7 +56,6 @@ import ( "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/tls" "github.com/thanos-io/thanos/pkg/ui" - "google.golang.org/grpc" ) const ( @@ -205,6 +206,10 @@ func registerQuery(app *extkingpin.App) { alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field.").String() grpcProxyStrategy := cmd.Flag("grpc.proxy-strategy", "Strategy to use when proxying Series requests to leaf nodes. Hidden and only used for testing, will be removed after lazy becomes the default.").Default(string(store.EagerRetrieval)).Hidden().Enum(string(store.EagerRetrieval), string(store.LazyRetrieval)) + queryTelemetryDurationQuantiles := cmd.Flag("query.telemetry.request-duration-seconds-quantiles", "The quantiles for exporting metrics about the request duration quantiles.").Default("0.1", "0.25", "0.75", "1.25", "1.75", "2.5", "3", "5", "10").Float64List() + queryTelemetrySamplesQuantiles := cmd.Flag("query.telemetry.request-samples-quantiles", "The quantiles for exporting metrics about the samples count quantiles.").Default("100", "1000", "10000", "100000", "1000000").Int64List() + queryTelemetrySeriesQuantiles := cmd.Flag("query.telemetry.request-series-seconds-quantiles", "The quantiles for exporting metrics about the series count quantiles.").Default("10", "100", "1000", "10000", "100000").Int64List() + cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { selectorLset, err := parseFlagLabels(*selectorLabels) if err != nil { @@ -317,6 +322,9 @@ func registerQuery(app *extkingpin.App) { *alertQueryURL, *grpcProxyStrategy, component.Query, + *queryTelemetryDurationQuantiles, + *queryTelemetrySamplesQuantiles, + *queryTelemetrySeriesQuantiles, promqlEngineType(*promqlEngine), ) }) @@ -390,6 +398,9 @@ func runQuery( alertQueryURL string, grpcProxyStrategy string, comp component.Component, + queryTelemetryDurationQuantiles []float64, + queryTelemetrySamplesQuantiles []int64, + queryTelemetrySeriesQuantiles []int64, promqlEngine promqlEngineType, ) error { if alertQueryURL == "" { @@ -694,6 +705,12 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), + store.NewSeriesStatsAggregator( + reg, + queryTelemetryDurationQuantiles, + queryTelemetrySamplesQuantiles, + queryTelemetrySeriesQuantiles, + ), reg, ) diff --git a/docs/components/query.md b/docs/components/query.md index 1a028ee3ed..c3690ca05a 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -381,6 +381,15 @@ Flags: be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. + --query.telemetry.request-duration-seconds-quantiles=0.1... ... + The quantiles for exporting metrics about the + request duration quantiles. + --query.telemetry.request-samples-quantiles=100... ... + The quantiles for exporting metrics about the + samples count quantiles. + --query.telemetry.request-series-seconds-quantiles=10... ... + The quantiles for exporting metrics about the + series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' @@ -463,3 +472,13 @@ Flags: of Prometheus. ``` + +## Exported metrics + +Thanos Query also exports metrics about its own performance. You can find a list with these metrics below. + +**Disclaimer**: this list is incomplete. The remaining metrics will be added over time. + +| Name | Type | Labels | Description | +|-----------------------------------------|-----------|-----------------------|-------------------------------------------------------------------------------------------------------------------| +| thanos_store_api_query_duration_seconds | Histogram | samples_le, series_le | Duration of the Thanos Store API select phase for a query according to the amount of samples and series selected. | diff --git a/pkg/api/query/grpc.go b/pkg/api/query/grpc.go index 144166f57b..8848cd2ffe 100644 --- a/pkg/api/query/grpc.go +++ b/pkg/api/query/grpc.go @@ -94,6 +94,7 @@ func (g *GRPCAPI) Query(request *querypb.QueryRequest, server querypb.Query_Quer request.EnableQueryPushdown, false, request.ShardInfo, + query.NoopSeriesStatsReporter, ) qry, err := g.queryEngine.NewInstantQuery(queryable, &promql.QueryOpts{LookbackDelta: lookbackDelta}, request.Query, ts) if err != nil { @@ -168,6 +169,7 @@ func (g *GRPCAPI) QueryRange(request *querypb.QueryRangeRequest, srv querypb.Que request.EnableQueryPushdown, false, request.ShardInfo, + query.NoopSeriesStatsReporter, ) startTime := time.Unix(request.StartTimeSeconds, 0) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index cbe1327a36..918bcbf5fd 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -41,10 +41,8 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" - v1 "github.com/prometheus/prometheus/web/api/v1" - "github.com/prometheus/prometheus/util/stats" - + v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" "github.com/thanos-io/thanos/pkg/exemplars/exemplarspb" @@ -57,6 +55,7 @@ import ( "github.com/thanos-io/thanos/pkg/rules" "github.com/thanos-io/thanos/pkg/rules/rulespb" "github.com/thanos-io/thanos/pkg/runutil" + "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/targets/targetspb" @@ -107,6 +106,13 @@ type QueryAPI struct { defaultMetadataTimeRange time.Duration queryRangeHist prometheus.Histogram + + seriesStatsAggregator seriesQueryPerformanceMetricsAggregator +} + +type seriesQueryPerformanceMetricsAggregator interface { + Aggregate(seriesStats storepb.SeriesStatsCounter) + Observe(duration float64) } // NewQueryAPI returns an initialized QueryAPI type. @@ -134,8 +140,12 @@ func NewQueryAPI( defaultMetadataTimeRange time.Duration, disableCORS bool, gate gate.Gate, + statsAggregator seriesQueryPerformanceMetricsAggregator, reg *prometheus.Registry, ) *QueryAPI { + if statsAggregator == nil { + statsAggregator = &store.NoopSeriesStatsAggregator{} + } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), logger: logger, @@ -160,6 +170,7 @@ func NewQueryAPI( defaultInstantQueryMaxSourceResolution: defaultInstantQueryMaxSourceResolution, defaultMetadataTimeRange: defaultMetadataTimeRange, disableCORS: disableCORS, + seriesStatsAggregator: statsAggregator, queryRangeHist: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "thanos_query_range_requested_timespan_duration_seconds", @@ -396,7 +407,24 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro span, ctx := tracing.StartSpan(ctx, "promql_instant_query") defer span.Finish() - qry, err := qapi.queryEngine.NewInstantQuery(qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), ts) + var seriesStats []storepb.SeriesStatsCounter + qry, err := qapi.queryEngine.NewInstantQuery( + qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + maxSourceResolution, + enablePartialResponse, + qapi.enableQueryPushdown, + false, + shardInfo, + query.NewAggregateStatsReporter(&seriesStats), + ), + &promql.QueryOpts{LookbackDelta: lookbackDelta}, + r.FormValue("query"), + ts, + ) + if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}, func() {} } @@ -409,6 +437,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } defer qapi.gate.Done() + beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -421,6 +450,10 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } + for i := range seriesStats { + qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) + } + qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -525,8 +558,19 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap span, ctx := tracing.StartSpan(ctx, "promql_range_query") defer span.Finish() + var seriesStats []storepb.SeriesStatsCounter qry, err := qapi.queryEngine.NewRangeQuery( - qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), + qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + maxSourceResolution, + enablePartialResponse, + qapi.enableQueryPushdown, + false, + shardInfo, + query.NewAggregateStatsReporter(&seriesStats), + ), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), start, @@ -545,6 +589,7 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } defer qapi.gate.Done() + beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -555,6 +600,10 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } + for i := range seriesStats { + qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) + } + qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -600,8 +649,17 @@ func (qapi *QueryAPI) labelValues(r *http.Request) (interface{}, []error, *api.A matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + true, + nil, + storeDebugMatchers, + 0, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -687,8 +745,18 @@ func (qapi *QueryAPI) series(r *http.Request) (interface{}, []error, *api.ApiErr return nil, nil, apiErr, func() {} } - q, err := qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, math.MaxInt64, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + enableDedup, + replicaLabels, + storeDebugMatchers, + math.MaxInt64, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -737,8 +805,17 @@ func (qapi *QueryAPI) labelNames(r *http.Request) (interface{}, []error, *api.Ap matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). - Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate( + true, + nil, + storeDebugMatchers, + 0, + enablePartialResponse, + qapi.enableQueryPushdown, + true, + nil, + query.NoopSeriesStatsReporter, + ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 000410ddbd..07c562af9c 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -44,9 +44,8 @@ import ( "github.com/prometheus/prometheus/tsdb/tsdbutil" promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" - "github.com/thanos-io/thanos/pkg/compact" - baseAPI "github.com/thanos-io/thanos/pkg/api" + "github.com/thanos-io/thanos/pkg/compact" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/gate" "github.com/thanos-io/thanos/pkg/query" @@ -198,6 +197,7 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -737,6 +737,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -750,6 +751,7 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), + seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/query/querier.go b/pkg/query/querier.go index 361834c07d..b094cbd45c 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -7,6 +7,7 @@ import ( "context" "sort" "strings" + "sync" "time" "github.com/go-kit/log" @@ -28,21 +29,60 @@ import ( "github.com/thanos-io/thanos/pkg/tracing" ) +type seriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) + +var NoopSeriesStatsReporter seriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} + +func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) seriesStatsReporter { + var mutex sync.Mutex + return func(s storepb.SeriesStatsCounter) { + mutex.Lock() + defer mutex.Unlock() + *stats = append(*stats, s) + } +} + // QueryableCreator returns implementation of promql.Queryable that fetches data from the proxy store API endpoints. // If deduplication is enabled, all data retrieved from it will be deduplicated along all replicaLabels by default. // When the replicaLabels argument is not empty it overwrites the global replicaLabels flag. This allows specifying // replicaLabels at query time. // maxResolutionMillis controls downsampling resolution that is allowed (specified in milliseconds). // partialResponse controls `partialResponseDisabled` option of StoreAPI and partial response behavior of proxy. -type QueryableCreator func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable +type QueryableCreator func( + deduplicate bool, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + shardInfo *storepb.ShardInfo, + seriesStatsReporter seriesStatsReporter, +) storage.Queryable // NewQueryableCreator creates QueryableCreator. -func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy storepb.StoreServer, maxConcurrentSelects int, selectTimeout time.Duration) QueryableCreator { +func NewQueryableCreator( + logger log.Logger, + reg prometheus.Registerer, + proxy storepb.StoreServer, + maxConcurrentSelects int, + selectTimeout time.Duration, +) QueryableCreator { duration := promauto.With( extprom.WrapRegistererWithPrefix("concurrent_selects_", reg), ).NewHistogram(gate.DurationHistogramOpts) - return func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable { + return func( + deduplicate bool, + replicaLabels []string, + storeDebugMatchers [][]*labels.Matcher, + maxResolutionMillis int64, + partialResponse, + enableQueryPushdown, + skipChunks bool, + shardInfo *storepb.ShardInfo, + seriesStatsReporter seriesStatsReporter, + ) storage.Queryable { return &queryable{ logger: logger, replicaLabels: replicaLabels, @@ -59,6 +99,7 @@ func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy sto selectTimeout: selectTimeout, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, + seriesStatsReporter: seriesStatsReporter, } } } @@ -77,11 +118,12 @@ type queryable struct { selectTimeout time.Duration enableQueryPushdown bool shardInfo *storepb.ShardInfo + seriesStatsReporter seriesStatsReporter } // Querier returns a new storage querier against the underlying proxy store API. func (q *queryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) { - return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo), nil + return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo, q.seriesStatsReporter), nil } type querier struct { @@ -100,6 +142,7 @@ type querier struct { selectGate gate.Gate selectTimeout time.Duration shardInfo *storepb.ShardInfo + seriesStatsReporter seriesStatsReporter } // newQuerier creates implementation of storage.Querier that fetches data from the proxy @@ -107,16 +150,20 @@ type querier struct { func newQuerier( ctx context.Context, logger log.Logger, - mint, maxt int64, + mint, + maxt int64, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, proxy storepb.StoreServer, deduplicate bool, maxResolutionMillis int64, - partialResponse, enableQueryPushdown bool, skipChunks bool, + partialResponse, + enableQueryPushdown, + skipChunks bool, selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, + seriesStatsReporter seriesStatsReporter, ) *querier { if logger == nil { logger = log.NewNopLogger() @@ -145,6 +192,7 @@ func newQuerier( skipChunks: skipChunks, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, + seriesStatsReporter: seriesStatsReporter, } } @@ -157,8 +205,9 @@ type seriesServer struct { storepb.Store_SeriesServer ctx context.Context - seriesSet []storepb.Series - warnings []string + seriesSet []storepb.Series + seriesSetStats storepb.SeriesStatsCounter + warnings []string } func (s *seriesServer) Send(r *storepb.SeriesResponse) error { @@ -169,6 +218,7 @@ func (s *seriesServer) Send(r *storepb.SeriesResponse) error { if r.GetSeries() != nil { s.seriesSet = append(s.seriesSet, *r.GetSeries()) + s.seriesSetStats.Count(r.GetSeries()) return nil } @@ -257,11 +307,12 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match span, ctx := tracing.StartSpan(ctx, "querier_select_select_fn") defer span.Finish() - set, err := q.selectFn(ctx, hints, ms...) + set, stats, err := q.selectFn(ctx, hints, ms...) if err != nil { promise <- storage.ErrSeriesSet(err) return } + q.seriesStatsReporter(stats) promise <- set }() @@ -279,10 +330,10 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match }} } -func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, error) { +func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, storepb.SeriesStatsCounter, error) { sms, err := storepb.PromMatchersToMatchers(ms...) if err != nil { - return nil, errors.Wrap(err, "convert matchers") + return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "convert matchers") } aggrs := aggrsFromFunc(hints.Func) @@ -310,7 +361,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . Step: hints.Step, Range: hints.Range, }, resp); err != nil { - return nil, errors.Wrap(err, "proxy Series()") + return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "proxy Series()") } var warns storage.Warnings @@ -342,7 +393,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . set: newStoreSeriesSet(resp.seriesSet), aggrs: aggrs, warns: warns, - }, nil + }, resp.seriesSetStats, nil } // TODO(fabxc): this could potentially pushed further down into the store API to make true streaming possible. @@ -357,7 +408,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . // The merged series set assembles all potentially-overlapping time ranges of the same series into a single one. // TODO(bwplotka): We could potentially dedup on chunk level, use chunk iterator for that when available. - return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), nil + return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), resp.seriesSetStats, nil } // sortDedupLabels re-sorts the set so that the same series with different replica diff --git a/pkg/query/querier_test.go b/pkg/query/querier_test.go index a43c75e7a5..2e31fa65a0 100644 --- a/pkg/query/querier_test.go +++ b/pkg/query/querier_test.go @@ -44,7 +44,17 @@ func TestQueryableCreator_MaxResolution(t *testing.T) { queryableCreator := NewQueryableCreator(nil, nil, testProxy, 2, 5*time.Second) oneHourMillis := int64(1*time.Hour) / int64(time.Millisecond) - queryable := queryableCreator(false, nil, nil, oneHourMillis, false, false, false, nil) + queryable := queryableCreator( + false, + nil, + nil, + oneHourMillis, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) q, err := queryable.Querier(context.Background(), 0, 42) testutil.Ok(t, err) @@ -71,7 +81,22 @@ func TestQuerier_DownsampledData(t *testing.T) { } timeout := 10 * time.Second - q := NewQueryableCreator(nil, nil, testProxy, 2, timeout)(false, nil, nil, 9999999, false, false, false, nil) + q := NewQueryableCreator( + nil, + nil, + testProxy, + 2, + timeout, + )(false, + nil, + nil, + 9999999, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) engine := promql.NewEngine( promql.EngineOpts{ MaxSamples: math.MaxInt32, @@ -365,7 +390,7 @@ func TestQuerier_Select_AfterPromQL(t *testing.T) { g := gate.New(2) mq := &mockedQueryable{ Creator: func(mint, maxt int64) storage.Querier { - return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) + return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) }, } t.Cleanup(func() { @@ -609,7 +634,7 @@ func TestQuerier_Select(t *testing.T) { {dedup: true, expected: []series{tcase.expectedAfterDedup}}, } { g := gate.New(2) - q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, func(i storepb.SeriesStatsCounter) {}) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) t.Run(fmt.Sprintf("dedup=%v", sc.dedup), func(t *testing.T) { @@ -838,7 +863,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 100 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) @@ -908,7 +933,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 5 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) diff --git a/pkg/query/query_bench_test.go b/pkg/query/query_bench_test.go index 301c880877..84efb46820 100644 --- a/pkg/query/query_bench_test.go +++ b/pkg/query/query_bench_test.go @@ -80,12 +80,13 @@ func benchQuerySelect(t testutil.TB, totalSamples, totalSeries int, dedup bool) logger := log.NewNopLogger() q := &querier{ - ctx: context.Background(), - logger: logger, - proxy: &mockedStoreServer{responses: resps}, - replicaLabels: map[string]struct{}{"a_replica": {}}, - deduplicate: dedup, - selectGate: gate.NewNoop(), + ctx: context.Background(), + logger: logger, + proxy: &mockedStoreServer{responses: resps}, + replicaLabels: map[string]struct{}{"a_replica": {}}, + deduplicate: dedup, + selectGate: gate.NewNoop(), + seriesStatsReporter: NoopSeriesStatsReporter, } testSelect(t, q, expectedSeries) } diff --git a/pkg/query/query_test.go b/pkg/query/query_test.go index 99e29be66f..060571fc70 100644 --- a/pkg/query/query_test.go +++ b/pkg/query/query_test.go @@ -54,7 +54,16 @@ func TestQuerier_Proxy(t *testing.T) { name: fmt.Sprintf("store number %v", i), }) } - return q(true, nil, nil, 0, false, false, false, nil) + return q(true, + nil, + nil, + 0, + false, + false, + false, + nil, + NoopSeriesStatsReporter, + ) } for _, fn := range files { diff --git a/pkg/store/telemetry.go b/pkg/store/telemetry.go new file mode 100644 index 0000000000..a854daaf0c --- /dev/null +++ b/pkg/store/telemetry.go @@ -0,0 +1,88 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package store + +import ( + "strconv" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/thanos-io/thanos/pkg/store/storepb" +) + +// seriesStatsAggregator aggregates results from fanned-out queries into a histogram given their +// response's shape. +type seriesStatsAggregator struct { + queryDuration *prometheus.HistogramVec + + seriesLeBuckets []int64 + samplesLeBuckets []int64 + seriesStats storepb.SeriesStatsCounter +} + +// NewSeriesStatsAggregator is a constructor for seriesStatsAggregator. +func NewSeriesStatsAggregator( + reg prometheus.Registerer, + durationQuantiles []float64, + sampleQuantiles []int64, + seriesQuantiles []int64, +) *seriesStatsAggregator { + return &seriesStatsAggregator{ + queryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "thanos_store_api_query_duration_seconds", + Help: "Duration of the Thanos Store API select phase for a query.", + Buckets: durationQuantiles, + }, []string{"series_le", "samples_le"}), + seriesLeBuckets: seriesQuantiles, + samplesLeBuckets: sampleQuantiles, + seriesStats: storepb.SeriesStatsCounter{}, + } +} + +// Aggregate is an aggregator for merging `storepb.SeriesStatsCounter` for each incoming fanned out query. +func (s *seriesStatsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { + s.seriesStats.Series += stats.Series + s.seriesStats.Samples += stats.Samples + s.seriesStats.Chunks += stats.Chunks +} + +// Observe commits the aggregated SeriesStatsCounter as an observation. +func (s *seriesStatsAggregator) Observe(duration float64) { + if s.seriesStats.Series == 0 || s.seriesStats.Samples == 0 || s.seriesStats.Chunks == 0 { + return + } + // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 + seriesLeBucket := s.findBucket(float64(s.seriesStats.Series), s.seriesLeBuckets) + samplesLeBucket := s.findBucket(float64(s.seriesStats.Samples), s.samplesLeBuckets) + s.queryDuration.With(prometheus.Labels{ + "series_le": strconv.Itoa(int(seriesLeBucket)), + "samples_le": strconv.Itoa(int(samplesLeBucket)), + }).Observe(duration) + s.reset() +} + +func (s *seriesStatsAggregator) reset() { + s.seriesStats = storepb.SeriesStatsCounter{} +} + +func (s *seriesStatsAggregator) findBucket(value float64, quantiles []int64) int64 { + if len(quantiles) == 0 { + return 0 + } + var foundBucket int64 + for _, bucket := range quantiles { + foundBucket = bucket + if value < float64(bucket) { + break + } + } + return foundBucket +} + +// NoopSeriesStatsAggregator is a query performance series aggregator that does nothing. +type NoopSeriesStatsAggregator struct{} + +func (s *NoopSeriesStatsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} + +func (s *NoopSeriesStatsAggregator) Observe(_ float64) {} diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 7fc56bda97..04b425061a 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -23,6 +23,7 @@ import ( "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" "github.com/efficientgo/e2e" + e2edb "github.com/efficientgo/e2e/db" e2emon "github.com/efficientgo/e2e/monitoring" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -578,6 +579,130 @@ func newSample(s fakeMetricSample) model.Sample { } } +func TestQueryStoreMetrics(t *testing.T) { + t.Parallel() + + // Build up. + e, err := e2e.New(e2e.WithName("storemetrics01")) + testutil.Ok(t, err) + t.Cleanup(e2ethanos.CleanScenario(t, e)) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + t.Cleanup(cancel) + + bucket := "store-gw-test" + minio := e2ethanos.NewMinio(e, "thanos-minio", bucket) + testutil.Ok(t, e2e.StartAndWaitReady(minio)) + + l := log.NewLogfmtLogger(os.Stdout) + bkt, err := s3.NewBucketWithConfig(l, e2ethanos.NewS3Config(bucket, minio.Endpoint("https"), minio.Dir()), "test") + testutil.Ok(t, err) + + // Preparing 2 different blocks for the tests. + { + blockSizes := []struct { + samples int + series int + name string + }{ + {samples: 10, series: 1, name: "one_series"}, + {samples: 10, series: 1001, name: "thousand_one_series"}, + } + now := time.Now() + externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") + dir := filepath.Join(e.SharedDir(), "tmp") + testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) + for _, blockSize := range blockSizes { + series := make([]labels.Labels, blockSize.series) + for i := 0; i < blockSize.series; i++ { + bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) + series[i] = bigSeriesLabels + } + blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, + dir, + series, + blockSize.samples, + timestamp.FromTime(now), + timestamp.FromTime(now.Add(2*time.Hour)), + 30*time.Minute, + externalLabels, + 0, + metadata.NoneFunc, + ) + testutil.Ok(t, err) + testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) + } + } + + storeGW := e2ethanos.NewStoreGW( + e, + "s1", + client.BucketConfig{ + Type: client.S3, + Config: e2ethanos.NewS3Config(bucket, minio.InternalEndpoint("https"), minio.InternalDir()), + }, + "", + nil, + ) + querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() + testutil.Ok(t, e2e.StartAndWaitReady(storeGW, querier)) + testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) + + // Querying the series in the previously created blocks to ensure we produce Store API query metrics. + { + instantQuery(t, ctx, querier.Endpoint("http"), func() string { + return "max_over_time(one_series{instance='foo_0'}[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1) + testutil.Ok(t, err) + + instantQuery(t, ctx, querier.Endpoint("http"), func() string { + return "max_over_time(thousand_one_series[2h])" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, 1001) + testutil.Ok(t, err) + } + + mon, err := e2emon.Start(e) + testutil.Ok(t, err) + + queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { + return "thanos_store_api_query_duration_seconds_count{samples_le='100000',series_le='10000'}" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{ + "__name__": "thanos_store_api_query_duration_seconds_count", + "instance": "storemetrics01-querier-1:8080", + "job": "querier-1", + "samples_le": "100000", + "series_le": "10000", + }, + Value: model.SampleValue(1), + }, + }) + + queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { + return "thanos_store_api_query_duration_seconds_count{samples_le='100',series_le='10'}" + }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{ + "__name__": "thanos_store_api_query_duration_seconds_count", + "instance": "storemetrics01-querier-1:8080", + "job": "querier-1", + "samples_le": "100", + "series_le": "10", + }, + Value: model.SampleValue(1), + }, + }) +} + // Regression test for https://github.com/thanos-io/thanos/issues/5033. // Tests whether queries work with mixed sources, and with functions // that we are pushing down: min, max, min_over_time, max_over_time, @@ -882,18 +1007,10 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin "msg", fmt.Sprintf("Waiting for %d results for query %s", expectedSeriesLen, q()), ) testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { - res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) + res, err := simpleInstantQuery(t, ctx, addr, q, ts, opts, expectedSeriesLen) if err != nil { return err } - - if len(warnings) > 0 { - return errors.Errorf("unexpected warnings %s", warnings) - } - - if len(res) != expectedSeriesLen { - return errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) - } result = res return nil })) @@ -901,6 +1018,24 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin return result } +func simpleInstantQuery(t testing.TB, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expectedSeriesLen int) (model.Vector, error) { + res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) + if err != nil { + return nil, err + } + + if len(warnings) > 0 { + return nil, errors.Errorf("unexpected warnings %s", warnings) + } + + if len(res) != expectedSeriesLen { + return nil, errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) + } + + sortResults(res) + return res, nil +} + func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) { t.Helper() @@ -912,7 +1047,7 @@ func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() "caller", "queryWaitAndAssert", "msg", fmt.Sprintf("Waiting for %d results for query %s", len(expected), q()), ) - testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.RetryWithLog(logger, 10*time.Second, ctx.Done(), func() error { res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err From c509c0e9f22deb3f75b707f88285548876eb7ffb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Giedrius=20Statkevi=C4=8Dius?= Date: Tue, 18 Oct 2022 12:48:11 +0300 Subject: [PATCH 24/43] docs: mark me as shepherd for next release (#5797) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's release the RC on Friday. Signed-off-by: Giedrius Statkevičius Signed-off-by: Giedrius Statkevičius Signed-off-by: utukj --- docs/release-process.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-process.md b/docs/release-process.md index 84415b0f11..1d99961df5 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -24,7 +24,7 @@ Release shepherd responsibilities: | Release | Time of first RC | Shepherd (GitHub handle) | |---------|----------------------|-------------------------------| | v0.30.0 | (planned) 2022.11.21 | No one ATM | -| v0.29.0 | (planned) 2022.09.29 | No one ATM | +| v0.29.0 | 2022.10.21 | `@GiedriusS` | | v0.28.0 | 2022.08.22 | `@yeya24` | | v0.27.0 | 2022.06.21 | `@wiardvanrij` and `@matej-g` | | v0.26.0 | 2022.04.29 | `@wiardvanrij` | From f109590616f9a2810e151934bd1fcdc60128d547 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 17:12:50 +0100 Subject: [PATCH 25/43] Revert "docs: mark me as shepherd for next release (#5797)" This reverts commit c509c0e9f22deb3f75b707f88285548876eb7ffb. Signed-off-by: utukj --- docs/release-process.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-process.md b/docs/release-process.md index 1d99961df5..84415b0f11 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -24,7 +24,7 @@ Release shepherd responsibilities: | Release | Time of first RC | Shepherd (GitHub handle) | |---------|----------------------|-------------------------------| | v0.30.0 | (planned) 2022.11.21 | No one ATM | -| v0.29.0 | 2022.10.21 | `@GiedriusS` | +| v0.29.0 | (planned) 2022.09.29 | No one ATM | | v0.28.0 | 2022.08.22 | `@yeya24` | | v0.27.0 | 2022.06.21 | `@wiardvanrij` and `@matej-g` | | v0.26.0 | 2022.04.29 | `@wiardvanrij` | From 5c1dd949cdeec55f57b1232d4f2d52fe7484e245 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 17:54:32 +0100 Subject: [PATCH 26/43] Revert "Updates busybox SHA (#5793)" This reverts commit ad11a03ebeb6ae10ed7ad7a6336365f33a07538b. Signed-off-by: utukj --- .busybox-versions | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.busybox-versions b/.busybox-versions index dfaea69d50..afcacb3c77 100644 --- a/.busybox-versions +++ b/.busybox-versions @@ -1,6 +1,6 @@ # Auto generated by busybox-updater.sh. DO NOT EDIT -amd64=c9f983fc55b0b74723a69c31688cca7d5a2e5b2af7c954780f29a331817982f3 -arm64=1349554b18d6c349a390929c2a4855fadb003b2243aabf2cc71b931068c69279 -arm=be08b36d0e8f90b6fb317d29582c632ce365a00648a81c4022c4ff79df928ad9 -ppc64le=d44f541b0df83608110e695b9a1e71604ab94924954a1b18f6d76c4b5871cadd -s390x=007b2b388c575d00c7234d29227bbb8216786d7ba3f86d82696dc6fe86ac1ec0 +amd64=d8d3654786836cad8c09543704807c7a6d75de53b9e9cd21a1bbd8cb1a607004 +arm64=a3435ee186dbf88238388c112761488ecd2c264dbff8957ab73f804be62a9080 +arm=b063a2176f23a13007de5c447ab3552f8e355162ac54fc2a545b00b612d4c81e +ppc64le=203c3f97bc34c4d5df50bd61beaa397f2a4c7cbd470c84fe7ec3db12409435d3 +s390x=1a6eb305bd08bd1d38cb85a097ad776a78dd72b7c1a35094bb080788a39b174c From 60f308c78f38b91a8e7440334af98e998576ac14 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 17:55:37 +0100 Subject: [PATCH 27/43] Revert "Query: add query metrics to calls going through the Store API (#5741)" This reverts commit 7a77769258a138394be68a018be5f0c871afee7a. Signed-off-by: utukj --- CHANGELOG.md | 1 - cmd/thanos/query.go | 19 +---- docs/components/query.md | 19 ----- pkg/api/query/grpc.go | 2 - pkg/api/query/v1.go | 99 +++------------------- pkg/api/query/v1_test.go | 6 +- pkg/query/querier.go | 79 +++-------------- pkg/query/querier_test.go | 37 ++------ pkg/query/query_bench_test.go | 13 ++- pkg/query/query_test.go | 11 +-- pkg/store/telemetry.go | 88 ------------------- test/e2e/query_test.go | 155 +++------------------------------- 12 files changed, 51 insertions(+), 478 deletions(-) delete mode 100644 pkg/store/telemetry.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e2e854ce2..6e1d2143c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,6 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. -- [#5741](https://github.com/thanos-io/thanos/pull/5741) Query: add metrics on how much data is being selected by downstream Store APIs. - [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 5e5a7fc7cd..54724f59a6 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -25,8 +25,6 @@ import ( "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" - "google.golang.org/grpc" - v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-community/promql-engine/engine" apiv1 "github.com/thanos-io/thanos/pkg/api/query" @@ -56,6 +54,7 @@ import ( "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/tls" "github.com/thanos-io/thanos/pkg/ui" + "google.golang.org/grpc" ) const ( @@ -206,10 +205,6 @@ func registerQuery(app *extkingpin.App) { alertQueryURL := cmd.Flag("alert.query-url", "The external Thanos Query URL that would be set in all alerts 'Source' field.").String() grpcProxyStrategy := cmd.Flag("grpc.proxy-strategy", "Strategy to use when proxying Series requests to leaf nodes. Hidden and only used for testing, will be removed after lazy becomes the default.").Default(string(store.EagerRetrieval)).Hidden().Enum(string(store.EagerRetrieval), string(store.LazyRetrieval)) - queryTelemetryDurationQuantiles := cmd.Flag("query.telemetry.request-duration-seconds-quantiles", "The quantiles for exporting metrics about the request duration quantiles.").Default("0.1", "0.25", "0.75", "1.25", "1.75", "2.5", "3", "5", "10").Float64List() - queryTelemetrySamplesQuantiles := cmd.Flag("query.telemetry.request-samples-quantiles", "The quantiles for exporting metrics about the samples count quantiles.").Default("100", "1000", "10000", "100000", "1000000").Int64List() - queryTelemetrySeriesQuantiles := cmd.Flag("query.telemetry.request-series-seconds-quantiles", "The quantiles for exporting metrics about the series count quantiles.").Default("10", "100", "1000", "10000", "100000").Int64List() - cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { selectorLset, err := parseFlagLabels(*selectorLabels) if err != nil { @@ -322,9 +317,6 @@ func registerQuery(app *extkingpin.App) { *alertQueryURL, *grpcProxyStrategy, component.Query, - *queryTelemetryDurationQuantiles, - *queryTelemetrySamplesQuantiles, - *queryTelemetrySeriesQuantiles, promqlEngineType(*promqlEngine), ) }) @@ -398,9 +390,6 @@ func runQuery( alertQueryURL string, grpcProxyStrategy string, comp component.Component, - queryTelemetryDurationQuantiles []float64, - queryTelemetrySamplesQuantiles []int64, - queryTelemetrySeriesQuantiles []int64, promqlEngine promqlEngineType, ) error { if alertQueryURL == "" { @@ -705,12 +694,6 @@ func runQuery( extprom.WrapRegistererWithPrefix("thanos_query_concurrent_", reg), maxConcurrentQueries, ), - store.NewSeriesStatsAggregator( - reg, - queryTelemetryDurationQuantiles, - queryTelemetrySamplesQuantiles, - queryTelemetrySeriesQuantiles, - ), reg, ) diff --git a/docs/components/query.md b/docs/components/query.md index c3690ca05a..1a028ee3ed 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -381,15 +381,6 @@ Flags: be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. - --query.telemetry.request-duration-seconds-quantiles=0.1... ... - The quantiles for exporting metrics about the - request duration quantiles. - --query.telemetry.request-samples-quantiles=100... ... - The quantiles for exporting metrics about the - samples count quantiles. - --query.telemetry.request-series-seconds-quantiles=10... ... - The quantiles for exporting metrics about the - series count quantiles. --query.timeout=2m Maximum time to process query by query node. --request.logging-config= Alternative to 'request.logging-config-file' @@ -472,13 +463,3 @@ Flags: of Prometheus. ``` - -## Exported metrics - -Thanos Query also exports metrics about its own performance. You can find a list with these metrics below. - -**Disclaimer**: this list is incomplete. The remaining metrics will be added over time. - -| Name | Type | Labels | Description | -|-----------------------------------------|-----------|-----------------------|-------------------------------------------------------------------------------------------------------------------| -| thanos_store_api_query_duration_seconds | Histogram | samples_le, series_le | Duration of the Thanos Store API select phase for a query according to the amount of samples and series selected. | diff --git a/pkg/api/query/grpc.go b/pkg/api/query/grpc.go index 8848cd2ffe..144166f57b 100644 --- a/pkg/api/query/grpc.go +++ b/pkg/api/query/grpc.go @@ -94,7 +94,6 @@ func (g *GRPCAPI) Query(request *querypb.QueryRequest, server querypb.Query_Quer request.EnableQueryPushdown, false, request.ShardInfo, - query.NoopSeriesStatsReporter, ) qry, err := g.queryEngine.NewInstantQuery(queryable, &promql.QueryOpts{LookbackDelta: lookbackDelta}, request.Query, ts) if err != nil { @@ -169,7 +168,6 @@ func (g *GRPCAPI) QueryRange(request *querypb.QueryRangeRequest, srv querypb.Que request.EnableQueryPushdown, false, request.ShardInfo, - query.NoopSeriesStatsReporter, ) startTime := time.Unix(request.StartTimeSeconds, 0) diff --git a/pkg/api/query/v1.go b/pkg/api/query/v1.go index 918bcbf5fd..cbe1327a36 100644 --- a/pkg/api/query/v1.go +++ b/pkg/api/query/v1.go @@ -41,8 +41,10 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" - "github.com/prometheus/prometheus/util/stats" v1 "github.com/prometheus/prometheus/web/api/v1" + + "github.com/prometheus/prometheus/util/stats" + "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/exemplars" "github.com/thanos-io/thanos/pkg/exemplars/exemplarspb" @@ -55,7 +57,6 @@ import ( "github.com/thanos-io/thanos/pkg/rules" "github.com/thanos-io/thanos/pkg/rules/rulespb" "github.com/thanos-io/thanos/pkg/runutil" - "github.com/thanos-io/thanos/pkg/store" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/targets" "github.com/thanos-io/thanos/pkg/targets/targetspb" @@ -106,13 +107,6 @@ type QueryAPI struct { defaultMetadataTimeRange time.Duration queryRangeHist prometheus.Histogram - - seriesStatsAggregator seriesQueryPerformanceMetricsAggregator -} - -type seriesQueryPerformanceMetricsAggregator interface { - Aggregate(seriesStats storepb.SeriesStatsCounter) - Observe(duration float64) } // NewQueryAPI returns an initialized QueryAPI type. @@ -140,12 +134,8 @@ func NewQueryAPI( defaultMetadataTimeRange time.Duration, disableCORS bool, gate gate.Gate, - statsAggregator seriesQueryPerformanceMetricsAggregator, reg *prometheus.Registry, ) *QueryAPI { - if statsAggregator == nil { - statsAggregator = &store.NoopSeriesStatsAggregator{} - } return &QueryAPI{ baseAPI: api.NewBaseAPI(logger, disableCORS, flagsMap), logger: logger, @@ -170,7 +160,6 @@ func NewQueryAPI( defaultInstantQueryMaxSourceResolution: defaultInstantQueryMaxSourceResolution, defaultMetadataTimeRange: defaultMetadataTimeRange, disableCORS: disableCORS, - seriesStatsAggregator: statsAggregator, queryRangeHist: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "thanos_query_range_requested_timespan_duration_seconds", @@ -407,24 +396,7 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro span, ctx := tracing.StartSpan(ctx, "promql_instant_query") defer span.Finish() - var seriesStats []storepb.SeriesStatsCounter - qry, err := qapi.queryEngine.NewInstantQuery( - qapi.queryableCreate( - enableDedup, - replicaLabels, - storeDebugMatchers, - maxSourceResolution, - enablePartialResponse, - qapi.enableQueryPushdown, - false, - shardInfo, - query.NewAggregateStatsReporter(&seriesStats), - ), - &promql.QueryOpts{LookbackDelta: lookbackDelta}, - r.FormValue("query"), - ts, - ) - + qry, err := qapi.queryEngine.NewInstantQuery(qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), ts) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: err}, func() {} } @@ -437,7 +409,6 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } defer qapi.gate.Done() - beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -450,10 +421,6 @@ func (qapi *QueryAPI) query(r *http.Request) (interface{}, []error, *api.ApiErro } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } - for i := range seriesStats { - qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) - } - qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -558,19 +525,8 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap span, ctx := tracing.StartSpan(ctx, "promql_range_query") defer span.Finish() - var seriesStats []storepb.SeriesStatsCounter qry, err := qapi.queryEngine.NewRangeQuery( - qapi.queryableCreate( - enableDedup, - replicaLabels, - storeDebugMatchers, - maxSourceResolution, - enablePartialResponse, - qapi.enableQueryPushdown, - false, - shardInfo, - query.NewAggregateStatsReporter(&seriesStats), - ), + qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, maxSourceResolution, enablePartialResponse, qapi.enableQueryPushdown, false, shardInfo), &promql.QueryOpts{LookbackDelta: lookbackDelta}, r.FormValue("query"), start, @@ -589,7 +545,6 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } defer qapi.gate.Done() - beforeRange := time.Now() res := qry.Exec(ctx) if res.Err != nil { switch res.Err.(type) { @@ -600,10 +555,6 @@ func (qapi *QueryAPI) queryRange(r *http.Request) (interface{}, []error, *api.Ap } return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: res.Err}, qry.Close } - for i := range seriesStats { - qapi.seriesStatsAggregator.Aggregate(seriesStats[i]) - } - qapi.seriesStatsAggregator.Observe(time.Since(beforeRange).Seconds()) // Optional stats field in response if parameter "stats" is not empty. var qs stats.QueryStats @@ -649,17 +600,8 @@ func (qapi *QueryAPI) labelValues(r *http.Request) (interface{}, []error, *api.A matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate( - true, - nil, - storeDebugMatchers, - 0, - enablePartialResponse, - qapi.enableQueryPushdown, - true, - nil, - query.NoopSeriesStatsReporter, - ).Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). + Querier(ctx, timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -745,18 +687,8 @@ func (qapi *QueryAPI) series(r *http.Request) (interface{}, []error, *api.ApiErr return nil, nil, apiErr, func() {} } - q, err := qapi.queryableCreate( - enableDedup, - replicaLabels, - storeDebugMatchers, - math.MaxInt64, - enablePartialResponse, - qapi.enableQueryPushdown, - true, - nil, - query.NoopSeriesStatsReporter, - ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) - + q, err := qapi.queryableCreate(enableDedup, replicaLabels, storeDebugMatchers, math.MaxInt64, enablePartialResponse, qapi.enableQueryPushdown, true, nil). + Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } @@ -805,17 +737,8 @@ func (qapi *QueryAPI) labelNames(r *http.Request) (interface{}, []error, *api.Ap matcherSets = append(matcherSets, matchers) } - q, err := qapi.queryableCreate( - true, - nil, - storeDebugMatchers, - 0, - enablePartialResponse, - qapi.enableQueryPushdown, - true, - nil, - query.NoopSeriesStatsReporter, - ).Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) + q, err := qapi.queryableCreate(true, nil, storeDebugMatchers, 0, enablePartialResponse, qapi.enableQueryPushdown, true, nil). + Querier(r.Context(), timestamp.FromTime(start), timestamp.FromTime(end)) if err != nil { return nil, nil, &api.ApiError{Typ: api.ErrorExec, Err: err}, func() {} } diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 07c562af9c..000410ddbd 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -44,8 +44,9 @@ import ( "github.com/prometheus/prometheus/tsdb/tsdbutil" promgate "github.com/prometheus/prometheus/util/gate" "github.com/prometheus/prometheus/util/stats" - baseAPI "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/compact" + + baseAPI "github.com/thanos-io/thanos/pkg/api" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/gate" "github.com/thanos-io/thanos/pkg/query" @@ -197,7 +198,6 @@ func TestQueryEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } start := time.Unix(0, 0) @@ -737,7 +737,6 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } apiWithLabelLookback := &QueryAPI{ baseAPI: &baseAPI.BaseAPI{ @@ -751,7 +750,6 @@ func TestMetadataEndpoints(t *testing.T) { queryRangeHist: promauto.With(prometheus.NewRegistry()).NewHistogram(prometheus.HistogramOpts{ Name: "query_range_hist", }), - seriesStatsAggregator: &store.NoopSeriesStatsAggregator{}, } var tests = []endpointTestCase{ diff --git a/pkg/query/querier.go b/pkg/query/querier.go index b094cbd45c..361834c07d 100644 --- a/pkg/query/querier.go +++ b/pkg/query/querier.go @@ -7,7 +7,6 @@ import ( "context" "sort" "strings" - "sync" "time" "github.com/go-kit/log" @@ -29,60 +28,21 @@ import ( "github.com/thanos-io/thanos/pkg/tracing" ) -type seriesStatsReporter func(seriesStats storepb.SeriesStatsCounter) - -var NoopSeriesStatsReporter seriesStatsReporter = func(_ storepb.SeriesStatsCounter) {} - -func NewAggregateStatsReporter(stats *[]storepb.SeriesStatsCounter) seriesStatsReporter { - var mutex sync.Mutex - return func(s storepb.SeriesStatsCounter) { - mutex.Lock() - defer mutex.Unlock() - *stats = append(*stats, s) - } -} - // QueryableCreator returns implementation of promql.Queryable that fetches data from the proxy store API endpoints. // If deduplication is enabled, all data retrieved from it will be deduplicated along all replicaLabels by default. // When the replicaLabels argument is not empty it overwrites the global replicaLabels flag. This allows specifying // replicaLabels at query time. // maxResolutionMillis controls downsampling resolution that is allowed (specified in milliseconds). // partialResponse controls `partialResponseDisabled` option of StoreAPI and partial response behavior of proxy. -type QueryableCreator func( - deduplicate bool, - replicaLabels []string, - storeDebugMatchers [][]*labels.Matcher, - maxResolutionMillis int64, - partialResponse, - enableQueryPushdown, - skipChunks bool, - shardInfo *storepb.ShardInfo, - seriesStatsReporter seriesStatsReporter, -) storage.Queryable +type QueryableCreator func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable // NewQueryableCreator creates QueryableCreator. -func NewQueryableCreator( - logger log.Logger, - reg prometheus.Registerer, - proxy storepb.StoreServer, - maxConcurrentSelects int, - selectTimeout time.Duration, -) QueryableCreator { +func NewQueryableCreator(logger log.Logger, reg prometheus.Registerer, proxy storepb.StoreServer, maxConcurrentSelects int, selectTimeout time.Duration) QueryableCreator { duration := promauto.With( extprom.WrapRegistererWithPrefix("concurrent_selects_", reg), ).NewHistogram(gate.DurationHistogramOpts) - return func( - deduplicate bool, - replicaLabels []string, - storeDebugMatchers [][]*labels.Matcher, - maxResolutionMillis int64, - partialResponse, - enableQueryPushdown, - skipChunks bool, - shardInfo *storepb.ShardInfo, - seriesStatsReporter seriesStatsReporter, - ) storage.Queryable { + return func(deduplicate bool, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, maxResolutionMillis int64, partialResponse, enableQueryPushdown, skipChunks bool, shardInfo *storepb.ShardInfo) storage.Queryable { return &queryable{ logger: logger, replicaLabels: replicaLabels, @@ -99,7 +59,6 @@ func NewQueryableCreator( selectTimeout: selectTimeout, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, - seriesStatsReporter: seriesStatsReporter, } } } @@ -118,12 +77,11 @@ type queryable struct { selectTimeout time.Duration enableQueryPushdown bool shardInfo *storepb.ShardInfo - seriesStatsReporter seriesStatsReporter } // Querier returns a new storage querier against the underlying proxy store API. func (q *queryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) { - return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo, q.seriesStatsReporter), nil + return newQuerier(ctx, q.logger, mint, maxt, q.replicaLabels, q.storeDebugMatchers, q.proxy, q.deduplicate, q.maxResolutionMillis, q.partialResponse, q.enableQueryPushdown, q.skipChunks, q.gateProviderFn(), q.selectTimeout, q.shardInfo), nil } type querier struct { @@ -142,7 +100,6 @@ type querier struct { selectGate gate.Gate selectTimeout time.Duration shardInfo *storepb.ShardInfo - seriesStatsReporter seriesStatsReporter } // newQuerier creates implementation of storage.Querier that fetches data from the proxy @@ -150,20 +107,16 @@ type querier struct { func newQuerier( ctx context.Context, logger log.Logger, - mint, - maxt int64, + mint, maxt int64, replicaLabels []string, storeDebugMatchers [][]*labels.Matcher, proxy storepb.StoreServer, deduplicate bool, maxResolutionMillis int64, - partialResponse, - enableQueryPushdown, - skipChunks bool, + partialResponse, enableQueryPushdown bool, skipChunks bool, selectGate gate.Gate, selectTimeout time.Duration, shardInfo *storepb.ShardInfo, - seriesStatsReporter seriesStatsReporter, ) *querier { if logger == nil { logger = log.NewNopLogger() @@ -192,7 +145,6 @@ func newQuerier( skipChunks: skipChunks, enableQueryPushdown: enableQueryPushdown, shardInfo: shardInfo, - seriesStatsReporter: seriesStatsReporter, } } @@ -205,9 +157,8 @@ type seriesServer struct { storepb.Store_SeriesServer ctx context.Context - seriesSet []storepb.Series - seriesSetStats storepb.SeriesStatsCounter - warnings []string + seriesSet []storepb.Series + warnings []string } func (s *seriesServer) Send(r *storepb.SeriesResponse) error { @@ -218,7 +169,6 @@ func (s *seriesServer) Send(r *storepb.SeriesResponse) error { if r.GetSeries() != nil { s.seriesSet = append(s.seriesSet, *r.GetSeries()) - s.seriesSetStats.Count(r.GetSeries()) return nil } @@ -307,12 +257,11 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match span, ctx := tracing.StartSpan(ctx, "querier_select_select_fn") defer span.Finish() - set, stats, err := q.selectFn(ctx, hints, ms...) + set, err := q.selectFn(ctx, hints, ms...) if err != nil { promise <- storage.ErrSeriesSet(err) return } - q.seriesStatsReporter(stats) promise <- set }() @@ -330,10 +279,10 @@ func (q *querier) Select(_ bool, hints *storage.SelectHints, ms ...*labels.Match }} } -func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, storepb.SeriesStatsCounter, error) { +func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms ...*labels.Matcher) (storage.SeriesSet, error) { sms, err := storepb.PromMatchersToMatchers(ms...) if err != nil { - return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "convert matchers") + return nil, errors.Wrap(err, "convert matchers") } aggrs := aggrsFromFunc(hints.Func) @@ -361,7 +310,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . Step: hints.Step, Range: hints.Range, }, resp); err != nil { - return nil, storepb.SeriesStatsCounter{}, errors.Wrap(err, "proxy Series()") + return nil, errors.Wrap(err, "proxy Series()") } var warns storage.Warnings @@ -393,7 +342,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . set: newStoreSeriesSet(resp.seriesSet), aggrs: aggrs, warns: warns, - }, resp.seriesSetStats, nil + }, nil } // TODO(fabxc): this could potentially pushed further down into the store API to make true streaming possible. @@ -408,7 +357,7 @@ func (q *querier) selectFn(ctx context.Context, hints *storage.SelectHints, ms . // The merged series set assembles all potentially-overlapping time ranges of the same series into a single one. // TODO(bwplotka): We could potentially dedup on chunk level, use chunk iterator for that when available. - return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), resp.seriesSetStats, nil + return dedup.NewSeriesSet(set, q.replicaLabels, hints.Func, q.enableQueryPushdown), nil } // sortDedupLabels re-sorts the set so that the same series with different replica diff --git a/pkg/query/querier_test.go b/pkg/query/querier_test.go index 2e31fa65a0..a43c75e7a5 100644 --- a/pkg/query/querier_test.go +++ b/pkg/query/querier_test.go @@ -44,17 +44,7 @@ func TestQueryableCreator_MaxResolution(t *testing.T) { queryableCreator := NewQueryableCreator(nil, nil, testProxy, 2, 5*time.Second) oneHourMillis := int64(1*time.Hour) / int64(time.Millisecond) - queryable := queryableCreator( - false, - nil, - nil, - oneHourMillis, - false, - false, - false, - nil, - NoopSeriesStatsReporter, - ) + queryable := queryableCreator(false, nil, nil, oneHourMillis, false, false, false, nil) q, err := queryable.Querier(context.Background(), 0, 42) testutil.Ok(t, err) @@ -81,22 +71,7 @@ func TestQuerier_DownsampledData(t *testing.T) { } timeout := 10 * time.Second - q := NewQueryableCreator( - nil, - nil, - testProxy, - 2, - timeout, - )(false, - nil, - nil, - 9999999, - false, - false, - false, - nil, - NoopSeriesStatsReporter, - ) + q := NewQueryableCreator(nil, nil, testProxy, 2, timeout)(false, nil, nil, 9999999, false, false, false, nil) engine := promql.NewEngine( promql.EngineOpts{ MaxSamples: math.MaxInt32, @@ -390,7 +365,7 @@ func TestQuerier_Select_AfterPromQL(t *testing.T) { g := gate.New(2) mq := &mockedQueryable{ Creator: func(mint, maxt int64) storage.Querier { - return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) + return newQuerier(context.Background(), nil, mint, maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) }, } t.Cleanup(func() { @@ -634,7 +609,7 @@ func TestQuerier_Select(t *testing.T) { {dedup: true, expected: []series{tcase.expectedAfterDedup}}, } { g := gate.New(2) - q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil, func(i storepb.SeriesStatsCounter) {}) + q := newQuerier(context.Background(), nil, tcase.mint, tcase.maxt, tcase.replicaLabels, nil, tcase.storeAPI, sc.dedup, 0, true, false, false, g, timeout, nil) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) t.Run(fmt.Sprintf("dedup=%v", sc.dedup), func(t *testing.T) { @@ -863,7 +838,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 100 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, false, 0, true, false, false, g, timeout, nil) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) @@ -933,7 +908,7 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { timeout := 5 * time.Second g := gate.New(2) - q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil, NoopSeriesStatsReporter) + q := newQuerier(context.Background(), logger, realSeriesWithStaleMarkerMint, realSeriesWithStaleMarkerMaxt, []string{"replica"}, nil, s, true, 0, true, false, false, g, timeout, nil) t.Cleanup(func() { testutil.Ok(t, q.Close()) }) diff --git a/pkg/query/query_bench_test.go b/pkg/query/query_bench_test.go index 84efb46820..301c880877 100644 --- a/pkg/query/query_bench_test.go +++ b/pkg/query/query_bench_test.go @@ -80,13 +80,12 @@ func benchQuerySelect(t testutil.TB, totalSamples, totalSeries int, dedup bool) logger := log.NewNopLogger() q := &querier{ - ctx: context.Background(), - logger: logger, - proxy: &mockedStoreServer{responses: resps}, - replicaLabels: map[string]struct{}{"a_replica": {}}, - deduplicate: dedup, - selectGate: gate.NewNoop(), - seriesStatsReporter: NoopSeriesStatsReporter, + ctx: context.Background(), + logger: logger, + proxy: &mockedStoreServer{responses: resps}, + replicaLabels: map[string]struct{}{"a_replica": {}}, + deduplicate: dedup, + selectGate: gate.NewNoop(), } testSelect(t, q, expectedSeries) } diff --git a/pkg/query/query_test.go b/pkg/query/query_test.go index 060571fc70..99e29be66f 100644 --- a/pkg/query/query_test.go +++ b/pkg/query/query_test.go @@ -54,16 +54,7 @@ func TestQuerier_Proxy(t *testing.T) { name: fmt.Sprintf("store number %v", i), }) } - return q(true, - nil, - nil, - 0, - false, - false, - false, - nil, - NoopSeriesStatsReporter, - ) + return q(true, nil, nil, 0, false, false, false, nil) } for _, fn := range files { diff --git a/pkg/store/telemetry.go b/pkg/store/telemetry.go deleted file mode 100644 index a854daaf0c..0000000000 --- a/pkg/store/telemetry.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package store - -import ( - "strconv" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/thanos-io/thanos/pkg/store/storepb" -) - -// seriesStatsAggregator aggregates results from fanned-out queries into a histogram given their -// response's shape. -type seriesStatsAggregator struct { - queryDuration *prometheus.HistogramVec - - seriesLeBuckets []int64 - samplesLeBuckets []int64 - seriesStats storepb.SeriesStatsCounter -} - -// NewSeriesStatsAggregator is a constructor for seriesStatsAggregator. -func NewSeriesStatsAggregator( - reg prometheus.Registerer, - durationQuantiles []float64, - sampleQuantiles []int64, - seriesQuantiles []int64, -) *seriesStatsAggregator { - return &seriesStatsAggregator{ - queryDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "thanos_store_api_query_duration_seconds", - Help: "Duration of the Thanos Store API select phase for a query.", - Buckets: durationQuantiles, - }, []string{"series_le", "samples_le"}), - seriesLeBuckets: seriesQuantiles, - samplesLeBuckets: sampleQuantiles, - seriesStats: storepb.SeriesStatsCounter{}, - } -} - -// Aggregate is an aggregator for merging `storepb.SeriesStatsCounter` for each incoming fanned out query. -func (s *seriesStatsAggregator) Aggregate(stats storepb.SeriesStatsCounter) { - s.seriesStats.Series += stats.Series - s.seriesStats.Samples += stats.Samples - s.seriesStats.Chunks += stats.Chunks -} - -// Observe commits the aggregated SeriesStatsCounter as an observation. -func (s *seriesStatsAggregator) Observe(duration float64) { - if s.seriesStats.Series == 0 || s.seriesStats.Samples == 0 || s.seriesStats.Chunks == 0 { - return - } - // Bucket matching for series/labels matchSeriesBucket/matchSamplesBucket => float64, float64 - seriesLeBucket := s.findBucket(float64(s.seriesStats.Series), s.seriesLeBuckets) - samplesLeBucket := s.findBucket(float64(s.seriesStats.Samples), s.samplesLeBuckets) - s.queryDuration.With(prometheus.Labels{ - "series_le": strconv.Itoa(int(seriesLeBucket)), - "samples_le": strconv.Itoa(int(samplesLeBucket)), - }).Observe(duration) - s.reset() -} - -func (s *seriesStatsAggregator) reset() { - s.seriesStats = storepb.SeriesStatsCounter{} -} - -func (s *seriesStatsAggregator) findBucket(value float64, quantiles []int64) int64 { - if len(quantiles) == 0 { - return 0 - } - var foundBucket int64 - for _, bucket := range quantiles { - foundBucket = bucket - if value < float64(bucket) { - break - } - } - return foundBucket -} - -// NoopSeriesStatsAggregator is a query performance series aggregator that does nothing. -type NoopSeriesStatsAggregator struct{} - -func (s *NoopSeriesStatsAggregator) Aggregate(_ storepb.SeriesStatsCounter) {} - -func (s *NoopSeriesStatsAggregator) Observe(_ float64) {} diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 04b425061a..7fc56bda97 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -23,7 +23,6 @@ import ( "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" "github.com/efficientgo/e2e" - e2edb "github.com/efficientgo/e2e/db" e2emon "github.com/efficientgo/e2e/monitoring" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -579,130 +578,6 @@ func newSample(s fakeMetricSample) model.Sample { } } -func TestQueryStoreMetrics(t *testing.T) { - t.Parallel() - - // Build up. - e, err := e2e.New(e2e.WithName("storemetrics01")) - testutil.Ok(t, err) - t.Cleanup(e2ethanos.CleanScenario(t, e)) - - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) - t.Cleanup(cancel) - - bucket := "store-gw-test" - minio := e2ethanos.NewMinio(e, "thanos-minio", bucket) - testutil.Ok(t, e2e.StartAndWaitReady(minio)) - - l := log.NewLogfmtLogger(os.Stdout) - bkt, err := s3.NewBucketWithConfig(l, e2ethanos.NewS3Config(bucket, minio.Endpoint("https"), minio.Dir()), "test") - testutil.Ok(t, err) - - // Preparing 2 different blocks for the tests. - { - blockSizes := []struct { - samples int - series int - name string - }{ - {samples: 10, series: 1, name: "one_series"}, - {samples: 10, series: 1001, name: "thousand_one_series"}, - } - now := time.Now() - externalLabels := labels.FromStrings("prometheus", "p1", "replica", "0") - dir := filepath.Join(e.SharedDir(), "tmp") - testutil.Ok(t, os.MkdirAll(filepath.Join(e.SharedDir(), dir), os.ModePerm)) - for _, blockSize := range blockSizes { - series := make([]labels.Labels, blockSize.series) - for i := 0; i < blockSize.series; i++ { - bigSeriesLabels := labels.FromStrings("__name__", blockSize.name, "instance", fmt.Sprintf("foo_%d", i)) - series[i] = bigSeriesLabels - } - blockID, err := e2eutil.CreateBlockWithBlockDelay(ctx, - dir, - series, - blockSize.samples, - timestamp.FromTime(now), - timestamp.FromTime(now.Add(2*time.Hour)), - 30*time.Minute, - externalLabels, - 0, - metadata.NoneFunc, - ) - testutil.Ok(t, err) - testutil.Ok(t, objstore.UploadDir(ctx, l, bkt, path.Join(dir, blockID.String()), blockID.String())) - } - } - - storeGW := e2ethanos.NewStoreGW( - e, - "s1", - client.BucketConfig{ - Type: client.S3, - Config: e2ethanos.NewS3Config(bucket, minio.InternalEndpoint("https"), minio.InternalDir()), - }, - "", - nil, - ) - querier := e2ethanos.NewQuerierBuilder(e, "1", storeGW.InternalEndpoint("grpc")).Init() - testutil.Ok(t, e2e.StartAndWaitReady(storeGW, querier)) - testutil.Ok(t, storeGW.WaitSumMetrics(e2emon.Equals(2), "thanos_blocks_meta_synced")) - - // Querying the series in the previously created blocks to ensure we produce Store API query metrics. - { - instantQuery(t, ctx, querier.Endpoint("http"), func() string { - return "max_over_time(one_series{instance='foo_0'}[2h])" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, 1) - testutil.Ok(t, err) - - instantQuery(t, ctx, querier.Endpoint("http"), func() string { - return "max_over_time(thousand_one_series[2h])" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, 1001) - testutil.Ok(t, err) - } - - mon, err := e2emon.Start(e) - testutil.Ok(t, err) - - queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { - return "thanos_store_api_query_duration_seconds_count{samples_le='100000',series_le='10000'}" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, model.Vector{ - &model.Sample{ - Metric: model.Metric{ - "__name__": "thanos_store_api_query_duration_seconds_count", - "instance": "storemetrics01-querier-1:8080", - "job": "querier-1", - "samples_le": "100000", - "series_le": "10000", - }, - Value: model.SampleValue(1), - }, - }) - - queryWaitAndAssert(t, ctx, mon.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { - return "thanos_store_api_query_duration_seconds_count{samples_le='100',series_le='10'}" - }, time.Now, promclient.QueryOptions{ - Deduplicate: true, - }, model.Vector{ - &model.Sample{ - Metric: model.Metric{ - "__name__": "thanos_store_api_query_duration_seconds_count", - "instance": "storemetrics01-querier-1:8080", - "job": "querier-1", - "samples_le": "100", - "series_le": "10", - }, - Value: model.SampleValue(1), - }, - }) -} - // Regression test for https://github.com/thanos-io/thanos/issues/5033. // Tests whether queries work with mixed sources, and with functions // that we are pushing down: min, max, min_over_time, max_over_time, @@ -1007,10 +882,18 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin "msg", fmt.Sprintf("Waiting for %d results for query %s", expectedSeriesLen, q()), ) testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { - res, err := simpleInstantQuery(t, ctx, addr, q, ts, opts, expectedSeriesLen) + res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err } + + if len(warnings) > 0 { + return errors.Errorf("unexpected warnings %s", warnings) + } + + if len(res) != expectedSeriesLen { + return errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) + } result = res return nil })) @@ -1018,24 +901,6 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin return result } -func simpleInstantQuery(t testing.TB, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expectedSeriesLen int) (model.Vector, error) { - res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) - if err != nil { - return nil, err - } - - if len(warnings) > 0 { - return nil, errors.Errorf("unexpected warnings %s", warnings) - } - - if len(res) != expectedSeriesLen { - return nil, errors.Errorf("unexpected result size, expected %d; result %d: %v", expectedSeriesLen, len(res), res) - } - - sortResults(res) - return res, nil -} - func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) { t.Helper() @@ -1047,7 +912,7 @@ func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() "caller", "queryWaitAndAssert", "msg", fmt.Sprintf("Waiting for %d results for query %s", len(expected), q()), ) - testutil.Ok(t, runutil.RetryWithLog(logger, 10*time.Second, ctx.Done(), func() error { + testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error { res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts) if err != nil { return err From 1dea93b3c59c5a7bf57efdd7524c57795cdc2f78 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 17:55:48 +0100 Subject: [PATCH 28/43] Revert "Receive: Reload tenant limit configuration on file change (#5673)" This reverts commit 32ca3279bb995861b7f3b7ba5b9cb4cbeeddf68c. Signed-off-by: utukj --- CHANGELOG.md | 1 - cmd/thanos/receive.go | 46 ++--- docs/components/receive.md | 2 +- go.mod | 8 +- go.sum | 4 +- pkg/extkingpin/path_content_reloader.go | 128 ------------ pkg/extkingpin/path_content_reloader_test.go | 105 ---------- pkg/receive/handler.go | 22 +- pkg/receive/handler_test.go | 38 ++-- pkg/receive/limiter.go | 189 ++---------------- pkg/receive/limiter_config.go | 4 +- pkg/receive/limiter_config_test.go | 6 +- pkg/receive/limiter_test.go | 100 --------- pkg/receive/request_limiter.go | 31 ++- pkg/receive/request_limiter_test.go | 20 +- pkg/receive/testdata/limits.yaml | 22 -- .../limits_config/invalid_limits.yaml | 17 -- 17 files changed, 97 insertions(+), 646 deletions(-) delete mode 100644 pkg/extkingpin/path_content_reloader.go delete mode 100644 pkg/extkingpin/path_content_reloader_test.go delete mode 100644 pkg/receive/limiter_test.go delete mode 100644 pkg/receive/testdata/limits.yaml delete mode 100644 pkg/receive/testdata/limits_config/invalid_limits.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e1d2143c3..9ed82d6525 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,6 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5734](https://github.com/thanos-io/thanos/pull/5734) Store: Support disable block viewer UI. - [#5411](https://github.com/thanos-io/thanos/pull/5411) Tracing: Add OpenTelemetry Protocol exporter. - [#5779](https://github.com/thanos-io/thanos/pull/5779) Objstore: Support specifying S3 storage class. -- [#5673](https://github.com/thanos-io/thanos/pull/5673) Receive: Reload tenant limit configuration on file change. ### Changed diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index d86b560983..5c47b91dd5 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -192,19 +192,6 @@ func runReceive( return errors.Wrap(err, "parse relabel configuration") } - dbs := receive.NewMultiTSDB( - conf.dataDir, - logger, - reg, - tsdbOpts, - lset, - conf.tenantLabelName, - bkt, - conf.allowOutOfOrderUpload, - hashFunc, - ) - writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs) - var limitsConfig *receive.RootLimitsConfig if conf.limitsConfig != nil { limitsContentYaml, err := conf.limitsConfig.Content() @@ -216,11 +203,20 @@ func runReceive( return errors.Wrap(err, "parse limit configuration") } } - limiter, err := receive.NewLimiter(conf.limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) - if err != nil { - return errors.Wrap(err, "creating limiter") - } + limiter := receive.NewLimiter(limitsConfig, reg, receiveMode, log.With(logger, "component", "receive-limiter")) + dbs := receive.NewMultiTSDB( + conf.dataDir, + logger, + reg, + tsdbOpts, + lset, + conf.tenantLabelName, + bkt, + conf.allowOutOfOrderUpload, + hashFunc, + ) + writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs) webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{ Writer: writer, ListenAddress: conf.rwAddress, @@ -403,22 +399,6 @@ func runReceive( }) } - { - if limiter.CanReload() { - ctx, cancel := context.WithCancel(context.Background()) - g.Add(func() error { - level.Debug(logger).Log("msg", "limits config initialized with file watcher.") - if err := limiter.StartConfigReloader(ctx); err != nil { - return err - } - <-ctx.Done() - return nil - }, func(err error) { - cancel() - }) - } - } - level.Info(logger).Log("msg", "starting receiver") return nil } diff --git a/docs/components/receive.md b/docs/components/receive.md index ef4e39e35e..6fa13938e9 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -86,7 +86,7 @@ Thanos Receive has some limits and gates that can be configured to control resou To configure the gates and limits you can use one of the two options: -- `--receive.limits-config-file=`: where `` is the path to the YAML file. Any modification to the indicated file will trigger a configuration reload. If the updated configuration is invalid an error will be logged and it won't replace the previous valid configuration. +- `--receive.limits-config-file=`: where `` is the path to the YAML file. - `--receive.limits-config=`: where `` is the content of YAML file. By default all the limits and gates are **disabled**. diff --git a/go.mod b/go.mod index bee3e97fe7..13743c8020 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/dustin/go-humanize v1.0.0 github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a - github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd + github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.2.0 github.com/felixge/fgprof v0.9.2 @@ -108,7 +108,6 @@ require ( require ( github.com/efficientgo/core v1.0.0-rc.0 - github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd github.com/minio/sha256-simd v1.0.0 ) @@ -128,7 +127,10 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.10.0 ) -require go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 +require ( + github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd + go.opentelemetry.io/contrib/propagators/autoprop v0.34.0 +) require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.32.3 // indirect diff --git a/go.sum b/go.sum index 97fc0d0411..5ee9bab6be 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a h1:cnJajqeh/Hjv github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd h1:svR6KxSP1xiPw10RN4Pd7g6BAVkEcNN628PAqZH31mM= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:OmVcnJopJL8d3X3sSXTiypGoUSgFq1aDGmlrdi9dn/M= -github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd h1:VaYzzXeUbC5fVheskcKVNOyJMEYD+HgrJNzIAg/mRIM= -github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= +github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d h1:WZV/mrUyKS9w9r+Jdw+zq/tdGAb5LwB+H37EkMLhEMA= +github.com/efficientgo/tools/extkingpin v0.0.0-20220801101838-3312908f6a9d/go.mod h1:ZV0utlglOczUWv3ih2AbqPSoLoFzdplUYxwV62eZi6Q= github.com/elastic/go-sysinfo v1.1.1/go.mod h1:i1ZYdU10oLNfRzq4vq62BEwD2fH8KaWh6eh0ikPT9F0= github.com/elastic/go-sysinfo v1.8.1 h1:4Yhj+HdV6WjbCRgGdZpPJ8lZQlXZLKDAeIkmQ/VRvi4= github.com/elastic/go-sysinfo v1.8.1/go.mod h1:JfllUnzoQV/JRYymbH3dO1yggI3mV2oTKSXsDHM+uIM= diff --git a/pkg/extkingpin/path_content_reloader.go b/pkg/extkingpin/path_content_reloader.go deleted file mode 100644 index 68c2cd252c..0000000000 --- a/pkg/extkingpin/path_content_reloader.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package extkingpin - -import ( - "context" - "fmt" - "os" - "path" - "path/filepath" - "time" - - "github.com/fsnotify/fsnotify" - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/pkg/errors" -) - -type fileContent interface { - Content() ([]byte, error) - Path() string -} - -// PathContentReloader starts a file watcher that monitors the file indicated by fileContent.Path() and runs -// reloadFunc whenever a change is detected. -// A debounce timer can be configured via opts to handle situations where many "write" events are received together or -// a "create" event is followed up by a "write" event, for example. Files will be effectively reloaded at the latest -// after 2 times the debounce timer. By default the debouncer timer is 1 second. -// To ensure renames and deletes are properly handled, the file watcher is put at the file's parent folder. See -// https://github.com/fsnotify/fsnotify/issues/214 for more details. -func PathContentReloader(ctx context.Context, fileContent fileContent, logger log.Logger, reloadFunc func(), debounceTime time.Duration) error { - filePath, err := filepath.Abs(fileContent.Path()) - if err != nil { - return errors.Wrap(err, "getting absolute file path") - } - - watcher, err := fsnotify.NewWatcher() - if filePath == "" { - level.Debug(logger).Log("msg", "no path detected for config reload") - } - if err != nil { - return errors.Wrap(err, "creating file watcher") - } - go func() { - var reloadTimer *time.Timer - if debounceTime != 0 { - reloadTimer = time.AfterFunc(debounceTime, func() { - reloadFunc() - level.Debug(logger).Log("msg", "configuration reloaded after debouncing") - }) - } - defer watcher.Close() - for { - select { - case <-ctx.Done(): - if reloadTimer != nil { - reloadTimer.Stop() - } - return - case event := <-watcher.Events: - // fsnotify sometimes sends a bunch of events without name or operation. - // It's unclear what they are and why they are sent - filter them out. - if event.Name == "" { - break - } - // We are watching the file's parent folder (more details on this is done can be found below), but are - // only interested in changed to the target file. Discard every other file as quickly as possible. - if event.Name != filePath { - break - } - // We only react to files being written or created. - // On chmod or remove we have nothing to do. - // On rename we have the old file name (not useful). A create event for the new file will come later. - if event.Op&fsnotify.Write == 0 && event.Op&fsnotify.Create == 0 { - break - } - level.Debug(logger).Log("msg", fmt.Sprintf("change detected for %s", filePath), "eventName", event.Name, "eventOp", event.Op) - if reloadTimer != nil { - reloadTimer.Reset(debounceTime) - } - case err := <-watcher.Errors: - level.Error(logger).Log("msg", "watcher error", "error", err) - } - } - }() - // We watch the file's parent folder and not the file itself to better handle DELETE and RENAME events. Check - // https://github.com/fsnotify/fsnotify/issues/214 for more details. - if err := watcher.Add(path.Dir(filePath)); err != nil { - return errors.Wrapf(err, "adding path %s to file watcher", filePath) - } - return nil -} - -type staticPathContent struct { - content []byte - path string -} - -var _ fileContent = (*staticPathContent)(nil) - -// Content returns the cached content. -func (t *staticPathContent) Content() ([]byte, error) { - return t.content, nil -} - -// Path returns the path to the file that contains the content. -func (t *staticPathContent) Path() string { - return t.path -} - -// NewStaticPathContent creates a new content that can be used to serve a static configuration. It copies the -// configuration from `fromPath` into `destPath` to avoid confusion with file watchers. -func NewStaticPathContent(fromPath string) (*staticPathContent, error) { - content, err := os.ReadFile(fromPath) - if err != nil { - return nil, errors.Wrapf(err, "could not load test content: %s", fromPath) - } - return &staticPathContent{content, fromPath}, nil -} - -// Rewrite rewrites the file backing this staticPathContent and swaps the local content cache. The file writing -// is needed to trigger the file system monitor. -func (t *staticPathContent) Rewrite(newContent []byte) error { - t.content = newContent - // Write the file to ensure possible file watcher reloaders get triggered. - return os.WriteFile(t.path, newContent, 0666) -} diff --git a/pkg/extkingpin/path_content_reloader_test.go b/pkg/extkingpin/path_content_reloader_test.go deleted file mode 100644 index fb20f83d5c..0000000000 --- a/pkg/extkingpin/path_content_reloader_test.go +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package extkingpin - -import ( - "context" - "os" - "path" - "sync" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/thanos-io/thanos/pkg/testutil" -) - -func TestPathContentReloader(t *testing.T) { - type args struct { - runSteps func(t *testing.T, testFile string, pathContent *staticPathContent) - } - tests := []struct { - name string - args args - wantReloads int - }{ - { - name: "Many operations, only rewrite triggers one reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - testutil.Ok(t, os.Remove(testFile)) - testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) - }, - }, - wantReloads: 1, - }, - { - name: "Many operations, only rename triggers one reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - testutil.Ok(t, os.Rename(testFile, testFile+".tmp")) - testutil.Ok(t, os.Rename(testFile+".tmp", testFile)) - }, - }, - wantReloads: 1, - }, - { - name: "Many operations, two rewrites trigger two reloads", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - testutil.Ok(t, os.Remove(testFile)) - testutil.Ok(t, pathContent.Rewrite([]byte("test modified"))) - time.Sleep(2 * time.Second) - testutil.Ok(t, pathContent.Rewrite([]byte("test modified again"))) - }, - }, - wantReloads: 1, - }, - { - name: "Chmod doesn't trigger reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Chmod(testFile, 0777)) - }, - }, - wantReloads: 0, - }, - { - name: "Remove doesn't trigger reload", - args: args{ - runSteps: func(t *testing.T, testFile string, pathContent *staticPathContent) { - testutil.Ok(t, os.Remove(testFile)) - }, - }, - wantReloads: 0, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - testFile := path.Join(t.TempDir(), "test") - testutil.Ok(t, os.WriteFile(testFile, []byte("test"), 0666)) - pathContent, err := NewStaticPathContent(testFile) - testutil.Ok(t, err) - - wg := &sync.WaitGroup{} - wg.Add(tt.wantReloads) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - reloadCount := 0 - err = PathContentReloader(ctx, pathContent, log.NewLogfmtLogger(os.Stdout), func() { - reloadCount++ - wg.Done() - }, 100*time.Millisecond) - testutil.Ok(t, err) - - tt.args.runSteps(t, testFile, pathContent) - wg.Wait() - testutil.Equals(t, tt.wantReloads, reloadCount) - }) - } -} diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go index 12afb752b8..156bb74566 100644 --- a/pkg/receive/handler.go +++ b/pkg/receive/handler.go @@ -17,6 +17,10 @@ import ( "sync" "time" + "github.com/thanos-io/thanos/pkg/api" + statusapi "github.com/thanos-io/thanos/pkg/api/status" + "github.com/thanos-io/thanos/pkg/logging" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -31,9 +35,6 @@ import ( "github.com/prometheus/prometheus/model/relabel" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb" - "github.com/thanos-io/thanos/pkg/api" - statusapi "github.com/thanos-io/thanos/pkg/api/status" - "github.com/thanos-io/thanos/pkg/logging" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -98,7 +99,7 @@ type Options struct { ForwardTimeout time.Duration RelabelConfigs []*relabel.Config TSDBStats TSDBStats - Limiter *Limiter + Limiter *limiter } // Handler serves a Prometheus remote write receiving HTTP endpoint. @@ -123,7 +124,7 @@ type Handler struct { writeSamplesTotal *prometheus.HistogramVec writeTimeseriesTotal *prometheus.HistogramVec - Limiter *Limiter + limiter *limiter } func NewHandler(logger log.Logger, o *Options) *Handler { @@ -149,7 +150,7 @@ func NewHandler(logger log.Logger, o *Options) *Handler { Max: 30 * time.Second, Jitter: true, }, - Limiter: o.Limiter, + limiter: o.Limiter, forwardRequests: promauto.With(registerer).NewCounterVec( prometheus.CounterOpts{ Name: "thanos_receive_forward_requests_total", @@ -406,18 +407,17 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { tLogger := log.With(h.logger, "tenant", tenant) - writeGate := h.Limiter.WriteGate() tracing.DoInSpan(r.Context(), "receive_write_gate_ismyturn", func(ctx context.Context) { - err = writeGate.Start(r.Context()) + err = h.limiter.writeGate.Start(r.Context()) }) - defer writeGate.Done() if err != nil { level.Error(tLogger).Log("err", err, "msg", "internal server error") http.Error(w, err.Error(), http.StatusInternalServerError) return } + defer h.limiter.writeGate.Done() - under, err := h.Limiter.HeadSeriesLimiter.isUnderLimit(tenant) + under, err := h.limiter.HeadSeriesLimiter.isUnderLimit(tenant) if err != nil { level.Error(tLogger).Log("msg", "error while limiting", "err", err.Error()) } @@ -428,7 +428,7 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) { return } - requestLimiter := h.Limiter.RequestLimiter() + requestLimiter := h.limiter.requestLimiter // io.ReadAll dynamically adjust the byte slice for read data, starting from 512B. // Since this is receive hot path, grow upfront saving allocations and CPU time. compressed := bytes.Buffer{} diff --git a/pkg/receive/handler_test.go b/pkg/receive/handler_test.go index 4a2a536038..44076de141 100644 --- a/pkg/receive/handler_test.go +++ b/pkg/receive/handler_test.go @@ -13,7 +13,6 @@ import ( "net/http" "net/http/httptest" "os" - "path" "path/filepath" "runtime" "runtime/pprof" @@ -22,8 +21,6 @@ import ( "testing" "time" - "gopkg.in/yaml.v3" - "github.com/alecthomas/units" "github.com/go-kit/log" "github.com/gogo/protobuf/proto" @@ -43,7 +40,6 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/errutil" - "github.com/thanos-io/thanos/pkg/extkingpin" "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/store/labelpb" "github.com/thanos-io/thanos/pkg/store/storepb" @@ -366,7 +362,6 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin }, } - limiter, _ := NewLimiter(NewNopConfig(), nil, RouterIngestor, log.NewNopLogger()) for i := range appendables { h := NewHandler(nil, &Options{ TenantHeader: DefaultTenantHeader, @@ -374,7 +369,7 @@ func newTestHandlerHashring(appendables []*fakeAppendable, replicationFactor uin ReplicationFactor: replicationFactor, ForwardTimeout: 5 * time.Second, Writer: NewWriter(log.NewNopLogger(), newFakeTenantAppendable(appendables[i])), - Limiter: limiter, + Limiter: NewLimiter(nil, nil, RouterIngestor, nil), }) handlers = append(handlers, h) h.peers = peers @@ -780,28 +775,23 @@ func TestReceiveWriteRequestLimits(t *testing.T) { } handlers, _ := newTestHandlerHashring(appendables, 3) handler := handlers[0] - tenant := "test" - tenantConfig, err := yaml.Marshal(&RootLimitsConfig{ - WriteLimits: WriteLimitsConfig{ - TenantsLimits: TenantsWriteLimitsConfig{ - tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig(). - SetSizeBytesLimit(int64(1 * units.Megabyte)). - SetSeriesLimit(20). - SetSamplesLimit(200), + handler.limiter = NewLimiter( + &RootLimitsConfig{ + WriteLimits: WriteLimitsConfig{ + TenantsLimits: TenantsWriteLimitsConfig{ + tenant: &WriteLimitConfig{ + RequestLimits: newEmptyRequestLimitsConfig(). + SetSizeBytesLimit(int64(1 * units.Megabyte)). + SetSeriesLimit(20). + SetSamplesLimit(200), + }, }, }, }, - }) - if err != nil { - t.Fatal("handler: failed to generate limit configuration") - } - tmpLimitsPath := path.Join(t.TempDir(), "limits.yaml") - testutil.Ok(t, os.WriteFile(tmpLimitsPath, tenantConfig, 0666)) - limitConfig, _ := extkingpin.NewStaticPathContent(tmpLimitsPath) - handler.Limiter, _ = NewLimiter( - limitConfig, nil, RouterIngestor, log.NewNopLogger(), + nil, + RouterIngestor, + log.NewNopLogger(), ) wreq := &prompb.WriteRequest{ diff --git a/pkg/receive/limiter.go b/pkg/receive/limiter.go index ff5bbe3199..bc3c4d8358 100644 --- a/pkg/receive/limiter.go +++ b/pkg/receive/limiter.go @@ -5,204 +5,59 @@ package receive import ( "context" - "fmt" - "sync" - "time" "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/thanos-io/thanos/pkg/extkingpin" - - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/thanos-io/thanos/pkg/extprom" "github.com/thanos-io/thanos/pkg/gate" ) -// Limiter is responsible for managing the configuration and initialization of -// different types that apply limits to the Receive instance. -type Limiter struct { - sync.RWMutex - requestLimiter requestLimiter - HeadSeriesLimiter headSeriesLimiter - writeGate gate.Gate - registerer prometheus.Registerer - configPathOrContent fileContent - logger log.Logger - configReloadCounter prometheus.Counter - configReloadFailedCounter prometheus.Counter - receiverMode ReceiverMode -} - -// headSeriesLimiter encompasses active/head series limiting logic. -type headSeriesLimiter interface { - QueryMetaMonitoring(context.Context) error - isUnderLimit(tenant string) (bool, error) +type limiter struct { + requestLimiter requestLimiter + writeGate gate.Gate + HeadSeriesLimiter headSeriesLimiter } +// requestLimiter encompasses logic for limiting remote write requests. type requestLimiter interface { AllowSizeBytes(tenant string, contentLengthBytes int64) bool AllowSeries(tenant string, amount int64) bool AllowSamples(tenant string, amount int64) bool } -// fileContent is an interface to avoid a direct dependency on kingpin or extkingpin. -type fileContent interface { - Content() ([]byte, error) - Path() string +// headSeriesLimiter encompasses active/head series limiting logic. +type headSeriesLimiter interface { + QueryMetaMonitoring(context.Context) error + isUnderLimit(tenant string) (bool, error) } -// NewLimiter creates a new *Limiter given a configuration and prometheus -// registerer. -func NewLimiter(configFile fileContent, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) (*Limiter, error) { - limiter := &Limiter{ +func NewLimiter(root *RootLimitsConfig, reg prometheus.Registerer, r ReceiverMode, logger log.Logger) *limiter { + limiter := &limiter{ writeGate: gate.NewNoop(), requestLimiter: &noopRequestLimiter{}, HeadSeriesLimiter: NewNopSeriesLimit(), - logger: logger, - receiverMode: r, - } - - if reg != nil { - limiter.registerer = NewUnRegisterer(reg) - limiter.configReloadCounter = promauto.With(limiter.registerer).NewCounter( - prometheus.CounterOpts{ - Namespace: "thanos", - Subsystem: "receive", - Name: "limits_config_reload_total", - Help: "How many times the limit configuration was reloaded", - }, - ) - limiter.configReloadFailedCounter = promauto.With(limiter.registerer).NewCounter( - prometheus.CounterOpts{ - Namespace: "thanos", - Subsystem: "receive", - Name: "limits_config_reload_err_total", - Help: "How many times the limit configuration failed to reload.", - }, - ) - } - - if configFile == nil { - return limiter, nil - } - - limiter.configPathOrContent = configFile - if err := limiter.loadConfig(); err != nil { - return nil, errors.Wrap(err, "load tenant limits config") - } - - return limiter, nil -} - -// StartConfigReloader starts the automatic configuration reloader based off of -// the file indicated by pathOrContent. It starts a Go routine in the given -// *run.Group. -func (l *Limiter) StartConfigReloader(ctx context.Context) error { - if !l.CanReload() { - return nil } - - return extkingpin.PathContentReloader(ctx, l.configPathOrContent, l.logger, func() { - level.Info(l.logger).Log("msg", "reloading limit config") - if err := l.loadConfig(); err != nil { - if failedReload := l.configReloadCounter; failedReload != nil { - failedReload.Inc() - } - errMsg := fmt.Sprintf("error reloading tenant limits config from %s", l.configPathOrContent.Path()) - level.Error(l.logger).Log("msg", errMsg, "err", err) - } - if reloadCounter := l.configReloadCounter; reloadCounter != nil { - reloadCounter.Inc() - } - }, 1*time.Second) -} - -func (l *Limiter) CanReload() bool { - if l.configPathOrContent == nil { - return false + if root == nil { + return limiter } - if l.configPathOrContent.Path() == "" { - return false - } - return true -} -func (l *Limiter) loadConfig() error { - config, err := ParseLimitConfigContent(l.configPathOrContent) - if err != nil { - return err - } - l.Lock() - defer l.Unlock() - maxWriteConcurrency := config.WriteLimits.GlobalLimits.MaxConcurrency + maxWriteConcurrency := root.WriteLimits.GlobalLimits.MaxConcurrency if maxWriteConcurrency > 0 { - l.writeGate = gate.New( + limiter.writeGate = gate.New( extprom.WrapRegistererWithPrefix( "thanos_receive_write_request_concurrent_", - l.registerer, + reg, ), int(maxWriteConcurrency), ) } - l.requestLimiter = newConfigRequestLimiter( - l.registerer, - &config.WriteLimits, - ) - seriesLimitSupported := (l.receiverMode == RouterOnly || l.receiverMode == RouterIngestor) && (len(config.WriteLimits.TenantsLimits) != 0 || config.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) - if seriesLimitSupported { - l.HeadSeriesLimiter = NewHeadSeriesLimit(config.WriteLimits, l.registerer, l.logger) - } - return nil -} + limiter.requestLimiter = newConfigRequestLimiter(reg, &root.WriteLimits) -// RequestLimiter is a safe getter for the request limiter. -func (l *Limiter) RequestLimiter() requestLimiter { - l.RLock() - defer l.RUnlock() - return l.requestLimiter -} - -// WriteGate is a safe getter for the write gate. -func (l *Limiter) WriteGate() gate.Gate { - l.RLock() - defer l.RUnlock() - return l.writeGate -} - -// ParseLimitConfigContent parses the limit configuration from the path or -// content. -func ParseLimitConfigContent(limitsConfig fileContent) (*RootLimitsConfig, error) { - if limitsConfig == nil { - return &RootLimitsConfig{}, nil - } - limitsContentYaml, err := limitsConfig.Content() - if err != nil { - return nil, errors.Wrap(err, "get content of limit configuration") - } - parsedConfig, err := ParseRootLimitConfig(limitsContentYaml) - if err != nil { - return nil, errors.Wrap(err, "parse limit configuration") + // Impose active series limit only if Receiver is in Router or RouterIngestor mode, and config is provided. + seriesLimitSupported := (r == RouterOnly || r == RouterIngestor) && (len(root.WriteLimits.TenantsLimits) != 0 || root.WriteLimits.DefaultLimits.HeadSeriesLimit != 0) + if seriesLimitSupported { + limiter.HeadSeriesLimiter = NewHeadSeriesLimit(root.WriteLimits, reg, logger) } - return parsedConfig, nil -} - -type nopConfigContent struct{} - -var _ fileContent = (*nopConfigContent)(nil) - -// Content returns no content and no error. -func (n nopConfigContent) Content() ([]byte, error) { - return nil, nil -} - -// Path returns an empty path. -func (n nopConfigContent) Path() string { - return "" -} -// NewNopConfig creates a no-op config content (no configuration). -func NewNopConfig() nopConfigContent { - return nopConfigContent{} + return limiter } diff --git a/pkg/receive/limiter_config.go b/pkg/receive/limiter_config.go index c3bd330b6e..67aa5ef93a 100644 --- a/pkg/receive/limiter_config.go +++ b/pkg/receive/limiter_config.go @@ -78,7 +78,6 @@ type DefaultLimitsConfig struct { HeadSeriesLimit uint64 `yaml:"head_series_limit"` } -// TenantsWriteLimitsConfig is a map of tenant IDs to their *WriteLimitConfig. type TenantsWriteLimitsConfig map[string]*WriteLimitConfig // A tenant might not always have limits configured, so things here must @@ -111,7 +110,8 @@ type requestLimitsConfig struct { SamplesLimit *int64 `yaml:"samples_limit"` } -func NewEmptyRequestLimitsConfig() *requestLimitsConfig { +// Utils for initializing. +func newEmptyRequestLimitsConfig() *requestLimitsConfig { return &requestLimitsConfig{} } diff --git a/pkg/receive/limiter_config_test.go b/pkg/receive/limiter_config_test.go index 3e32ea41e8..b080680162 100644 --- a/pkg/receive/limiter_config_test.go +++ b/pkg/receive/limiter_config_test.go @@ -35,7 +35,7 @@ func TestParseLimiterConfig(t *testing.T) { }, }, DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig(). + RequestLimits: *newEmptyRequestLimitsConfig(). SetSizeBytesLimit(1024). SetSeriesLimit(1000). SetSamplesLimit(10), @@ -44,7 +44,7 @@ func TestParseLimiterConfig(t *testing.T) { TenantsLimits: TenantsWriteLimitsConfig{ "acme": NewEmptyWriteLimitConfig(). SetRequestLimits( - NewEmptyRequestLimitsConfig(). + newEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0), @@ -52,7 +52,7 @@ func TestParseLimiterConfig(t *testing.T) { SetHeadSeriesLimit(2000), "ajax": NewEmptyWriteLimitConfig(). SetRequestLimits( - NewEmptyRequestLimitsConfig(). + newEmptyRequestLimitsConfig(). SetSeriesLimit(50000). SetSamplesLimit(500), ), diff --git a/pkg/receive/limiter_test.go b/pkg/receive/limiter_test.go deleted file mode 100644 index be7e8790c1..0000000000 --- a/pkg/receive/limiter_test.go +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) The Thanos Authors. -// Licensed under the Apache License 2.0. - -package receive - -import ( - "context" - "os" - "path" - "testing" - "time" - - "github.com/thanos-io/thanos/pkg/extkingpin" - - "github.com/efficientgo/tools/core/pkg/testutil" - "github.com/go-kit/log" -) - -func TestLimiter_StartConfigReloader(t *testing.T) { - origLimitsFile, err := os.ReadFile(path.Join("testdata", "limits_config", "good_limits.yaml")) - testutil.Ok(t, err) - copyLimitsFile := path.Join(t.TempDir(), "limits.yaml") - testutil.Ok(t, os.WriteFile(copyLimitsFile, origLimitsFile, 0666)) - - goodLimits, err := extkingpin.NewStaticPathContent(copyLimitsFile) - if err != nil { - t.Fatalf("error trying to save static limit config: %s", err) - } - invalidLimitsPath := path.Join("./testdata", "limits_config", "invalid_limits.yaml") - invalidLimits, err := os.ReadFile(invalidLimitsPath) - if err != nil { - t.Fatalf("could not load test content at %s: %s", invalidLimitsPath, err) - } - - limiter, err := NewLimiter(goodLimits, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) - testutil.Ok(t, err) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - err = limiter.StartConfigReloader(ctx) - testutil.Ok(t, err) - - time.Sleep(1 * time.Second) - testutil.Ok(t, goodLimits.Rewrite(invalidLimits)) -} - -type emptyPathFile struct{} - -func (e emptyPathFile) Content() ([]byte, error) { - return []byte{}, nil -} - -func (e emptyPathFile) Path() string { - return "" -} - -func TestLimiter_CanReload(t *testing.T) { - validLimitsPath, err := extkingpin.NewStaticPathContent( - path.Join("testdata", "limits_config", "good_limits.yaml"), - ) - testutil.Ok(t, err) - emptyLimitsPath := emptyPathFile{} - - type args struct { - configFilePath fileContent - } - tests := []struct { - name string - args args - wantReload bool - }{ - { - name: "Nil config file path cannot be reloaded", - args: args{configFilePath: nil}, - wantReload: false, - }, - { - name: "Empty config file path cannot be reloaded", - args: args{configFilePath: emptyLimitsPath}, - wantReload: false, - }, - { - name: "Valid config file path can be reloaded", - args: args{configFilePath: validLimitsPath}, - wantReload: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - configFile := tt.args.configFilePath - limiter, err := NewLimiter(configFile, nil, RouterIngestor, log.NewLogfmtLogger(os.Stdout)) - testutil.Ok(t, err) - if tt.wantReload { - testutil.Assert(t, limiter.CanReload()) - } else { - testutil.Assert(t, !limiter.CanReload()) - } - }) - } -} diff --git a/pkg/receive/request_limiter.go b/pkg/receive/request_limiter.go index 7da0c64a6d..de7554de2f 100644 --- a/pkg/receive/request_limiter.go +++ b/pkg/receive/request_limiter.go @@ -14,7 +14,7 @@ const ( sizeBytesLimitName = "body_size" ) -var unlimitedRequestLimitsConfig = NewEmptyRequestLimitsConfig(). +var unlimitedRequestLimitsConfig = newEmptyRequestLimitsConfig(). SetSizeBytesLimit(0). SetSeriesLimit(0). SetSamplesLimit(0) @@ -49,12 +49,7 @@ func newConfigRequestLimiter(reg prometheus.Registerer, writeLimits *WriteLimits tenantLimits: tenantRequestLimits, cachedDefaultLimits: defaultRequestLimits, } - limiter.registerMetrics(reg) - return &limiter -} - -func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { - l.limitsHit = promauto.With(reg).NewSummaryVec( + limiter.limitsHit = promauto.With(reg).NewSummaryVec( prometheus.SummaryOpts{ Namespace: "thanos", Subsystem: "receive", @@ -63,7 +58,7 @@ func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { Objectives: map[float64]float64{0.50: 0.1, 0.95: 0.1, 0.99: 0.001}, }, []string{"tenant", "limit"}, ) - l.configuredLimits = promauto.With(reg).NewGaugeVec( + limiter.configuredLimits = promauto.With(reg).NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thanos", Subsystem: "receive", @@ -71,14 +66,16 @@ func (l *configRequestLimiter) registerMetrics(reg prometheus.Registerer) { Help: "The configured write limits.", }, []string{"tenant", "limit"}, ) - for tenant, limits := range l.tenantLimits { - l.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) - l.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) - l.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) + for tenant, limits := range tenantRequestLimits { + limiter.configuredLimits.WithLabelValues(tenant, sizeBytesLimitName).Set(float64(*limits.SizeBytesLimit)) + limiter.configuredLimits.WithLabelValues(tenant, seriesLimitName).Set(float64(*limits.SeriesLimit)) + limiter.configuredLimits.WithLabelValues(tenant, samplesLimitName).Set(float64(*limits.SamplesLimit)) } - l.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*l.cachedDefaultLimits.SizeBytesLimit)) - l.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*l.cachedDefaultLimits.SeriesLimit)) - l.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*l.cachedDefaultLimits.SamplesLimit)) + limiter.configuredLimits.WithLabelValues("", sizeBytesLimitName).Set(float64(*defaultRequestLimits.SizeBytesLimit)) + limiter.configuredLimits.WithLabelValues("", seriesLimitName).Set(float64(*defaultRequestLimits.SeriesLimit)) + limiter.configuredLimits.WithLabelValues("", samplesLimitName).Set(float64(*defaultRequestLimits.SamplesLimit)) + + return &limiter } func (l *configRequestLimiter) AllowSizeBytes(tenant string, contentLengthBytes int64) bool { @@ -103,7 +100,7 @@ func (l *configRequestLimiter) AllowSeries(tenant string, amount int64) bool { } allowed := *limit >= amount - if !allowed && l.limitsHit != nil { + if !allowed { l.limitsHit. WithLabelValues(tenant, seriesLimitName). Observe(float64(amount - *limit)) @@ -117,7 +114,7 @@ func (l *configRequestLimiter) AllowSamples(tenant string, amount int64) bool { return true } allowed := *limit >= amount - if !allowed && l.limitsHit != nil { + if !allowed { l.limitsHit. WithLabelValues(tenant, samplesLimitName). Observe(float64(amount - *limit)) diff --git a/pkg/receive/request_limiter_test.go b/pkg/receive/request_limiter_test.go index dfbea066d9..e654cd1cdf 100644 --- a/pkg/receive/request_limiter_test.go +++ b/pkg/receive/request_limiter_test.go @@ -15,12 +15,12 @@ func TestRequestLimiter_limitsFor(t *testing.T) { limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig(). + RequestLimits: *newEmptyRequestLimitsConfig(). SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenantWithLimits: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig(). + RequestLimits: newEmptyRequestLimitsConfig(). SetSeriesLimit(30), }, }, @@ -33,7 +33,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the default limits when tenant's limits aren't present", tenant: tenantWithoutLimits, - wantLimits: NewEmptyRequestLimitsConfig(). + wantLimits: newEmptyRequestLimitsConfig(). SetSeriesLimit(10). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -41,7 +41,7 @@ func TestRequestLimiter_limitsFor(t *testing.T) { { name: "Gets the tenant's limits when it is present", tenant: tenantWithLimits, - wantLimits: NewEmptyRequestLimitsConfig(). + wantLimits: newEmptyRequestLimitsConfig(). SetSeriesLimit(30). SetSamplesLimit(0). SetSizeBytesLimit(0), @@ -102,11 +102,11 @@ func TestRequestLimiter_AllowRequestBodySizeBytes(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), + RequestLimits: newEmptyRequestLimitsConfig().SetSizeBytesLimit(tt.sizeByteLimit), }, }, } @@ -159,11 +159,11 @@ func TestRequestLimiter_AllowSeries(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), + RequestLimits: newEmptyRequestLimitsConfig().SetSeriesLimit(tt.seriesLimit), }, }, } @@ -217,11 +217,11 @@ func TestRequestLimiter_AllowSamples(t *testing.T) { tenant := "tenant" limits := WriteLimitsConfig{ DefaultLimits: DefaultLimitsConfig{ - RequestLimits: *NewEmptyRequestLimitsConfig().SetSeriesLimit(10), + RequestLimits: *newEmptyRequestLimitsConfig().SetSeriesLimit(10), }, TenantsLimits: TenantsWriteLimitsConfig{ tenant: &WriteLimitConfig{ - RequestLimits: NewEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), + RequestLimits: newEmptyRequestLimitsConfig().SetSamplesLimit(tt.samplesLimit), }, }, } diff --git a/pkg/receive/testdata/limits.yaml b/pkg/receive/testdata/limits.yaml deleted file mode 100644 index 2345756179..0000000000 --- a/pkg/receive/testdata/limits.yaml +++ /dev/null @@ -1,22 +0,0 @@ -write: - global: - max_concurrency: 30 - meta_monitoring_url: "http://localhost:9090" - meta_monitoring_limit_query: "sum(prometheus_tsdb_head_series) by (tenant)" - default: - request: - size_bytes_limit: 1024 - series_limit: 1000 - samples_limit: 10 - head_series_limit: 1000 - tenants: - acme: - request: - size_bytes_limit: 0 - series_limit: 0 - samples_limit: 0 - head_series_limit: 2000 - ajax: - request: - series_limit: 50000 - samples_limit: 500 diff --git a/pkg/receive/testdata/limits_config/invalid_limits.yaml b/pkg/receive/testdata/limits_config/invalid_limits.yaml deleted file mode 100644 index 74db0453f8..0000000000 --- a/pkg/receive/testdata/limits_config/invalid_limits.yaml +++ /dev/null @@ -1,17 +0,0 @@ -write: - global: - max_concurrency: 30 - request: - size_bytes_limit: 1024 - series_limit: 1000 - samples_limit: 10 - tenants: - acme: - request: - size_bytes_limit: 0 - series_limit: 0 - samples_limit: 0 - ajax: - request: - series_limit: 50000 - samples_limit: 500 From 0684e392320a321ce5c2737eeeab9ddf0e1ae263 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 18:51:19 +0100 Subject: [PATCH 29/43] fixed lint issue Signed-off-by: utukj --- pkg/query/endpointset_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index b79196248d..cfdd62fb10 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -410,9 +410,8 @@ func TestEndpointSetUpdate(t *testing.T) { defer endpoints.Close() discoveredEndpointAddr := endpoints.EndpointAddresses() - var endpointSet *EndpointSet // Specify only "store_type" to exclude "external_labels". - endpointSet = makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, tc.connLabels...) + endpointSet := makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, tc.connLabels...) defer endpointSet.Close() endpointSet.Update(context.Background()) From 46c43d73b4a1e90a5ee89445ced3835aa76bedfe Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 18 Oct 2022 19:20:40 +0100 Subject: [PATCH 30/43] added unit test for truncate and clean up Signed-off-by: utukj --- pkg/query/endpointset_test.go | 65 +++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index cfdd62fb10..af421af3fd 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -271,20 +271,39 @@ func (e *testEndpoints) CloseOne(addr string) { delete(e.srvs, addr) } -func truncateAndEscapeQuotes(s string) string { - // Truncate string. - if len(s) > externalLabelLimit { - s = s[:externalLabelLimit] +func TestTruncateExtLabels(t *testing.T) { + testCases := []struct { + name string + labelToTruncate string + lengthLimit int + expectedOutput string + }{ + { + name: "shorter label length", + labelToTruncate: "{xxx}", + lengthLimit: 5, + expectedOutput: "{xxx}", + }, + { + name: "longer label length", + labelToTruncate: "{xxxxxxxxxxxxxxxxxxx}", + lengthLimit: 5, + expectedOutput: "{xxxxx}", + }, + { + name: "exact label length", + labelToTruncate: "{xxxxx}", + lengthLimit: 5, + expectedOutput: "{xxxxx}", + }, } - // Add backslash escape for every quote character. - var lbl strings.Builder - for _, ch := range s { - if string(ch) == `"` { - lbl.WriteString(`\`) - } - lbl.WriteRune(ch) + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + got := truncateExtLabels(tc.labelToTruncate, tc.lengthLimit) + testutil.Equals(t, tc.expectedOutput, got) + }) } - return lbl.String() } func TestEndpointSetUpdate(t *testing.T) { @@ -394,12 +413,22 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, expectedEndpoints: 1, - expectedConnMetrics: metricsMeta + fmt.Sprintf( - ` - thanos_store_nodes_grpc_connections{external_labels="{%s}", store_type="sidecar"} 1 - `, - truncateAndEscapeQuotes(strings.Repeat(`lbl="val", `, 1000)), - ), + expectedConnMetrics: `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", + lbl=\"val\", lbl=\"val\", lbl=\"val\",`, }, } From b3f0d8cd2d0c15584089f6f5e18ad0014073f50d Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 20 Oct 2022 12:21:43 +0100 Subject: [PATCH 31/43] fixed truncate label func and added more tests Signed-off-by: utukj --- pkg/query/endpointset.go | 8 ++++++-- pkg/query/endpointset_test.go | 29 ++++++++++++++++++----------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index e886fb9725..7977df6e65 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -9,6 +9,7 @@ import ( "fmt" "math" "sort" + "strings" "sync" "time" @@ -204,10 +205,13 @@ func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { } func truncateExtLabels(s string, threshold int) string { + // remove enclosing braces + s = strings.Trim(s, "{}") + // truncate if len(s) > threshold { - s = s[:threshold+1] + "}" + s = s[:threshold] } - return s + return fmt.Sprintf("{%s}", s) } func (c *endpointSetNodeCollector) Update(nodes map[component.Component]map[string]int) { diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index af421af3fd..ba0a4801b9 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -272,35 +272,42 @@ func (e *testEndpoints) CloseOne(addr string) { } func TestTruncateExtLabels(t *testing.T) { + const testLength = 5 testCases := []struct { name string labelToTruncate string - lengthLimit int expectedOutput string }{ { name: "shorter label length", - labelToTruncate: "{xxx}", - lengthLimit: 5, - expectedOutput: "{xxx}", + labelToTruncate: "{abc}", + expectedOutput: "{abc}", }, { name: "longer label length", - labelToTruncate: "{xxxxxxxxxxxxxxxxxxx}", - lengthLimit: 5, - expectedOutput: "{xxxxx}", + labelToTruncate: "{abcdefghij}", + expectedOutput: "{abcde}", }, { name: "exact label length", - labelToTruncate: "{xxxxx}", - lengthLimit: 5, - expectedOutput: "{xxxxx}", + labelToTruncate: "{abcde}", + expectedOutput: "{abcde}", + }, + { + name: "one less length", + labelToTruncate: "{abcd}", + expectedOutput: "{abcd}", + }, + { + name: "one more length", + labelToTruncate: "{abcdef}", + expectedOutput: "{abcde}", }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - got := truncateExtLabels(tc.labelToTruncate, tc.lengthLimit) + got := truncateExtLabels(tc.labelToTruncate, testLength) testutil.Equals(t, tc.expectedOutput, got) }) } From 4ce76268f938d04fcc7aef1c37f3ecbd3ae61616 Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 20 Oct 2022 12:29:03 +0100 Subject: [PATCH 32/43] removed name from truncate test Signed-off-by: utukj --- pkg/query/endpointset_test.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index ba0a4801b9..9a09bac563 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -274,39 +274,33 @@ func (e *testEndpoints) CloseOne(addr string) { func TestTruncateExtLabels(t *testing.T) { const testLength = 5 testCases := []struct { - name string labelToTruncate string expectedOutput string }{ { - name: "shorter label length", labelToTruncate: "{abc}", expectedOutput: "{abc}", }, { - name: "longer label length", labelToTruncate: "{abcdefghij}", expectedOutput: "{abcde}", }, { - name: "exact label length", labelToTruncate: "{abcde}", expectedOutput: "{abcde}", }, { - name: "one less length", labelToTruncate: "{abcd}", expectedOutput: "{abcd}", }, { - name: "one more length", labelToTruncate: "{abcdef}", expectedOutput: "{abcde}", }, } for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { + t.Run("", func(t *testing.T) { got := truncateExtLabels(tc.labelToTruncate, testLength) testutil.Equals(t, tc.expectedOutput, got) }) From c8731e353cd70afca30f93fd8c4efc65ed5282a3 Mon Sep 17 00:00:00 2001 From: utukj Date: Thu, 20 Oct 2022 16:28:16 +0100 Subject: [PATCH 33/43] reorganized test cases and removed redundant comments Signed-off-by: utukj --- pkg/query/endpointset.go | 2 -- pkg/query/endpointset_test.go | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 7977df6e65..983b305cd3 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -205,9 +205,7 @@ func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { } func truncateExtLabels(s string, threshold int) string { - // remove enclosing braces s = strings.Trim(s, "{}") - // truncate if len(s) > threshold { s = s[:threshold] } diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 9a09bac563..c19688c099 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -278,23 +278,23 @@ func TestTruncateExtLabels(t *testing.T) { expectedOutput string }{ { - labelToTruncate: "{abc}", - expectedOutput: "{abc}", + labelToTruncate: "{a}", + expectedOutput: "{a}", }, { - labelToTruncate: "{abcdefghij}", - expectedOutput: "{abcde}", + labelToTruncate: "{abcd}", + expectedOutput: "{abcd}", }, { labelToTruncate: "{abcde}", expectedOutput: "{abcde}", }, { - labelToTruncate: "{abcd}", - expectedOutput: "{abcd}", + labelToTruncate: "{abcdef}", + expectedOutput: "{abcde}", }, { - labelToTruncate: "{abcdef}", + labelToTruncate: "{abcdefghijk}", expectedOutput: "{abcde}", }, } From e38bfb8d950da8af593e7ab8221c6bef7758d86e Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Thu, 20 Oct 2022 18:15:04 +0100 Subject: [PATCH 34/43] Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset_test.go | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index c19688c099..9353d72d7d 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -273,36 +273,37 @@ func (e *testEndpoints) CloseOne(addr string) { func TestTruncateExtLabels(t *testing.T) { const testLength = 5 - testCases := []struct { + + for _, tc := range []struct { labelToTruncate string expectedOutput string }{ { - labelToTruncate: "{a}", - expectedOutput: "{a}", + labelToTruncate: "{abc}", + expectedOutput: "{abc}", }, { labelToTruncate: "{abcd}", - expectedOutput: "{abcd}", + expectedOutput: "{abc}", }, { labelToTruncate: "{abcde}", - expectedOutput: "{abcde}", + expectedOutput: "{abc}", }, + { labelToTruncate: "{abcdef}", - expectedOutput: "{abcde}", + expectedOutput: "{abc}", }, { - labelToTruncate: "{abcdefghijk}", - expectedOutput: "{abcde}", + labelToTruncate: "{abcdefghij}", + expectedOutput: "{abc}", }, - } - - for _, tc := range testCases { - t.Run("", func(t *testing.T) { + } { + t.Run(tc.labelToTruncate, func(t *testing.T) { got := truncateExtLabels(tc.labelToTruncate, testLength) testutil.Equals(t, tc.expectedOutput, got) + testutil.Assert(t, len(got) <= testLength) }) } } From 4d7175ead802e800ad979965fb37f7d73ca222aa Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Thu, 20 Oct 2022 18:15:17 +0100 Subject: [PATCH 35/43] Update pkg/query/endpointset.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 983b305cd3..257b49e768 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -204,14 +204,13 @@ func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { } } +// truncateExtLabels truncates the stringify external labels with the format of {labels..}. func truncateExtLabels(s string, threshold int) string { - s = strings.Trim(s, "{}") if len(s) > threshold { - s = s[:threshold] + return fmt.Sprintf("%s}", s[:threshold-1]) } - return fmt.Sprintf("{%s}", s) + return s } - func (c *endpointSetNodeCollector) Update(nodes map[component.Component]map[string]int) { storeNodes := make(map[component.Component]map[string]int, len(nodes)) storePerExtLset := map[string]int{} From 56ef7cb47997f68ddd72d282e29e8d9ea68d90a5 Mon Sep 17 00:00:00 2001 From: utukj Date: Fri, 21 Oct 2022 10:57:38 +0100 Subject: [PATCH 36/43] fixed failing checks Signed-off-by: utukj --- pkg/query/endpointset.go | 1 - pkg/query/endpointset_test.go | 37 ++++++++++++++++++++--------------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 257b49e768..aaa50324c6 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -9,7 +9,6 @@ import ( "fmt" "math" "sort" - "strings" "sync" "time" diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 9353d72d7d..05d8b9f8d5 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -415,22 +415,27 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, expectedEndpoints: 1, - expectedConnMetrics: `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", - lbl=\"val\", lbl=\"val\", lbl=\"val\",`, + expectedConnMetrics: metricsMeta + fmt.Sprintf( + ` + thanos_store_nodes_grpc_connections{external_labels="{%s}", store_type="sidecar"} 1 + `, + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + `lbl=\"val`, + ), }, } From 96b1545048e01003dcbf90029d70137a93d0d196 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Fri, 21 Oct 2022 13:46:34 +0200 Subject: [PATCH 37/43] e2e: Adding test for querier with two stores loadbalancing across them. Signed-off-by: bwplotka --- go.mod | 2 +- go.sum | 4 +- test/e2e/query_lb_test.go | 104 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 test/e2e/query_lb_test.go diff --git a/go.mod b/go.mod index bee3e97fe7..d0b1088dd8 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/chromedp/chromedp v0.8.2 github.com/davecgh/go-spew v1.1.1 github.com/dustin/go-humanize v1.0.0 - github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a + github.com/efficientgo/e2e v0.13.2-0.20221003194337-cbc7a9c8405f github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.2.0 diff --git a/go.sum b/go.sum index 97fc0d0411..7ab0a6582a 100644 --- a/go.sum +++ b/go.sum @@ -248,8 +248,8 @@ github.com/edsrzf/mmap-go v1.1.0 h1:6EUwBLQ/Mcr1EYLE4Tn1VdW1A4ckqCQWZBw8Hr0kjpQ= github.com/edsrzf/mmap-go v1.1.0/go.mod h1:19H/e8pUPLicwkyNgOykDXkJ9F0MHE+Z52B8EIth78Q= github.com/efficientgo/core v1.0.0-rc.0 h1:jJoA0N+C4/knWYVZ6GrdHOtDyrg8Y/TR4vFpTaqTsqs= github.com/efficientgo/core v1.0.0-rc.0/go.mod h1:kQa0V74HNYMfuJH6jiPiwNdpWXl4xd/K4tzlrcvYDQI= -github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a h1:cnJajqeh/HjvJLhI3wPvWG9OQ4gU79+4pELRD5Pkih8= -github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= +github.com/efficientgo/e2e v0.13.2-0.20221003194337-cbc7a9c8405f h1:kS5jX3et5GqgKDTjZZxYuBmJdCQTkuX2Ss57blDaL/Y= +github.com/efficientgo/e2e v0.13.2-0.20221003194337-cbc7a9c8405f/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd h1:svR6KxSP1xiPw10RN4Pd7g6BAVkEcNN628PAqZH31mM= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:OmVcnJopJL8d3X3sSXTiypGoUSgFq1aDGmlrdi9dn/M= github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd h1:VaYzzXeUbC5fVheskcKVNOyJMEYD+HgrJNzIAg/mRIM= diff --git a/test/e2e/query_lb_test.go b/test/e2e/query_lb_test.go new file mode 100644 index 0000000000..677fd26f8f --- /dev/null +++ b/test/e2e/query_lb_test.go @@ -0,0 +1,104 @@ +package e2e + +import ( + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/efficientgo/e2e" + e2edb "github.com/efficientgo/e2e/db" + e2einteractive "github.com/efficientgo/e2e/interactive" + e2emon "github.com/efficientgo/e2e/monitoring" + "github.com/thanos-io/objstore/client" + "github.com/thanos-io/objstore/providers/filesystem" + "github.com/thanos-io/thanos/pkg/testutil" + tracingclient "github.com/thanos-io/thanos/pkg/tracing/client" + "github.com/thanos-io/thanos/pkg/tracing/jaeger" + "gopkg.in/yaml.v2" +) + +func marshal(t testing.TB, i interface{}) []byte { + t.Helper() + + b, err := yaml.Marshal(i) + testutil.Ok(t, err) + + return b +} + +// TestQuery_WithStores_Loadbalancing is testing. +// * Create & Start Querier +// * Prepare "object storage" (test hack: It can just filesystem). +// - Create one TSDB block. +// +// * Create & Start 2x Stores +// * Connect Querier with Stores (tricky - there is no way of marking store as LB...) +// * Assertion: Monitor the traffic distribution. +func TestQuery_WithStores_Loadbalancing(t *testing.T) { + pwd, err := os.Getwd() + testutil.Ok(t, err) + + // Create a local dir that will be shared with containers with TSDB blocks we need. + // TODO(bwplotka): Create a block here (e.g using thanosbench). + bktDir := filepath.Join(pwd, "tsdb/bucket") + e, err := e2e.New( + e2e.WithVolumes( + fmt.Sprintf("%v:%v:z", filepath.Join(pwd, "tsdb"), filepath.Join(pwd, "tsdb"))), + ) + testutil.Ok(t, err) + t.Cleanup(e.Close) + + // Start monitoring. + mon, err := e2emon.Start(e) + testutil.Ok(t, err) + testutil.Ok(t, mon.OpenUserInterfaceInBrowser()) + + // Start tracing. + j := e.Runnable("tracing").WithPorts(map[string]int{"http-front": 16686, "jaeger.thrift": 14268}).Init(e2e.StartOptions{Image: "jaegertracing/all-in-one:1.25"}) + testutil.Ok(t, e2e.StartAndWaitReady(j)) + //testutil.Ok(t, e2einteractive.OpenInBrowser("http://"+j.Endpoint("http-front"))) + + jaegerConfig, err := yaml.Marshal(tracingclient.TracingConfig{ + Type: tracingclient.Jaeger, + Config: jaeger.Config{ + ServiceName: "thanos", + SamplerType: "const", + SamplerParam: 1, + Endpoint: "http://" + j.InternalEndpoint("jaeger.thrift") + "/api/traces", + }, + }) + testutil.Ok(t, err) + + const thanosImage = "thanos:latest" // Run 'make thanos' in thanos root to recreate it. + store1 := e2edb.NewThanosStore(e, "store1", marshal(t, client.BucketConfig{ + Type: client.FILESYSTEM, + Config: filesystem.Config{ + Directory: bktDir, + }, + }), e2edb.WithImage(thanosImage), e2edb.WithFlagOverride(map[string]string{ + "--tracing.config": string(jaegerConfig), + })) + store2 := e2edb.NewThanosStore(e, "store2", marshal(t, client.BucketConfig{ + Type: client.FILESYSTEM, + Config: filesystem.Config{ + Directory: bktDir, + }, + }), e2edb.WithImage(thanosImage), e2edb.WithFlagOverride(map[string]string{ + "--tracing.config": string(jaegerConfig), + })) + querier := e2edb.NewThanosQuerier(e, "query", []string{ + // TODO(bwplotka): Play with loadbalancing to ensure half of requests goes to store1 and half to store2. + store1.InternalEndpoint("grpc"), + store2.InternalEndpoint("grpc"), + }, e2edb.WithImage(thanosImage), e2edb.WithFlagOverride(map[string]string{ + "--tracing.config": string(jaegerConfig), + })) + testutil.Ok(t, e2e.StartAndWaitReady(store1, store2, querier)) + + testutil.Ok(t, e2einteractive.OpenInBrowser("http://"+querier.Endpoint("http"))) + + // Once done, wait for user input so user can explore + // the results in Prometheus UI and logs. + testutil.Ok(t, e2einteractive.RunUntilEndpointHit()) +} From edaf16edf7c937da0312aaba2d6da9ca04d7a130 Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Tue, 25 Oct 2022 15:41:56 +0100 Subject: [PATCH 38/43] Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset_test.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 05d8b9f8d5..bf542fd149 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -415,12 +415,7 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, expectedEndpoints: 1, - expectedConnMetrics: metricsMeta + fmt.Sprintf( - ` - thanos_store_nodes_grpc_connections{external_labels="{%s}", store_type="sidecar"} 1 - `, - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ + expectedConnMetrics: metricsMeta +` thanos_store_nodes_grpc_connections{external_labels="{lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", ` `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ From 14ee77493d688dddb1b7a4cea5e59c2c8ac8e14c Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 25 Oct 2022 15:57:48 +0100 Subject: [PATCH 39/43] dumped long expected output in unittest Signed-off-by: utukj --- pkg/query/endpointset_test.go | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index bf542fd149..f1f68286ed 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -415,22 +415,9 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, expectedEndpoints: 1, - expectedConnMetrics: metricsMeta +` thanos_store_nodes_grpc_connections{external_labels="{lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", ` `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", `+ - `lbl=\"val`, - ), + expectedConnMetrics: metricsMeta + ` + thanos_store_nodes_grpc_connections{external_labels="{lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val}",store_type="sidecar"} 1 + `, }, } From f963ac1748b9d4012c1da9be565349df245017f1 Mon Sep 17 00:00:00 2001 From: utukj Date: Tue, 25 Oct 2022 16:39:36 +0100 Subject: [PATCH 40/43] Revert "e2e: Adding test for querier with two stores loadbalancing across them." This reverts commit 96b1545048e01003dcbf90029d70137a93d0d196. Signed-off-by: utukj --- go.mod | 2 +- go.sum | 4 +- test/e2e/query_lb_test.go | 104 -------------------------------------- 3 files changed, 3 insertions(+), 107 deletions(-) delete mode 100644 test/e2e/query_lb_test.go diff --git a/go.mod b/go.mod index d0b1088dd8..bee3e97fe7 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/chromedp/chromedp v0.8.2 github.com/davecgh/go-spew v1.1.1 github.com/dustin/go-humanize v1.0.0 - github.com/efficientgo/e2e v0.13.2-0.20221003194337-cbc7a9c8405f + github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.2.0 diff --git a/go.sum b/go.sum index 7ab0a6582a..97fc0d0411 100644 --- a/go.sum +++ b/go.sum @@ -248,8 +248,8 @@ github.com/edsrzf/mmap-go v1.1.0 h1:6EUwBLQ/Mcr1EYLE4Tn1VdW1A4ckqCQWZBw8Hr0kjpQ= github.com/edsrzf/mmap-go v1.1.0/go.mod h1:19H/e8pUPLicwkyNgOykDXkJ9F0MHE+Z52B8EIth78Q= github.com/efficientgo/core v1.0.0-rc.0 h1:jJoA0N+C4/knWYVZ6GrdHOtDyrg8Y/TR4vFpTaqTsqs= github.com/efficientgo/core v1.0.0-rc.0/go.mod h1:kQa0V74HNYMfuJH6jiPiwNdpWXl4xd/K4tzlrcvYDQI= -github.com/efficientgo/e2e v0.13.2-0.20221003194337-cbc7a9c8405f h1:kS5jX3et5GqgKDTjZZxYuBmJdCQTkuX2Ss57blDaL/Y= -github.com/efficientgo/e2e v0.13.2-0.20221003194337-cbc7a9c8405f/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= +github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a h1:cnJajqeh/HjvJLhI3wPvWG9OQ4gU79+4pELRD5Pkih8= +github.com/efficientgo/e2e v0.13.1-0.20220923082810-8fa9daa8af8a/go.mod h1:Hi+sz0REtlhVZ8zcdeTC3j6LUEEpJpPtNjOaOKuNcgI= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd h1:svR6KxSP1xiPw10RN4Pd7g6BAVkEcNN628PAqZH31mM= github.com/efficientgo/tools/core v0.0.0-20220817170617-6c25e3b627dd/go.mod h1:OmVcnJopJL8d3X3sSXTiypGoUSgFq1aDGmlrdi9dn/M= github.com/efficientgo/tools/extkingpin v0.0.0-20220817170617-6c25e3b627dd h1:VaYzzXeUbC5fVheskcKVNOyJMEYD+HgrJNzIAg/mRIM= diff --git a/test/e2e/query_lb_test.go b/test/e2e/query_lb_test.go deleted file mode 100644 index 677fd26f8f..0000000000 --- a/test/e2e/query_lb_test.go +++ /dev/null @@ -1,104 +0,0 @@ -package e2e - -import ( - "fmt" - "os" - "path/filepath" - "testing" - - "github.com/efficientgo/e2e" - e2edb "github.com/efficientgo/e2e/db" - e2einteractive "github.com/efficientgo/e2e/interactive" - e2emon "github.com/efficientgo/e2e/monitoring" - "github.com/thanos-io/objstore/client" - "github.com/thanos-io/objstore/providers/filesystem" - "github.com/thanos-io/thanos/pkg/testutil" - tracingclient "github.com/thanos-io/thanos/pkg/tracing/client" - "github.com/thanos-io/thanos/pkg/tracing/jaeger" - "gopkg.in/yaml.v2" -) - -func marshal(t testing.TB, i interface{}) []byte { - t.Helper() - - b, err := yaml.Marshal(i) - testutil.Ok(t, err) - - return b -} - -// TestQuery_WithStores_Loadbalancing is testing. -// * Create & Start Querier -// * Prepare "object storage" (test hack: It can just filesystem). -// - Create one TSDB block. -// -// * Create & Start 2x Stores -// * Connect Querier with Stores (tricky - there is no way of marking store as LB...) -// * Assertion: Monitor the traffic distribution. -func TestQuery_WithStores_Loadbalancing(t *testing.T) { - pwd, err := os.Getwd() - testutil.Ok(t, err) - - // Create a local dir that will be shared with containers with TSDB blocks we need. - // TODO(bwplotka): Create a block here (e.g using thanosbench). - bktDir := filepath.Join(pwd, "tsdb/bucket") - e, err := e2e.New( - e2e.WithVolumes( - fmt.Sprintf("%v:%v:z", filepath.Join(pwd, "tsdb"), filepath.Join(pwd, "tsdb"))), - ) - testutil.Ok(t, err) - t.Cleanup(e.Close) - - // Start monitoring. - mon, err := e2emon.Start(e) - testutil.Ok(t, err) - testutil.Ok(t, mon.OpenUserInterfaceInBrowser()) - - // Start tracing. - j := e.Runnable("tracing").WithPorts(map[string]int{"http-front": 16686, "jaeger.thrift": 14268}).Init(e2e.StartOptions{Image: "jaegertracing/all-in-one:1.25"}) - testutil.Ok(t, e2e.StartAndWaitReady(j)) - //testutil.Ok(t, e2einteractive.OpenInBrowser("http://"+j.Endpoint("http-front"))) - - jaegerConfig, err := yaml.Marshal(tracingclient.TracingConfig{ - Type: tracingclient.Jaeger, - Config: jaeger.Config{ - ServiceName: "thanos", - SamplerType: "const", - SamplerParam: 1, - Endpoint: "http://" + j.InternalEndpoint("jaeger.thrift") + "/api/traces", - }, - }) - testutil.Ok(t, err) - - const thanosImage = "thanos:latest" // Run 'make thanos' in thanos root to recreate it. - store1 := e2edb.NewThanosStore(e, "store1", marshal(t, client.BucketConfig{ - Type: client.FILESYSTEM, - Config: filesystem.Config{ - Directory: bktDir, - }, - }), e2edb.WithImage(thanosImage), e2edb.WithFlagOverride(map[string]string{ - "--tracing.config": string(jaegerConfig), - })) - store2 := e2edb.NewThanosStore(e, "store2", marshal(t, client.BucketConfig{ - Type: client.FILESYSTEM, - Config: filesystem.Config{ - Directory: bktDir, - }, - }), e2edb.WithImage(thanosImage), e2edb.WithFlagOverride(map[string]string{ - "--tracing.config": string(jaegerConfig), - })) - querier := e2edb.NewThanosQuerier(e, "query", []string{ - // TODO(bwplotka): Play with loadbalancing to ensure half of requests goes to store1 and half to store2. - store1.InternalEndpoint("grpc"), - store2.InternalEndpoint("grpc"), - }, e2edb.WithImage(thanosImage), e2edb.WithFlagOverride(map[string]string{ - "--tracing.config": string(jaegerConfig), - })) - testutil.Ok(t, e2e.StartAndWaitReady(store1, store2, querier)) - - testutil.Ok(t, e2einteractive.OpenInBrowser("http://"+querier.Endpoint("http"))) - - // Once done, wait for user input so user can explore - // the results in Prometheus UI and logs. - testutil.Ok(t, e2einteractive.RunUntilEndpointHit()) -} From a34f356532c5166cd7000a244008fb4ebc0b90f5 Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Wed, 26 Oct 2022 07:05:12 +0100 Subject: [PATCH 41/43] Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index f1f68286ed..bcf9420528 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -401,7 +401,7 @@ func TestEndpointSetUpdate(t *testing.T) { endpoints: []testEndpointMeta{ { InfoResponse: sidecarInfo, - // simulate very long external labels + // Simulate very long external labels. extlsetFn: func(addr string) []labelpb.ZLabelSet { sLabel := []string{} for i := 0; i < 1000; i++ { From c4e0f89b07c066a3e9953886b342589178e14867 Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Wed, 26 Oct 2022 07:06:10 +0100 Subject: [PATCH 42/43] Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> --- pkg/query/endpointset_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index bcf9420528..7e84890ea3 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -290,7 +290,6 @@ func TestTruncateExtLabels(t *testing.T) { labelToTruncate: "{abcde}", expectedOutput: "{abc}", }, - { labelToTruncate: "{abcdef}", expectedOutput: "{abc}", From c26cf746e2004bf727d413863532c41ebadae7a2 Mon Sep 17 00:00:00 2001 From: utukj Date: Wed, 26 Oct 2022 07:20:07 +0100 Subject: [PATCH 43/43] moved label definition to endpointset Signed-off-by: utukj --- cmd/thanos/query.go | 11 ++--------- pkg/query/endpointset.go | 13 ++++++++++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 5e5a7fc7cd..4862440af1 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -71,13 +71,6 @@ const ( promqlEngineThanos promqlEngineType = "thanos" ) -type queryConnMetricLabel string - -const ( - externalLabels queryConnMetricLabel = "external_labels" - storeType queryConnMetricLabel = "store_type" -) - // registerQuery registers a query command. func registerQuery(app *extkingpin.App) { comp := component.Query @@ -117,8 +110,8 @@ func registerQuery(app *extkingpin.App) { Default("4").Int() queryConnMetricLabels := cmd.Flag("query.conn-metric.label", "Optional selection of query connection metric labels to be collected from endpoint set"). - Default(string(externalLabels), string(storeType)). - Enums(string(externalLabels), string(storeType)) + Default(string(query.ExternalLabels), string(query.StoreType)). + Enums(string(query.ExternalLabels), string(query.StoreType)) queryReplicaLabels := cmd.Flag("query.replica-label", "Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules."). Strings() diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index aaa50324c6..d3ce73b9e7 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -38,6 +38,13 @@ const ( noMetadataEndpointMessage = "cannot obtain metadata: neither info nor store client found" ) +type queryConnMetricLabel string + +const ( + ExternalLabels queryConnMetricLabel = "external_labels" + StoreType queryConnMetricLabel = "store_type" +) + type GRPCEndpointSpec struct { addr string isStrictStatic bool @@ -190,7 +197,7 @@ type endpointSetNodeCollector struct { func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { if len(labels) == 0 { - labels = []string{"external_labels", "store_type"} + labels = []string{string(ExternalLabels), string(StoreType)} } return &endpointSetNodeCollector{ storeNodes: map[component.Component]map[string]int{}, @@ -247,9 +254,9 @@ func (c *endpointSetNodeCollector) Collect(ch chan<- prometheus.Metric) { lbls := []string{} for _, lbl := range c.labels { switch lbl { - case "external_labels": + case string(ExternalLabels): lbls = append(lbls, externalLabels) - case "store_type": + case string(StoreType): lbls = append(lbls, storeTypeStr) } }