diff --git a/.mdox.validate.yaml b/.mdox.validate.yaml index 94355e319e2..d4f9e409816 100644 --- a/.mdox.validate.yaml +++ b/.mdox.validate.yaml @@ -42,3 +42,5 @@ validators: type: 'ignore' - regex: 'twitter\.com' type: 'ignore' + - regex: 'outshift.cisco.com/blog/multi-cluster-monitoring' + type: 'ignore' diff --git a/CHANGELOG.md b/CHANGELOG.md index d6f83d8b3ed..49490c17680 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Changed +- [#7429](https://github.com/thanos-io/thanos/pull/7429): Reloader: introduce `suppressEnvironmentVariablesExpansionErrors` to allow suppressing errors when expanding environment variables in the configuration file. When set, this will ensure that the reloader won't consider the operation to fail when an unset environment variable is encountered. Note that all unset environment variables are left as is, whereas all set environment variables are expanded as usual. - [#7334](https://github.com/thanos-io/thanos/pull/7334) Compactor: do not vertically compact downsampled blocks. Such cases are now marked with `no-compact-mark.json`. Fixes panic `panic: unexpected seriesToChunkEncoder lack of iterations`. - [#7393](https://github.com/thanos-io/thanos/pull/7393) *: *breaking :warning:* Using native histograms for grpc middleware metrics. Metrics `grpc_client_handling_seconds` and `grpc_server_handling_seconds` will now be native histograms, if you have enabled native histogram scraping you will need to update your PromQL expressions to use the new metric names. diff --git a/pkg/reloader/reloader.go b/pkg/reloader/reloader.go index 0821b90fedc..3ffca9511cd 100644 --- a/pkg/reloader/reloader.go +++ b/pkg/reloader/reloader.go @@ -75,7 +75,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/minio/sha256-simd" - ps "github.com/mitchellh/go-ps" + "github.com/mitchellh/go-ps" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -87,14 +87,15 @@ import ( // It optionally substitutes environment variables in the configuration. // Referenced environment variables must be of the form `$(var)` (not `$var` or `${var}`). type Reloader struct { - logger log.Logger - cfgFile string - cfgOutputFile string - cfgDirs []CfgDirOption - watchInterval time.Duration - retryInterval time.Duration - watchedDirs []string - watcher *watcher + logger log.Logger + cfgFile string + cfgOutputFile string + cfgDirs []CfgDirOption + discardEnvVarExpansionErrors bool + retryInterval time.Duration + watchInterval time.Duration + watchedDirs []string + watcher *watcher tr TriggerReloader @@ -104,13 +105,14 @@ type Reloader struct { lastCfgDirFiles []map[string]struct{} forceReload bool - reloads prometheus.Counter - reloadErrors prometheus.Counter - lastReloadSuccess prometheus.Gauge - lastReloadSuccessTimestamp prometheus.Gauge - configApplyErrors prometheus.Counter - configApply prometheus.Counter - reloaderInfo *prometheus.GaugeVec + reloads prometheus.Counter + reloadErrors prometheus.Counter + lastReloadSuccess prometheus.Gauge + lastReloadSuccessTimestamp prometheus.Gauge + configApplyErrors prometheus.Counter + configEnvVarExpansionErrors prometheus.Gauge + configApply prometheus.Counter + reloaderInfo *prometheus.GaugeVec } // TriggerReloader reloads the configuration of the process. @@ -172,6 +174,9 @@ type Options struct { // RetryInterval controls how often the reloader retries a reloading of the // configuration in case the reload operation returned an error. RetryInterval time.Duration + // DiscardEnvVarExpansionErrors suppresses errors when expanding environment variables in the config file, and + // leaves the unset variables as is. All found environment variables are still expanded. + DiscardEnvVarExpansionErrors bool } var firstGzipBytes = []byte{0x1f, 0x8b, 0x08} @@ -183,15 +188,16 @@ func New(logger log.Logger, reg prometheus.Registerer, o *Options) *Reloader { logger = log.NewNopLogger() } r := &Reloader{ - logger: logger, - cfgFile: o.CfgFile, - cfgOutputFile: o.CfgOutputFile, - cfgDirs: o.CfgDirs, - lastCfgDirFiles: make([]map[string]struct{}, len(o.CfgDirs)), - watcher: newWatcher(logger, reg, o.DelayInterval), - watchedDirs: o.WatchedDirs, - watchInterval: o.WatchInterval, - retryInterval: o.RetryInterval, + logger: logger, + cfgFile: o.CfgFile, + cfgOutputFile: o.CfgOutputFile, + cfgDirs: o.CfgDirs, + lastCfgDirFiles: make([]map[string]struct{}, len(o.CfgDirs)), + watcher: newWatcher(logger, reg, o.DelayInterval), + watchedDirs: o.WatchedDirs, + watchInterval: o.WatchInterval, + retryInterval: o.RetryInterval, + discardEnvVarExpansionErrors: o.DiscardEnvVarExpansionErrors, reloads: promauto.With(reg).NewCounter( prometheus.CounterOpts{ @@ -229,6 +235,12 @@ func New(logger log.Logger, reg prometheus.Registerer, o *Options) *Reloader { Help: "Total number of config apply operations that failed.", }, ), + configEnvVarExpansionErrors: promauto.With(reg).NewGauge( + prometheus.GaugeOpts{ + Name: "reloader_config_env_var_expansion_errors", + Help: "Total number of config environment variable expansions that failed.", + }, + ), reloaderInfo: promauto.With(reg).NewGaugeVec( prometheus.GaugeOpts{ Name: "reloader_info", @@ -348,8 +360,8 @@ func (r *Reloader) Watch(ctx context.Context) error { } } -func normalize(logger log.Logger, inputFile, outputFile string) error { - b, err := os.ReadFile(inputFile) +func (r *Reloader) normalize(input, output string) error { + b, err := os.ReadFile(input) if err != nil { return errors.Wrap(err, "read file") } @@ -360,7 +372,7 @@ func normalize(logger log.Logger, inputFile, outputFile string) error { if err != nil { return errors.Wrap(err, "create gzip reader") } - defer runutil.CloseWithLogOnErr(logger, zr, "gzip reader close") + defer runutil.CloseWithLogOnErr(r.logger, zr, "gzip reader close") b, err = io.ReadAll(zr) if err != nil { @@ -368,19 +380,19 @@ func normalize(logger log.Logger, inputFile, outputFile string) error { } } - b, err = expandEnv(b) + b, err = r.expandEnv(b) if err != nil { return errors.Wrap(err, "expand environment variables") } - tmpFile := outputFile + ".tmp" + tmpFile := output + ".tmp" defer func() { _ = os.Remove(tmpFile) }() if err := os.WriteFile(tmpFile, b, 0644); err != nil { return errors.Wrap(err, "write file") } - if err := os.Rename(tmpFile, outputFile); err != nil { + if err := os.Rename(tmpFile, output); err != nil { return errors.Wrap(err, "rename file") } return nil @@ -402,7 +414,7 @@ func (r *Reloader) apply(ctx context.Context) error { } cfgHash = h.Sum(nil) if r.cfgOutputFile != "" { - if err := normalize(r.logger, r.cfgFile, r.cfgOutputFile); err != nil { + if err := r.normalize(r.cfgFile, r.cfgOutputFile); err != nil { return err } } @@ -446,7 +458,7 @@ func (r *Reloader) apply(ctx context.Context) error { outFile := filepath.Join(outDir, targetFile.Name()) cfgDirFiles[outFile] = struct{}{} - if err := normalize(r.logger, path, outFile); err != nil { + if err := r.normalize(path, outFile); err != nil { return errors.Wrapf(err, "move file: %s", path) } } @@ -692,21 +704,29 @@ func RuntimeInfoURLFromBase(u *url.URL) *url.URL { var envRe = regexp.MustCompile(`\$\(([a-zA-Z_0-9]+)\)`) -func expandEnv(b []byte) (r []byte, err error) { - r = envRe.ReplaceAllFunc(b, func(n []byte) []byte { +func (r *Reloader) expandEnv(b []byte) (replaced []byte, err error) { + replaced = envRe.ReplaceAllFunc(b, func(n []byte) []byte { if err != nil { return nil } + m := n n = n[2 : len(n)-1] v, ok := os.LookupEnv(string(n)) if !ok { - err = errors.Errorf("found reference to unset environment variable %q", n) + r.configEnvVarExpansionErrors.Inc() + errStr := errors.Errorf("found reference to unset environment variable %q", n) + if r.discardEnvVarExpansionErrors { + level.Warn(r.logger).Log("msg", "expand environment variable", "err", errStr) + return m + } + level.Error(r.logger).Log("msg", "expand environment variable", "err", errStr) + err = errStr return nil } return []byte(v) }) - return r, err + return replaced, err } type watcher struct { diff --git a/pkg/reloader/reloader_test.go b/pkg/reloader/reloader_test.go index c2448b6ea12..e18611977f2 100644 --- a/pkg/reloader/reloader_test.go +++ b/pkg/reloader/reloader_test.go @@ -97,10 +97,44 @@ config: testutil.NotOk(t, err) testutil.Assert(t, strings.HasSuffix(err.Error(), `found reference to unset environment variable "TEST_RELOADER_THANOS_ENV"`), "expect error since there envvars are not set.") + // Don't fail with unset variables. + ctx2, cancel2 := context.WithTimeout(ctx, 10*time.Second) + configEnvVarExpansionErrorsCountPre := promtest.ToFloat64(reloader.configEnvVarExpansionErrors) + + // Enable suppressing environment variables expansion errors. + reloader.discardEnvVarExpansionErrors = true + + // Set an environment variable while leaving the other unset, so as to ensure we don't break the flow when an unset + // variable is found. + testutil.Ok(t, os.Setenv("TEST_RELOADER_THANOS_ENV2", "3")) + err = reloader.Watch(ctx2) + cancel2() + + // Restore state. + reloader.discardEnvVarExpansionErrors = false + testutil.Ok(t, os.Unsetenv("TEST_RELOADER_THANOS_ENV2")) + + // The environment variable expansion errors should be suppressed, but recorded. + configEnvVarExpansionErrorsCountPost := promtest.ToFloat64(reloader.configEnvVarExpansionErrors) + testutil.Equals(t, 1.0, configEnvVarExpansionErrorsCountPost-configEnvVarExpansionErrorsCountPre) + + // All environment variables expansion errors should be suppressed. + testutil.Ok(t, err) + + // Config should consist on unset as well as set variables. + f, err := os.ReadFile(output) + testutil.Ok(t, err) + testutil.Equals(t, ` +config: + a: 1 + b: $(TEST_RELOADER_THANOS_ENV) + c: 3 +`, string(f)) + testutil.Ok(t, os.Setenv("TEST_RELOADER_THANOS_ENV", "2")) testutil.Ok(t, os.Setenv("TEST_RELOADER_THANOS_ENV2", "3")) - rctx, cancel2 := context.WithCancel(ctx) + rctx, cancel3 := context.WithCancel(ctx) g := sync.WaitGroup{} g.Add(1) go func() { @@ -159,7 +193,7 @@ config: } } } - cancel2() + cancel3() g.Wait() testutil.Ok(t, os.Unsetenv("TEST_RELOADER_THANOS_ENV"))