forked from cockroachdb/cockroach
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
roachtest: add admission/follower-overload
This is is a less ad-hoc version of the experiment in cockroachdb#81289, where I messed with the EBS configuration. This can't be done programmatically, and so here we use an IO nemesis on n3 instead. Part of cockroachdb#79215. Closes cockroachdb#81834. Release note: None
- Loading branch information
Showing
4 changed files
with
354 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
286 changes: 286 additions & 0 deletions
286
pkg/cmd/roachtest/tests/admission_control_follower_overload.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,286 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
// | ||
|
||
package tests | ||
|
||
import ( | ||
"context" | ||
"strings" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" | ||
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" | ||
"github.com/cockroachdb/cockroach/pkg/roachprod/install" | ||
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func registerAdmissionControlFollowerOverload(r registry.Registry) { | ||
spec := func(subtest string, cfg admissionControlFollowerOverloadOpts) registry.TestSpec { | ||
return registry.TestSpec{ | ||
Name: "admission/follower-overload/" + subtest, | ||
Owner: registry.OwnerKV, | ||
Timeout: 3 * time.Hour, | ||
// Don't re-use the cluster, since we don't have any conventions about | ||
// `wipe` removing any custom systemd units. | ||
// | ||
// NB: use 16vcpu machines to avoid getting anywhere close to EBS bandwidth limits | ||
// on AWS, see: | ||
// https://github.com/cockroachdb/cockroach/issues/82109#issuecomment-1154049976 | ||
Cluster: r.MakeClusterSpec(4, spec.CPU(4), spec.ReuseNone()), | ||
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { | ||
runAdmissionControlFollowerOverload(ctx, t, c, cfg) | ||
}, | ||
} | ||
} | ||
|
||
// The control group - just the vanilla cluster workloads, no nemesis. Running | ||
// this and looking at performance blips can give us an idea of what "normal" | ||
// looks like. This is most directly contrasted with presplit-with-leases but | ||
// since the workload on N3 barely needs any resources, it should also compare | ||
// well with presplit-no-leases. | ||
r.Add(spec("presplit-control", admissionControlFollowerOverloadOpts{ | ||
kv0N12: true, | ||
kvN12ExtraArgs: "--splits 100", | ||
kv50N3: true, | ||
})) | ||
// n3 has no leases (but has disk overload), so n1 and n2 field all of the | ||
// active work but replicate to n3. The workload should be steady with good | ||
// p99s since there is no backpressure from n3 (at the time of writing) and | ||
// we're not sending it any foreground traffic. The quota pools shouldn't | ||
// deplete since writes are spread out evenly across 100 ranges. | ||
r.Add(spec("presplit-no-leases", admissionControlFollowerOverloadOpts{ | ||
ioNemesis: true, | ||
kv0N12: true, | ||
kvN12ExtraArgs: "--splits 100", | ||
})) | ||
// Everything as before, but now the writes aren't spread out but all hit the | ||
// same range. This could lead to the quota pool on that range running | ||
// significantly emptier, possibly to the point of stalling foreground writes. | ||
r.Add(spec("hotspot-no-leases", admissionControlFollowerOverloadOpts{ | ||
ioNemesis: true, | ||
kv0N12: true, | ||
kvN12ExtraArgs: "--sequential", | ||
kv50N3: true, | ||
})) | ||
// This is identical to presplit-no-leases, but this time we are also running a | ||
// (small) workload against n3. Looking at the performance of this workload gives | ||
// us an idea of the impact of follower writes overload on a foreground workload. | ||
r.Add(spec("presplit-with-leases", admissionControlFollowerOverloadOpts{ | ||
ioNemesis: true, | ||
kv0N12: true, | ||
kvN12ExtraArgs: "--splits=100", | ||
kv50N3: true, | ||
})) | ||
|
||
} | ||
|
||
type admissionControlFollowerOverloadOpts struct { | ||
ioNemesis bool | ||
kv0N12 bool | ||
kvN12ExtraArgs string | ||
kv50N3 bool | ||
} | ||
|
||
func runAdmissionControlFollowerOverload( | ||
ctx context.Context, t test.Test, c cluster.Cluster, cfg admissionControlFollowerOverloadOpts, | ||
) { | ||
require.False(t, c.IsLocal()) | ||
|
||
resetSystemdUnits := func() { | ||
for _, cmd := range []string{"stop", "reset-failed"} { | ||
_ = c.RunE(ctx, c.Node(4), "sudo", "systemctl", cmd, "kv-n12") | ||
_ = c.RunE(ctx, c.Node(4), "sudo", "systemctl", cmd, "kv-n3") | ||
} | ||
} | ||
|
||
// Make cluster re-use possible to iterate on this test without making a new | ||
// cluster every time. | ||
const dev = true | ||
if dev { | ||
resetSystemdUnits() | ||
} | ||
|
||
// Set up prometheus. | ||
{ | ||
clusNodes := c.Range(1, c.Spec().NodeCount-1) | ||
workloadNode := c.Node(c.Spec().NodeCount) | ||
cfg := (&prometheus.Config{}). | ||
WithPrometheusNode(workloadNode.InstallNodes()[0]). | ||
WithGrafanaDashboard("https://gist.githubusercontent.com/tbg/f238d578269143187e71a1046562225f/raw"). | ||
WithCluster(clusNodes.InstallNodes()). | ||
WithNodeExporter(clusNodes.InstallNodes()). | ||
WithWorkload("kv-n12", workloadNode.InstallNodes()[0], 2112). // kv-n12 | ||
WithWorkload("kv-n3", workloadNode.InstallNodes()[0], 2113) // kv-n3 (if present) | ||
|
||
require.NoError(t, c.StartGrafana(ctx, t.L(), cfg)) | ||
defer c.StopGrafana(ctx, t.L(), t.ArtifactsDir()) | ||
} | ||
|
||
phaseDuration := 3 * time.Minute // TODO time.Hour | ||
|
||
nodes := c.Range(1, 3) | ||
c.Put(ctx, t.Cockroach(), "cockroach") | ||
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), nodes) | ||
db := c.Conn(ctx, t.L(), 1) | ||
require.NoError(t, WaitFor3XReplication(ctx, t, db)) | ||
|
||
{ | ||
_, err := c.Conn(ctx, t.L(), 1).ExecContext(ctx, `SET CLUSTER SETTING admission.kv.pause_replication_io_threshold = 0.8`) | ||
require.NoError(t, err) | ||
} | ||
|
||
if cfg.kv0N12 { | ||
args := strings.Fields("./cockroach workload init kv {pgurl:1}") | ||
args = append(args, strings.Fields(cfg.kvN12ExtraArgs)...) | ||
c.Run(ctx, c.Node(1), args...) | ||
} | ||
if cfg.kv50N3 { | ||
args := strings.Fields("./cockroach workload init kv --db kvn3 {pgurl:1}") | ||
c.Run(ctx, c.Node(1), args...) | ||
} | ||
|
||
// Node 3 should not have any leases (excepting kvn3, if present). | ||
runner := sqlutils.MakeSQLRunner(db) | ||
for _, row := range runner.QueryStr( | ||
t, `SELECT target FROM [ SHOW ZONE CONFIGURATIONS ]`, | ||
) { | ||
q := `ALTER ` + row[0] + ` CONFIGURE ZONE USING lease_preferences = '[[-node3]]'` | ||
t.L().Printf("%s", q) | ||
_, err := db.Exec(q) | ||
require.NoError(t, err) | ||
} | ||
if cfg.kv50N3 { | ||
q := `ALTER DATABASE kvn3 CONFIGURE ZONE USING lease_preferences = '[[+node3]]', constraints = COPY FROM PARENT` | ||
t.L().Printf("%s", q) | ||
runner.Exec(t, q) | ||
} | ||
|
||
{ | ||
var attempts int | ||
for ctx.Err() == nil { | ||
attempts++ | ||
m1 := runner.QueryStr(t, `SELECT range_id FROM crdb_internal.ranges WHERE lease_holder=3 AND database_name != 'kvn3'`) | ||
m2 := runner.QueryStr(t, `SELECT range_id FROM crdb_internal.ranges WHERE lease_holder!=3 AND database_name = 'kvn3'`) | ||
if len(m1)+len(m2) == 0 { | ||
t.L().Printf("done waiting for lease movement") | ||
break | ||
} | ||
if len(m1) > 0 { | ||
t.L().Printf("waiting for %d range leases to move off n3: %v", len(m1), m1) | ||
} | ||
if len(m2) > 0 { | ||
t.L().Printf("waiting for %d range leases to move to n3: %v", len(m2), m2) | ||
} | ||
|
||
time.Sleep(10 * time.Second) | ||
require.Less(t, attempts, 100) | ||
} | ||
} | ||
|
||
if cfg.kv0N12 { | ||
// Deploy workload against the default kv database (which has no leases on | ||
// n3) and let it run for a phase duration. This does not block and keeps | ||
// running even after the test tears down. Initially, the below workload was | ||
// configured for 400 requests per second with 10k blocks, amounting to | ||
// 4mb/s of goodput. Experimentally this was observed to cause (after ~8h) a | ||
// per-store read throughput of ~60mb/s and write throughput of ~140mb/s for | ||
// a total of close to 200mb/s (per store). This was too much for default | ||
// EBS disks (see below) and there was unpredictable performance when | ||
// reprovisioning such volumes with higher throughput, so we run at 2mb/s | ||
// which should translate to ~100mb/s of max sustained combined throughput. | ||
// | ||
// NB: on GCE pd-ssd, we get 30 IOPS/GB of (combined) throughput and | ||
// 0.45MB/(GB*s) for each GB provisioned, so for the 500GB volumes in this | ||
// test 15k IOPS and 225MB/s. | ||
// | ||
// See: https://cloud.google.com/compute/docs/disks/performance#footnote-1 | ||
// | ||
// On AWS, the default EBS volumes have 3000 IOPS and 125MB/s combined | ||
// throughput. Additionally, instances have a throughput limit for talking | ||
// to EBS, see: | ||
// | ||
// https://github.com/cockroachdb/cockroach/issues/82109#issuecomment-1154049976 | ||
deployWorkload := ` | ||
mkdir -p logs && \ | ||
sudo systemd-run --property=Type=exec \ | ||
--property=StandardOutput=file:/home/ubuntu/logs/kv-n12.stdout.log \ | ||
--property=StandardError=file:/home/ubuntu/logs/kv-n12.stderr.log \ | ||
--remain-after-exit --unit kv-n12 -- ./cockroach workload run kv --read-percent 0 \ | ||
--max-rate 400 --concurrency 400 --min-block-bytes 5000 --max-block-bytes 5000 --tolerate-errors {pgurl:1-2}` | ||
c.Run(ctx, c.Node(4), deployWorkload) | ||
} | ||
if cfg.kv50N3 { | ||
// On n3, we run a "trickle" workload that does not add much work to the | ||
// system but which we can use to establish to monitor the impact of the | ||
// overload on the follower to its foreground traffic. All leases for this | ||
// workload are held by n3. | ||
const deployWorkload = ` | ||
sudo systemd-run --property=Type=exec \ | ||
--property=StandardOutput=file:/home/ubuntu/logs/kv-n3.stdout.log \ | ||
--property=StandardError=file:/home/ubuntu/logs/kv-n3.stderr.log \ | ||
--remain-after-exit --unit kv-n3 -- ./cockroach workload run kv --db kvn3 \ | ||
--read-percent 50 --max-rate 100 --concurrency 1000 --min-block-bytes 100 --max-block-bytes 100 \ | ||
--prometheus-port 2113 --tolerate-errors {pgurl:3}` | ||
c.Run(ctx, c.Node(4), deployWorkload) | ||
} | ||
t.L().Printf("deployed workload") | ||
|
||
wait(c.NewMonitor(ctx, nodes), phaseDuration) | ||
|
||
if cfg.ioNemesis { | ||
// Limit write throughput on s3 to 20mb/s. This is not enough to keep up | ||
// with the workload, at least not in the long run, due to write amp. | ||
// | ||
// NB: I happen to have tested this on RAID0 and it doesn't quite behave | ||
// as expected: the limit will be set on the `md0` device: | ||
// | ||
// nvme1n1 259:0 0 500G 0 disk | ||
// └─md0 9:0 0 872.3G 0 raid0 /mnt/data1 | ||
// nvme2n1 259:1 0 372.5G 0 disk | ||
// └─md0 9:0 0 872.3G 0 raid0 /mnt/data1 | ||
// | ||
// and so the actual write throttle is about 2x what was set. | ||
c.Run(ctx, c.Node(3), "sudo", "systemctl", "set-property", "cockroach", "'IOWriteBandwidthMax={store-dir} 20971520'") | ||
t.L().Printf("installed write throughput limit on n3") | ||
} | ||
|
||
wait(c.NewMonitor(ctx, nodes), phaseDuration) | ||
|
||
// TODO collect, assert on, and export metrics, using: | ||
// https://github.com/cockroachdb/cockroach/pull/80724. | ||
// Things to check: | ||
// - LSM health of follower (and, to be sure, on other replicas) | ||
// -Latency of a benign read-only workload on the follower | ||
// - Comparison of baseline perf of kv0 workload before disk nemesis (i.e. | ||
// run first without nemesis, then with nemesis, maybe again without, make | ||
// sure they're all sort of comparable, or report all three, or something | ||
// like that. At first probably just export the overall coefficient of | ||
// variation or something like that and leave detailed interpretation to | ||
// human eyes on roachperf. | ||
t.Fatal("failing on purpose") | ||
} | ||
|
||
func wait(m cluster.Monitor, duration time.Duration) { | ||
m.Go(func(ctx context.Context) error { | ||
select { | ||
case <-ctx.Done(): | ||
return ctx.Err() | ||
case <-time.After(duration): | ||
return nil | ||
} | ||
}) | ||
m.Wait() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
// | ||
|
||
package tests | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/util/timeutil" | ||
promapi "github.com/prometheus/client_golang/api" | ||
promv1 "github.com/prometheus/client_golang/api/prometheus/v1" | ||
"github.com/prometheus/common/model" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestFoo(t *testing.T) { | ||
client, err := promapi.NewClient(promapi.Config{ | ||
Address: "http://tobias-overload-new-0004.roachprod.crdb.io:9090", | ||
}) | ||
now := timeutil.Now() | ||
require.NoError(t, err) | ||
c := promv1.NewAPI(client) | ||
// NB: this is, at each datapoint, the larger of the two values. | ||
const q = ` | ||
storage_l0_sublevels{store="3"}/20 > storage_l0_num_files{store="3"}/1000 or storage_l0_num_files{store="3"}/1000 | ||
` | ||
v, warns, err := c.QueryRange(context.Background(), q, promv1.Range{ | ||
Start: now.Add(-10 * time.Minute), | ||
End: now, | ||
Step: time.Minute, | ||
}) | ||
require.NoError(t, err) | ||
require.Len(t, warns, 0) | ||
|
||
m := v.(model.Matrix) | ||
|
||
var n int | ||
for _, ss := range m { | ||
for _, v := range ss.Values { | ||
n++ | ||
// We're pausing once we hit .8, but there's some delay so we may overshoot a bit. | ||
// In practice, we're hoping to stay away from 1.0 most of the time and definitely | ||
// we shouldn't be hitting 2.0 (unless pausing is broken, in which case we're | ||
// sure to hit it). | ||
if v.Value > 2.0 { | ||
t.Errorf("%s at %s: overload score %.2f", ss.Metric, v.Timestamp, v.Value) | ||
} | ||
} | ||
} | ||
|
||
require.NotZero(t, n) | ||
|
||
if t.Failed() { | ||
t.Log(m) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters