Skip to content

Commit

Permalink
roachtest: add admission/follower-overload
Browse files Browse the repository at this point in the history
This is is a less ad-hoc version of the experiment in cockroachdb#81289, where I
messed with the EBS configuration. This can't be done programmatically,
and so here we use an IO nemesis on n3 instead.

Part of cockroachdb#79215.
Closes cockroachdb#81834.

Release note: None
  • Loading branch information
tbg committed Aug 18, 2022
1 parent 5c2c62e commit a3f80e6
Show file tree
Hide file tree
Showing 4 changed files with 354 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
"acceptance.go",
"activerecord.go",
"activerecord_blocklist.go",
"admission_control_follower_overload.go",
"allocator.go",
"alterpk.go",
"asyncpg.go",
Expand Down
286 changes: 286 additions & 0 deletions pkg/cmd/roachtest/tests/admission_control_follower_overload.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
//

package tests

import (
"context"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/stretchr/testify/require"
)

func registerAdmissionControlFollowerOverload(r registry.Registry) {
spec := func(subtest string, cfg admissionControlFollowerOverloadOpts) registry.TestSpec {
return registry.TestSpec{
Name: "admission/follower-overload/" + subtest,
Owner: registry.OwnerKV,
Timeout: 3 * time.Hour,
// Don't re-use the cluster, since we don't have any conventions about
// `wipe` removing any custom systemd units.
//
// NB: use 16vcpu machines to avoid getting anywhere close to EBS bandwidth limits
// on AWS, see:
// https://github.com/cockroachdb/cockroach/issues/82109#issuecomment-1154049976
Cluster: r.MakeClusterSpec(4, spec.CPU(4), spec.ReuseNone()),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runAdmissionControlFollowerOverload(ctx, t, c, cfg)
},
}
}

// The control group - just the vanilla cluster workloads, no nemesis. Running
// this and looking at performance blips can give us an idea of what "normal"
// looks like. This is most directly contrasted with presplit-with-leases but
// since the workload on N3 barely needs any resources, it should also compare
// well with presplit-no-leases.
r.Add(spec("presplit-control", admissionControlFollowerOverloadOpts{
kv0N12: true,
kvN12ExtraArgs: "--splits 100",
kv50N3: true,
}))
// n3 has no leases (but has disk overload), so n1 and n2 field all of the
// active work but replicate to n3. The workload should be steady with good
// p99s since there is no backpressure from n3 (at the time of writing) and
// we're not sending it any foreground traffic. The quota pools shouldn't
// deplete since writes are spread out evenly across 100 ranges.
r.Add(spec("presplit-no-leases", admissionControlFollowerOverloadOpts{
ioNemesis: true,
kv0N12: true,
kvN12ExtraArgs: "--splits 100",
}))
// Everything as before, but now the writes aren't spread out but all hit the
// same range. This could lead to the quota pool on that range running
// significantly emptier, possibly to the point of stalling foreground writes.
r.Add(spec("hotspot-no-leases", admissionControlFollowerOverloadOpts{
ioNemesis: true,
kv0N12: true,
kvN12ExtraArgs: "--sequential",
kv50N3: true,
}))
// This is identical to presplit-no-leases, but this time we are also running a
// (small) workload against n3. Looking at the performance of this workload gives
// us an idea of the impact of follower writes overload on a foreground workload.
r.Add(spec("presplit-with-leases", admissionControlFollowerOverloadOpts{
ioNemesis: true,
kv0N12: true,
kvN12ExtraArgs: "--splits=100",
kv50N3: true,
}))

}

type admissionControlFollowerOverloadOpts struct {
ioNemesis bool
kv0N12 bool
kvN12ExtraArgs string
kv50N3 bool
}

func runAdmissionControlFollowerOverload(
ctx context.Context, t test.Test, c cluster.Cluster, cfg admissionControlFollowerOverloadOpts,
) {
require.False(t, c.IsLocal())

resetSystemdUnits := func() {
for _, cmd := range []string{"stop", "reset-failed"} {
_ = c.RunE(ctx, c.Node(4), "sudo", "systemctl", cmd, "kv-n12")
_ = c.RunE(ctx, c.Node(4), "sudo", "systemctl", cmd, "kv-n3")
}
}

// Make cluster re-use possible to iterate on this test without making a new
// cluster every time.
const dev = true
if dev {
resetSystemdUnits()
}

// Set up prometheus.
{
clusNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Node(c.Spec().NodeCount)
cfg := (&prometheus.Config{}).
WithPrometheusNode(workloadNode.InstallNodes()[0]).
WithGrafanaDashboard("https://gist.githubusercontent.com/tbg/f238d578269143187e71a1046562225f/raw").
WithCluster(clusNodes.InstallNodes()).
WithNodeExporter(clusNodes.InstallNodes()).
WithWorkload("kv-n12", workloadNode.InstallNodes()[0], 2112). // kv-n12
WithWorkload("kv-n3", workloadNode.InstallNodes()[0], 2113) // kv-n3 (if present)

require.NoError(t, c.StartGrafana(ctx, t.L(), cfg))
defer c.StopGrafana(ctx, t.L(), t.ArtifactsDir())
}

phaseDuration := 3 * time.Minute // TODO time.Hour

nodes := c.Range(1, 3)
c.Put(ctx, t.Cockroach(), "cockroach")
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), nodes)
db := c.Conn(ctx, t.L(), 1)
require.NoError(t, WaitFor3XReplication(ctx, t, db))

{
_, err := c.Conn(ctx, t.L(), 1).ExecContext(ctx, `SET CLUSTER SETTING admission.kv.pause_replication_io_threshold = 0.8`)
require.NoError(t, err)
}

if cfg.kv0N12 {
args := strings.Fields("./cockroach workload init kv {pgurl:1}")
args = append(args, strings.Fields(cfg.kvN12ExtraArgs)...)
c.Run(ctx, c.Node(1), args...)
}
if cfg.kv50N3 {
args := strings.Fields("./cockroach workload init kv --db kvn3 {pgurl:1}")
c.Run(ctx, c.Node(1), args...)
}

// Node 3 should not have any leases (excepting kvn3, if present).
runner := sqlutils.MakeSQLRunner(db)
for _, row := range runner.QueryStr(
t, `SELECT target FROM [ SHOW ZONE CONFIGURATIONS ]`,
) {
q := `ALTER ` + row[0] + ` CONFIGURE ZONE USING lease_preferences = '[[-node3]]'`
t.L().Printf("%s", q)
_, err := db.Exec(q)
require.NoError(t, err)
}
if cfg.kv50N3 {
q := `ALTER DATABASE kvn3 CONFIGURE ZONE USING lease_preferences = '[[+node3]]', constraints = COPY FROM PARENT`
t.L().Printf("%s", q)
runner.Exec(t, q)
}

{
var attempts int
for ctx.Err() == nil {
attempts++
m1 := runner.QueryStr(t, `SELECT range_id FROM crdb_internal.ranges WHERE lease_holder=3 AND database_name != 'kvn3'`)
m2 := runner.QueryStr(t, `SELECT range_id FROM crdb_internal.ranges WHERE lease_holder!=3 AND database_name = 'kvn3'`)
if len(m1)+len(m2) == 0 {
t.L().Printf("done waiting for lease movement")
break
}
if len(m1) > 0 {
t.L().Printf("waiting for %d range leases to move off n3: %v", len(m1), m1)
}
if len(m2) > 0 {
t.L().Printf("waiting for %d range leases to move to n3: %v", len(m2), m2)
}

time.Sleep(10 * time.Second)
require.Less(t, attempts, 100)
}
}

if cfg.kv0N12 {
// Deploy workload against the default kv database (which has no leases on
// n3) and let it run for a phase duration. This does not block and keeps
// running even after the test tears down. Initially, the below workload was
// configured for 400 requests per second with 10k blocks, amounting to
// 4mb/s of goodput. Experimentally this was observed to cause (after ~8h) a
// per-store read throughput of ~60mb/s and write throughput of ~140mb/s for
// a total of close to 200mb/s (per store). This was too much for default
// EBS disks (see below) and there was unpredictable performance when
// reprovisioning such volumes with higher throughput, so we run at 2mb/s
// which should translate to ~100mb/s of max sustained combined throughput.
//
// NB: on GCE pd-ssd, we get 30 IOPS/GB of (combined) throughput and
// 0.45MB/(GB*s) for each GB provisioned, so for the 500GB volumes in this
// test 15k IOPS and 225MB/s.
//
// See: https://cloud.google.com/compute/docs/disks/performance#footnote-1
//
// On AWS, the default EBS volumes have 3000 IOPS and 125MB/s combined
// throughput. Additionally, instances have a throughput limit for talking
// to EBS, see:
//
// https://github.com/cockroachdb/cockroach/issues/82109#issuecomment-1154049976
deployWorkload := `
mkdir -p logs && \
sudo systemd-run --property=Type=exec \
--property=StandardOutput=file:/home/ubuntu/logs/kv-n12.stdout.log \
--property=StandardError=file:/home/ubuntu/logs/kv-n12.stderr.log \
--remain-after-exit --unit kv-n12 -- ./cockroach workload run kv --read-percent 0 \
--max-rate 400 --concurrency 400 --min-block-bytes 5000 --max-block-bytes 5000 --tolerate-errors {pgurl:1-2}`
c.Run(ctx, c.Node(4), deployWorkload)
}
if cfg.kv50N3 {
// On n3, we run a "trickle" workload that does not add much work to the
// system but which we can use to establish to monitor the impact of the
// overload on the follower to its foreground traffic. All leases for this
// workload are held by n3.
const deployWorkload = `
sudo systemd-run --property=Type=exec \
--property=StandardOutput=file:/home/ubuntu/logs/kv-n3.stdout.log \
--property=StandardError=file:/home/ubuntu/logs/kv-n3.stderr.log \
--remain-after-exit --unit kv-n3 -- ./cockroach workload run kv --db kvn3 \
--read-percent 50 --max-rate 100 --concurrency 1000 --min-block-bytes 100 --max-block-bytes 100 \
--prometheus-port 2113 --tolerate-errors {pgurl:3}`
c.Run(ctx, c.Node(4), deployWorkload)
}
t.L().Printf("deployed workload")

wait(c.NewMonitor(ctx, nodes), phaseDuration)

if cfg.ioNemesis {
// Limit write throughput on s3 to 20mb/s. This is not enough to keep up
// with the workload, at least not in the long run, due to write amp.
//
// NB: I happen to have tested this on RAID0 and it doesn't quite behave
// as expected: the limit will be set on the `md0` device:
//
// nvme1n1 259:0 0 500G 0 disk
// └─md0 9:0 0 872.3G 0 raid0 /mnt/data1
// nvme2n1 259:1 0 372.5G 0 disk
// └─md0 9:0 0 872.3G 0 raid0 /mnt/data1
//
// and so the actual write throttle is about 2x what was set.
c.Run(ctx, c.Node(3), "sudo", "systemctl", "set-property", "cockroach", "'IOWriteBandwidthMax={store-dir} 20971520'")
t.L().Printf("installed write throughput limit on n3")
}

wait(c.NewMonitor(ctx, nodes), phaseDuration)

// TODO collect, assert on, and export metrics, using:
// https://github.com/cockroachdb/cockroach/pull/80724.
// Things to check:
// - LSM health of follower (and, to be sure, on other replicas)
// -Latency of a benign read-only workload on the follower
// - Comparison of baseline perf of kv0 workload before disk nemesis (i.e.
// run first without nemesis, then with nemesis, maybe again without, make
// sure they're all sort of comparable, or report all three, or something
// like that. At first probably just export the overall coefficient of
// variation or something like that and leave detailed interpretation to
// human eyes on roachperf.
t.Fatal("failing on purpose")
}

func wait(m cluster.Monitor, duration time.Duration) {
m.Go(func(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(duration):
return nil
}
})
m.Wait()
}
66 changes: 66 additions & 0 deletions pkg/cmd/roachtest/tests/foo_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
//

package tests

import (
"context"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/util/timeutil"
promapi "github.com/prometheus/client_golang/api"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
)

func TestFoo(t *testing.T) {
client, err := promapi.NewClient(promapi.Config{
Address: "http://tobias-overload-new-0004.roachprod.crdb.io:9090",
})
now := timeutil.Now()
require.NoError(t, err)
c := promv1.NewAPI(client)
// NB: this is, at each datapoint, the larger of the two values.
const q = `
storage_l0_sublevels{store="3"}/20 > storage_l0_num_files{store="3"}/1000 or storage_l0_num_files{store="3"}/1000
`
v, warns, err := c.QueryRange(context.Background(), q, promv1.Range{
Start: now.Add(-10 * time.Minute),
End: now,
Step: time.Minute,
})
require.NoError(t, err)
require.Len(t, warns, 0)

m := v.(model.Matrix)

var n int
for _, ss := range m {
for _, v := range ss.Values {
n++
// We're pausing once we hit .8, but there's some delay so we may overshoot a bit.
// In practice, we're hoping to stay away from 1.0 most of the time and definitely
// we shouldn't be hitting 2.0 (unless pausing is broken, in which case we're
// sure to hit it).
if v.Value > 2.0 {
t.Errorf("%s at %s: overload score %.2f", ss.Metric, v.Timestamp, v.Value)
}
}
}

require.NotZero(t, n)

if t.Failed() {
t.Log(m)
}
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
func RegisterTests(r registry.Registry) {
registerAcceptance(r)
registerActiveRecord(r)
registerAdmissionControlFollowerOverload(r)
registerAllocator(r)
registerAlterPK(r)
registerAWSDMS(r)
Expand Down

0 comments on commit a3f80e6

Please sign in to comment.