From e16f9caed0fa61305b8cd294b1fe57995da11e55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Kripalani?= Date: Wed, 18 Mar 2020 18:52:49 +0000 Subject: [PATCH] automatically expose pprof+prometheus endpoint. (#677) --- .dockerignore | 3 +- manifests/placebo.toml | 13 +++++--- pkg/build/golang/Dockerfile.template | 1 + pkg/docker/container.go | 11 ++++--- pkg/runner/local_docker.go | 9 ++++-- pkg/sidecar/docker_reactor.go | 36 +++++++++++++++++++++-- pkg/tgwriter/tgwriter.go | 4 +++ plans/placebo/main.go | 13 ++++---- sdk/runtime/metrics.go | 20 ------------- sdk/runtime/runner.go | 44 +++++++++++++++++++++++++++- 10 files changed, 114 insertions(+), 40 deletions(-) diff --git a/.dockerignore b/.dockerignore index a934775c6..421b3f631 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,4 +5,5 @@ **/.gitignore **/.gitkeep **/.gitmodules -**/Dockerfile \ No newline at end of file +**/Dockerfile +**/infra \ No newline at end of file diff --git a/manifests/placebo.toml b/manifests/placebo.toml index 6bbe2b1d9..6490f0cbf 100644 --- a/manifests/placebo.toml +++ b/manifests/placebo.toml @@ -52,7 +52,12 @@ instances = { min = 1, max = 250, default = 1 } [testcases.params] some_param = { type = "int", desc = "some param", unit = "peers" } -# # seq 3 -# [[testcases]] -# name = "panic" -# instances = { min = 1, max = 250, default = 1 } +# seq 3 +[[testcases]] +name = "panic" +instances = { min = 1, max = 250, default = 1 } + +# seq 4 +[[testcases]] +name = "stall" +instances = { min = 1, max = 250, default = 1 } diff --git a/pkg/build/golang/Dockerfile.template b/pkg/build/golang/Dockerfile.template index 0e3013d32..e44fc51c9 100644 --- a/pkg/build/golang/Dockerfile.template +++ b/pkg/build/golang/Dockerfile.template @@ -64,4 +64,5 @@ COPY --from=0 /tmp/delete_me /tmp/go-ipfs/ipfs* /usr/local/bin/ RUN rm -f /usr/local/bin/delete_me ENV PATH="/usr/local/bin:${PATH}" +EXPOSE 6060 ENTRYPOINT [ "/testplan", "--vv"] diff --git a/pkg/docker/container.go b/pkg/docker/container.go index d80e05a8f..dc5652770 100644 --- a/pkg/docker/container.go +++ b/pkg/docker/container.go @@ -3,6 +3,7 @@ package docker import ( "context" "errors" + "fmt" "os" "github.com/docker/docker/api/types" @@ -127,13 +128,15 @@ func EnsureContainer(ctx context.Context, log *zap.SugaredLogger, cli *client.Cl log.Infow("starting new container", "id", res.ID) err = cli.ContainerStart(ctx, res.ID, types.ContainerStartOptions{}) - if err == nil { - log.Infow("started container", "id", res.ID) + if err != nil { + return nil, false, err } + log.Infow("started container", "id", res.ID) + c, err := cli.ContainerInspect(ctx, res.ID) - if err == nil { - log.Infow("started container", "id", res.ID) + if err != nil { + return nil, false, fmt.Errorf("failed to inspect container: %w", err) } return &c, true, err diff --git a/pkg/runner/local_docker.go b/pkg/runner/local_docker.go index e6b8a1e1a..541caee08 100644 --- a/pkg/runner/local_docker.go +++ b/pkg/runner/local_docker.go @@ -588,6 +588,11 @@ func ensureControlNetwork(ctx context.Context, cli *client.Client, log *zap.Suga ctx, log, cli, "testground-control", + // making internal=false enables us to expose ports to the host (e.g. + // pprof and prometheus). by itself, it would allow the container to + // access the Internet, and therefore would break isolation, but since + // we have sidecar overriding the default Docker ip routes, and + // suppressing such traffic, we're safe. false, network.IPAMConfig{ Subnet: controlSubnet, @@ -635,14 +640,14 @@ func newDataNetwork(ctx context.Context, cli *client.Client, log *zap.SugaredLog } // ensure container is started -func ensureInfraContainer(ctx context.Context, cli *client.Client, log *zap.SugaredLogger, containerName string, imageName string, NetworkID string, pull bool) (id string, err error) { +func ensureInfraContainer(ctx context.Context, cli *client.Client, log *zap.SugaredLogger, containerName string, imageName string, networkID string, pull bool) (id string, err error) { container, _, err := docker.EnsureContainer(ctx, log, cli, &docker.EnsureContainerOpts{ ContainerName: containerName, ContainerConfig: &container.Config{ Image: imageName, }, HostConfig: &container.HostConfig{ - NetworkMode: container.NetworkMode(NetworkID), + NetworkMode: container.NetworkMode(networkID), PublishAllPorts: true, }, PullImageIfMissing: pull, diff --git a/pkg/sidecar/docker_reactor.go b/pkg/sidecar/docker_reactor.go index 3fb097af6..66bf11cdd 100644 --- a/pkg/sidecar/docker_reactor.go +++ b/pkg/sidecar/docker_reactor.go @@ -18,6 +18,16 @@ import ( "github.com/ipfs/testground/sdk/runtime" ) +// PublicAddr points to an IP address in the public range. It helps us discover +// the IP address of the gateway (i.e. the Docker host) on the control network +// (the learned route will be via the control network because, at this point, +// the only network that's attached to the container is the control network). +// +// Sidecar doesn't whitelist traffic to public addresses, but it special-cases +// traffic between the container and the host, so that pprof, metrics and other +// ports can be exposed to the Docker host. +var PublicAddr = net.ParseIP("1.1.1.1") + type DockerReactor struct { routes []net.IP manager *docker.Manager @@ -163,8 +173,6 @@ func (d *DockerReactor) handleContainer(ctx context.Context, container *docker.C // TODO: Some of this code could be factored out into helpers. - // Get the routes to redis. We need to keep these. - var controlRoutes []netlink.Route for _, route := range d.routes { nlroutes, err := netlinkHandle.RouteGet(route) @@ -174,6 +182,30 @@ func (d *DockerReactor) handleContainer(ctx context.Context, container *docker.C controlRoutes = append(controlRoutes, nlroutes...) } + // Get the route to a public address. We will NOT be whitelisting traffic to + // public IPs, but this helps us discover the IP address of the Docker host + // on the control network. See the godoc on the PublicAddr var for more + // info. + pub, err := netlinkHandle.RouteGet(PublicAddr) + if err != nil { + return nil, fmt.Errorf("failed to resolve route for %s: %w", PublicAddr, err) + } + + switch { + case len(pub) == 0: + logging.S().Warnw("failed to discover gateway/host address; no routes to public IPs", "container_id", container.ID) + case pub[0].Gw == nil: + logging.S().Warnw("failed to discover gateway/host address; gateway is nil", "route", pub[0], "container_id", container.ID) + default: + hostRoutes, err := netlinkHandle.RouteGet(pub[0].Gw) + if err != nil { + logging.S().Warnw("failed to add route for gateway/host address", "error", err, "route", pub[0], "container_id", container.ID) + break + } + logging.S().Infow("successfully resolved route to host", "container_id", container.ID) + controlRoutes = append(controlRoutes, hostRoutes...) + } + for id, link := range links { if name, ok := reverseIndex[id]; ok { // manage this network diff --git a/pkg/tgwriter/tgwriter.go b/pkg/tgwriter/tgwriter.go index 2375fe6bd..1afdae013 100644 --- a/pkg/tgwriter/tgwriter.go +++ b/pkg/tgwriter/tgwriter.go @@ -44,6 +44,10 @@ type Error struct { } func (tgw *TgWriter) Write(p []byte) (n int, err error) { + if p == nil { + return 0, nil + } + pld := Msg{ Type: "progress", Payload: p, diff --git a/plans/placebo/main.go b/plans/placebo/main.go index 501fa115e..03e60f404 100644 --- a/plans/placebo/main.go +++ b/plans/placebo/main.go @@ -12,6 +12,7 @@ import ( func main() { runtime.Invoke(run) } + func run(runenv *runtime.RunEnv) error { if runenv.TestCaseSeq < 0 { panic("test case sequence number not set") @@ -21,25 +22,25 @@ func run(runenv *runtime.RunEnv) error { case 0: return nil case 2: - // expose prometheus endpoint - listener := runenv.MustExportPrometheus() - defer listener.Close() - // create context for cancelation ctx, cancel := context.WithCancel(context.Background()) defer cancel() // snapshot metrics every second and save them into "metrics" directory - err := runenv.HTTPPeriodicSnapshots(ctx, "http://"+listener.Addr().String(), time.Second, "metrics") + err := runenv.HTTPPeriodicSnapshots(ctx, "http://"+runtime.HTTPListenAddr+"/metrics", time.Second, "metrics") if err != nil { return err } - time.Sleep(time.Second * 10) + time.Sleep(time.Second * 5) return nil case 3: // panic panic(errors.New("this is an intentional panic")) + case 4: + // stall + time.Sleep(24 * time.Hour) + return nil default: return fmt.Errorf("aborting") } diff --git a/sdk/runtime/metrics.go b/sdk/runtime/metrics.go index f93a00cb5..d773384e2 100644 --- a/sdk/runtime/metrics.go +++ b/sdk/runtime/metrics.go @@ -3,7 +3,6 @@ package runtime import ( "context" "io" - "net" "net/http" "os" "path" @@ -11,7 +10,6 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" ) func NewCounter(runenv *RunEnv, name string, help string) prometheus.Counter { @@ -50,24 +48,6 @@ func NewSummary(runenv *RunEnv, name string, help string, opts prometheus.Summar return s } -// MustExportPrometheus starts an HTTP server with the Prometheus handler. -// It starts on a random open port and returns the listener. It is the caller -// responsability to close the listener. -func (re *RunEnv) MustExportPrometheus() net.Listener { - listener, err := net.Listen("tcp", "127.0.0.1:0") - if err != nil { - panic(err) - } - - go func() { - // avoid triggering golangci-lint for not checking - // the error. - _ = http.Serve(listener, promhttp.Handler()) - }() - - return listener -} - // HTTPPeriodicSnapshots periodically fetches the snapshots from the given address // and outputs them to the out directory. Every file will be in the format timestamp.out. func (re *RunEnv) HTTPPeriodicSnapshots(ctx context.Context, addr string, dur time.Duration, outDir string) error { diff --git a/sdk/runtime/runner.go b/sdk/runtime/runner.go index 655eb8a74..066c32033 100644 --- a/sdk/runtime/runner.go +++ b/sdk/runtime/runner.go @@ -3,12 +3,31 @@ package runtime import ( "fmt" "io" + "net" "os" "runtime/debug" "strings" "time" + + "net/http" + _ "net/http/pprof" + + "github.com/prometheus/client_golang/prometheus/promhttp" ) +const ( + // These ports are the HTTP ports we'll attempt to bind to. If this instance + // is running in a Docker container, binding to 6060 is safe. If it's a + // local:exec run, these ports belong to the host, so starting more than one + // instance will lead to a collision. Therefore we fallback to 0. + HTTPPort = 6060 + HTTPPortFallback = 0 +) + +// HTTPListenAddr will be set to the listener address _before_ the test case is +// invoked. If we were unable to start the listener, this value will be "". +var HTTPListenAddr string + // Invoke runs the passed test-case and reports the result. func Invoke(tc func(*RunEnv) error) { var ( @@ -17,6 +36,8 @@ func Invoke(tc func(*RunEnv) error) { durationGauge = NewGauge(runenv, "plan_duration", "Run time (seconds)") ) + setupHTTPListener(runenv) + // The prometheus pushgateway has a customized scrape interval, which is used to hint to the // prometheus operator at which interval the it should be scraped. This is currently set to 5s. // To provide an updated metric in every scrape, jobs will push to the pushgateway at the same @@ -80,7 +101,6 @@ func Invoke(tc func(*RunEnv) error) { if err = errfile.Sync(); err != nil { runenv.RecordCrash(fmt.Errorf("stderr file tee sync failed failed: %w", err)) - return } }() @@ -107,3 +127,25 @@ func Invoke(tc func(*RunEnv) error) { _ = rd.Close() <-doneCh } + +func setupHTTPListener(runenv *RunEnv) { + addr := fmt.Sprintf("0.0.0.0:%d", HTTPPort) + l, err := net.Listen("tcp", addr) + if err != nil { + addr = fmt.Sprintf("0.0.0.0:%d", HTTPPortFallback) + if l, err = net.Listen("tcp", addr); err != nil { + runenv.RecordMessage("error registering default http handler at: %s: %s", addr, err) + return + } + } + + // DefaultServeMux already includes the pprof handler, add the + // Prometheus handler. + http.DefaultServeMux.Handle("/metrics", promhttp.Handler()) + + HTTPListenAddr = l.Addr().String() + + runenv.RecordMessage("registering default http handler at: http://%s/ (pprof: http://%s/debug/pprof/)", HTTPListenAddr, HTTPListenAddr) + + go http.Serve(l, nil) +}