diff --git a/tests/antithesis/AGENTS.md b/tests/antithesis/AGENTS.md index 7838cea65ab07..ef55f61280a7a 100644 --- a/tests/antithesis/AGENTS.md +++ b/tests/antithesis/AGENTS.md @@ -38,8 +38,9 @@ The fault profile is the single source of truth: change a shot's faults by editi ```sh cd tests/antithesis/scenarios ./launch.sh vector_to_vector_e2e_disk # 30-minute run with the pinned profile -DURATION=60 ./launch.sh vector_to_vector_e2e_disk # override duration (minutes) -DRY_RUN=1 ./launch.sh vector_to_vector_e2e_disk # print the exact command, submit nothing +./launch.sh vector_e2e # the no-disk, single-node counterpart +DURATION=60 ./launch.sh vector_e2e # override duration (minutes) +DRY_RUN=1 ./launch.sh vector_e2e # print the exact command, submit nothing ``` The launcher reads tenant and registry from the environment (snouty's variables): @@ -48,15 +49,15 @@ The launcher reads tenant and registry from the environment (snouty's variables) - `ANTITHESIS_API_KEY` (or `ANTITHESIS_USERNAME` + `ANTITHESIS_PASSWORD`) - `ANTITHESIS_REPOSITORY` -`DESCRIPTION`, `TEST_NAME`, `FAULT_NODES`, and `WEBHOOK` are overridable; the +`DESCRIPTION`, `TEST_NAME`, `FAULT_NODES`, and `WEBHOOK` are overridable. The running git commit is stamped into the description automatically so a shot records the code it tested. Extra snouty flags pass straight through, e.g. -`./launch.sh vector_to_vector_e2e_disk --recipients you@example.com`. +`./launch.sh vector_e2e --recipients you@example.com`. The pinned profile submits to the `persistent_storage` webhook and faults the -scenario's SUT nodes (`head` and `tail` for the disk scenario) with node -termination, hang, and throttle, plus `cpu_mod` and `clock_jitter`. The `oracle` -is left out of termination and hang **only** — its obligation ledger lives in -memory, so killing or freezing it would erase the run's source of truth. It is -deliberately still subject to network faults so the egress delivery path is -exercised. +scenario's SUT nodes (`head` and `tail` for the disk scenario, `vector` for +`vector_e2e`) with node termination, hang, and throttle, plus `cpu_mod` and +`clock_jitter`. The `oracle` is never faulted with termination or hang — its +obligation ledger lives in memory, so killing or freezing it would erase the run's +source of truth. It is deliberately still subject to network faults so the egress +delivery path is exercised. diff --git a/tests/antithesis/scenarios/vector_e2e/Dockerfile b/tests/antithesis/scenarios/vector_e2e/Dockerfile new file mode 100644 index 0000000000000..ffbd85a5c4aea --- /dev/null +++ b/tests/antithesis/scenarios/vector_e2e/Dockerfile @@ -0,0 +1,111 @@ +# syntax=docker/dockerfile:1 + +ARG SANCOV_RUSTFLAGS='["-Cpasses=sancov-module","-Cllvm-args=-sanitizer-coverage-level=3","-Cllvm-args=-sanitizer-coverage-trace-pc-guard","-Clink-args=-Wl,--build-id"]' + +############################ +# Vector SUT — build stage # +############################ +FROM rust:1.92-bookworm AS vector-build +RUN apt-get update && apt-get install -y --no-install-recommends \ + protobuf-compiler cmake perl build-essential pkg-config libssl-dev clang binutils \ + && rm -rf /var/lib/apt/lists/* +WORKDIR /src +COPY Cargo.toml Cargo.lock rust-toolchain.toml build.rs ./ +COPY lib ./lib +COPY src ./src +COPY proto ./proto +COPY benches ./benches +COPY tests ./tests +COPY vdev ./vdev +COPY scripts ./scripts +ARG SANCOV_RUSTFLAGS +# debug=true keeps DWARF for /symbols. lto=false keeps sancov instrumentation +# predictable and stops the optimizer from dropping the force-linked runtime. +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/src/target \ + cargo build --release --no-default-features \ + --features "sources-http_server,sinks-http,sources-internal_metrics,sinks-prometheus,antithesis-scenario-memory" \ + --bin vector \ + --config 'build.target = "x86_64-unknown-linux-gnu"' \ + --config 'profile.release.debug = true' \ + --config 'profile.release.lto = false' \ + --config "target.x86_64-unknown-linux-gnu.rustflags = ${SANCOV_RUSTFLAGS}" \ + && cp target/x86_64-unknown-linux-gnu/release/vector /usr/local/bin/vector \ + && echo "validating instrumentation symbols..." \ + && nm /usr/local/bin/vector | grep -q __sanitizer_cov_trace_pc_guard \ + && nm /usr/local/bin/vector | grep -q antithesis_load_libvoidstar \ + && echo "instrumentation OK" + +################################## +# Harness (workload) — build stage +################################## +# The workload binaries live in the shared `harness` crate, a member of Vector's +# workspace, so the build needs the workspace root and member manifests. `-p` +# compiles only the harness bins. +FROM rust:1.92-bookworm AS workload-build +RUN apt-get update && apt-get install -y --no-install-recommends binutils \ + && rm -rf /var/lib/apt/lists/* +WORKDIR /src +COPY Cargo.toml Cargo.lock rust-toolchain.toml build.rs ./ +COPY lib ./lib +COPY src ./src +COPY proto ./proto +COPY benches ./benches +COPY tests ./tests +COPY vdev ./vdev +COPY scripts ./scripts +ARG SANCOV_RUSTFLAGS +# debug=true keeps DWARF for /symbols. +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/src/target \ + cargo build --release -p harness \ + --config 'build.target = "x86_64-unknown-linux-gnu"' \ + --config 'profile.release.debug = true' \ + --config "target.x86_64-unknown-linux-gnu.rustflags = ${SANCOV_RUSTFLAGS}" \ + && D=target/x86_64-unknown-linux-gnu/release \ + && cp "$D/oracle" "$D/parallel_driver_produce" "$D/eventually_conservation" /usr/local/bin/ \ + && echo "validating instrumentation symbols..." \ + && nm /usr/local/bin/oracle | grep -q __sanitizer_cov_trace_pc_guard \ + && nm /usr/local/bin/oracle | grep -q antithesis_load_libvoidstar \ + && echo "instrumentation OK" + +##################################### +# Runtime: Vector SUT (the one node) # +##################################### +FROM debian:stable-slim AS vector +RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* +COPY --from=vector-build /usr/local/bin/vector /usr/bin/vector +# Bake the node config plus its benign alternate, which the reload fault swaps in +# to force a sink rebuild. +COPY tests/antithesis/scenarios/vector_e2e/vector.yaml /etc/vector/vector.yaml +COPY tests/antithesis/scenarios/vector_e2e/vector.b.yaml /etc/vector/vector.b.yaml +# The reload fault is an anytime_ test command that runs IN the node container. +# The node stays running because its entrypoint is Vector, not a test command. +COPY --chmod=755 tests/antithesis/scenarios/vector_e2e/anytime_reload.sh /opt/antithesis/test/v1/ve2e/anytime_reload +RUN mkdir -p /symbols && ln -s /usr/bin/vector /symbols/vector +ENV NO_COLOR=1 +EXPOSE 8080 9598 +ENTRYPOINT ["/usr/bin/vector"] + +################################### +# Runtime: workload (oracle + cmds) # +################################### +FROM debian:stable-slim AS workload +RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* +# The oracle is the entrypoint. The two test-command binaries are the test commands: +# drop them straight into the test template, named by their Antithesis prefix. +COPY --from=workload-build /usr/local/bin/oracle /usr/bin/oracle +COPY --from=workload-build /usr/local/bin/parallel_driver_produce /opt/antithesis/test/v1/ve2e/parallel_driver_produce +COPY --from=workload-build /usr/local/bin/eventually_conservation /opt/antithesis/test/v1/ve2e/eventually_conservation +# Symbolize all three instrumented binaries: the oracle entrypoint and both +# test-command bins. A crash in any of them must resolve against /symbols. +RUN mkdir -p /symbols \ + && ln -s /usr/bin/oracle /symbols/oracle \ + && ln -s /opt/antithesis/test/v1/ve2e/parallel_driver_produce /symbols/parallel_driver_produce \ + && ln -s /opt/antithesis/test/v1/ve2e/eventually_conservation /symbols/eventually_conservation +ENV NO_COLOR=1 +# The oracle waits for the SUT, emits setup_complete via the SDK, then serves so +# Antithesis runs the test commands in this container. +ENTRYPOINT ["/usr/bin/oracle"] diff --git a/tests/antithesis/scenarios/vector_e2e/README.md b/tests/antithesis/scenarios/vector_e2e/README.md new file mode 100644 index 0000000000000..4448e918b7e94 --- /dev/null +++ b/tests/antithesis/scenarios/vector_e2e/README.md @@ -0,0 +1,57 @@ +# vector_e2e + +The no-disk counterpart of `vector_to_vector_e2e_disk`. Same two properties, one +Vector process, memory buffer instead of `disk_v2`. + +**Conservation**: every event the oracle acked must eventually come back, across +arbitrary Antithesis faults. Duplicates are allowed because the contract is +at-least-once. A missing acked id is the bug. + +**Liveness**: once faults stop, the node must still accept fresh writes. + +## Why a single process + +Vector's end-to-end acknowledgements are in-process: a source holds the client's +ack until every sink that received the event has finished. So a single node is the +honest place to test what an e2e ack promises. The producer POSTs to the node's +`http_server` source; the 200 comes back only once the `http` sink has delivered +the event to the oracle and the oracle returned 2xx. That means an acked id has +**already** reached the oracle — which is why conservation can hold even though the +node has no disk buffer and a crash drops whatever is still in memory: those +in-flight events were never acked, so they were never an obligation. + +## How it works + +One Vector node and one oracle container. + +- **vector** takes an `http_server` source (`:8080`) and delivers over `http` to + the oracle through an in-memory buffer with `when_full: block` and e2e acks. It + also exposes Prometheus metrics (`:9598`) for the health gate, and runs the + reload fault: an `anytime_` command swaps `vector.yaml`/`vector.b.yaml` and sends + `SIGHUP`, forcing the sink to rebuild mid-run. +- **oracle** (`:8686`) is one container that injects unique event ids at the node + and runs the HTTP endpoint the node's sink delivers back to. + +The oracle keeps its id sets in memory and Antithesis never terminates it, so the +faults under test cannot corrupt the judge. The workload binaries (`oracle`, +`parallel_driver_produce`, `eventually_conservation`) are the shared, buffer- +agnostic bins from `tests/antithesis/harness`, pointed at this topology by the +environment in `docker-compose.yaml`. + +## Run + +Validate the config locally: + +```bash +cd tests/antithesis +docker compose -f scenarios/vector_e2e/docker-compose.yaml build +snouty validate scenarios/vector_e2e +``` + +Submit a run through the shared launcher, which pins the fault profile (see +`tests/antithesis/AGENTS.md`): + +```bash +cd tests/antithesis/scenarios +./launch.sh vector_e2e +``` diff --git a/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh b/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh new file mode 100644 index 0000000000000..71416aaf9324b --- /dev/null +++ b/tests/antithesis/scenarios/vector_e2e/anytime_reload.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -euo pipefail +[ -n "${VECTOR_CONFIG_ALT:-}" ] || exit 0 +cfg="${VECTOR_CONFIG:?}" +alt="${VECTOR_CONFIG_ALT:?}" + +# Vector only ever reads $cfg, so reload alternates $cfg between two immutable +# sources rather than swapping two live files. The alternate $alt is never +# written, and the baseline (the original $cfg) is snapshotted once, so the only +# mutable file is $cfg and the only writes to it are a single rename of a fully +# written temp. The node-termination fault can therefore interrupt this script at +# any point and leave $cfg as one complete config or the other, never half-written +# and never collapsed so both sources hold the same content. Alternation always +# resumes on the next invocation. +base="$cfg.orig" +if [ ! -f "$base" ]; then + cp "$cfg" "$base.tmp" + mv "$base.tmp" "$base" +fi + +# Pick whichever source is not currently live. cksum reads from stdin so its +# output is the checksum alone, with no filename to differ on. +if [ "$(cksum <"$cfg")" = "$(cksum <"$alt")" ]; then + next="$base" +else + next="$alt" +fi +cp "$next" "$cfg.tmp" +mv "$cfg.tmp" "$cfg" + +# Vector is PID 1 in the node container. SIGHUP triggers reload-from-disk. +kill -HUP 1 +sleep 5 diff --git a/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml b/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml new file mode 100644 index 0000000000000..abbc3a136b36d --- /dev/null +++ b/tests/antithesis/scenarios/vector_e2e/docker-compose.yaml @@ -0,0 +1,51 @@ +name: vector-e2e + +x-vector-build: &vector-build + context: ../../../.. + dockerfile: tests/antithesis/scenarios/vector_e2e/Dockerfile + target: vector + +x-node-health: &node-health + test: ["CMD", "curl", "-fsS", "http://localhost:9598/metrics"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + +services: + vector: + container_name: vector + hostname: vector + platform: linux/amd64 + init: true + build: *vector-build + image: ve2e-vector:${ANTITHESIS_IMAGE_TAG:-dev} + entrypoint: ["/usr/bin/vector", "--config", "/etc/vector/vector.yaml"] + # vector runs the reload fault: VECTOR_CONFIG_ALT lets anytime_reload swap + # configs and SIGHUP, forcing the sink to rebuild. No disk buffer, so no volume. + environment: + NO_COLOR: "1" + VECTOR_CONFIG: "/etc/vector/vector.yaml" + VECTOR_CONFIG_ALT: "/etc/vector/vector.b.yaml" + healthcheck: *node-health + + oracle: + container_name: oracle + hostname: oracle + platform: linux/amd64 + init: true + build: + context: ../../../.. + dockerfile: tests/antithesis/scenarios/vector_e2e/Dockerfile + target: workload + image: ve2e-oracle:${ANTITHESIS_IMAGE_TAG:-dev} + environment: + NO_COLOR: "1" + SCENARIO_NAME: "vector_e2e" + VECTOR_SOURCE_URL: "http://vector:8080/" + VECTOR_METRICS_URL: "http://vector:9598/metrics" + VECTOR_METRICS_URLS: "http://vector:9598/metrics" + # Test commands run in this container, so they reach the oracle locally. + ORACLE_URL: "http://127.0.0.1:8686" + depends_on: + vector: { condition: service_healthy } diff --git a/tests/antithesis/scenarios/vector_e2e/vector.b.yaml b/tests/antithesis/scenarios/vector_e2e/vector.b.yaml new file mode 100644 index 0000000000000..d426c7413699a --- /dev/null +++ b/tests/antithesis/scenarios/vector_e2e/vector.b.yaml @@ -0,0 +1,36 @@ +sources: + in: + type: http_server + address: 0.0.0.0:8080 + decoding: + codec: json + acknowledgements: + enabled: true + + metrics: + type: internal_metrics + scrape_interval_secs: 1 + +sinks: + out: + type: http + inputs: [in] + uri: http://oracle:8686/ingest + method: post + encoding: + codec: json + # Benign alternate the reload fault swaps in. It differs from vector.yaml only + # by an explicit request timeout, enough to make the reload rebuild the sink. + request: + timeout_secs: 45 + buffer: + type: memory + max_events: 500 + when_full: block + acknowledgements: + enabled: true + + prom: + type: prometheus_exporter + inputs: [metrics] + address: 0.0.0.0:9598 diff --git a/tests/antithesis/scenarios/vector_e2e/vector.yaml b/tests/antithesis/scenarios/vector_e2e/vector.yaml new file mode 100644 index 0000000000000..8163fcbd08d55 --- /dev/null +++ b/tests/antithesis/scenarios/vector_e2e/vector.yaml @@ -0,0 +1,38 @@ +sources: + in: + type: http_server + address: 0.0.0.0:8080 + decoding: + codec: json + acknowledgements: + enabled: true + + metrics: + type: internal_metrics + scrape_interval_secs: 1 + +sinks: + out: + type: http + inputs: [in] + uri: http://oracle:8686/ingest + method: post + encoding: + codec: json + # Memory buffer, no disk: this is the no-disk counterpart of the disk scenario. + # when_full: block keeps the same backpressure so the source applies it to the + # client instead of dropping. With end-to-end acks the 200 to the producer + # fires only once this sink has delivered to the oracle, so an acked event has + # already arrived — that is why conservation can hold even though a crash loses + # whatever is still in this in-memory buffer. + buffer: + type: memory + max_events: 500 + when_full: block + acknowledgements: + enabled: true + + prom: + type: prometheus_exporter + inputs: [metrics] + address: 0.0.0.0:9598