From 4f0d3679957eeb2c5fd89ba1b7829b6557425366 Mon Sep 17 00:00:00 2001
From: Preetam Dwivedi <preetam@uber.com>
Date: Tue, 2 Jun 2026 10:15:45 -0700
Subject: [PATCH 1/2] feat(buildrunner): add vendor-agnostic BuildRunner
 interface + RFC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

### Why?

The build stage needs a vendor-agnostic abstraction for talking to a
Build Runner — one that fits the existing extension family (storage,
queue, conflict) and that does not have to change when the first real
backend lands. Design rationale lives in `doc/rfc/build-runner.md`.

### What?

Introduces the `BuildRunner` contract — three verbs, all keyed by an `entity.BuildID`: `Trigger` submits a build (ordered `base` + `head` change sets plus a free-form metadata map) and returns its ID; `Status` fetches the current `BuildStatus` and runner-defined metadata; `Cancel` requests cancellation. See `extension/buildrunner/build_runner.go` for the signatures.

Highlights:

- `Trigger` takes ordered `base` (assumed-good prefix from dependency
  batches) and `head` (changes being verified) — a runner can cache or
  short-circuit a prefix it has validated before, and attribute terminal
  failures to base vs head via `BuildMetadata`.
- Build identifiers are the typed `entity.BuildID` (a lightweight
  `{ID string}` value, also the queue payload envelope) rather than a
  bare `string`, so the runner contract and the on-queue messages share
  one identifier type and the compiler distinguishes a build ID from
  other string IDs.
- `metadata` is a free-form `map[string]string` for caller-supplied
  attributes (requester, ticket ID, trace ID) the runner MAY persist or
  echo back via `Status`.
- `Trigger` and `Cancel` return promptly; `Status` MAY be lengthy.
- The `BuildStatus` enum is narrowed to `Accepted` / `Running` /
  `Succeeded` / `Failed` / `Cancelled` so every backend can map its
  native lifecycle without leaking runner-specific stages.
- `BuildMetadata` is added as a free-form map for round-tripping
  runner-defined attributes (build URL, duration, SHA, etc.).

Naming: the interface is `BuildRunner` (not `BuildManager` — the
"Manager" suffix doesn't describe what it does, and "Build" alone would
collide cognitively with `entity.Build`). The package is
`extension/buildrunner` to follow the repo convention used by
`mergechecker`, `changeprovider`, `pusher`, `counter`, and `storage`.

Implementation (noop backend, queue `PublishAfter` primitive, controller
wiring, and the poll-driven `buildsignal` loop) lands in the follow-up
PRs stacked on top of this one.

## Test Plan

- ✅ `make test` — entity/build tests pass; mock-only consumers of the
  interface compile and pass.
- ✅ `make fmt lint check-tidy check-gazelle check-mocks` — clean.
- ✅ `make build` — all targets compile.
---
 CLAUDE.md                                     |   1 +
 Makefile                                      |   2 +-
 doc/rfc/build-runner.md                       | 152 ++++++++++++++++++
 doc/rfc/index.md                              |   1 +
 entity/build.go                               |  55 +++++--
 entity/build_test.go                          |  27 ++--
 extension/buildrunner/BUILD.bazel             |   9 ++
 extension/buildrunner/README.md               |  12 ++
 extension/buildrunner/build_runner.go         |  82 ++++++++++
 extension/buildrunner/mock/BUILD.bazel        |  12 ++
 .../buildrunner/mock/build_runner_mock.go     |  87 ++++++++++
 orchestrator/controller/build/build.go        |   2 +-
 .../buildsignal/buildsignal_test.go           |   2 +-
 13 files changed, 409 insertions(+), 35 deletions(-)
 create mode 100644 doc/rfc/build-runner.md
 create mode 100644 extension/buildrunner/BUILD.bazel
 create mode 100644 extension/buildrunner/README.md
 create mode 100644 extension/buildrunner/build_runner.go
 create mode 100644 extension/buildrunner/mock/BUILD.bazel
 create mode 100644 extension/buildrunner/mock/build_runner_mock.go

diff --git a/CLAUDE.md b/CLAUDE.md
index 692d815b..f78731ca 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -154,6 +154,7 @@ Generated proto files are committed. When modifying `.proto` files:
 - **Files**: `{method}.go`, `{entity}.go`, `{file}_test.go`, `BUILD.bazel`
 - **Proto files**: `{service}.proto`
 - **README files**: Do not duplicate interface or type definitions as code blocks in READMEs. Describe behavior in prose and let readers navigate to the source. Only include code samples when explicitly instructed.
+- **Markdown prose width**: Do not hard-wrap prose in Markdown docs (RFCs under `doc/`, READMEs). Write one line per paragraph and one line per list item, and let the editor soft-wrap — hard wrapping at a fixed column renders as a narrow fixed-width column regardless of window size. Code blocks, tables, and ASCII diagrams keep their own line breaks.
 
 ### Makefile
 
diff --git a/Makefile b/Makefile
index 573fb595..279054c9 100644
--- a/Makefile
+++ b/Makefile
@@ -285,7 +285,7 @@ local-stovepipe-gateway-start: build-stovepipe-gateway-linux ## Start Stovepipe
 
 mocks: ## Generate mock files using mockgen
 	@echo "Generating mocks..."
-	@$(BAZEL) run @rules_go//go -- generate ./extension/storage/... ./extension/changestore/... ./extension/counter/... ./extension/queue/... ./extension/queueconfig/... ./extension/mergechecker/... ./extension/pusher/... ./extension/scorer/... ./extension/conflict/... ./core/consumer/...
+	@$(BAZEL) run @rules_go//go -- generate ./extension/storage/... ./extension/buildrunner/... ./extension/changestore/... ./extension/counter/... ./extension/queue/... ./extension/queueconfig/... ./extension/mergechecker/... ./extension/pusher/... ./extension/scorer/... ./extension/conflict/... ./core/consumer/...
 	@echo "Mocks generated successfully!"
 
 proto: ## Generate protobuf files from .proto definitions
diff --git a/doc/rfc/build-runner.md b/doc/rfc/build-runner.md
new file mode 100644
index 00000000..8cc3c815
--- /dev/null
+++ b/doc/rfc/build-runner.md
@@ -0,0 +1,152 @@
+# Build Runner
+
+Design notes for the `BuildRunner` extension that lets the orchestrator drive external Build Runners from the build stage of the workflow pipeline.
+
+This document captures **design decisions and rationale only**.
+
+## Problem
+
+The build stage needs a vendor-agnostic abstraction for talking to a Build Runner — one that fits the existing extension family (storage, queue, conflict) and that does not have to change when the first real backend lands.
+
+## Flow
+
+`build` triggers the runner and hands the `buildID` to the `buildsignal` poll loop. The loop calls `Status` on its own partition per build until the build is terminal: terminal results wake the batch state machine via `speculate`; non-terminal results re-enqueue the same `buildID` after a delay (`PublishAfter`). A webhook-capable backend can publish a status message into the same queue — the consumer cannot tell a push from a poll.
+
+```
+   ┌────────────────────────────────────────────────────┐
+   │ build                                              │
+   │   Trigger(base, head) → buildID                    │
+   │   persist Build{Accepted}, enqueue buildID         │
+   └──────────────────────────┬─────────────────────────┘
+                              │ buildID
+                              ▼
+   ┌────────────────────────────────────────────────────┐
+   │ buildsignal   (poll loop, one partition per build) │
+   │   load Build, call Status(buildID)                 │
+   └──────────────────────────┬─────────────────────────┘
+                              │
+           ┌──────────────────┴─────────────────┐
+           ▼                                    ▼
+   ┌───────────────┐          ┌──────────────────────────────────┐
+   │ terminal      │          │ non-terminal                     │
+   │   → speculate │          │   → PublishAfter(buildID, delay) │
+   │   re-evaluate │          │   re-enqueues to buildsignal     │
+   └───────────────┘          └──────────────────────────────────┘
+```
+
+## Interface
+
+`BuildRunner` exposes three verbs, all keyed by a build identifier (`entity.BuildID`):
+
+- **`Trigger`** — submit a build for a queue, given the ordered `base` and `head` change sets plus a free-form metadata map; returns the new build's ID. Runner-side work is asynchronous.
+- **`Status`** — fetch the current `BuildStatus` and runner-defined metadata for a build; MAY round-trip to the runner.
+- **`Cancel`** — request cancellation; returns once the request reaches the runner, not once the build stops.
+
+See `extension/buildrunner/build_runner.go` for the exact Go signatures. The sections below record why the contract is shaped this way.
+
+### Trigger: base + head
+
+`Trigger` takes two ordered lists of changes and a free-form metadata map:
+
+- **`base`** — changes from the dependency batches (assumed-good prefix). Ordered.
+- **`head`** — changes from the batch being verified. Ordered.
+- **`metadata`** — caller-supplied attributes (requester, ticket ID, trace ID, etc.) the provider MAY persist or echo back via `Status`. Schema is caller/provider-defined; the interface treats it as opaque. `nil` is equivalent to an empty map.
+
+The provider applies `base` then `head` in order on top of the queue's target branch and validates the resulting tree. Validation is **implicit and holistic**: it is not a per-change action, it is what the provider does after applying everything.
+
+Why split base and head:
+
+- The orchestrator's internal model already distinguishes them — a speculation path has a head batch and a list of base batches assumed to pass.
+- Lets a provider cache or short-circuit the base when it has validated the same prefix before — a hot path for stacked-batch speculation.
+- Lets the provider attribute terminal failure to base vs head in `BuildMetadata` without the orchestrator having to round-trip the split itself.
+
+Rejected: a single flat `changes []entity.Change`. Provider would have to deduce base via prefix matching and could not distinguish "base broke" from "head broke" without out-of-band hints. Loses the orchestrator's clearest piece of structural information at the boundary for no gain.
+
+Rejected: list-of-lists, one slice per batch. Pushes batch structure across the boundary, which the provider does not care about. The provider thinks in terms of "stuff to apply before" and "stuff to validate" — base / head matches that mental model. Batches are an orchestrator concept.
+
+### Async vs sync contract
+
+| Verb | Must return promptly? | Notes |
+|---|---|---|
+| `Trigger` | yes | provider-side work is asynchronous |
+| `Cancel` | yes | returns once the request reaches the provider, **not** once the build stops |
+| `Status` | no | a provider round trip is typical |
+
+This keeps the orchestrator's queue loops snappy while admitting that reading state is remote.
+
+### Status delivery: polling, as queue traffic
+
+The interface is pull-only — the only way to learn a build's current state is to call `Status`. But polling is not a tight loop. It runs as queue traffic, with each `buildID` as its own partition.
+
+A status-loop consumer receives a `buildID` message, calls `Status`, and either:
+
+- **terminal** → forwards downstream, or
+- **non-terminal** → re-publishes to its own queue with a delay.
+
+This makes polling behave like everything else in the orchestrator:
+
+- **Independent partitions** — slow polls on one build don't block others.
+- **Restart-safe** — pending polls live in the queue, not in memory.
+- **Retry-native** — a `Status` call that errors out is `Nack`'d and redelivered with the queue's normal backoff, separate from polling cadence.
+- **Tunable cadence** — re-publish delay can vary by status (longer for `Accepted`, shorter for `Running`).
+
+### Polling primitive: `PublishAfter`, not `Nack`
+
+Postponing the next poll needs a "publish-with-delay" verb. Two candidates exist in or near the queue extension:
+
+- **`Publisher.PublishAfter(topic, msg, delayMs)`** — a new primitive. A fresh message, made visible only after `delayMs`. The SQL-backed queue already has the column needed (`invisible_until`); `PublishAfter` is `Publish` with a non-zero delay.
+- **`Delivery.Nack(requeueAfterMs)`** — the existing primitive. Re-uses the same message, sets it invisible until `now + delay`, increments `retry_count`.
+
+Both deliver the same surface behaviour: one message per build at a time, redelivered after the chosen delay. The difference is what `retry_count` means.
+
+`Nack` is "this delivery failed, try again," and `retry_count` feeds `MaxAttempts` and DLQ. Using it for "build not yet done" overloads that counter — every poll bumps a number that is supposed to flag problems.
+
+`PublishAfter` is "postpone this work." Each poll cycle is a fresh message with `retry_count = 0`. `Nack` stays available for true `Status` failures with its normal bounded-retry-then-DLQ behaviour. The two signals stay separate.
+
+**Why not `Nack` with `MaxAttempts = ∞`** (one message per build, just keep cycling)? The mechanism works. Three things break:
+
+- **No DLQ escape valve.** A malformed `buildID`, or a build the provider has lost, fails `Status` every call. With unbounded retries the message spins forever; the operator gets no signal that something is permanently wrong. DLQ exists for exactly this case; opting out for the buildsignal subscription means opting out of every poison-message signal it offers.
+- **Conflated metric.** `retry_count` is the obvious dashboard signal for "this consumer is having trouble." With infinite-retry polling, a `retry_count` of 500 might mean "build has been running 30 minutes" *or* "Status has errored 500 times" — operationally indistinguishable.
+- **Visibility-timeout coupling.** If the consumer crashes mid-poll before its `Nack`, the queue's visibility timeout redelivers the message and bumps `retry_count`. One number ends up counting legitimate polls, real errors, *and* consumer crashes — three signals fused.
+
+`PublishAfter` costs one new queue primitive. It buys back the queue's diagnostic semantics.
+
+Trade-off acknowledged: `PublishAfter` writes more — Ack deletes the old message, PublishAfter inserts a new one — vs `Nack` updating one row in place. At minute cadence the difference is noise; at second cadence it is real but small.
+
+### Push, when a backend supports it
+
+A webhook-capable backend publishes a status message into the same queue, keyed by the same `buildID`. The consumer cannot tell whether a message came from a poll or a push — both shapes are identical, both partition the same way.
+
+This keeps the `BuildRunner` contract pull-only: push is implemented at the queue boundary, not on the interface. A backend without webhooks needs zero extra code; a backend with webhooks needs only a webhook receiver that publishes into the existing queue.
+
+Rejected: a push method on `BuildRunner` (e.g. `Subscribe`). Forces every implementation to either expose a real push primitive or fake one, and gives the orchestrator two parallel update paths. Pushing into the queue collapses both paths.
+
+Rejected: long-polling on `Status`. Not every backend supports efficient server-side blocking; making it part of the contract forces backends to fake it. Same latency, more interface complexity.
+
+### Lifecycle
+
+Implementations are long-lived singletons bound to provider config at construction. Every method is concurrent-safe; connection pools and caches live inside the manager; anything that must survive a restart belongs in persistent storage, not the manager.
+
+### Transient failures
+
+The manager's problem, not the caller's. Reconnect and retry-with-backoff internally; during the recovery window return plain errors rather than block indefinitely.
+
+### Error classification
+
+Methods return plain errors. The controller classifies errors as user vs infra and retryable vs not — the manager does not. It MAY mark errors as retryable when it knows a failure is transient. Domain sentinels (e.g. "build not found") land with the first implementation that needs them, not before.
+
+## `BuildStatus`
+
+A deliberately narrow set — `Unknown`, `Accepted`, `Running`, `Succeeded`, `Failed`, `Cancelled`. Every backend must collapse its native lifecycle into one of these.
+
+`Accepted` covers any pre-execution state — submitted, queued for a worker, waiting on capacity. The orchestrator cannot act differently on "submitted" vs "queued", so collapsing them removes a distinction nobody consumes. `Running` is a separate first-class non-terminal state because "in a queue" and "burning compute right now" are operationally different: cancellation cost differs, and future speculation decisions can use the signal.
+
+`Succeeded` is the common terminal-success name across Build Runners. `Failed` covers terminal failure of any cause — including runner-initiated terminations like timeouts, OOM, or worker crashes, which are real failures the orchestrator must see and react to. `Cancelled` is the terminal state for a build that was cancelled.
+
+The set excludes `Blocked` — a wait on an upstream gate is the orchestrator's concept, owned by speculation. Folding it into build status would conflate two systems and invite a controller branch that should not exist.
+
+`IsTerminal` is a predicate on the type: `Succeeded`, `Failed`, `Cancelled` are terminal. Living on the enum prevents callers from reimplementing the list and drifting.
+
+## `BuildMetadata`
+
+`BuildMetadata` is a free-form `map[string]string` from `Status`. Every backend exposes different metadata (URL, duration, SHA, runner ID, base-vs-head failure attribution); the orchestrator's job is to round-trip it to users, not interpret it.
diff --git a/doc/rfc/index.md b/doc/rfc/index.md
index a2160392..1cbbbb55 100644
--- a/doc/rfc/index.md
+++ b/doc/rfc/index.md
@@ -6,3 +6,4 @@ Design documents and technical proposals for SubmitQueue.
 
 - [SQL-Based Distributed Queue](sql-queue-rfc.md) - MySQL-based distributed message queue with partition leasing and at-least-once delivery
 - [Orchestrator Workflow](workflow.md) - Queue-driven controller pipeline from gateway entry through batching, scoring, build, merge, and conclude
+- [Build Runner](build-runner.md) - Vendor-agnostic BuildRunner interface, provider-neutral BuildStatus lifecycle, and how the orchestrator wires it into the build stage
diff --git a/entity/build.go b/entity/build.go
index f5956d16..86039f43 100644
--- a/entity/build.go
+++ b/entity/build.go
@@ -16,41 +16,41 @@ package entity
 
 import "encoding/json"
 
-// BuildStatus defines the possible states of a build.
+// BuildStatus defines the possible states of a build. The set is
+// intentionally narrow: every supported build provider must be able to map
+// its native lifecycle into one of these values without leaking
+// provider-specific stages.
 type BuildStatus string
 
 const (
-	// BuildStatusUnknown is the unreachable state. It is set by default when the structure is initialized. It should never be seen in the system.
+	// BuildStatusUnknown is the unreachable zero value, set by default when
+	// the structure is initialized. It should never be seen in the system.
 	BuildStatusUnknown BuildStatus = ""
 
-	// BuildStatusQueued indicates the build has been scheduled but not yet started.
-	BuildStatusQueued BuildStatus = "queued"
+	// BuildStatusAccepted indicates the build has been accepted for
+	// execution.
+	BuildStatusAccepted BuildStatus = "accepted"
 
 	// BuildStatusRunning indicates the build is currently executing.
 	BuildStatusRunning BuildStatus = "running"
 
-	// BuildStatusPassed indicates the build completed successfully.
+	// BuildStatusSucceeded indicates the build completed successfully.
 	// This is a terminal state.
-	BuildStatusPassed BuildStatus = "passed"
+	BuildStatusSucceeded BuildStatus = "succeeded"
 
-	// BuildStatusFailed indicates the build completed with failures.
+	// BuildStatusFailed indicates the build did not complete successfully.
 	// This is a terminal state.
 	BuildStatusFailed BuildStatus = "failed"
 
-	// BuildStatusCancelled indicates the build was cancelled before completion.
+	// BuildStatusCancelled indicates the build was cancelled.
 	// This is a terminal state.
 	BuildStatusCancelled BuildStatus = "cancelled"
-
-	// BuildStatusBlocked indicates the build is waiting for manual approval or unblocking.
-	// Some CI systems (like BuildKite) support manual approval steps.
-	BuildStatusBlocked BuildStatus = "blocked"
 )
 
-// IsTerminal returns true if the build state represents a final state (passed, failed, or cancelled).
-// Terminal states indicate the build has finished and will not change state again.
-// Note: BuildStatusBlocked is NOT terminal as blocked builds can be unblocked and continue execution.
+// IsTerminal returns true if the status represents a final state
+// (Succeeded, Failed, or Cancelled).
 func (s BuildStatus) IsTerminal() bool {
-	return s == BuildStatusPassed || s == BuildStatusFailed || s == BuildStatusCancelled
+	return s == BuildStatusSucceeded || s == BuildStatusFailed || s == BuildStatusCancelled
 }
 
 // SpeculationPathInfo represents the base and head commits of a speculation path used in a build.
@@ -88,3 +88,26 @@ func BuildFromBytes(data []byte) (Build, error) {
 	err := json.Unmarshal(data, &build)
 	return build, err
 }
+
+// BuildID is a lightweight entity for publishing and consuming just the build identifier via the queue.
+type BuildID struct {
+	// ID is the globally unique identifier for the build.
+	ID string `json:"id"`
+}
+
+// ToBytes serializes the BuildID to JSON bytes for queue message payload.
+func (b BuildID) ToBytes() ([]byte, error) {
+	return json.Marshal(b)
+}
+
+// BuildIDFromBytes deserializes a BuildID from JSON bytes.
+func BuildIDFromBytes(data []byte) (BuildID, error) {
+	var bid BuildID
+	err := json.Unmarshal(data, &bid)
+	return bid, err
+}
+
+// BuildMetadata carries provider-defined free-form metadata about a build
+// (e.g. build URL, duration, commit SHA). Keys and values are
+// implementation-defined; callers should not assume any particular schema.
+type BuildMetadata map[string]string
diff --git a/entity/build_test.go b/entity/build_test.go
index e31938ac..97de0193 100644
--- a/entity/build_test.go
+++ b/entity/build_test.go
@@ -28,8 +28,8 @@ func TestBuildStatus_IsTerminal(t *testing.T) {
 		expected bool
 	}{
 		{
-			name:     "passed is terminal",
-			status:   BuildStatusPassed,
+			name:     "succeeded is terminal",
+			status:   BuildStatusSucceeded,
 			expected: true,
 		},
 		{
@@ -43,8 +43,8 @@ func TestBuildStatus_IsTerminal(t *testing.T) {
 			expected: true,
 		},
 		{
-			name:     "queued is not terminal",
-			status:   BuildStatusQueued,
+			name:     "accepted is not terminal",
+			status:   BuildStatusAccepted,
 			expected: false,
 		},
 		{
@@ -52,11 +52,6 @@ func TestBuildStatus_IsTerminal(t *testing.T) {
 			status:   BuildStatusRunning,
 			expected: false,
 		},
-		{
-			name:     "blocked is not terminal",
-			status:   BuildStatusBlocked,
-			expected: false,
-		},
 		{
 			name:     "unknown is not terminal",
 			status:   BuildStatusUnknown,
@@ -79,7 +74,7 @@ func TestBuild_ToBytes(t *testing.T) {
 			Base: []string{"batch-0", "batch-prev"},
 		},
 		Score:  0.85,
-		Status: BuildStatusQueued,
+		Status: BuildStatusAccepted,
 	}
 
 	data, err := build.ToBytes()
@@ -90,7 +85,7 @@ func TestBuild_ToBytes(t *testing.T) {
 	jsonStr := string(data)
 	assert.Contains(t, jsonStr, "build-1")
 	assert.Contains(t, jsonStr, "batch-1")
-	assert.Contains(t, jsonStr, "queued")
+	assert.Contains(t, jsonStr, "accepted")
 }
 
 func TestBuildFromBytes(t *testing.T) {
@@ -101,7 +96,7 @@ func TestBuildFromBytes(t *testing.T) {
 			Base: []string{"batch-5", "batch-6"},
 		},
 		Score:  0.92,
-		Status: BuildStatusRunning,
+		Status: BuildStatusAccepted,
 	}
 
 	// Serialize
@@ -146,7 +141,7 @@ func TestBuild_SerializationRoundTrip(t *testing.T) {
 		build Build
 	}{
 		{
-			name: "queued build with speculation path",
+			name: "accepted build with speculation path",
 			build: Build{
 				ID:      "build-100",
 				BatchID: "batch-50",
@@ -154,16 +149,16 @@ func TestBuild_SerializationRoundTrip(t *testing.T) {
 					Base: []string{"batch-48", "batch-49"},
 				},
 				Score:  0.75,
-				Status: BuildStatusQueued,
+				Status: BuildStatusAccepted,
 			},
 		},
 		{
-			name: "passed build with no speculation base",
+			name: "succeeded build with no speculation base",
 			build: Build{
 				ID:      "build-200",
 				BatchID: "batch-60",
 				Score:   1.0,
-				Status:  BuildStatusPassed,
+				Status:  BuildStatusSucceeded,
 			},
 		},
 		{
diff --git a/extension/buildrunner/BUILD.bazel b/extension/buildrunner/BUILD.bazel
new file mode 100644
index 00000000..190da6b8
--- /dev/null
+++ b/extension/buildrunner/BUILD.bazel
@@ -0,0 +1,9 @@
+load("@rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "buildrunner",
+    srcs = ["build_runner.go"],
+    importpath = "github.com/uber/submitqueue/extension/buildrunner",
+    visibility = ["//visibility:public"],
+    deps = ["//entity"],
+)
diff --git a/extension/buildrunner/README.md b/extension/buildrunner/README.md
new file mode 100644
index 00000000..6e56ac01
--- /dev/null
+++ b/extension/buildrunner/README.md
@@ -0,0 +1,12 @@
+# Build Runner
+
+Pluggable abstraction for triggering builds against an external Build Runner, querying their status, and cancelling them.
+
+See [`doc/rfc/build-runner.md`](../../doc/rfc/build-runner.md) for the contract and design rationale. See `build_runner.go` for the interface itself.
+
+## Adding a new backend
+
+1. Create `extension/buildrunner/{backend}/` with a `BuildRunner` implementation bound to its runner configuration at construction.
+2. Map the `base` and `head` change slices onto the backend's build primitives (apply `base`, apply `head`, validate the result).
+3. Map the runner's lifecycle states down to the `BuildStatus` values: `Accepted` (accepted for execution), `Running` (executing), and the terminal `Succeeded` / `Failed` / `Cancelled`.
+4. Implement internal reconnect / retry so transient failures surface as plain errors without blocking the caller.
diff --git a/extension/buildrunner/build_runner.go b/extension/buildrunner/build_runner.go
new file mode 100644
index 00000000..eb262426
--- /dev/null
+++ b/extension/buildrunner/build_runner.go
@@ -0,0 +1,82 @@
+// Copyright (c) 2025 Uber Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buildrunner
+
+//go:generate mockgen -source=build_runner.go -destination=mock/build_runner_mock.go -package=mock
+
+import (
+	"context"
+
+	"github.com/uber/submitqueue/entity"
+)
+
+// BuildRunner triggers builds against an external Build Runner, queries
+// their status, and cancels them.
+//
+// Implementations are long-lived singletons and must:
+//   - make every method safe for concurrent use by multiple goroutines;
+//   - recover from transient connectivity failures internally, returning
+//     plain errors during the recovery window rather than blocking the
+//     caller indefinitely;
+//   - keep only transient local state (caches, pools) — anything that must
+//     survive a restart belongs in Storage;
+//   - return plain errors and leave classification (user vs infra,
+//     retryable or not) to the calling controller, per core/errs.
+type BuildRunner interface {
+	// Trigger submits a build that applies base then head, in order, on top
+	// of the queue's target branch and validates the resulting tree.
+	// Validation is implicit and holistic — it is what the runner does
+	// after applying everything, not a per-change action.
+	//
+	// base contains changes from the dependency batches (an assumed-good
+	// prefix). head contains changes from the batch being verified.
+	// Splitting them lets a runner cache or short-circuit the base when
+	// it has validated the same prefix before, and lets it attribute
+	// terminal failure to base vs head in BuildMetadata.
+	//
+	// metadata carries free-form caller-supplied attributes (e.g. requester,
+	// ticket ID, trace ID) that the runner MAY persist or echo back via
+	// Status. Implementations MUST NOT depend on any specific key; nil is
+	// equivalent to an empty map.
+	//
+	// Trigger MUST return promptly; runner-side work happens
+	// asynchronously. Callers learn the build's progress via Status, not
+	// via Trigger.
+	//
+	// queueName selects the runner-specific job configuration.
+	// Returns an error if the request is invalid.
+	Trigger(
+		ctx context.Context,
+		queueName string,
+		base []entity.Change,
+		head []entity.Change,
+		metadata entity.BuildMetadata,
+	) (buildID entity.BuildID, err error)
+
+	// Status returns the current status and runner-defined metadata
+	// (build URL, duration, etc.) for a build. Unlike Trigger, Status MAY be
+	// synchronous and lengthy — a runner round trip is typical.
+	//
+	// Returns an error if the build does not exist.
+	Status(
+		ctx context.Context,
+		buildID entity.BuildID,
+	) (entity.BuildStatus, entity.BuildMetadata, error)
+
+	// Cancel requests cancellation and returns once the request has reached
+	// the runner; it does not wait for the build to actually stop. A no-op
+	// on already-terminal builds. Returns an error if the build does not exist.
+	Cancel(ctx context.Context, buildID entity.BuildID) error
+}
diff --git a/extension/buildrunner/mock/BUILD.bazel b/extension/buildrunner/mock/BUILD.bazel
new file mode 100644
index 00000000..e24e6c3b
--- /dev/null
+++ b/extension/buildrunner/mock/BUILD.bazel
@@ -0,0 +1,12 @@
+load("@rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "mock",
+    srcs = ["build_runner_mock.go"],
+    importpath = "github.com/uber/submitqueue/extension/buildrunner/mock",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//entity",
+        "@org_uber_go_mock//gomock",
+    ],
+)
diff --git a/extension/buildrunner/mock/build_runner_mock.go b/extension/buildrunner/mock/build_runner_mock.go
new file mode 100644
index 00000000..9fae6097
--- /dev/null
+++ b/extension/buildrunner/mock/build_runner_mock.go
@@ -0,0 +1,87 @@
+// Code generated by MockGen. DO NOT EDIT.
+// Source: build_runner.go
+//
+// Generated by this command:
+//
+//	mockgen -source=build_runner.go -destination=mock/build_runner_mock.go -package=mock
+//
+
+// Package mock is a generated GoMock package.
+package mock
+
+import (
+	context "context"
+	reflect "reflect"
+
+	entity "github.com/uber/submitqueue/entity"
+	gomock "go.uber.org/mock/gomock"
+)
+
+// MockBuildRunner is a mock of BuildRunner interface.
+type MockBuildRunner struct {
+	ctrl     *gomock.Controller
+	recorder *MockBuildRunnerMockRecorder
+	isgomock struct{}
+}
+
+// MockBuildRunnerMockRecorder is the mock recorder for MockBuildRunner.
+type MockBuildRunnerMockRecorder struct {
+	mock *MockBuildRunner
+}
+
+// NewMockBuildRunner creates a new mock instance.
+func NewMockBuildRunner(ctrl *gomock.Controller) *MockBuildRunner {
+	mock := &MockBuildRunner{ctrl: ctrl}
+	mock.recorder = &MockBuildRunnerMockRecorder{mock}
+	return mock
+}
+
+// EXPECT returns an object that allows the caller to indicate expected use.
+func (m *MockBuildRunner) EXPECT() *MockBuildRunnerMockRecorder {
+	return m.recorder
+}
+
+// Cancel mocks base method.
+func (m *MockBuildRunner) Cancel(ctx context.Context, buildID entity.BuildID) error {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Cancel", ctx, buildID)
+	ret0, _ := ret[0].(error)
+	return ret0
+}
+
+// Cancel indicates an expected call of Cancel.
+func (mr *MockBuildRunnerMockRecorder) Cancel(ctx, buildID any) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cancel", reflect.TypeOf((*MockBuildRunner)(nil).Cancel), ctx, buildID)
+}
+
+// Status mocks base method.
+func (m *MockBuildRunner) Status(ctx context.Context, buildID entity.BuildID) (entity.BuildStatus, entity.BuildMetadata, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Status", ctx, buildID)
+	ret0, _ := ret[0].(entity.BuildStatus)
+	ret1, _ := ret[1].(entity.BuildMetadata)
+	ret2, _ := ret[2].(error)
+	return ret0, ret1, ret2
+}
+
+// Status indicates an expected call of Status.
+func (mr *MockBuildRunnerMockRecorder) Status(ctx, buildID any) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Status", reflect.TypeOf((*MockBuildRunner)(nil).Status), ctx, buildID)
+}
+
+// Trigger mocks base method.
+func (m *MockBuildRunner) Trigger(ctx context.Context, queueName string, base, head []entity.Change, metadata entity.BuildMetadata) (entity.BuildID, error) {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "Trigger", ctx, queueName, base, head, metadata)
+	ret0, _ := ret[0].(entity.BuildID)
+	ret1, _ := ret[1].(error)
+	return ret0, ret1
+}
+
+// Trigger indicates an expected call of Trigger.
+func (mr *MockBuildRunnerMockRecorder) Trigger(ctx, queueName, base, head, metadata any) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Trigger", reflect.TypeOf((*MockBuildRunner)(nil).Trigger), ctx, queueName, base, head, metadata)
+}
diff --git a/orchestrator/controller/build/build.go b/orchestrator/controller/build/build.go
index 48d4cf16..0a450872 100644
--- a/orchestrator/controller/build/build.go
+++ b/orchestrator/controller/build/build.go
@@ -102,7 +102,7 @@ func (c *Controller) Process(ctx context.Context, delivery consumer.Delivery) (r
 	build := entity.Build{
 		ID:      batch.ID,
 		BatchID: batch.ID,
-		Status:  entity.BuildStatusQueued,
+		Status:  entity.BuildStatusAccepted,
 	}
 
 	// Publish build to build signal topic
diff --git a/orchestrator/controller/buildsignal/buildsignal_test.go b/orchestrator/controller/buildsignal/buildsignal_test.go
index 44db77a9..fed0cb40 100644
--- a/orchestrator/controller/buildsignal/buildsignal_test.go
+++ b/orchestrator/controller/buildsignal/buildsignal_test.go
@@ -88,7 +88,7 @@ func TestController_Process_Success(t *testing.T) {
 	build := entity.Build{
 		ID:      "build-123",
 		BatchID: "test-queue/batch/1",
-		Status:  entity.BuildStatusQueued,
+		Status:  entity.BuildStatusAccepted,
 	}
 
 	payload, err := build.ToBytes()

From 5bc2c0d8e5f27b7c153a8303f657cbf5b8b670a1 Mon Sep 17 00:00:00 2001
From: Preetam Dwivedi <preetam@uber.com>
Date: Tue, 2 Jun 2026 11:32:17 -0700
Subject: [PATCH 2/2] feat(buildrunner/noop): add pass-through BuildRunner stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

### Why?

Wiring tests for the orchestrator's build stage need a `BuildRunner`
implementation, but real backends (BuildKite, Jenkins, etc.) won't land
until later. A noop that immediately reports success is also useful as
the local default in `make local-start` and as a best-case baseline
where every build passes.

### What?

Adds `extension/buildrunner/noop` — a `BuildRunner` that does no real
work. Every `Trigger` returns a unique `noop-<n>` build ID with no
error; every `Status` returns `BuildStatusSucceeded` with no metadata;
`Cancel` is a no-op. The atomic counter keeps IDs unique under
concurrent use.

## Test Plan

- ✅ `make test` — 36 unit tests pass, including the new
  `extension/buildrunner/noop:noop_test` (Trigger uniqueness, Status,
  Cancel, interface satisfaction).
- ✅ `make fmt lint check-tidy check-gazelle check-mocks` — clean.
- ✅ `make build` — all targets compile.
---
 extension/buildrunner/noop/BUILD.bazel  | 24 ++++++++++
 extension/buildrunner/noop/noop.go      | 56 +++++++++++++++++++++++
 extension/buildrunner/noop/noop_test.go | 61 +++++++++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 extension/buildrunner/noop/BUILD.bazel
 create mode 100644 extension/buildrunner/noop/noop.go
 create mode 100644 extension/buildrunner/noop/noop_test.go

diff --git a/extension/buildrunner/noop/BUILD.bazel b/extension/buildrunner/noop/BUILD.bazel
new file mode 100644
index 00000000..f53beb24
--- /dev/null
+++ b/extension/buildrunner/noop/BUILD.bazel
@@ -0,0 +1,24 @@
+load("@rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "noop",
+    srcs = ["noop.go"],
+    importpath = "github.com/uber/submitqueue/extension/buildrunner/noop",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//entity",
+        "//extension/buildrunner",
+    ],
+)
+
+go_test(
+    name = "noop_test",
+    srcs = ["noop_test.go"],
+    embed = [":noop"],
+    deps = [
+        "//entity",
+        "//extension/buildrunner",
+        "@com_github_stretchr_testify//assert",
+        "@com_github_stretchr_testify//require",
+    ],
+)
diff --git a/extension/buildrunner/noop/noop.go b/extension/buildrunner/noop/noop.go
new file mode 100644
index 00000000..61637434
--- /dev/null
+++ b/extension/buildrunner/noop/noop.go
@@ -0,0 +1,56 @@
+// Copyright (c) 2025 Uber Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package noop provides a buildrunner.BuildRunner that performs no real
+// work: every triggered build immediately succeeds. It is intended as a
+// stub for wiring tests and as a best-case baseline where every build
+// passes.
+package noop
+
+import (
+	"context"
+	"fmt"
+	"sync/atomic"
+
+	"github.com/uber/submitqueue/entity"
+	"github.com/uber/submitqueue/extension/buildrunner"
+)
+
+// runner is a buildrunner.BuildRunner that does no real work and reports
+// every build as immediately succeeded. The atomic counter hands out
+// unique build IDs and makes the type safe for concurrent use.
+type runner struct {
+	counter atomic.Uint64
+}
+
+// New returns a buildrunner.BuildRunner that performs no real work.
+func New() buildrunner.BuildRunner {
+	return &runner{}
+}
+
+// Trigger returns a unique build ID without contacting any runner.
+// Inputs are ignored.
+func (r *runner) Trigger(_ context.Context, _ string, _ []entity.Change, _ []entity.Change, _ entity.BuildMetadata) (entity.BuildID, error) {
+	return entity.BuildID{ID: fmt.Sprintf("noop-%d", r.counter.Add(1))}, nil
+}
+
+// Status always reports BuildStatusSucceeded with no metadata.
+func (r *runner) Status(_ context.Context, _ entity.BuildID) (entity.BuildStatus, entity.BuildMetadata, error) {
+	return entity.BuildStatusSucceeded, nil, nil
+}
+
+// Cancel is a no-op.
+func (r *runner) Cancel(_ context.Context, _ entity.BuildID) error {
+	return nil
+}
diff --git a/extension/buildrunner/noop/noop_test.go b/extension/buildrunner/noop/noop_test.go
new file mode 100644
index 00000000..436725cd
--- /dev/null
+++ b/extension/buildrunner/noop/noop_test.go
@@ -0,0 +1,61 @@
+// Copyright (c) 2025 Uber Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package noop
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/uber/submitqueue/entity"
+	"github.com/uber/submitqueue/extension/buildrunner"
+)
+
+func TestNew_ImplementsInterface(t *testing.T) {
+	var _ buildrunner.BuildRunner = New()
+}
+
+func TestRunner_Trigger(t *testing.T) {
+	r := New()
+	ctx := context.Background()
+
+	id1, err := r.Trigger(ctx, "queueA",
+		[]entity.Change{{URIs: []string{"github://owner/repo/pull/1"}}},
+		[]entity.Change{{URIs: []string{"github://owner/repo/pull/2"}}},
+		entity.BuildMetadata{"requester": "alice"},
+	)
+	require.NoError(t, err)
+	assert.NotEmpty(t, id1.ID)
+
+	// IDs are unique across calls, even with empty inputs.
+	id2, err := r.Trigger(ctx, "queueA", nil, nil, nil)
+	require.NoError(t, err)
+	assert.NotEqual(t, id1, id2)
+}
+
+func TestRunner_Status(t *testing.T) {
+	r := New()
+
+	status, meta, err := r.Status(context.Background(), entity.BuildID{ID: "any-id"})
+	require.NoError(t, err)
+	assert.Equal(t, entity.BuildStatusSucceeded, status)
+	assert.Empty(t, meta)
+}
+
+func TestRunner_Cancel(t *testing.T) {
+	r := New()
+	assert.NoError(t, r.Cancel(context.Background(), entity.BuildID{ID: "any-id"}))
+}