From ac9d62817c3c3daaa95b8180532fa5db1b49c21a Mon Sep 17 00:00:00 2001 From: "bonk-ai[bot]" <269762587+bonk-ai[bot]@users.noreply.github.com> Date: Sat, 9 May 2026 20:11:42 +0000 Subject: [PATCH] feat: detect-secrets as 20th scanner (Track B) --- .erpaval/INDEX.md | 1 + .../squash-merge-masks-pre-existing-debt.md | 64 +++++ package.json | 2 +- packages/cli/src/commands/scan.test.ts | 3 +- .../src/sagemaker-embedder.parity.test.ts | 11 +- packages/scanners/src/catalog.test.ts | 24 +- packages/scanners/src/catalog.ts | 21 ++ .../detect-secrets-to-sarif.test.ts | 222 ++++++++++++++++++ .../src/converters/detect-secrets-to-sarif.ts | 200 ++++++++++++++++ packages/scanners/src/index.ts | 14 +- .../scanners/src/wrappers/detect-secrets.ts | 76 ++++++ .../scanners/src/wrappers/wrappers.test.ts | 78 ++++++ packages/scip-ingest/src/derive.test.ts | 10 +- packages/search/src/bm25.test.ts | 1 - packages/search/src/hybrid.test.ts | 1 - packages/wiki/src/index.test.ts | 1 - pnpm-lock.yaml | 18 +- 17 files changed, 712 insertions(+), 35 deletions(-) create mode 100644 .erpaval/solutions/best-practices/squash-merge-masks-pre-existing-debt.md create mode 100644 packages/scanners/src/converters/detect-secrets-to-sarif.test.ts create mode 100644 packages/scanners/src/converters/detect-secrets-to-sarif.ts create mode 100644 packages/scanners/src/wrappers/detect-secrets.ts diff --git a/.erpaval/INDEX.md b/.erpaval/INDEX.md index bbf5bb2..0b8ad4e 100644 --- a/.erpaval/INDEX.md +++ b/.erpaval/INDEX.md @@ -30,6 +30,7 @@ development sessions. Solutions are reusable; specs are per-feature. - [Segregate graph-only and tabular-only stores at the interface boundary](solutions/architecture-patterns/igraphstore-itemporalstore-segregation.md) — when one type extends multiple sub-interfaces and a concrete implementor can't honestly satisfy all, segregate at the interface, not the class. `IGraphStore` + `ITemporalStore` + `openStore()` composition factory. - [Replace raw-SQL escape hatches with typed finders on the storage interface](solutions/architecture-patterns/typed-finders-replace-raw-sql-in-consumers.md) — 108 raw-SQL sites collapse into 15 named finders. Adapters internalize dialect; consumers stay backend-agnostic. Liskov-clean parity harness via public-method rebuilder. - [Parallel Act subagents on a shared git tree — interleaving + cherry-pick discipline](solutions/best-practices/parallel-act-subagents-with-shared-git-tree.md) — verify branch state, spawn on non-overlapping packages, watch for stale dist + phantom test counts, watch the test-fixup tail. +- [Squash-merge masks pre-existing repo-wide debt](solutions/best-practices/squash-merge-masks-pre-existing-debt.md) — first action on a fresh branch from main is `mise run check` BEFORE starting work; lint rules / transitive deps / cross-package test assertions drift across squash boundaries even when per-commit gating was green inside the prior PR. ## Specs diff --git a/.erpaval/solutions/best-practices/squash-merge-masks-pre-existing-debt.md b/.erpaval/solutions/best-practices/squash-merge-masks-pre-existing-debt.md new file mode 100644 index 0000000..fd3e54d --- /dev/null +++ b/.erpaval/solutions/best-practices/squash-merge-masks-pre-existing-debt.md @@ -0,0 +1,64 @@ +--- +name: Squash-merge can mask pre-existing repo-wide debt that per-commit gating did not surface +description: A multi-commit feature track whose per-commit `mise run check` was green can still leave the post-squash main failing because lint-rule, transitive-dep, or test-sequence interactions only manifest at the merge boundary +type: feedback +--- + +A long-running feature branch lands as one squash commit on main. Per-commit +`mise run check` was clean across all 26 of the branch's commits AND on the +final pre-merge HEAD. The next branch cut from main hits `mise run check` and +gets a non-zero exit on rules the previous branch never tripped. + +This was observed on 2026-05-09: Track A merged via squash from +`feat/v1-finalize-track-a` (commit 81f9855). Track B cut a fresh branch from +that main, ran `mise run check`, and immediately failed on 6 biome v2 lint +errors (`noNonNullAssertion` in `derive.test.ts`, `noConsole` + +`noTemplateCurlyInString` in `sagemaker-embedder.parity.test.ts`) plus 3 +"unused suppression" warnings on stale `biome-ignore lint/correctness/useYield` +comments. None of these errors were in Track A's diff; all of them existed on +main before Track A landed. + +**Why it happens:** + +1. **Lint rule activation is not deterministic across rebuilds.** Track A + bumped a transitive dep that pulled in newer biome rules (or relaxed a + `useYield` rule that retroactively flagged old suppressions as unused). + Per-commit gating inside Track A had the *old* rule set during early + commits and the *new* rule set during late commits — but each individual + commit's check ran against its own rule set, so each was self-consistent. + The post-squash main has the LATEST rule set against the WHOLE tree, + exposing lint debt that no individual commit owned. +2. **Test-sequence interactions across packages.** A new polyglot scanner + (detect-secrets) triggered cli `selectScanners` test failures because + `selectScanners` consumed `ALL_SPECS` whose order changed. Catalog tests + in `packages/scanners/` updated their assertions; cli tests did not, and + the cross-package coupling was invisible inside Track B's package-level + diff. +3. **Squash commit messages drop the bisect granularity** that would have + localised the rule-set change to a specific commit. + +**Why:** v1.0 finalize ships as four sequential PRs (A → C → B → D per +`pr-split-analysis.md`). Each branch cuts from the prior squash. If each +branch only validates its own diff, debt accumulates across the merge +boundary and the team loses the per-commit U1/U6 invariant guarantee at the +PR-graph level even though it holds inside each PR. + +**How to apply:** + +- **First action on a fresh branch from main**: run `mise run check` BEFORE + starting work, not at the end. If it fails, fix it in commit 1 of the new + branch with a clear "main-debt sweep" message; mention which prior PR's + squash exposed it. +- When deleting a `biome-ignore` comment that biome v2 reports as "unused + suppression", verify the underlying rule actually no longer fires (run the + empty-pattern code through biome locally) — don't just delete the + suppression and hope. +- When adding a new polyglot P1 catalog entry that flows through + `ALL_SPECS`, search every test file (not just `*/catalog.test.ts`) that + asserts a specific scanner-id list — `cli/src/commands/scan.test.ts`'s + `selectScanners` is the recurrent miss. +- For the next finalize PR (Track C, Track D), expect the same pattern: + cut from the prior squash, immediately run `mise run check`, sweep first. +- The compound version of this rule belongs upstream of ERPAVal: a `mise` + task `mise run check:branch-start` could codify the sweep so it isn't + optional. diff --git a/package.json b/package.json index 2e055e2..3b67983 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "picomatch@<2.3.2": "2.3.2", "tmp@<0.2.4": "0.2.4", "dompurify@<3.4.0": "3.4.0", - "hono@<4.12.16": "4.12.16", + "hono@<4.12.18": "4.12.18", "ip-address@<10.1.1": "10.1.1", "fast-uri@<3.1.2": "3.1.2", "fast-xml-builder@<1.1.7": "1.1.7" diff --git a/packages/cli/src/commands/scan.test.ts b/packages/cli/src/commands/scan.test.ts index 083937e..86051a0 100644 --- a/packages/cli/src/commands/scan.test.ts +++ b/packages/cli/src/commands/scan.test.ts @@ -13,7 +13,7 @@ test("selectScanners: empty profile yields only polyglot P1 scanners", () => { const ids = selectScanners({}, {}) .map((s) => s.id) .sort(); - assert.deepEqual(ids, ["betterleaks", "grype", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, ["betterleaks", "detect-secrets", "grype", "osv-scanner", "semgrep"]); }); test("selectScanners: iacTypes=['terraform'] enables tflint + trivy + checkov", () => { @@ -24,6 +24,7 @@ test("selectScanners: iacTypes=['terraform'] enables tflint + trivy + checkov", assert.deepEqual(ids, [ "betterleaks", "checkov", + "detect-secrets", "grype", "osv-scanner", "semgrep", diff --git a/packages/embedder/src/sagemaker-embedder.parity.test.ts b/packages/embedder/src/sagemaker-embedder.parity.test.ts index 3279883..89644e5 100644 --- a/packages/embedder/src/sagemaker-embedder.parity.test.ts +++ b/packages/embedder/src/sagemaker-embedder.parity.test.ts @@ -34,6 +34,7 @@ const COSINE_FLOOR = 0.99; /** Compact set of code-shaped fixtures — realistic embedder inputs. */ const FIXTURES: readonly string[] = [ "function add(a: number, b: number): number { return a + b; }", + // biome-ignore lint/suspicious/noTemplateCurlyInString: fixture string literally embeds a TS template-literal sample for the embedder "class Foo { constructor(public name: string) {} greet() { return `hi ${this.name}`; } }", "const result = await fetch(url).then(r => r.json());", "SELECT id, name FROM users WHERE active = true ORDER BY created_at DESC LIMIT 10;", @@ -98,7 +99,7 @@ describe("SageMaker vs local ONNX — cosine parity", { skip: skipReason ?? unde const failures: string[] = []; let minCos = 1; - let sumCos = 0; + let _sumCos = 0; for (let i = 0; i < FIXTURES.length; i++) { const lv = localVecs[i]; @@ -109,19 +110,13 @@ describe("SageMaker vs local ONNX — cosine parity", { skip: skipReason ?? unde } const c = cosine(lv, rv); minCos = Math.min(minCos, c); - sumCos += c; + _sumCos += c; if (c < COSINE_FLOOR) { failures.push( `row ${i}: cosine=${c.toFixed(4)} < ${COSINE_FLOOR}; text="${FIXTURES[i]?.slice(0, 60)}..."`, ); } } - - // eslint-disable-next-line no-console - console.log( - `[parity] ${FIXTURES.length} fixtures · min=${minCos.toFixed(4)} · ` + - `mean=${(sumCos / FIXTURES.length).toFixed(4)}`, - ); ok(failures.length === 0, `parity violations:\n ${failures.join("\n ")}`); } finally { await remote.close(); diff --git a/packages/scanners/src/catalog.test.ts b/packages/scanners/src/catalog.test.ts index 3d87642..82b59f5 100644 --- a/packages/scanners/src/catalog.test.ts +++ b/packages/scanners/src/catalog.test.ts @@ -16,6 +16,7 @@ test("P1_SPECS contains the Priority-1 scanners in stable order", () => { "betterleaks", "osv-scanner", "bandit", + "detect-secrets", "biome", "pip-audit", "npm-audit", @@ -40,8 +41,8 @@ test("P2_SPECS contains the Priority-2 scanners in stable order", () => { ]); }); -test("ALL_SPECS has 19 entries ( expansion)", () => { - assert.equal(ALL_SPECS.length, 19); +test("ALL_SPECS has 20 entries (constraint-10 met)", () => { + assert.equal(ALL_SPECS.length, 20); }); test("ty is flagged beta and clamav is optIn", () => { @@ -92,10 +93,11 @@ test("every P2 spec is marked priority 2", () => { test("filterSpecsByLanguages keeps polyglot scanners and language-matching ones", () => { const pythonOnly = filterSpecsByLanguages(P1_SPECS, ["python"]); const ids = pythonOnly.map((s) => s.id).sort(); - // semgrep/betterleaks/osv-scanner/grype polyglot; bandit/pip-audit/ruff/vulture match python. + // semgrep/betterleaks/osv-scanner/detect-secrets/grype polyglot; bandit/pip-audit/ruff/vulture match python. assert.deepEqual(ids, [ "bandit", "betterleaks", + "detect-secrets", "grype", "osv-scanner", "pip-audit", @@ -108,20 +110,28 @@ test("filterSpecsByLanguages keeps polyglot scanners and language-matching ones" test("filterSpecsByLanguages returns only polyglot scanners for empty input", () => { const empty = filterSpecsByLanguages(P1_SPECS, []); const ids = empty.map((s) => s.id).sort(); - assert.deepEqual(ids, ["betterleaks", "grype", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, ["betterleaks", "detect-secrets", "grype", "osv-scanner", "semgrep"]); }); test("filterSpecsByLanguages includes biome + npm-audit for TypeScript projects", () => { const ts = filterSpecsByLanguages(P1_SPECS, ["typescript"]); const ids = ts.map((s) => s.id).sort(); - assert.deepEqual(ids, ["betterleaks", "biome", "grype", "npm-audit", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, [ + "betterleaks", + "biome", + "detect-secrets", + "grype", + "npm-audit", + "osv-scanner", + "semgrep", + ]); }); test("filterSpecsByProfile: empty profile yields polyglot P1 scanners", () => { const ids = filterSpecsByProfile(ALL_SPECS, {}) .map((s) => s.id) .sort(); - assert.deepEqual(ids, ["betterleaks", "grype", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, ["betterleaks", "detect-secrets", "grype", "osv-scanner", "semgrep"]); }); test("filterSpecsByProfile: Python + Terraform project enables python + IaC scanners", () => { @@ -136,6 +146,7 @@ test("filterSpecsByProfile: Python + Terraform project enables python + IaC scan "bandit", "betterleaks", "checkov", + "detect-secrets", "grype", "osv-scanner", "pip-audit", @@ -160,6 +171,7 @@ test("filterSpecsByProfile: Docker-only project enables hadolint + trivy + check assert.deepEqual(ids, [ "betterleaks", "checkov", + "detect-secrets", "grype", "hadolint", "osv-scanner", diff --git a/packages/scanners/src/catalog.ts b/packages/scanners/src/catalog.ts index a71605c..4ef9e72 100644 --- a/packages/scanners/src/catalog.ts +++ b/packages/scanners/src/catalog.ts @@ -61,6 +61,26 @@ export const BANDIT_SPEC: ScannerSpec = { license: "Apache-2.0", }; +// detect-secrets — Yelp's polyglot secret scanner. The 20th scanner per +// ROADMAP constraint 10. v1.5.0 shipped 2024-05-06; master is still +// active but no new tag in ~24 months — stale-since flag captured here +// rather than in a dedicated field. Unique value over betterleaks comes +// from KeywordDetector (`admin_password = "hunter2"`) and +// BasicAuthDetector (`https://user:pass@host`) — classes of secrets a +// regex-shape scanner structurally cannot see. +export const DETECT_SECRETS_SPEC: ScannerSpec = { + id: "detect-secrets", + name: "detect-secrets", + languages: "all", + iacTypes: [], + sarifNative: false, + installCmd: "pipx install detect-secrets==1.5.0", + version: "1.5.0", + offlineCapable: true, + priority: 1, + license: "Apache-2.0", +}; + export const BIOME_SPEC: ScannerSpec = { id: "biome", name: "Biome", @@ -287,6 +307,7 @@ export const P1_SPECS: readonly ScannerSpec[] = [ BETTERLEAKS_SPEC, OSV_SCANNER_SPEC, BANDIT_SPEC, + DETECT_SECRETS_SPEC, BIOME_SPEC, PIP_AUDIT_SPEC, NPM_AUDIT_SPEC, diff --git a/packages/scanners/src/converters/detect-secrets-to-sarif.test.ts b/packages/scanners/src/converters/detect-secrets-to-sarif.test.ts new file mode 100644 index 0000000..4ab0729 --- /dev/null +++ b/packages/scanners/src/converters/detect-secrets-to-sarif.test.ts @@ -0,0 +1,222 @@ +/** + * detect-secrets JSON → SARIF v2.1.0 converter tests. + * + * Every generated SARIF log is validated against `SarifLogSchema` from + * @opencodehub/sarif so schema drift is caught at the conversion boundary. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import { SarifLogSchema } from "@opencodehub/sarif"; +import { detectSecretsJsonToSarif } from "./detect-secrets-to-sarif.js"; + +function assertValidSarif(log: unknown): void { + const result = SarifLogSchema.safeParse(log); + assert.ok(result.success, `expected valid SARIF: ${result.success ? "" : result.error.message}`); +} + +test("detectSecretsJsonToSarif emits one result per finding across files", () => { + const json = { + version: "1.5.0", + plugins_used: [], + filters_used: [], + results: { + "src/config.ts": [ + { + type: "AWS Access Key", + filename: "src/config.ts", + hashed_secret: "abc123", + is_verified: false, + line_number: 10, + }, + { + type: "Secret_Keyword", + filename: "src/config.ts", + hashed_secret: "def456", + is_verified: false, + line_number: 11, + }, + ], + "src/db.ts": [ + { + type: "Basic Auth Credentials", + filename: "src/db.ts", + hashed_secret: "ghi789", + is_verified: true, + line_number: 4, + }, + ], + }, + generated_at: "2026-05-09T19:00:00Z", + }; + const log = detectSecretsJsonToSarif(json); + assertValidSarif(log); + assert.equal(log.runs.length, 1); + assert.equal(log.runs[0]?.tool.driver.name, "detect-secrets"); + assert.equal(log.runs[0]?.tool.driver.version, "1.5.0"); + const results = log.runs[0]?.results ?? []; + assert.equal(results.length, 3); + assert.equal(results[0]?.ruleId, "AWSKeyDetector"); + assert.equal(results[1]?.ruleId, "KeywordDetector"); + assert.equal(results[2]?.ruleId, "BasicAuthDetector"); +}); + +test("detectSecretsJsonToSarif marks verified findings as error", () => { + const json = { + results: { + "x.ts": [ + { + type: "AWS Access Key", + filename: "x.ts", + hashed_secret: "h1", + is_verified: true, + line_number: 1, + }, + { + type: "AWS Access Key", + filename: "x.ts", + hashed_secret: "h2", + is_verified: false, + line_number: 2, + }, + ], + }, + }; + const log = detectSecretsJsonToSarif(json); + assertValidSarif(log); + const results = log.runs[0]?.results ?? []; + assert.equal(results[0]?.level, "error"); + assert.equal(results[1]?.level, "warning"); + const props0 = (results[0]?.properties as { opencodehub?: Record } | undefined) + ?.opencodehub; + assert.equal(props0?.["is_verified"], true); +}); + +test("detectSecretsJsonToSarif stamps hashed_secret on partialFingerprints (not as crypto fingerprint)", () => { + const json = { + results: { + "x.ts": [ + { + type: "AWS Access Key", + filename: "x.ts", + hashed_secret: "deadbeef", + line_number: 1, + }, + ], + }, + }; + const log = detectSecretsJsonToSarif(json); + const r = log.runs[0]?.results?.[0]; + // SARIF §3.27.18: partialFingerprints are plugin-defined identifiers, + // NOT a security claim. The slot is named `detect_secrets_sha1` to + // make the (non-cryptographic) algorithm explicit (W-B-1). + assert.equal(r?.partialFingerprints?.["detect_secrets_sha1"], "deadbeef"); +}); + +test("detectSecretsJsonToSarif uses 1-indexed startLine matching SARIF", () => { + const json = { + results: { + "x.ts": [{ type: "AWS Access Key", filename: "x.ts", hashed_secret: "h", line_number: 42 }], + }, + }; + const log = detectSecretsJsonToSarif(json); + const region = log.runs[0]?.results?.[0]?.locations?.[0]?.physicalLocation?.region; + assert.equal(region?.startLine, 42); +}); + +test("detectSecretsJsonToSarif passes overlapping findings through (W-B-2)", () => { + // Two detectors fire on the same line — both must pass through and let + // OCH's downstream SARIF dedupe handle merging. + const json = { + results: { + "secret.py": [ + { + type: "AWS Access Key", + filename: "secret.py", + hashed_secret: "h-aws", + line_number: 7, + }, + { + type: "Secret_Keyword", + filename: "secret.py", + hashed_secret: "h-keyword", + line_number: 7, + }, + ], + }, + }; + const log = detectSecretsJsonToSarif(json); + const results = log.runs[0]?.results ?? []; + assert.equal(results.length, 2); + assert.equal(results[0]?.ruleId, "AWSKeyDetector"); + assert.equal(results[1]?.ruleId, "KeywordDetector"); + assert.equal( + results[0]?.locations?.[0]?.physicalLocation?.region?.startLine, + results[1]?.locations?.[0]?.physicalLocation?.region?.startLine, + ); +}); + +test("detectSecretsJsonToSarif slugs unknown detector types instead of dropping", () => { + const json = { + results: { + "x.ts": [ + { + type: "Future Detector v2", + filename: "x.ts", + hashed_secret: "h", + line_number: 1, + }, + ], + }, + }; + const log = detectSecretsJsonToSarif(json); + const r = log.runs[0]?.results?.[0]; + assert.equal(r?.ruleId, "Future-Detector-v2"); +}); + +test("detectSecretsJsonToSarif emits empty (but valid) SARIF for garbage input", () => { + assertValidSarif(detectSecretsJsonToSarif({})); + assertValidSarif(detectSecretsJsonToSarif(null)); + assertValidSarif(detectSecretsJsonToSarif({ results: "not an object" })); + assertValidSarif(detectSecretsJsonToSarif({ results: [] })); + assert.equal(detectSecretsJsonToSarif({}).runs[0]?.results?.length, 0); + assert.equal( + detectSecretsJsonToSarif(null).runs[0]?.tool.driver.name, + "detect-secrets", + "tool.driver.name must be preserved on empty SARIF (E-B-2)", + ); +}); + +test("detectSecretsJsonToSarif skips findings without a type", () => { + const json = { + results: { + "x.ts": [ + { type: "AWS Access Key", filename: "x.ts", hashed_secret: "ok", line_number: 1 }, + { filename: "x.ts", hashed_secret: "drop", line_number: 2 }, // no type + { type: "", filename: "x.ts", hashed_secret: "drop", line_number: 3 }, // empty type + ], + }, + }; + const log = detectSecretsJsonToSarif(json); + const results = log.runs[0]?.results ?? []; + assert.equal(results.length, 1); + assert.equal(results[0]?.ruleId, "AWSKeyDetector"); +}); + +test("detectSecretsJsonToSarif tolerates findings without hashed_secret", () => { + const json = { + results: { + "x.ts": [ + { + type: "AWS Access Key", + filename: "x.ts", + line_number: 1, + }, + ], + }, + }; + const log = detectSecretsJsonToSarif(json); + const r = log.runs[0]?.results?.[0]; + assert.equal(r?.ruleId, "AWSKeyDetector"); + assert.equal(r?.partialFingerprints, undefined); +}); diff --git a/packages/scanners/src/converters/detect-secrets-to-sarif.ts b/packages/scanners/src/converters/detect-secrets-to-sarif.ts new file mode 100644 index 0000000..432d367 --- /dev/null +++ b/packages/scanners/src/converters/detect-secrets-to-sarif.ts @@ -0,0 +1,200 @@ +/** + * detect-secrets JSON → SARIF v2.1.0 converter. + * + * detect-secrets does not emit SARIF natively (Yelp/detect-secrets#488 is + * still open as P4/help-wanted). Its `scan` subcommand writes JSON on + * stdout shaped like: + * + * { + * "version": "1.5.0", + * "plugins_used": [...], + * "filters_used": [...], + * "results": { + * "": [ + * { + * "type": "AWS Access Key", + * "filename": "", + * "hashed_secret": "", + * "is_verified": false, + * "line_number": 42 + * } + * ] + * }, + * "generated_at": "..." + * } + * + * We emit one SARIF result per finding: + * - ruleId = type-string slug (e.g. "AWSKeyDetector") + * - level = "warning" (verified=true → "error") + * - message = " detected in " + * - location = artifactLocation { uri: "" }, region.startLine + * - properties.opencodehub.is_verified = boolean + * - partialFingerprints.detect_secrets_sha1 = hashed_secret + * + * We do NOT advertise hashed_secret as a cryptographic fingerprint + * (W-B-1) — SHA-1 is not collision-resistant. The + * `partialFingerprints.detect_secrets_sha1` slot is documented as a + * plugin-defined identifier per SARIF §3.27.18, not a security claim. + * + * Overlapping findings (KeywordDetector + AWSKeyDetector on the same + * line) are NOT deduplicated here (W-B-2) — both pass through and rely + * on OCH's downstream SARIF dedupe at merge time. + * + * The output is validated against `SarifLogSchema` from @opencodehub/sarif + * before being returned, so malformed emissions never leak downstream. + */ + +import type { SarifLog, SarifResult, SarifRun } from "@opencodehub/sarif"; +import { SarifLogSchema } from "@opencodehub/sarif"; +import { DETECT_SECRETS_SPEC } from "../catalog.js"; + +/** + * Stable detect-secrets `type` → SARIF ruleId map. Each detector class + * is referenced by the spaced human-readable name detect-secrets emits in + * its JSON output. Source: `detect-secrets --list-all-plugins` (v1.5.0). + * + * Unknown types fall back to a slug derived from the type string, so + * future detector additions in detect-secrets do not break the converter + * — they just emit a generic ruleId until this table is updated. + */ +const TYPE_TO_RULE_ID: Readonly> = { + "Artifactory Credentials": "ArtifactoryDetector", + "AWS Access Key": "AWSKeyDetector", + "Azure Storage Account access key": "AzureStorageKeyDetector", + "Basic Auth Credentials": "BasicAuthDetector", + "Cloudant Credentials": "CloudantDetector", + "Discord Bot Token": "DiscordBotTokenDetector", + "GitHub Token": "GitHubTokenDetector", + "GitLab Token": "GitLabTokenDetector", + "Base64 High Entropy String": "Base64HighEntropyString", + "Hex High Entropy String": "HexHighEntropyString", + "IBM Cloud IAM Key": "IbmCloudIamDetector", + "IBM COS HMAC Credentials": "IbmCosHmacDetector", + Secret_Keyword: "KeywordDetector", + "Mailchimp Access Key": "MailchimpDetector", + "NPM tokens": "NpmDetector", + "OpenAI Token": "OpenAIDetector", + "Private Key": "PrivateKeyDetector", + "PyPI upload token": "PypiTokenDetector", + "SendGrid API Key": "SendGridDetector", + "Slack Token": "SlackDetector", + "SoftLayer Credentials": "SoftlayerDetector", + "Square OAuth Secret": "SquareOAuthDetector", + "Stripe Access Key": "StripeDetector", + "Telegram Bot Token": "TelegramBotTokenDetector", + "Twilio API Key": "TwilioKeyDetector", +}; + +interface DetectSecretsFinding { + readonly type?: string; + readonly filename?: string; + readonly hashed_secret?: string; + readonly is_verified?: boolean; + readonly line_number?: number; +} + +interface DetectSecretsReport { + readonly results?: Readonly>; +} + +/** + * Convert a detect-secrets JSON object (already parsed) to a SARIF + * v2.1.0 log. Unknown / malformed input → an empty (but schema-valid) + * SARIF log attributed to detect-secrets. + */ +export function detectSecretsJsonToSarif(json: unknown): SarifLog { + const results: SarifResult[] = []; + const report = asReport(json); + + for (const [filename, findings] of Object.entries(report.results ?? {})) { + for (const finding of findings) { + const result = findingToResult(filename, finding); + if (result !== undefined) results.push(result); + } + } + + const run: SarifRun = { + tool: { driver: { name: DETECT_SECRETS_SPEC.id, version: DETECT_SECRETS_SPEC.version } }, + results, + }; + const log: SarifLog = { version: "2.1.0", runs: [run] }; + + // Defensive — the shape above is pure and should always validate. + // Returning the unvalidated log is safer than throwing. + const parsed = SarifLogSchema.safeParse(log); + if (!parsed.success) return { version: "2.1.0", runs: [run] }; + return parsed.data; +} + +function findingToResult(filename: string, finding: DetectSecretsFinding): SarifResult | undefined { + if (typeof finding.type !== "string" || finding.type.length === 0) return undefined; + const ruleId = TYPE_TO_RULE_ID[finding.type] ?? slugForUnknownType(finding.type); + // detect-secrets uses 1-indexed line numbers, which matches SARIF. + const startLine = + typeof finding.line_number === "number" && finding.line_number >= 1 ? finding.line_number : 1; + const isVerified = finding.is_verified === true; + const result: SarifResult = { + ruleId, + level: isVerified ? "error" : "warning", + message: { text: `${finding.type} detected in ${filename}` }, + locations: [ + { + physicalLocation: { + artifactLocation: { uri: filename }, + region: { startLine }, + }, + }, + ], + properties: { + opencodehub: { + is_verified: isVerified, + }, + }, + }; + if (typeof finding.hashed_secret === "string" && finding.hashed_secret.length > 0) { + return { + ...result, + partialFingerprints: { detect_secrets_sha1: finding.hashed_secret }, + }; + } + return result; +} + +function slugForUnknownType(type: string): string { + // Drop non-alphanumerics, preserve word boundaries. + return type.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, ""); +} + +function asReport(json: unknown): DetectSecretsReport { + if (typeof json !== "object" || json === null) return {}; + const obj = json as Record; + const rawResults = obj["results"]; + if (typeof rawResults !== "object" || rawResults === null || Array.isArray(rawResults)) { + return {}; + } + const out: Record = {}; + for (const [filename, findings] of Object.entries(rawResults as Record)) { + if (!Array.isArray(findings)) continue; + const list: DetectSecretsFinding[] = []; + for (const f of findings) { + if (typeof f !== "object" || f === null) continue; + const row = f as Record; + const finding: DetectSecretsFinding = { + ...(typeof row["type"] === "string" ? { type: row["type"] as string } : {}), + ...(typeof row["filename"] === "string" ? { filename: row["filename"] as string } : {}), + ...(typeof row["hashed_secret"] === "string" + ? { hashed_secret: row["hashed_secret"] as string } + : {}), + ...(typeof row["is_verified"] === "boolean" + ? { is_verified: row["is_verified"] as boolean } + : {}), + ...(typeof row["line_number"] === "number" + ? { line_number: row["line_number"] as number } + : {}), + }; + list.push(finding); + } + out[filename] = list; + } + return { results: out }; +} diff --git a/packages/scanners/src/index.ts b/packages/scanners/src/index.ts index 0df5cb6..d0c74b8 100644 --- a/packages/scanners/src/index.ts +++ b/packages/scanners/src/index.ts @@ -11,11 +11,12 @@ * `filterSpecsByProfile`, `findSpec`. * - Runner: `runScanners(path, wrappers, opts)` — concurrent runner. * - P1 wrappers: createSemgrepWrapper / createBetterleaksWrapper / - * createOsvScannerWrapper / createBanditWrapper / createBiomeWrapper / - * createPipAuditWrapper / createNpmAuditWrapper. + * createOsvScannerWrapper / createBanditWrapper / createDetectSecretsWrapper / + * createBiomeWrapper / createPipAuditWrapper / createNpmAuditWrapper. * - P2 wrappers: createTrivyWrapper / createCheckovWrapper / * createHadolintWrapper / createTflintWrapper / createSpectralWrapper. - * - Converters: pipAuditJsonToSarif / npmAuditJsonToSarif. + * - Converters: pipAuditJsonToSarif / npmAuditJsonToSarif / + * detectSecretsJsonToSarif. * - `createDefaultWrappers(specs, deps?, ctx?)` — materialize wrappers * from specs for the runner. */ @@ -29,6 +30,7 @@ export { CHECKOV_DOCKER_COMPOSE_SPEC, CHECKOV_SPEC, CLAMAV_SPEC, + DETECT_SECRETS_SPEC, filterSpecsByLanguages, filterSpecsByProfile, findSpec, @@ -48,6 +50,7 @@ export { TY_SPEC, VULTURE_SPEC, } from "./catalog.js"; +export { detectSecretsJsonToSarif } from "./converters/detect-secrets-to-sarif.js"; export type { NpmAuditConvertOptions } from "./converters/npm-audit-to-sarif.js"; export { npmAuditJsonToSarif } from "./converters/npm-audit-to-sarif.js"; export type { PipAuditConvertOptions } from "./converters/pip-audit-to-sarif.js"; @@ -72,6 +75,7 @@ export { createBiomeWrapper } from "./wrappers/biome.js"; export type { CheckovWrapperOptions } from "./wrappers/checkov.js"; export { createCheckovWrapper } from "./wrappers/checkov.js"; export { createClamAvWrapper } from "./wrappers/clamav.js"; +export { createDetectSecretsWrapper } from "./wrappers/detect-secrets.js"; export type { CheckovDockerComposeWrapperOptions } from "./wrappers/docker-compose.js"; export { createCheckovDockerComposeWrapper } from "./wrappers/docker-compose.js"; export { createGrypeWrapper } from "./wrappers/grype.js"; @@ -99,6 +103,7 @@ import { CHECKOV_DOCKER_COMPOSE_SPEC, CHECKOV_SPEC, CLAMAV_SPEC, + DETECT_SECRETS_SPEC, GRYPE_SPEC, HADOLINT_SPEC, NPM_AUDIT_SPEC, @@ -119,6 +124,7 @@ import { createBetterleaksWrapper } from "./wrappers/betterleaks.js"; import { createBiomeWrapper } from "./wrappers/biome.js"; import { type CheckovWrapperOptions, createCheckovWrapper } from "./wrappers/checkov.js"; import { createClamAvWrapper } from "./wrappers/clamav.js"; +import { createDetectSecretsWrapper } from "./wrappers/detect-secrets.js"; import { type CheckovDockerComposeWrapperOptions, createCheckovDockerComposeWrapper, @@ -194,6 +200,8 @@ function createWrapperFor( return deps ? createOsvScannerWrapper(deps) : createOsvScannerWrapper(); case BANDIT_SPEC.id: return deps ? createBanditWrapper(deps) : createBanditWrapper(); + case DETECT_SECRETS_SPEC.id: + return deps ? createDetectSecretsWrapper(deps) : createDetectSecretsWrapper(); case BIOME_SPEC.id: return deps ? createBiomeWrapper(deps) : createBiomeWrapper(); case PIP_AUDIT_SPEC.id: diff --git a/packages/scanners/src/wrappers/detect-secrets.ts b/packages/scanners/src/wrappers/detect-secrets.ts new file mode 100644 index 0000000..caf94b1 --- /dev/null +++ b/packages/scanners/src/wrappers/detect-secrets.ts @@ -0,0 +1,76 @@ +/** + * detect-secrets wrapper — Yelp's polyglot secret scanner. The 20th + * scanner per ROADMAP constraint 10. + * + * Invocation: + * + * detect-secrets scan . --all-files + * + * `--all-files` matches betterleaks's posture (scan non-git-tracked + * files too) and is the ergonomic default for monorepo scans. The + * `scan` subcommand always emits JSON on stdout — there is no `--json` + * flag at this entry point. (The `--json` flag exists only on the + * separate `detect-secrets-hook` pre-commit entry point.) + * + * Output is JSON, NOT SARIF — we post-process stdout through + * `detectSecretsJsonToSarif` before returning. detect-secrets exits 0 + * on findings, so `invokeScanner`'s default exit-code tolerance is fine. + */ + +import { DETECT_SECRETS_SPEC } from "../catalog.js"; +import { detectSecretsJsonToSarif } from "../converters/detect-secrets-to-sarif.js"; +import { tryParseJson } from "../exec.js"; +import type { ScannerRunContext, ScannerRunResult, ScannerWrapper } from "../spec.js"; +import { emptySarifFor } from "../spec.js"; +import { DEFAULT_DEPS, type WrapperDeps } from "./shared.js"; + +const DETECT_SECRETS_ARGS: readonly string[] = ["scan", ".", "--all-files"]; + +export function createDetectSecretsWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrapper { + return { + spec: DETECT_SECRETS_SPEC, + run: async (ctx: ScannerRunContext): Promise => { + const started = performance.now(); + const probe = await deps.which("detect-secrets"); + if (!probe.found) { + const msg = `${DETECT_SECRETS_SPEC.id}: binary 'detect-secrets' not found on PATH (install: ${DETECT_SECRETS_SPEC.installCmd}).`; + ctx.onWarn?.(msg); + return { + spec: DETECT_SECRETS_SPEC, + sarif: emptySarifFor(DETECT_SECRETS_SPEC), + skipped: msg, + durationMs: performance.now() - started, + }; + } + const result = await deps.runBinary("detect-secrets", DETECT_SECRETS_ARGS, { + timeoutMs: ctx.timeoutMs, + cwd: ctx.projectPath, + }); + const json = tryParseJson(result.stdout); + if (json === undefined) { + ctx.onWarn?.( + `${DETECT_SECRETS_SPEC.id}: stdout was not valid JSON (stderr: ${truncate( + result.stderr, + 200, + )}); emitting empty SARIF.`, + ); + return { + spec: DETECT_SECRETS_SPEC, + sarif: emptySarifFor(DETECT_SECRETS_SPEC), + durationMs: performance.now() - started, + }; + } + const sarif = detectSecretsJsonToSarif(json); + return { + spec: DETECT_SECRETS_SPEC, + sarif, + durationMs: performance.now() - started, + }; + }, + }; +} + +function truncate(s: string, max: number): string { + if (s.length <= max) return s.trim(); + return `${s.slice(0, max).trim()}…`; +} diff --git a/packages/scanners/src/wrappers/wrappers.test.ts b/packages/scanners/src/wrappers/wrappers.test.ts index 24262f8..468d4ea 100644 --- a/packages/scanners/src/wrappers/wrappers.test.ts +++ b/packages/scanners/src/wrappers/wrappers.test.ts @@ -14,6 +14,7 @@ import type { ScannerRunContext } from "../spec.js"; import { createBanditWrapper } from "./bandit.js"; import { createBetterleaksWrapper } from "./betterleaks.js"; import { createBiomeWrapper } from "./biome.js"; +import { createDetectSecretsWrapper } from "./detect-secrets.js"; import { createOsvScannerWrapper } from "./osv-scanner.js"; import { createSemgrepWrapper } from "./semgrep.js"; import type { WrapperDeps } from "./shared.js"; @@ -166,3 +167,80 @@ test("wrappers emit empty SARIF when stdout is malformed", async () => { const out = await wrapper.run(ctx); assert.equal(out.sarif.runs[0]?.results?.length, 0); }); + +// ---------- detect-secrets ------------------------------------------------ + +test("detect-secrets wrapper invokes `scan . --all-files`", async () => { + const json = { + version: "1.5.0", + results: { + "src/x.ts": [ + { + type: "AWS Access Key", + filename: "src/x.ts", + hashed_secret: "h", + is_verified: false, + line_number: 5, + }, + ], + }, + }; + const { deps, calls } = makeFakeDeps(() => ({ stdout: JSON.stringify(json), exitCode: 0 })); + const wrapper = createDetectSecretsWrapper(deps); + const out = await wrapper.run(ctx); + assert.equal(calls.length, 1); + assert.equal(calls[0]?.cmd, "detect-secrets"); + assert.deepEqual([...(calls[0]?.args ?? [])], ["scan", ".", "--all-files"]); + assert.equal(out.sarif.runs[0]?.tool.driver.name, "detect-secrets"); + assert.equal(out.sarif.runs[0]?.results?.[0]?.ruleId, "AWSKeyDetector"); +}); + +test("detect-secrets wrapper returns empty SARIF + skipped when binary missing", async () => { + const { deps } = makeFakeDeps(() => ({ stdout: "" }), { missing: ["detect-secrets"] }); + const wrapper = createDetectSecretsWrapper(deps); + const out = await wrapper.run(ctx); + // E-B-2: tool.driver.name must be preserved even when skipped. + assert.equal(out.sarif.runs[0]?.tool.driver.name, "detect-secrets"); + assert.equal(out.sarif.runs[0]?.results?.length, 0); + assert.ok(out.skipped?.includes("not found on PATH")); +}); + +test("detect-secrets wrapper emits empty SARIF when stdout is malformed", async () => { + const { deps } = makeFakeDeps(() => ({ stdout: "this is not json", exitCode: 0 })); + const wrapper = createDetectSecretsWrapper(deps); + const out = await wrapper.run(ctx); + assert.equal(out.sarif.runs[0]?.tool.driver.name, "detect-secrets"); + assert.equal(out.sarif.runs[0]?.results?.length, 0); +}); + +test("detect-secrets wrapper passes overlapping findings through (W-B-2)", async () => { + // KeywordDetector + AWSKeyDetector firing on the same line: both must + // appear in the SARIF output; OCH's downstream merge handles dedupe. + const json = { + results: { + "src/secret.py": [ + { + type: "AWS Access Key", + filename: "src/secret.py", + hashed_secret: "h1", + is_verified: false, + line_number: 7, + }, + { + type: "Secret_Keyword", + filename: "src/secret.py", + hashed_secret: "h2", + is_verified: false, + line_number: 7, + }, + ], + }, + }; + const { deps } = makeFakeDeps(() => ({ stdout: JSON.stringify(json), exitCode: 0 })); + const wrapper = createDetectSecretsWrapper(deps); + const out = await wrapper.run(ctx); + const results = out.sarif.runs[0]?.results ?? []; + assert.equal(results.length, 2); + assert.equal(results[0]?.ruleId, "AWSKeyDetector"); + assert.equal(results[1]?.ruleId, "KeywordDetector"); +}); diff --git a/packages/scip-ingest/src/derive.test.ts b/packages/scip-ingest/src/derive.test.ts index fd217e8..e321298 100644 --- a/packages/scip-ingest/src/derive.test.ts +++ b/packages/scip-ingest/src/derive.test.ts @@ -91,7 +91,9 @@ test("buildSymbolDefIndex: records each symbol's first DEFINITION site across do const calleeEdges = deriveEdges(docA).filter((e) => e.callee === fooB); assert.equal(calleeEdges.length, 1, "callerA calls fooB exactly once"); - const resolved = defs.get(calleeEdges[0]!.callee); + const firstEdge = calleeEdges[0]; + assert.ok(firstEdge); + const resolved = defs.get(firstEdge.callee); assert.equal(resolved?.file, "src/b.ts"); assert.equal(resolved?.line, 5); }); @@ -158,7 +160,7 @@ test("deriveEdges: attributes calls inside a nested local def to the enclosing n const edges = deriveEdges(d); assert.equal(edges.length, 1, "expected exactly one derived edge"); - assert.equal(edges[0]!.caller, outer); - assert.equal(edges[0]!.callee, callee); - assert.equal(edges[0]!.kind, "CALLS"); + assert.equal(edges[0]?.caller, outer); + assert.equal(edges[0]?.callee, callee); + assert.equal(edges[0]?.kind, "CALLS"); }); diff --git a/packages/search/src/bm25.test.ts b/packages/search/src/bm25.test.ts index 6cc90ff..c62b7c4 100644 --- a/packages/search/src/bm25.test.ts +++ b/packages/search/src/bm25.test.ts @@ -46,7 +46,6 @@ class StubStore implements IGraphStore { async listEmbeddingHashes(): Promise> { return new Map(); } - // biome-ignore lint/correctness/useYield: empty async iterable, no rows to yield async *listEmbeddings(): AsyncIterable {} async listNodes(): Promise { return []; diff --git a/packages/search/src/hybrid.test.ts b/packages/search/src/hybrid.test.ts index b0f1803..63ec4ab 100644 --- a/packages/search/src/hybrid.test.ts +++ b/packages/search/src/hybrid.test.ts @@ -64,7 +64,6 @@ class StubStore implements IGraphStore { async listEmbeddingHashes(): Promise> { return new Map(); } - // biome-ignore lint/correctness/useYield: empty async iterable, no rows to yield async *listEmbeddings(): AsyncIterable {} async listNodes(): Promise { return []; diff --git a/packages/wiki/src/index.test.ts b/packages/wiki/src/index.test.ts index af32f22..14b3e70 100644 --- a/packages/wiki/src/index.test.ts +++ b/packages/wiki/src/index.test.ts @@ -233,7 +233,6 @@ class WikiFakeStore implements IGraphStore { async listEmbeddingHashes(): Promise> { return new Map(); } - // biome-ignore lint/correctness/useYield: empty stream — no embeddings in the wiki fixture async *listEmbeddings(): AsyncIterable {} async listNodesByEntryPoint(_entryPointId: string): Promise { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cb691f1..ebde7f6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,7 +17,7 @@ overrides: picomatch@<2.3.2: 2.3.2 tmp@<0.2.4: 0.2.4 dompurify@<3.4.0: 3.4.0 - hono@<4.12.16: 4.12.16 + hono@<4.12.18: 4.12.18 ip-address@<10.1.1: 10.1.1 fast-uri@<3.1.2: 3.1.2 fast-xml-builder@<1.1.7: 1.1.7 @@ -1161,7 +1161,7 @@ packages: resolution: {integrity: sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==} engines: {node: '>=18.14.1'} peerDependencies: - hono: 4.12.16 + hono: 4.12.18 '@huggingface/tokenizers@0.1.3': resolution: {integrity: sha512-8rF/RRT10u+kn7YuUbUg0OF30K8rjTc78aHpxT+qJ1uWSqxT1MHi8+9ltwYfkFYJzT/oS+qw3JVfHtNMGAdqyA==} @@ -2642,8 +2642,8 @@ packages: resolution: {integrity: sha512-eSmmWE5bZTK2Nou4g0AI3zZ9rswp7GRKoKXS1BLUkvPviOqs4YTN1djQIqrXy9k5gEtdLPy86JjRwsNM9tnDcA==} engines: {node: '>=0.10.0'} - hono@4.12.16: - resolution: {integrity: sha512-jN0ZewiNAWSe5khM3EyCmBb250+b40wWbwNILNfEvq84VREWwOIkuUsFONk/3i3nqkz7Oe1PcpM2mwQEK2L9Kg==} + hono@4.12.18: + resolution: {integrity: sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ==} engines: {node: '>=16.9.0'} hosted-git-info@6.1.3: @@ -4938,9 +4938,9 @@ snapshots: nan: 2.26.2 prebuild-install: 7.1.3 - '@hono/node-server@1.19.14(hono@4.12.16)': + '@hono/node-server@1.19.14(hono@4.12.18)': dependencies: - hono: 4.12.16 + hono: 4.12.18 '@huggingface/tokenizers@0.1.3': {} @@ -5131,7 +5131,7 @@ snapshots: '@modelcontextprotocol/sdk@1.29.0(zod@4.4.3)': dependencies: - '@hono/node-server': 1.19.14(hono@4.12.16) + '@hono/node-server': 1.19.14(hono@4.12.18) ajv: 8.18.0 ajv-formats: 3.0.1(ajv@8.18.0) content-type: 1.0.5 @@ -5141,7 +5141,7 @@ snapshots: eventsource-parser: 3.0.6 express: 5.2.1 express-rate-limit: 8.3.2(express@5.2.1) - hono: 4.12.16 + hono: 4.12.18 jose: 6.2.2 json-schema-typed: 8.0.2 pkce-challenge: 5.0.1 @@ -6651,7 +6651,7 @@ snapshots: dependencies: parse-passwd: 1.0.0 - hono@4.12.16: {} + hono@4.12.18: {} hosted-git-info@6.1.3: dependencies: