From 20dd38cb226b3f453eef72b6312e8bc01cbdd759 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon <9553966+theagenticguy@users.noreply.github.com> Date: Fri, 29 May 2026 16:17:48 -0500 Subject: [PATCH] fix(scanners): exclude indexer-ignored dirs from vulture/radon/ty (drop .venv noise) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vulture/radon/ty walk the project tree directly with no exclude, so they descend into .venv/ and report library dead-code, complexity, and type findings. On a uv-managed repo this was 127/133 findings — 95% library noise drowning the 6 real ones. The indexer already excludes these dirs via pipeline.HARDCODED_IGNORES; the scan runner never threaded that set into the wrappers. - VultureWrapperOptions.excludeGlobs → vulture --exclude, each ignore name ANCHORED to a path segment (*/.venv/* not bare .venv, which vulture would substring-match and so also suppress src/.venv_helpers.py). - RadonWrapperOptions.ignoreDirs → radon -i (matches directory basenames, so bare names are correct here). - TyWrapperOptions.excludeGlobs → ty --exclude / plus --force-exclude (CLI-named paths bypass excludes otherwise). - DefaultWrapperContext gains vulture/radon/ty; createWrapperFor threads them the pip-audit-options way; scan.ts buildWrapperContext populates all three from pipeline.HARDCODED_IGNORES (single source of truth, no drift). Verified end-to-end: codehub scan on ngs-research-agent now reports 6 findings (was 133) — vulture contributes 0 .venv noise; the 6 real findings (2 semgrep, 1 osv, 1 pip-audit, 2 radon) are unchanged. Scanners 94/94, cli 263/263, tsc + biome clean. Field-report Issue 2. --- .../world-class-code-exploration.mjs | 415 ++++++++++++++++++ packages/cli/src/commands/scan.ts | 18 + packages/scanners/src/index.ts | 18 +- .../src/wrappers/extended-wrappers.test.ts | 20 + packages/scanners/src/wrappers/radon.ts | 21 +- packages/scanners/src/wrappers/ty.ts | 24 +- packages/scanners/src/wrappers/vulture.ts | 39 +- 7 files changed, 543 insertions(+), 12 deletions(-) create mode 100644 .claude/workflows/world-class-code-exploration.mjs diff --git a/.claude/workflows/world-class-code-exploration.mjs b/.claude/workflows/world-class-code-exploration.mjs new file mode 100644 index 00000000..bd6b38d1 --- /dev/null +++ b/.claude/workflows/world-class-code-exploration.mjs @@ -0,0 +1,415 @@ +export const meta = { + name: 'world-class-code-exploration', + description: 'Root-cause + design + verify fixes for the OpenCodeHub field-report issues and chart what makes code exploration world-class', + phases: [ + { title: 'Diagnose', detail: 'pin exact root cause + fix site for each issue (grounded in code)' }, + { title: 'Vision', detail: 'parallel lenses on what world-class code-exploration requires beyond the reported issues' }, + { title: 'Design', detail: 'concrete fix design per issue, with test shape' }, + { title: 'Adversarial verify', detail: 'skeptics try to refute each design' }, + { title: 'Synthesize', detail: 'prioritized roadmap: confirmed fixes + vision gaps' }, + ], +} + +// --------------------------------------------------------------------------- +// Shared context — the grounded findings I (orchestrator) already proved, so +// agents don't re-derive from scratch and don't repeat the field report's +// wrong hypotheses. Each agent re-verifies against the live code. +// --------------------------------------------------------------------------- +const REPO = '/Users/lalsaado/Projects/open-code-hub' +const SUBJECT = '/Users/lalsaado/Projects/ngs-research-agent' + +const GROUNDING = ` +You are improving OpenCodeHub (OCH), a local code-graph + MCP tool for AI-driven +code exploration. Repo root: ${REPO}. A field report drove an exploration session +THROUGH the codehub CLI against subject repo ${SUBJECT} (a Python stdio MCP server, +src-layout package ngs_research_agent) and filed 6 issues. + +CRITICAL grounded facts the orchestrator already PROVED empirically (re-verify, do +not contradict without equally strong evidence — cite file:line + a repro): + +ISSUE 1 (the report's headline "cross-module CALLS edges drop / FQN-vs-filepath +node-identity mismatch") — the report's hypothesis is WRONG for Python. Proven: + - scip-python emits ONE symbol string for both the def and every ref of + get_bedrock_client (no src/dist or external/FQN split like TS has). + - The decorated function get_bedrock_client (sole @cache-decorated def in + client.py) is DROPPED from the persisted lbug graph. WASM parse captures it, + pythonProvider.extractDefinitions returns it [146-171], idForDefinition gives a + unique id, KnowledgeGraph.addNode (packages/core-types/src/graph.ts) dedups by + id — yet the final graph has 5/6 client.py Function nodes; get_bedrock_client's + Function node is absent while its body Variables AND a Process node referencing + its (missing) Function id DO persist. Discriminator vs the 5 survivors: it's the + only bare-name @cache-decorated def. So the real bug is "decorated function def + lost between extraction and persistence/bulk-load". Suspect the lbug node COPY + struct-field type seeding (packages/storage/src/graphdb-adapter.ts + NODE_COPY_SUBQUERY / NODE_SENTINEL_ID) OR a later phase, OR decorated_definition + range handling. + - Bug A (independent, confirmed): extractPyImports in + packages/ingestion/src/providers/python.ts is LINE-BASED and silently drops + multi-line parenthesized imports: \`from pkg.mod import (\\n a,\\n b,\\n)\` → + first line rest="(" → 0 names → discarded. Ubiquitous in real Python. + - Bug B (confirmed): preprocessPyImportPath leaves dotted absolute imports + unchanged; resolveImportTarget (packages/ingestion/src/pipeline/phases/parse.ts + :761) only handles ./ ../ / → src-layout package imports (ngs_research_agent.client + → src/ngs_research_agent/client.py) stub as . + +ISSUE 2: scan runner runs vulture (and radon, ty) against the absolute repo tree +with NO exclude → vulture walks .venv/ → 127/133 findings are library noise. The +indexer already excludes via HARDCODED_IGNORES (packages/ingestion/src/pipeline/ +gitignore.ts:225 incl ".venv"), exported via the pipeline barrel. semgrep/ruff dodge +it by targeting "." and honoring gitignore. Fix: thread an exclude list into a new +VultureWrapperOptions (mirror pip-audit's options plumbing through +DefaultWrapperContext), populate from pipeline.HARDCODED_IGNORES in CLI +buildWrapperContext (packages/cli/src/commands/scan.ts), emit vulture --exclude +. Apply to radon.ts + ty.ts too. + +ISSUE 3: list_findings/list_dead_code/license_audit/owners/route_map/project_profile/ +risk_trends/api_impact are MCP-only; no CLI subcommand (CLI uses commander, entry +packages/cli/src/index.ts; verdict is the canonical CLI↔MCP shared-fn template — +both call computeVerdict from @opencodehub/analysis). list_findings (store.graph +.listFindings), list_dead_code (classifyDeadness), license_audit (classifyDependencies ++ listDependencies), project_profile (listNodesByKind), risk_trends (computeRiskTrends ++ loadSnapshots, already used in wiki.ts) are THIN; owners/route_map/api_impact are +inlined in MCP handlers (need extraction to @opencodehub/analysis). + +ISSUE 4: codehub sql exposes only cochanges + symbol_summaries (DuckDB temporal +tier); the node/edge graph lives in lbug (graph.lbug) and is NOT SQL-queryable. +Docs oversell "SQL against the graph store". Fix docs framing or add a read-only +nodes/edges view. + +ISSUE 5: symbol_summaries empty → query silently runs BM25-only even though doctor +reports embedder weights present. status should surface "summaries: N / vectors: +bm25-only|hybrid". + +ISSUE 6: doctor reports "bandit OK" by binary presence but bandit lacks the [sarif] +extra → scan can't use it (argparse rejects -f sarif, exit 2, 0 findings). doctor +should probe the formatter, not just --version. (installCmd already fixed to +uv tool install 'bandit[sarif]' in a merged PR.) + +Storage interface: IGraphStore in packages/storage/src/interface.ts has listNodes, +listNodesByKind, listEdgesByType, listFindings, listDependencies, listRoutes. +ITemporalStore holds cochanges + symbol_summaries. ADR 0016 (DuckDB graph rip), +ADR 0015 (WASM-only parser). Durable lessons in .erpaval/INDEX.md. +` + +// --------------------------------------------------------------------------- +// Schemas +// --------------------------------------------------------------------------- +const DIAGNOSIS_SCHEMA = { + type: 'object', + additionalProperties: false, + required: ['issue', 'rootCause', 'evidence', 'fixSites', 'severity', 'confidence'], + properties: { + issue: { type: 'string', description: 'Issue id, e.g. "Issue 1" / "Issue 1 Bug A"' }, + rootCause: { type: 'string', description: 'The precise mechanism, in 1-3 sentences' }, + evidence: { + type: 'array', items: { type: 'string' }, + description: 'file:line citations + repro observations that prove the root cause', + }, + fixSites: { + type: 'array', + items: { + type: 'object', additionalProperties: false, + required: ['file', 'what'], + properties: { + file: { type: 'string' }, + what: { type: 'string', description: 'the change to make at this site' }, + }, + }, + }, + severity: { enum: ['HIGH', 'MEDIUM', 'LOW'] }, + confidence: { type: 'number', description: '0..1 that this root cause is correct' }, + openQuestions: { type: 'array', items: { type: 'string' } }, + }, +} + +const VISION_SCHEMA = { + type: 'object', additionalProperties: false, + required: ['lens', 'gaps'], + properties: { + lens: { type: 'string' }, + gaps: { + type: 'array', + items: { + type: 'object', additionalProperties: false, + required: ['capability', 'whyItMatters', 'effort', 'leverage'], + properties: { + capability: { type: 'string', description: 'a missing/weak capability for world-class code exploration' }, + whyItMatters: { type: 'string' }, + existingFoundation: { type: 'string', description: 'what in OCH today it builds on (file/tool), or "greenfield"' }, + effort: { enum: ['S', 'M', 'L', 'XL'] }, + leverage: { enum: ['transformational', 'high', 'medium', 'low'] }, + }, + }, + }, + }, +} + +const DESIGN_SCHEMA = { + type: 'object', additionalProperties: false, + required: ['issue', 'approach', 'diffSketch', 'testShape', 'risks', 'blastRadius'], + properties: { + issue: { type: 'string' }, + approach: { type: 'string', description: 'the concrete fix, including exact functions/signatures touched' }, + diffSketch: { type: 'string', description: 'pseudo-diff or precise prose of the edits per file' }, + testShape: { type: 'string', description: 'the regression test(s) to add and where' }, + risks: { type: 'array', items: { type: 'string' } }, + blastRadius: { type: 'string', description: 'what else could break; which packages rebuild' }, + }, +} + +const VERDICT_SCHEMA = { + type: 'object', additionalProperties: false, + required: ['issue', 'holds', 'reason'], + properties: { + issue: { type: 'string' }, + holds: { type: 'boolean', description: 'true if the design is sound and the root cause is right' }, + reason: { type: 'string' }, + mustFix: { type: 'array', items: { type: 'string' }, description: 'concrete corrections the design needs before implementation' }, + }, +} + +// --------------------------------------------------------------------------- +// PHASE 1 — Diagnose: one agent per issue, grounded, returns structured RC. +// Issue 1 gets the deepest treatment (its own dedicated bisection agent). +// --------------------------------------------------------------------------- +const ISSUES = [ + { + id: 'Issue 1 (decorated-func drop)', + label: 'diag:issue1-core', + prompt: `${GROUNDING} + +YOUR TASK: Pin the EXACT drop point for the @cache-decorated get_bedrock_client +Function node. It survives pythonProvider.extractDefinitions but is absent from the +persisted lbug graph. Bisect the path: parse phase addNode loop +(packages/ingestion/src/pipeline/phases/parse.ts ~363-378) → later phases that +mutate nodes (processes.ts, accesses.ts, orm.ts, ownership.ts) → the lbug bulk-load +(packages/storage/src/graphdb-adapter.ts NODE_COPY_SUBQUERY, struct-field type +seeding, COPY ... IGNORE_ERRORS, any per-row filter on null startLine/endLine or +field-shape). Read every candidate. Form ONE concrete root-cause hypothesis with the +exact file:line where the node is dropped or overwritten, and explain why ONLY the +decorated def is affected (what's structurally different about its GraphNode — range +from decorated_definition? a field that trips the COPY type-seeding?). Also confirm +or correct Bug A (multi-line imports) and Bug B (src-layout resolution) with file:line. +Return THREE diagnosis objects (Issue 1 core, Issue 1 Bug A, Issue 1 Bug B) — but +this schema is one object, so return the CORE one here and put Bug A + Bug B findings +in openQuestions as "Bug A: ..." / "Bug B: ..." one-liners with their fix sites.`, + schema: DIAGNOSIS_SCHEMA, + }, + { + id: 'Issue 2 (vulture .venv)', + label: 'diag:issue2', + prompt: `${GROUNDING}\n\nYOUR TASK: Confirm Issue 2 root cause and the cleanest fix +site. Read packages/scanners/src/wrappers/{vulture,radon,ty,semgrep,ruff}.ts, +packages/scanners/src/spec.ts (ScannerRunContext), packages/scanners/src/index.ts +(DefaultWrapperContext, createWrapperFor), packages/cli/src/commands/scan.ts +(buildWrapperContext), and packages/ingestion/src/pipeline/gitignore.ts +(HARDCODED_IGNORES + barrel export). Verify vulture supports --exclude (comma glob). +Confirm scanners package does NOT depend on ingestion (so threading from CLI is the +right seam). Return the diagnosis with exact fixSites.`, + schema: DIAGNOSIS_SCHEMA, + }, + { + id: 'Issue 3 (MCP-only CLI gap)', + label: 'diag:issue3', + prompt: `${GROUNDING}\n\nYOUR TASK: For each MCP-only reader (list_findings, +list_dead_code, license_audit, project_profile, risk_trends, owners, route_map, +api_impact) confirm whether it calls a shared @opencodehub/analysis fn or storage +reader (THIN) vs inlined logic in the MCP handler (EXTRACT). Read packages/mcp/src/ +tools/*.ts for each + packages/cli/src/index.ts registration pattern + a template +command (verdict.ts). Return a diagnosis whose fixSites enumerate, per tool, the new +CLI command file + the lib fn it calls, and flag the 4-5 cheapest thin wins.`, + schema: DIAGNOSIS_SCHEMA, + }, + { + id: 'Issue 4 (sql framing)', + label: 'diag:issue4', + prompt: `${GROUNDING}\n\nYOUR TASK: Confirm what \`codehub sql\` can reach. Read the +sql command (packages/cli/src/commands/sql.ts or similar), ITemporalStore vs +IGraphStore (packages/storage/src/interface.ts), and where the "SQL against the graph +store" wording appears (CLAUDE.md, docs/, --help strings, MCP tool descriptions). +Decide: doc-only fix vs adding a read-only nodes/edges view. Return diagnosis + fixSites.`, + schema: DIAGNOSIS_SCHEMA, + }, + { + id: 'Issue 5 (status summaries/vectors)', + label: 'diag:issue5', + prompt: `${GROUNDING}\n\nYOUR TASK: Read the status command (packages/cli/src/ +commands/status.ts) and how query decides bm25 vs hybrid (search package + how it +checks symbol_summaries / embeddings presence). Determine where status should read +summaries count + vector mode and what exact line to print. Return diagnosis + fixSites.`, + schema: DIAGNOSIS_SCHEMA, + }, + { + id: 'Issue 6 (doctor bandit[sarif])', + label: 'diag:issue6', + prompt: `${GROUNDING}\n\nYOUR TASK: Read the bandit doctor check + the bandit +wrapper (packages/cli/src/commands/doctor.ts binaryOnPathCheck for bandit; +packages/scanners/src/wrappers/bandit.ts banditExitAdvisory). Design a probe that +verifies the [sarif] formatter is actually usable (e.g. run \`bandit -f sarif\` on a +tiny temp input and check exit!=2 / no usage banner, or check the +bandit-sarif-formatter entry point). Return diagnosis + fixSites.`, + schema: DIAGNOSIS_SCHEMA, + }, +] + +phase('Diagnose') +const diagnoses = await parallel( + ISSUES.map((iss) => () => + agent(iss.prompt, { label: iss.label, phase: 'Diagnose', schema: DIAGNOSIS_SCHEMA, agentType: 'Explore' }), + ), +) +const confirmedDiagnoses = diagnoses.filter(Boolean) +log(`Diagnosed ${confirmedDiagnoses.length}/${ISSUES.length} issues`) + +// --------------------------------------------------------------------------- +// PHASE 2 — Vision (parallel, runs concurrently with nothing depending on it +// until synthesis): what does WORLD-CLASS code exploration require, beyond the +// 6 reported issues? Distinct lenses so they don't converge. +// --------------------------------------------------------------------------- +const LENSES = [ + { + lens: 'Graph correctness & completeness', + angle: `What categories of edges/nodes does OCH likely MISS or mis-bind today +(beyond decorated funcs)? Think: dynamic dispatch, re-exports, decorators-as-wrappers, +class attributes, async/await call chains, test→src coverage edges, monkeypatch, +dependency-injection. What would make the graph trustworthy enough that a user +believes the blast-radius number? Ground in OCH's parse/scip phases.`, + }, + { + lens: 'Retrieval quality (BM25 → hybrid → reranked)', + angle: `The report found query silently runs BM25-only (no summaries/vectors). +What does world-class code retrieval look like — hybrid dense+sparse, symbol +summaries, query understanding, result grouping by process/flow, reranking? What does +OCH have (embedder, search package) vs need? How to make hybrid the default that +"just works" after analyze.`, + }, + { + lens: 'Agent ergonomics & CLI/MCP parity', + angle: `OCH is driven BY an LLM agent. What makes a code-graph tool delightful for +an agent: CLI↔MCP parity, structured + human output, disambiguation that never omits +the real node, --kind/--exclude-docs defaults, next-step hints, staleness signals, +self-describing errors (like AMBIGUOUS_REPO). What's missing for an agent to drive +exploration confidently end-to-end?`, + }, + { + lens: 'Trust, verification & "show your work"', + angle: `For impact/verdict to be trusted: edge provenance (scip vs heuristic +confidence), "why is this in the blast radius" path explanations, coverage of the +graph (what % of calls resolved vs dropped to ), a self-diagnostic that +reports graph health (orphan rate, unresolved-import rate). What would let a user +audit the graph's own accuracy?`, + }, +] + +phase('Vision') +const visions = await parallel( + LENSES.map((l) => () => + agent( + `${GROUNDING}\n\nYOU ARE A PRODUCT+ARCHITECTURE STRATEGIST for "world-class code +exploration & understanding". LENS: ${l.lens}.\n${l.angle}\n\nReturn 3-6 concrete +capability gaps. For each: why it matters for an AI agent exploring code, what OCH +foundation it builds on (cite a file/tool/package) or "greenfield", effort (S/M/L/XL), +and leverage. Be specific to THIS codebase — no generic advice. Prefer gaps that the +existing architecture (lbug graph, scip-ingest, embedder, 28 MCP tools, IGraphStore) +makes cheap to reach.`, + { label: `vision:${l.lens.slice(0, 18)}`, phase: 'Vision', schema: VISION_SCHEMA, agentType: 'Explore' }, + ), + ), +) +const confirmedVisions = visions.filter(Boolean) +log(`Collected ${confirmedVisions.length} vision lenses`) + +// --------------------------------------------------------------------------- +// PHASE 3+4 — Design each confirmed diagnosis, then adversarially verify. +// Pipeline: a design verifies as soon as it's produced (no global barrier). +// --------------------------------------------------------------------------- +phase('Design') +const designVerdicts = await pipeline( + confirmedDiagnoses, + (diag) => + agent( + `${GROUNDING}\n\nYOU ARE A STAFF ENGINEER designing the fix for: ${diag.issue}. +Confirmed root cause: ${diag.rootCause} +Evidence: ${(diag.evidence || []).join(' | ')} +Fix sites: ${JSON.stringify(diag.fixSites)} + +Produce an implementation-ready design: exact functions/signatures, a pseudo-diff per +file, the regression test(s) and where they live (match existing test conventions), +risks, and blast radius (which packages rebuild, what else could break). Match the +repo's idioms (DI seams in scanner wrappers, commander registration, structured-output +schemas). Do NOT write the code — design it precisely enough that implementation is +mechanical.`, + { label: `design:${diag.issue.slice(0, 22)}`, phase: 'Design', schema: DESIGN_SCHEMA, agentType: 'Explore' }, + ), + async (design, diag) => { + const LENSES_V = ['correctness', 'completeness', 'repro-or-refute'] + const thunks = LENSES_V.map((angleName) => () => + agent( + `${GROUNDING}\n\nYOU ARE A SKEPTIC. Default to holds=false unless the design +is clearly sound. Lens: ${angleName}. +Issue: ${diag.issue} +Root cause claim: ${diag.rootCause} +Design: ${design ? design.approach : '(design failed)'} +Diff sketch: ${design ? design.diffSketch : ''} +Test shape: ${design ? design.testShape : ''} + +Try to REFUTE: is the root cause actually right? Will this fix actually resolve the +reported symptom without breaking the 5 surviving cases / other languages / other +scanners? Is the test real (would it fail before, pass after)? For ${angleName} +specifically, find the hole. Return holds + reason + mustFix corrections.`, + { label: `verify:${diag.issue.slice(0, 14)}:${angleName}`, phase: 'Adversarial verify', schema: VERDICT_SCHEMA, agentType: 'Explore' }, + ), + ) + const votes = (await parallel(thunks)).filter(Boolean) + const holdCount = votes.filter((x) => x.holds).length + return { + issue: diag.issue, + severity: diag.severity, + confidence: diag.confidence, + rootCause: diag.rootCause, + fixSites: diag.fixSites, + design, + survives: holdCount >= 2, + votes, + mustFix: votes.flatMap((x) => x.mustFix || []), + } + }, +) +const designs = designVerdicts.filter(Boolean) + +// --------------------------------------------------------------------------- +// PHASE 5 — Synthesize: one agent merges confirmed designs + vision gaps into +// a single prioritized roadmap. Gets the full structured corpus. +// --------------------------------------------------------------------------- +phase('Synthesize') +const synthesis = await agent( + `${GROUNDING}\n\nYOU ARE THE TECH LEAD. Synthesize a single prioritized roadmap to +make OpenCodeHub WORLD-CLASS for exploring and understanding code. + +CONFIRMED FIXES (root cause + design + adversarial verdict): +${JSON.stringify(designs.map((d) => ({ issue: d.issue, severity: d.severity, survives: d.survives, rootCause: d.rootCause, approach: d.design?.approach, mustFix: d.mustFix })), null, 1)} + +VISION GAPS (what world-class requires beyond the reported issues): +${JSON.stringify(confirmedVisions.flatMap((v) => v.gaps.map((g) => ({ lens: v.lens, ...g }))), null, 1)} + +Produce, in Markdown: +1. **Ship now (this PR series)** — the confirmed bug fixes that SURVIVED adversarial + review, in dependency/priority order, each with the one-line fix and any mustFix + corrections folded in. Call out Issue 1 core (decorated-func drop) as the headline + correctness fix and whether it's ready or needs more diagnosis. +2. **Fast follow** — designs that need the mustFix corrections, or thin vision gaps. +3. **World-class roadmap** — the transformational/high-leverage vision gaps grouped by + theme (graph correctness, hybrid retrieval, agent ergonomics, trust/verification), + each with effort + the OCH foundation it builds on. +4. **What I'd cut / defer** and why. +Be decisive and specific to this codebase. This is the plan the orchestrator will +implement, so make "Ship now" directly actionable.`, + { label: 'synthesize:roadmap', phase: 'Synthesize' }, +) + +return { + diagnoses: confirmedDiagnoses, + designs: designs.map((d) => ({ issue: d.issue, survives: d.survives, severity: d.severity, mustFix: d.mustFix })), + visionGapCount: confirmedVisions.flatMap((v) => v.gaps).length, + roadmap: synthesis, +} diff --git a/packages/cli/src/commands/scan.ts b/packages/cli/src/commands/scan.ts index 2b836843..401f6cdc 100644 --- a/packages/cli/src/commands/scan.ts +++ b/packages/cli/src/commands/scan.ts @@ -27,6 +27,7 @@ import { readFileSync } from "node:fs"; import { mkdir, readFile, writeFile } from "node:fs/promises"; import { join, resolve } from "node:path"; +import { pipeline } from "@opencodehub/ingestion"; import { applyBaselineState, applySuppressions, @@ -45,10 +46,13 @@ import { P1_SPECS, PIP_AUDIT_SPEC, type ProjectProfileGate, + RADON_SPEC, runScanners, type ScannerSpec, type ScannerStatus, SPECTRAL_SPEC, + TY_SPEC, + VULTURE_SPEC, } from "@opencodehub/scanners"; import { resolveRepoMetaDir } from "@opencodehub/storage"; import { readRegistry } from "../registry.js"; @@ -360,6 +364,20 @@ async function buildWrapperContext( // export lands in the gitignored .codehub/ meta dir. ctx.pipAudit = { exportDir: resolveRepoMetaDir(repoPath) }; } + // Python tree-walking scanners (vulture/radon/ty) descend into `.venv` and + // report library noise unless told to skip the same dirs the indexer + // ignores. Reuse the indexer's single source of truth so the exclude set + // can't drift. Each wrapper anchors / formats these for its own CLI. + const ignoreDirs = pipeline.HARDCODED_IGNORES; + if (ids.has(VULTURE_SPEC.id)) { + ctx.vulture = { excludeGlobs: ignoreDirs }; + } + if (ids.has(RADON_SPEC.id)) { + ctx.radon = { ignoreDirs }; + } + if (ids.has(TY_SPEC.id)) { + ctx.ty = { excludeGlobs: ignoreDirs }; + } return ctx; } diff --git a/packages/scanners/src/index.ts b/packages/scanners/src/index.ts index 1f740cd2..58b9556b 100644 --- a/packages/scanners/src/index.ts +++ b/packages/scanners/src/index.ts @@ -129,15 +129,15 @@ import { createHadolintWrapper, type HadolintWrapperOptions } from "./wrappers/h import { createNpmAuditWrapper } from "./wrappers/npm-audit.js"; import { createOsvScannerWrapper } from "./wrappers/osv-scanner.js"; import { createPipAuditWrapper, type PipAuditWrapperOptions } from "./wrappers/pip-audit.js"; -import { createRadonWrapper } from "./wrappers/radon.js"; +import { createRadonWrapper, type RadonWrapperOptions } from "./wrappers/radon.js"; import { createRuffWrapper } from "./wrappers/ruff.js"; import { createSemgrepWrapper } from "./wrappers/semgrep.js"; import { DEFAULT_DEPS, type WrapperDeps } from "./wrappers/shared.js"; import { createSpectralWrapper, type SpectralWrapperOptions } from "./wrappers/spectral.js"; import { createTflintWrapper } from "./wrappers/tflint.js"; import { createTrivyWrapper } from "./wrappers/trivy.js"; -import { createTyWrapper } from "./wrappers/ty.js"; -import { createVultureWrapper } from "./wrappers/vulture.js"; +import { createTyWrapper, type TyWrapperOptions } from "./wrappers/ty.js"; +import { createVultureWrapper, type VultureWrapperOptions } from "./wrappers/vulture.js"; /** * Per-scanner context passed to `createDefaultWrappers`. Some wrappers @@ -157,6 +157,12 @@ export interface DefaultWrapperContext { readonly hadolint?: HadolintWrapperOptions; readonly spectral?: SpectralWrapperOptions; readonly pipAudit?: PipAuditWrapperOptions; + // Python dead-code / complexity / type-check scanners walk the project + // tree directly; without an exclude they descend into `.venv` and report + // library noise. The CLI threads the indexer's ignore dirs in here. + readonly vulture?: VultureWrapperOptions; + readonly radon?: RadonWrapperOptions; + readonly ty?: TyWrapperOptions; } /** @@ -216,11 +222,11 @@ function createWrapperFor( case GRYPE_SPEC.id: return deps ? createGrypeWrapper(deps) : createGrypeWrapper(); case VULTURE_SPEC.id: - return deps ? createVultureWrapper(deps) : createVultureWrapper(); + return createVultureWrapper(deps ?? DEFAULT_DEPS, ctx.vulture ?? {}); case RADON_SPEC.id: - return deps ? createRadonWrapper(deps) : createRadonWrapper(); + return createRadonWrapper(deps ?? DEFAULT_DEPS, ctx.radon ?? {}); case TY_SPEC.id: - return deps ? createTyWrapper(deps) : createTyWrapper(); + return createTyWrapper(deps ?? DEFAULT_DEPS, ctx.ty ?? {}); case CLAMAV_SPEC.id: return deps ? createClamAvWrapper(deps) : createClamAvWrapper(); case CHECKOV_DOCKER_COMPOSE_SPEC.id: diff --git a/packages/scanners/src/wrappers/extended-wrappers.test.ts b/packages/scanners/src/wrappers/extended-wrappers.test.ts index 9386b421..2aa12ce5 100644 --- a/packages/scanners/src/wrappers/extended-wrappers.test.ts +++ b/packages/scanners/src/wrappers/extended-wrappers.test.ts @@ -230,6 +230,26 @@ test("vulture wrapper emits empty SARIF when binary missing", async () => { assert.ok(out.skipped?.includes("not found on PATH")); }); +test("vulture wrapper anchors excludeGlobs to path segments (no .venv noise)", async () => { + const { deps, calls } = makeFakeDeps(() => ({ stdout: "", exitCode: 0 })); + await createVultureWrapper(deps, { excludeGlobs: [".venv", "node_modules"] }).run(ctx); + const args = calls[0]?.args ?? []; + const idx = args.indexOf("--exclude"); + assert.ok(idx >= 0, "must pass --exclude when excludeGlobs is non-empty"); + const value = args[idx + 1] ?? ""; + // Anchored to a full path segment — NOT the bare name, which vulture would + // substring-match and so suppress e.g. src/.venv_helpers.py. + assert.ok(value.includes("*/.venv/*"), `expected anchored .venv glob, got: ${value}`); + assert.ok(value.includes("*/node_modules/*")); + assert.ok(!value.split(",").includes(".venv"), "must not pass the bare name .venv"); +}); + +test("vulture wrapper omits --exclude when no excludeGlobs given", async () => { + const { deps, calls } = makeFakeDeps(() => ({ stdout: "", exitCode: 0 })); + await createVultureWrapper(deps).run(ctx); + assert.ok(!(calls[0]?.args ?? []).includes("--exclude")); +}); + // ---------- radon --------------------------------------------------------- test("radon wrapper parses cc JSON into SARIF results above threshold", async () => { diff --git a/packages/scanners/src/wrappers/radon.ts b/packages/scanners/src/wrappers/radon.ts index 9af4ee1e..ac8147bf 100644 --- a/packages/scanners/src/wrappers/radon.ts +++ b/packages/scanners/src/wrappers/radon.ts @@ -21,7 +21,20 @@ import type { ScannerRunContext, ScannerRunResult, ScannerWrapper } from "../spe import { emptySarifFor } from "../spec.js"; import { DEFAULT_DEPS, type WrapperDeps } from "./shared.js"; -export function createRadonWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrapper { +export interface RadonWrapperOptions { + /** + * Directory names to skip (e.g. `.venv`, `node_modules`). radon's `-i` + * matches directory BASENAMES (not path globs), so the bare ignore names + * are passed through as-is. radon already skips hidden dirs by default, so + * `-i` mainly helps non-hidden entries (`node_modules`, `dist`, `build`). + */ + readonly ignoreDirs?: readonly string[]; +} + +export function createRadonWrapper( + deps: WrapperDeps = DEFAULT_DEPS, + opts: RadonWrapperOptions = {}, +): ScannerWrapper { return { spec: RADON_SPEC, run: async (ctx: ScannerRunContext): Promise => { @@ -37,7 +50,11 @@ export function createRadonWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWra durationMs: performance.now() - started, }; } - const args: readonly string[] = ["cc", "-s", "-j", ctx.projectPath]; + const ignoreArgs = + opts.ignoreDirs !== undefined && opts.ignoreDirs.length > 0 + ? ["-i", opts.ignoreDirs.join(",")] + : []; + const args: readonly string[] = ["cc", "-s", "-j", ...ignoreArgs, ctx.projectPath]; const result = await deps.runBinary("radon", args, { timeoutMs: ctx.timeoutMs, cwd: ctx.projectPath, diff --git a/packages/scanners/src/wrappers/ty.ts b/packages/scanners/src/wrappers/ty.ts index 491423bf..88146a49 100644 --- a/packages/scanners/src/wrappers/ty.ts +++ b/packages/scanners/src/wrappers/ty.ts @@ -21,7 +21,20 @@ import type { ScannerRunContext, ScannerRunResult, ScannerWrapper } from "../spe import { emptySarifFor } from "../spec.js"; import { DEFAULT_DEPS, type WrapperDeps } from "./shared.js"; -export function createTyWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrapper { +export interface TyWrapperOptions { + /** + * Directory names to exclude (e.g. `.venv`, `node_modules`). ty uses + * gitignore-style excludes; a trailing `/` anchors to a directory. We also + * pass `--force-exclude` so the excludes apply even though the project path + * is given explicitly on the CLI (CLI-named paths bypass excludes otherwise). + */ + readonly excludeGlobs?: readonly string[]; +} + +export function createTyWrapper( + deps: WrapperDeps = DEFAULT_DEPS, + opts: TyWrapperOptions = {}, +): ScannerWrapper { return { spec: TY_SPEC, run: async (ctx: ScannerRunContext): Promise => { @@ -37,7 +50,14 @@ export function createTyWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrappe durationMs: performance.now() - started, }; } - const args: readonly string[] = ["check", ctx.projectPath]; + const excludeArgs = + opts.excludeGlobs !== undefined && opts.excludeGlobs.length > 0 + ? [ + ...opts.excludeGlobs.flatMap((g) => ["--exclude", g.endsWith("/") ? g : `${g}/`]), + "--force-exclude", + ] + : []; + const args: readonly string[] = ["check", ...excludeArgs, ctx.projectPath]; const result = await deps.runBinary("ty", args, { timeoutMs: ctx.timeoutMs, cwd: ctx.projectPath, diff --git a/packages/scanners/src/wrappers/vulture.ts b/packages/scanners/src/wrappers/vulture.ts index 6e0c0e3d..9ad4a930 100644 --- a/packages/scanners/src/wrappers/vulture.ts +++ b/packages/scanners/src/wrappers/vulture.ts @@ -22,7 +22,33 @@ import { DEFAULT_DEPS, type WrapperDeps } from "./shared.js"; /** Minimum confidence percentage vulture emits findings at. */ const DEFAULT_MIN_CONFIDENCE = "80"; -export function createVultureWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrapper { +export interface VultureWrapperOptions { + /** + * Directory names the indexer ignores (e.g. `.venv`, `node_modules`). + * Threaded from the CLI so vulture doesn't walk the virtualenv and drown + * real findings in library dead-code. Anchored to path-segment globs + * inside the wrapper so a bare `.venv` can't substring-match `src/distance.py`. + */ + readonly excludeGlobs?: readonly string[]; +} + +/** + * Turn an ignore directory name into a vulture `--exclude` glob anchored to a + * path segment. vulture matches `--exclude` patterns against ABSOLUTE paths + * and treats a wildcard-free pattern as a substring match, so the bare name + * `.venv` would also suppress `src/.venv_helpers.py`. Wrapping it as a + * slash-delimited glob segment matches only when the name is a full directory + * segment. Patterns already containing a glob pass through untouched. + */ +function toVultureExcludeGlob(name: string): string { + if (/[*?[\]]/.test(name)) return name; + return `*/${name}/*`; +} + +export function createVultureWrapper( + deps: WrapperDeps = DEFAULT_DEPS, + opts: VultureWrapperOptions = {}, +): ScannerWrapper { return { spec: VULTURE_SPEC, run: async (ctx: ScannerRunContext): Promise => { @@ -38,7 +64,16 @@ export function createVultureWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerW durationMs: performance.now() - started, }; } - const args: readonly string[] = [ctx.projectPath, "--min-confidence", DEFAULT_MIN_CONFIDENCE]; + const excludeArgs = + opts.excludeGlobs !== undefined && opts.excludeGlobs.length > 0 + ? ["--exclude", opts.excludeGlobs.map(toVultureExcludeGlob).join(",")] + : []; + const args: readonly string[] = [ + ctx.projectPath, + "--min-confidence", + DEFAULT_MIN_CONFIDENCE, + ...excludeArgs, + ]; const result = await deps.runBinary("vulture", args, { timeoutMs: ctx.timeoutMs, cwd: ctx.projectPath,