terraphim · AlexMikhalev · May 10, 2026 · May 10, 2026
diff --git a/automation/judge/kg/judge-verdicts.md b/automation/judge/kg/judge-verdicts.md
@@ -1,5 +1,5 @@
 # verdict
 
-Judge verdict classification for task output quality.
+Judge verdict classification for task output quality (v2 5-level ladder).
 
-synonyms:: accept, improve, reject, escalate, accepted, rejected, pass, fail, needs improvement, quality gate, quality check
+synonyms:: BLOCK, FIX_FIRST, SAFE_TO_COMMIT, SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK, INSUFFICIENT_EVIDENCE, accept, improve, reject, escalate, accepted, rejected, pass, fail, needs improvement, quality gate, quality check, blocked, fix first, safe to commit, safe to deploy, insufficient evidence
diff --git a/automation/judge/pre-push-judge.sh b/automation/judge/pre-push-judge.sh
@@ -66,7 +66,7 @@ case $EXIT_CODE in
         echo "[pre-push-judge] PASSED -- push allowed"
         ;;
     1)
-        echo "[pre-push-judge] FAILED -- push blocked"
+        echo "[pre-push-judge] BLOCKED or FIX FIRST -- push blocked"
         echo "Fix the issues and try again, or override with: git push --no-verify"
         ;;
     2)

diff --git a/automation/judge/run-judge.sh b/automation/judge/run-judge.sh
@@ -133,17 +133,19 @@ Score on three dimensions (1-5 each):
 - pragmatic: Does it enable intended actions? Actionable, useful, addresses the task goal.
 - syntactic: Is it internally consistent and well-structured? Format compliance, completeness, no broken references.
 
-Verdict rules:
-- "accept" if all scores >= 3 AND average >= 3.5
-- "improve" if any score < 3 but all >= 2
-- "reject" if any score < 2
+Verdict rules (5-level ladder):
+- "SAFE_TO_COMMIT" if all scores >= 3 AND average >= 3.5 AND no runtime verification needed
+- "SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK" if all scores >= 3 but deployment needs restart, smoke test, or migration check
+- "FIX_FIRST" if any score < 3 but all >= 2
+- "BLOCK" if any score < 2 or serious correctness/data/security/deploy risk
+- "INSUFFICIENT_EVIDENCE" if you lack task context, diff, tests, or runtime evidence to make a reliable call
 
 Respond with ONLY this JSON (no other text):
 {
   "task_id": "${TASK_ID}",
   "model": "${model}",
   "mode": "quick",
-  "verdict": "<accept|improve|reject>",
+  "verdict": "<BLOCK|FIX_FIRST|SAFE_TO_COMMIT|SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK|INSUFFICIENT_EVIDENCE>",
   "scores": {
     "semantic": "<1-5>",
     "pragmatic": "<1-5>",
@@ -177,10 +179,12 @@ Score each dimension 1-5:
 2. pragmatic: Does it enable intended actions?
 3. syntactic: Is it internally consistent and well-structured?
 
-Verdict rules:
-- "accept" if all scores >= 3 AND average >= 3.5
-- "improve" if any score < 3 but all >= 2
-- "reject" if any score < 2
+Verdict rules (5-level ladder):
+- "SAFE_TO_COMMIT" if all scores >= 3 AND average >= 3.5 AND no runtime verification needed
+- "SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK" if all scores >= 3 but deployment needs restart, smoke test, or migration check
+- "FIX_FIRST" if any score < 3 but all >= 2
+- "BLOCK" if any score < 2 or serious correctness/data/security/deploy risk
+- "INSUFFICIENT_EVIDENCE" if you lack task context, diff, tests, or runtime evidence to make a reliable call
 
 For each improvement, specify: what to fix, where it is, and why it matters.
 
@@ -189,7 +193,7 @@ Respond with ONLY this JSON (no other text):
   "task_id": "${TASK_ID}",
   "model": "${model}",
   "mode": "${mode}",
-  "verdict": "<accept|improve|reject>",
+  "verdict": "<BLOCK|FIX_FIRST|SAFE_TO_COMMIT|SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK|INSUFFICIENT_EVIDENCE>",
   "scores": {
     "semantic": "<1-5>",
     "pragmatic": "<1-5>",
@@ -320,7 +324,7 @@ try:
     if 'verdict' not in v:
         print('Missing: verdict', file=sys.stderr)
         sys.exit(1)
-    if v['verdict'] not in ('accept', 'improve', 'reject', 'escalate'):
+    if v['verdict'] not in ('BLOCK', 'FIX_FIRST', 'SAFE_TO_COMMIT', 'SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK', 'INSUFFICIENT_EVIDENCE'):
         print(f'Invalid verdict: {v[\"verdict\"]}', file=sys.stderr)
         sys.exit(1)
     # Normalize scores: handle flat format
@@ -490,19 +494,28 @@ QUICK_AVG="$ROUND_RESULT_AVG"
 
 log_verdict "$QUICK_VERDICT_JSON" "$ROUND" "quick" "$PREVIOUS_ROUNDS" "null"
 
-if [[ "$QUICK_VERDICT" == "accept" ]]; then
-    echo "RESULT: ACCEPTED (quick judge, round ${ROUND})"
+if [[ "$QUICK_VERDICT" == "SAFE_TO_COMMIT" ]]; then
+    echo "RESULT: SAFE TO COMMIT (quick judge, round ${ROUND})"
     exit 0
 fi
 
-if [[ "$QUICK_VERDICT" == "reject" ]]; then
-    echo "RESULT: REJECTED (quick judge, round ${ROUND})"
+if [[ "$QUICK_VERDICT" == "BLOCK" ]]; then
+    echo "RESULT: BLOCKED (quick judge, round ${ROUND})"
     "${SCRIPT_DIR}/handle-disagreement.sh" -t "$TASK_ID" -T "$TASK_DESCRIPTION" -f "${FILES[*]}" -r "persistent-reject" || true
     exit 1
 fi
 
+if [[ "$QUICK_VERDICT" == "INSUFFICIENT_EVIDENCE" ]]; then
+    echo "RESULT: INSUFFICIENT EVIDENCE (quick judge, round ${ROUND}) -- proceeding to deep judge"
+fi
+
+if [[ "$QUICK_VERDICT" == "SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK" ]]; then
+    echo "RESULT: SAFE TO COMMIT but deploy needs runtime check (quick judge, round ${ROUND})"
+    exit 0
+fi
+
 # ============================
-# Round 2: Deep judge (quick returned "improve")
+# Round 2: Deep judge (quick returned FIX_FIRST or INSUFFICIENT_EVIDENCE)
 # ============================
 PREVIOUS_ROUNDS=$(python3 -c "
 import json
@@ -526,22 +539,29 @@ log_verdict "$DEEP_VERDICT_JSON" "$ROUND" "deep" "$PREVIOUS_ROUNDS" "null"
 
 # Check for disagreement requiring tiebreaker
 NEEDS_TIEBREAKER=false
-if [[ "$QUICK_VERDICT" == "accept" && "$DEEP_VERDICT" == "reject" ]] || \
-   [[ "$QUICK_VERDICT" == "reject" && "$DEEP_VERDICT" == "accept" ]]; then
+if [[ "$QUICK_VERDICT" == "SAFE_TO_COMMIT" && "$DEEP_VERDICT" == "BLOCK" ]] || \
+   [[ "$QUICK_VERDICT" == "BLOCK" && "$DEEP_VERDICT" == "SAFE_TO_COMMIT" ]]; then
     NEEDS_TIEBREAKER=true
 fi
 
 if [[ "$NEEDS_TIEBREAKER" == "false" ]]; then
-    if [[ "$DEEP_VERDICT" == "accept" ]]; then
-        echo "RESULT: ACCEPTED (deep judge, round ${ROUND})"
+    if [[ "$DEEP_VERDICT" == "SAFE_TO_COMMIT" ]]; then
+        echo "RESULT: SAFE TO COMMIT (deep judge, round ${ROUND})"
         exit 0
-    elif [[ "$DEEP_VERDICT" == "reject" ]]; then
-        echo "RESULT: REJECTED (deep judge, round ${ROUND})"
+    elif [[ "$DEEP_VERDICT" == "SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK" ]]; then
+        echo "RESULT: SAFE TO COMMIT but deploy needs runtime check (deep judge, round ${ROUND})"
+        exit 0
+    elif [[ "$DEEP_VERDICT" == "BLOCK" ]]; then
+        echo "RESULT: BLOCKED (deep judge, round ${ROUND})"
         "${SCRIPT_DIR}/handle-disagreement.sh" -t "$TASK_ID" -T "$TASK_DESCRIPTION" -f "${FILES[*]}" -r "persistent-reject" || true
         exit 1
+    elif [[ "$DEEP_VERDICT" == "INSUFFICIENT_EVIDENCE" ]]; then
+        echo "RESULT: INSUFFICIENT EVIDENCE -- human fallback needed"
+        "${SCRIPT_DIR}/handle-disagreement.sh" -t "$TASK_ID" -T "$TASK_DESCRIPTION" -f "${FILES[*]}" -r "disagreement" || true
+        exit 2
     else
-        # Both returned "improve" -- human fallback
-        echo "RESULT: Human fallback needed (both judges returned 'improve')"
+        # FIX_FIRST -- human fallback
+        echo "RESULT: FIX FIRST -- human fallback needed (both judges returned improvement needed)"
         "${SCRIPT_DIR}/handle-disagreement.sh" -t "$TASK_ID" -T "$TASK_DESCRIPTION" -f "${FILES[*]}" -r "disagreement" || true
         exit 2
     fi
@@ -582,15 +602,21 @@ fi
 
 log_verdict "$ROUND_RESULT_JSON" "$ROUND" "tiebreaker" "$PREVIOUS_ROUNDS" "$CONSENSUS"
 
-if [[ "$TIEBREAKER_VERDICT" == "accept" ]]; then
-    echo "RESULT: ACCEPTED (tiebreaker, round ${ROUND}, consensus: ${CONSENSUS})"
+if [[ "$TIEBREAKER_VERDICT" == "SAFE_TO_COMMIT" ]]; then
+    echo "RESULT: SAFE TO COMMIT (tiebreaker, round ${ROUND}, consensus: ${CONSENSUS})"
     exit 0
-elif [[ "$TIEBREAKER_VERDICT" == "reject" ]]; then
-    echo "RESULT: REJECTED (tiebreaker, round ${ROUND}, consensus: ${CONSENSUS})"
+elif [[ "$TIEBREAKER_VERDICT" == "SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK" ]]; then
+    echo "RESULT: SAFE TO COMMIT but deploy needs runtime check (tiebreaker, round ${ROUND}, consensus: ${CONSENSUS})"
+    exit 0
+elif [[ "$TIEBREAKER_VERDICT" == "BLOCK" ]]; then
+    echo "RESULT: BLOCKED (tiebreaker, round ${ROUND}, consensus: ${CONSENSUS})"
     "${SCRIPT_DIR}/handle-disagreement.sh" -t "$TASK_ID" -T "$TASK_DESCRIPTION" -f "${FILES[*]}" -r "persistent-reject" || true
     exit 1
+elif [[ "$TIEBREAKER_VERDICT" == "INSUFFICIENT_EVIDENCE" ]]; then
+    echo "RESULT: INSUFFICIENT EVIDENCE (tiebreaker, round ${ROUND}, consensus: ${CONSENSUS})"
+    exit 0
 else
-    echo "RESULT: Human fallback needed (tiebreaker returned '${TIEBREAKER_VERDICT}')"
+    echo "RESULT: FIX FIRST -- human fallback needed (tiebreaker returned '${TIEBREAKER_VERDICT}')"
     "${SCRIPT_DIR}/handle-disagreement.sh" -t "$TASK_ID" -T "$TASK_DESCRIPTION" -f "${FILES[*]}" -r "disagreement" || true
     exit 2
 fi
diff --git a/automation/judge/verdict-schema.json b/automation/judge/verdict-schema.json
@@ -1,8 +1,8 @@
 {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
-  "$id": "https://terraphim.ai/schemas/verdict-v1.json",
-  "title": "Judge Verdict",
-  "description": "Schema for judge verdict records produced by the multi-model judge skill",
+  "$id": "https://terraphim.ai/schemas/verdict-v2.json",
+  "title": "Judge Verdict (v2)",
+  "description": "Schema for judge verdict records with 5-level verdict ladder (BLOCK, FIX FIRST, SAFE TO COMMIT, SAFE TO DEPLOY AFTER RUNTIME CHECK, INSUFFICIENT EVIDENCE)",
   "type": "object",
   "required": [
     "task_id",
@@ -36,8 +36,8 @@
     },
     "verdict": {
       "type": "string",
-      "description": "Final verdict for the task output",
-      "enum": ["accept", "improve", "reject", "escalate"]
+      "description": "Final verdict for the task output (5-level ladder)",
+      "enum": ["BLOCK", "FIX_FIRST", "SAFE_TO_COMMIT", "SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK", "INSUFFICIENT_EVIDENCE"]
     },
     "scores": {
       "type": "object",

diff --git a/skills/disciplined-design/SKILL.md b/skills/disciplined-design/SKILL.md
@@ -17,6 +17,17 @@ You are a design specialist executing Phase 2 of disciplined development. Your r
 4. **Human Approval**: No implementation without sign-off
 5. **Eliminate Before Adding**: Design is primarily about removal
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the change is trivial or well-understood:
+- Fewer than 10 lines changed, with no architectural impact
+- Adding a single field to a struct or API response
+- Replicating an existing pattern (same structure, different data)
+- Config flag toggle with an obvious test
+- A bug fix where the design is already clear from the research phase
+
+In these cases, go straight to implementation with a minimal plan. Full design phase for trivial changes produces 3,000+ lines of artefacts with no quality gain (see Eberhardt/Scott Logic experiment, Nov 2025).
+
 ## Essentialism: ELIMINATE Phase
 
 This phase embodies McKeown's ELIMINATE principle. Design is about choosing what NOT to do.

diff --git a/skills/disciplined-implementation/SKILL.md b/skills/disciplined-implementation/SKILL.md
@@ -9,6 +9,17 @@ license: Apache-2.0
 
 You are an implementation specialist executing Phase 3 of disciplined development. Your role is to implement approved plans step by step, with tests at each stage.
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the change is trivial or well-understood:
+- Fewer than 10 lines changed, following an existing pattern exactly
+- Typo, rename, import-fix, or formatting-only change
+- Dependency version bump with no API changes
+- Config flag toggle with an obvious test
+- A one-line fix where the test is equally obvious
+
+In these cases, implement directly with a single commit. The step-by-step plan, per-step tests, and review protocol for trivial changes add overhead with no quality gain.
+
 ## Core Principles
 
 1. **Follow the Plan**: Execute the approved design exactly

diff --git a/skills/disciplined-quality-evaluation/SKILL.md b/skills/disciplined-quality-evaluation/SKILL.md
@@ -19,6 +19,16 @@ You evaluate Research Documents (Phase 1) and Implementation Plans (Phase 2) usi
 3. **Actionable feedback**: Every low score includes specific fix
 4. **Essentialism check**: Vital few focus enforced
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the document under review is trivial:
+- Research doc for a change under 10 lines with no architectural impact
+- Design doc that replicates an existing pattern
+- Spec doc with no ambiguous requirements or edge cases
+- Implementation plan with a single obvious step
+
+In these cases, a quick read-through is sufficient. Full KLS scoring for trivial documents adds overhead without improving outcomes.
+
 ## When to Use This Skill
 
 - After Phase 1 (Research) before Phase 2 (Design)

diff --git a/skills/disciplined-research/SKILL.md b/skills/disciplined-research/SKILL.md
@@ -17,6 +17,17 @@ You are a research specialist executing Phase 1 of disciplined development. Your
 4. **Document Everything**: Enable informed decisions
 5. **Focus on Vital Few**: Identify what's essential, eliminate the rest
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the change is trivial or well-understood:
+- Fewer than 10 lines changed, with no behavioural impact
+- Typo, rename, import-fix, or formatting-only change
+- Dependency version bump with no API changes
+- Config flag toggle with an obvious test
+- A bug fix where the root cause is already identified and the fix is obvious
+
+In these cases, go straight to implementation. Research phase bloat for trivial changes wastes time with no quality gain (see Eberhardt/Scott Logic experiment, Nov 2025).
+
 ## Essentialism: EXPLORE Phase
 
 This phase embodies McKeown's EXPLORE principle. Before diving into research, validate that this work is essential.

diff --git a/skills/disciplined-specification/SKILL.md b/skills/disciplined-specification/SKILL.md
@@ -17,6 +17,17 @@ You are a specification interviewer executing Phase 2.5 of disciplined developme
 3. **Think Adversarially**: What could go wrong? What could be exploited?
 4. **Consider Evolution**: How will this need to change?
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the change is trivial or well-understood:
+- Fewer than 10 lines changed, with no edge case implications
+- Typo, rename, import-fix, or formatting-only change
+- Adding a field that mirrors an existing pattern
+- Config flag toggle with no security or migration concerns
+- A bug fix with no ambiguous requirements
+
+In these cases, proceed directly to implementation. Specification interviews for trivial changes surface no new information and add cycle time with no quality gain.
+
 ## Prerequisites
 
 Phase 2.5 requires:

diff --git a/skills/disciplined-validation/SKILL.md b/skills/disciplined-validation/SKILL.md
@@ -18,6 +18,17 @@ You are a validation specialist executing Phase 5 of disciplined development. Yo
 4. **Defects Loop Back**: Failures return to research or design phase
 5. **Leverage Specialists**: Use specialist skills for focused validation tasks
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the change is trivial or well-understood:
+- Fewer than 10 lines changed, with no user-visible impact
+- Typo, rename, import-fix, or formatting-only change
+- Internal refactor with no API or behaviour change
+- Config flag toggle with no security or migration concerns
+- A bug fix already validated by the verification phase tests
+
+In these cases, consider the verification phase tests sufficient. Full UAT, stakeholder interviews, and system testing for trivial changes add cycle time with no quality gain.
+
 ## Integration with Specialist Skills
 
 This skill orchestrates validation by leveraging specialist skills:

diff --git a/skills/disciplined-verification/SKILL.md b/skills/disciplined-verification/SKILL.md
@@ -9,6 +9,17 @@ license: Apache-2.0
 
 You are a verification specialist executing Phase 4 of disciplined development. Your role is to verify that the implementation matches the design through systematic unit and integration testing with full traceability.
 
+## Skip This Skill When...
+
+Do NOT invoke this skill when the change is trivial or well-understood:
+- Fewer than 10 lines changed, with no behavioural impact
+- Typo, rename, import-fix, or formatting-only change
+- Adding a test that mirrors an existing test pattern
+- Config flag toggle with an obvious smoke test
+- A bug fix where the test is a straightforward regression check
+
+In these cases, write the regression test directly. Full traceability matrix, coverage analysis, and UBS scan for trivial changes add cycle time with no quality gain.
+
 ## Core Principles
 
 1. **Trace to Design**: Every test maps to a design element or spec finding

diff --git a/skills/judge/SKILL.md b/skills/judge/SKILL.md
@@ -41,13 +41,26 @@ framework defined in `disciplined-quality-evaluation`:
 | 4 | Good -- clear, useful, few issues |
 | 5 | Excellent -- exemplary, no issues |
 
-### Verdict Thresholds
-
-| Condition | Verdict |
-|-----------|---------|
-| All dimensions >= 3 AND average >= 3.5 | **accept** |
-| Any dimension < 3 OR average < 3.5, but all >= 2 | **improve** |
-| Any dimension < 2 | **reject** |
+### Verdict Thresholds (v2 -- 5-Level Ladder)
+
+| Condition | Verdict | Pre-push exit code |
+|-----------|---------|--------------------|
+| Any dimension < 2, OR security/data/deploy risk | **BLOCK** | 1 (push blocked) |
+| Any dimension < 3 but all >= 2 | **FIX_FIRST** | 1 (push blocked) |
+| All dimensions >= 3 AND average >= 3.5 | **SAFE_TO_COMMIT** | 0 (push allowed) |
+| All dimensions >= 3 but deploy needs runtime verification | **SAFE_TO_DEPLOY_AFTER_RUNTIME_CHECK** | 0 (push allowed, log advisory) |
+| Reviewer lacked evidence to make reliable call | **INSUFFICIENT_EVIDENCE** | 0 (push allowed, log loudly) |
+
+#### Migration from v1 (binary GO/NO-GO)
+
+| v1 Verdict | v2 Mapping |
+|------------|------------|
+| `reject` | `BLOCK` |
+| `improve` | `FIX_FIRST` |
+| `accept` | `SAFE_TO_COMMIT` |
+| `escalate` | `INSUFFICIENT_EVIDENCE` |
+
+Historical calibration data tagged `binary-verdict-v1` remains valid for longitudinal analysis.
 | Models disagree on accept/reject | **escalate** |
 
 ## Verdict Format