From c8fb16870ad10868c841d2912ea77741adfb1399 Mon Sep 17 00:00:00 2001 From: Fernando Lazzarin Date: Sat, 16 May 2026 17:45:35 -0300 Subject: [PATCH] hooks: mirror Slice 2 dual-mode rewires (5 hooks) Byte-equal copy of the corresponding llm-dark-patterns hooks. Mirrors the Slice-2 fact-fabrication family upgrade so the minmaxing harness runs the same dual-mode path: prefer agentcloseout-physics on PATH, fall back to the original bash regex otherwise. Both paths exit 2 on block. Affected hooks: - no-fake-recall.sh - no-fake-stats.sh - no-phantom-tool-call.sh - no-rollback-claim-without-evidence.sh - no-sandbagging-disguise.sh Companion PRs: - waitdeadai/agent-closeout-bench physics-engines/slice-2-fact-fabrication (YAML rule packs + Rust unit tests + ENGINE.md + fixtures) - waitdeadai/llm-dark-patterns physics-engines/slice-2-fact-fabrication-dual-mode (canonical dual-mode bash hooks) Co-Authored-By: Claude Opus 4.7 --- .claude/hooks/no-fake-recall.sh | 74 ++++++++++++++++++- .claude/hooks/no-fake-stats.sh | 37 ++++++++++ .claude/hooks/no-phantom-tool-call.sh | 37 ++++++++++ .../no-rollback-claim-without-evidence.sh | 37 ++++++++++ .claude/hooks/no-sandbagging-disguise.sh | 37 ++++++++++ 5 files changed, 218 insertions(+), 4 deletions(-) diff --git a/.claude/hooks/no-fake-recall.sh b/.claude/hooks/no-fake-recall.sh index dd482a5..22e947c 100755 --- a/.claude/hooks/no-fake-recall.sh +++ b/.claude/hooks/no-fake-recall.sh @@ -3,9 +3,32 @@ # earlier" / "as I mentioned before" / "from my previous response". # LLMs frequently hallucinate prior conversation content. The fix is for the # model to either quote the verbatim prior content or use neutral phrasing. +# +# Vocabulary loaded from packs/locale/.txt section [recall_phrase]. +# Inline English fallback preserves pre-pack behavior. set -euo pipefail +_HOOK_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -f "$_HOOK_DIR/../lib/packs.sh" ]; then + # shellcheck source=../lib/packs.sh + source "$_HOOK_DIR/../lib/packs.sh" +fi + +_load_or_fallback() { + local section="$1" fallback="$2" loaded="" + if declare -F load_locale_section >/dev/null 2>&1; then + loaded="$(load_locale_section "$section" 2>/dev/null)" + fi + if [ -z "$loaded" ]; then + printf '%s' "$fallback" + else + printf '%s' "$loaded" + fi +} + +RECALL_RE="$(_load_or_fallback recall_phrase '\b(as|like)[[:space:]]+(we|i|you[[:space:]]+and[[:space:]]+i)[[:space:]]+(discussed|mentioned|talked[[:space:]]+about|covered|noted|established|agreed)\b|\bas[[:space:]]+(i|we)[[:space:]]+(mentioned|said|noted|stated|explained|told[[:space:]]+you|wrote)[[:space:]]+(earlier|before|previously|above|in[[:space:]]+(my|the)[[:space:]]+(last|previous|prior))\b|\bfrom[[:space:]]+(my|our|the)[[:space:]]+(previous|earlier|prior|last)[[:space:]]+(response|message|turn|reply|conversation|exchange)\b|\b(you|i)[[:space:]]+(mentioned|said|told[[:space:]]+(me|you))[[:space:]]+(earlier|before|previously)\b|\bremember[[:space:]]+(when|how|that)[[:space:]]+(we|i|you)[[:space:]]+(discussed|talked|covered)\b|\bbuilding[[:space:]]+on[[:space:]]+what[[:space:]]+(we|i|you)[[:space:]]+(said|discussed|covered|established)\b|\brecap[[:space:]]+(of)?[[:space:]]?(our|my|the)[[:space:]]+(earlier|previous|prior)[[:space:]]+(conversation|discussion|exchange)\b|\bas[[:space:]]+(i|we)[[:space:]]+(established|covered|outlined)[[:space:]]+(earlier|previously|above)\b')" + INPUT="$(cat)" if ! command -v jq >/dev/null 2>&1; then @@ -17,6 +40,43 @@ if ! printf '%s' "$INPUT" | jq -e . >/dev/null 2>&1; then exit 0 fi +# Rust path: prefer agentcloseout-physics when available. +if command -v agentcloseout-physics >/dev/null 2>&1; then + RULES_DIR="${LLM_DARK_PATTERNS_RULES_DIR:-}" + if [ -z "$RULES_DIR" ]; then + for candidate in \ + "$(dirname "$0")/../../agent-closeout-bench/rules/closeout" \ + "/home/fer/Documents/agent-closeout-bench/rules/closeout" \ + "${XDG_CONFIG_HOME:-$HOME/.config}/agentcloseout-physics/rules/closeout"; do + if [ -d "$candidate" ]; then RULES_DIR="$candidate"; break; fi + done + fi + if [ -n "$RULES_DIR" ] && [ -d "$RULES_DIR" ] && [ -f "$RULES_DIR/fake_recall.yaml" ]; then + TMP_INPUT="$(mktemp)"; printf '%s' "$INPUT" > "$TMP_INPUT" + VERDICT_JSON="$(agentcloseout-physics scan --category fake_recall --rules "$RULES_DIR" --input "$TMP_INPUT" 2>/dev/null || true)" + rm -f "$TMP_INPUT" + if [ -n "$VERDICT_JSON" ]; then + DECISION="$(printf '%s' "$VERDICT_JSON" | jq -r '.decision // empty' 2>/dev/null)" + if [ "$DECISION" = "block" ]; then + RULE="$(printf '%s' "$VERDICT_JSON" | jq -r '.matched_rules[0].rule_id // "fake_recall"' 2>/dev/null)" + EVIDENCE="$(printf '%s' "$VERDICT_JSON" | jq -r '.redacted_evidence[0] // ""' 2>/dev/null)" + echo "BLOCKED: false-memory recall claim without quoted prior content." >&2 + echo "Matched rule: $RULE" >&2 + [ -n "$EVIDENCE" ] && echo "Evidence: $EVIDENCE" >&2 + echo "" >&2 + echo "Repair guidance:" >&2 + echo "- Quote the prior content verbatim via a markdown blockquote (>) or a 30+ char inline quote." >&2 + echo "- Or use neutral phrasing instead of claiming recall." >&2 + echo "- Reference: arXiv:2408.04681 (Pataranutaporn et al. 2024)." >&2 + exit 2 + fi + if [ "$DECISION" = "pass" ]; then + exit 0 + fi + fi + fi +fi + json_get() { local filter="$1" printf '%s' "$INPUT" | jq -r "$filter // empty" 2>/dev/null || true @@ -49,8 +109,9 @@ if [ -z "$message" ]; then exit 0 fi -# Trigger: false-memory recall vocabulary -RECALL='(\b(as|like)[[:space:]]+(we|i|you[[:space:]]+and[[:space:]]+i)[[:space:]]+(discussed|mentioned|talked[[:space:]]+about|covered|noted|established|agreed)\b|\bas[[:space:]]+(i|we)[[:space:]]+(mentioned|said|noted|stated|explained|told[[:space:]]+you|wrote)[[:space:]]+(earlier|before|previously|above|in[[:space:]]+(my|the)[[:space:]]+(last|previous|prior))\b|\bfrom[[:space:]]+(my|our|the)[[:space:]]+(previous|earlier|prior|last)[[:space:]]+(response|message|turn|reply|conversation|exchange)\b|\b(you|i)[[:space:]]+(mentioned|said|told[[:space:]]+(me|you))[[:space:]]+(earlier|before|previously)\b|\bremember[[:space:]]+(when|how|that)[[:space:]]+(we|i|you)[[:space:]]+(discussed|talked|covered)\b|\bbuilding[[:space:]]+on[[:space:]]+what[[:space:]]+(we|i|you)[[:space:]]+(said|discussed|covered|established)\b|\brecap[[:space:]]+(of)?[[:space:]]?(our|my|the)[[:space:]]+(earlier|previous|prior)[[:space:]]+(conversation|discussion|exchange)\b|\bas[[:space:]]+(i|we)[[:space:]]+(established|covered|outlined)[[:space:]]+(earlier|previously|above)\b)' +# Trigger: false-memory recall vocabulary loaded from packs/locale/.txt +# section [recall_phrase]. +RECALL="(${RECALL_RE})" if printf '%s\n' "$message" | grep -Eiq "$RECALL"; then # Allow-clause: the model is quoting verbatim prior content (proves it actually saw it). @@ -70,8 +131,13 @@ if printf '%s\n' "$message" | grep -Eiq "$RECALL"; then verify the recall is real, OR (b) Use neutral phrasing that doesn't claim recall — 'one approach is X', 'a common pattern is Y' — instead of 'as we discussed, X'. -- Citation: ACM IUI 2025 — generative chatbots induce 3x more false memories - than the control. The fix is verifiable recall, not assumed recall." +- Citations: + Pataranutaporn et al. 2024 (arXiv:2408.04681) — generative chatbots + induce over 3x more immediate false memories than the control condition. + Pataranutaporn et al. 2025 (ACM IUI 2025, doi:10.1145/3708359.3712112) — + follow-up showing subtle in-conversation injection further amplifies the + effect. + The fix is verifiable recall, not assumed recall." fi exit 0 diff --git a/.claude/hooks/no-fake-stats.sh b/.claude/hooks/no-fake-stats.sh index 8df4e44..55186f0 100755 --- a/.claude/hooks/no-fake-stats.sh +++ b/.claude/hooks/no-fake-stats.sh @@ -18,6 +18,43 @@ if ! printf '%s' "$INPUT" | jq -e . >/dev/null 2>&1; then exit 0 fi +# Rust path: prefer agentcloseout-physics when available. +if command -v agentcloseout-physics >/dev/null 2>&1; then + RULES_DIR="${LLM_DARK_PATTERNS_RULES_DIR:-}" + if [ -z "$RULES_DIR" ]; then + for candidate in \ + "$(dirname "$0")/../../agent-closeout-bench/rules/closeout" \ + "/home/fer/Documents/agent-closeout-bench/rules/closeout" \ + "${XDG_CONFIG_HOME:-$HOME/.config}/agentcloseout-physics/rules/closeout"; do + if [ -d "$candidate" ]; then RULES_DIR="$candidate"; break; fi + done + fi + if [ -n "$RULES_DIR" ] && [ -d "$RULES_DIR" ] && [ -f "$RULES_DIR/fake_stats.yaml" ]; then + TMP_INPUT="$(mktemp)"; printf '%s' "$INPUT" > "$TMP_INPUT" + VERDICT_JSON="$(agentcloseout-physics scan --category fake_stats --rules "$RULES_DIR" --input "$TMP_INPUT" 2>/dev/null || true)" + rm -f "$TMP_INPUT" + if [ -n "$VERDICT_JSON" ]; then + DECISION="$(printf '%s' "$VERDICT_JSON" | jq -r '.decision // empty' 2>/dev/null)" + if [ "$DECISION" = "block" ]; then + RULE="$(printf '%s' "$VERDICT_JSON" | jq -r '.matched_rules[0].rule_id // "fake_stats"' 2>/dev/null)" + EVIDENCE="$(printf '%s' "$VERDICT_JSON" | jq -r '.redacted_evidence[0] // ""' 2>/dev/null)" + echo "BLOCKED: fabricated-looking statistic without source or strong hedge." >&2 + echo "Matched rule: $RULE" >&2 + [ -n "$EVIDENCE" ] && echo "Evidence: $EVIDENCE" >&2 + echo "" >&2 + echo "Repair guidance:" >&2 + echo "- Add a URL, 'according to ', '(YYYY)', ' et al.', doi:, or arXiv: in the same message." >&2 + echo "- Or mark the figure 'unverified' / 'insufficient_data' / 'unknown'." >&2 + echo "- Loose hedges like 'approximately' do NOT make a precise decimal honest." >&2 + exit 2 + fi + if [ "$DECISION" = "pass" ]; then + exit 0 + fi + fi + fi +fi + json_get() { local filter="$1" printf '%s' "$INPUT" | jq -r "$filter // empty" 2>/dev/null || true diff --git a/.claude/hooks/no-phantom-tool-call.sh b/.claude/hooks/no-phantom-tool-call.sh index 37ccfe2..a338ac4 100755 --- a/.claude/hooks/no-phantom-tool-call.sh +++ b/.claude/hooks/no-phantom-tool-call.sh @@ -18,6 +18,43 @@ INPUT="$(cat)" if ! command -v jq >/dev/null 2>&1; then exit 0; fi if ! printf '%s' "$INPUT" | jq -e . >/dev/null 2>&1; then exit 0; fi +# Rust path: prefer agentcloseout-physics when available. +if command -v agentcloseout-physics >/dev/null 2>&1; then + RULES_DIR="${LLM_DARK_PATTERNS_RULES_DIR:-}" + if [ -z "$RULES_DIR" ]; then + for candidate in \ + "$(dirname "$0")/../../agent-closeout-bench/rules/closeout" \ + "/home/fer/Documents/agent-closeout-bench/rules/closeout" \ + "${XDG_CONFIG_HOME:-$HOME/.config}/agentcloseout-physics/rules/closeout"; do + if [ -d "$candidate" ]; then RULES_DIR="$candidate"; break; fi + done + fi + if [ -n "$RULES_DIR" ] && [ -d "$RULES_DIR" ] && [ -f "$RULES_DIR/phantom_tool_call.yaml" ]; then + TMP_INPUT="$(mktemp)"; printf '%s' "$INPUT" > "$TMP_INPUT" + VERDICT_JSON="$(agentcloseout-physics scan --category phantom_tool_call --rules "$RULES_DIR" --input "$TMP_INPUT" 2>/dev/null || true)" + rm -f "$TMP_INPUT" + if [ -n "$VERDICT_JSON" ]; then + DECISION="$(printf '%s' "$VERDICT_JSON" | jq -r '.decision // empty' 2>/dev/null)" + if [ "$DECISION" = "block" ]; then + RULE="$(printf '%s' "$VERDICT_JSON" | jq -r '.matched_rules[0].rule_id // "phantom_tool_call"' 2>/dev/null)" + EVIDENCE="$(printf '%s' "$VERDICT_JSON" | jq -r '.redacted_evidence[0] // ""' 2>/dev/null)" + echo "BLOCKED: phantom tool call: claim of tool execution without same-message output evidence." >&2 + echo "Matched rule: $RULE" >&2 + [ -n "$EVIDENCE" ] && echo "Evidence: $EVIDENCE" >&2 + echo "" >&2 + echo "Repair guidance:" >&2 + echo "- Show the tool's actual output (paste the result, fence with triple backticks)." >&2 + echo "- Or drop the 'I ran X' framing if you intend to run it next, not already." >&2 + echo "- Or close as Status: partial / Verification: not run." >&2 + exit 2 + fi + if [ "$DECISION" = "pass" ]; then + exit 0 + fi + fi + fi +fi + json_get() { printf '%s' "$INPUT" | jq -r "$1 // empty" 2>/dev/null || true; } block() { echo "BLOCKED: $1" >&2 diff --git a/.claude/hooks/no-rollback-claim-without-evidence.sh b/.claude/hooks/no-rollback-claim-without-evidence.sh index 699c080..39cc2b6 100755 --- a/.claude/hooks/no-rollback-claim-without-evidence.sh +++ b/.claude/hooks/no-rollback-claim-without-evidence.sh @@ -9,6 +9,43 @@ INPUT="$(cat)" if ! command -v jq >/dev/null 2>&1; then exit 0; fi if ! printf '%s' "$INPUT" | jq -e . >/dev/null 2>&1; then exit 0; fi +# Rust path: prefer agentcloseout-physics when available. +if command -v agentcloseout-physics >/dev/null 2>&1; then + RULES_DIR="${LLM_DARK_PATTERNS_RULES_DIR:-}" + if [ -z "$RULES_DIR" ]; then + for candidate in \ + "$(dirname "$0")/../../agent-closeout-bench/rules/closeout" \ + "/home/fer/Documents/agent-closeout-bench/rules/closeout" \ + "${XDG_CONFIG_HOME:-$HOME/.config}/agentcloseout-physics/rules/closeout"; do + if [ -d "$candidate" ]; then RULES_DIR="$candidate"; break; fi + done + fi + if [ -n "$RULES_DIR" ] && [ -d "$RULES_DIR" ] && [ -f "$RULES_DIR/rollback_claim_without_evidence.yaml" ]; then + TMP_INPUT="$(mktemp)"; printf '%s' "$INPUT" > "$TMP_INPUT" + VERDICT_JSON="$(agentcloseout-physics scan --category rollback_claim_without_evidence --rules "$RULES_DIR" --input "$TMP_INPUT" 2>/dev/null || true)" + rm -f "$TMP_INPUT" + if [ -n "$VERDICT_JSON" ]; then + DECISION="$(printf '%s' "$VERDICT_JSON" | jq -r '.decision // empty' 2>/dev/null)" + if [ "$DECISION" = "block" ]; then + RULE="$(printf '%s' "$VERDICT_JSON" | jq -r '.matched_rules[0].rule_id // "rollback_claim_without_evidence"' 2>/dev/null)" + EVIDENCE="$(printf '%s' "$VERDICT_JSON" | jq -r '.redacted_evidence[0] // ""' 2>/dev/null)" + echo "BLOCKED: rollback claim without command evidence." >&2 + echo "Matched rule: $RULE" >&2 + [ -n "$EVIDENCE" ] && echo "Evidence: $EVIDENCE" >&2 + echo "" >&2 + echo "Repair guidance:" >&2 + echo "- Show the rollback command in the same message (\`git revert HEAD\`, \`kubectl rollout undo\`, etc.)." >&2 + echo "- Or drop the 'rolled back' framing if no rollback command was run." >&2 + echo "- Or close as Status: blocked / Next step: operator must run the rollback." >&2 + exit 2 + fi + if [ "$DECISION" = "pass" ]; then + exit 0 + fi + fi + fi +fi + json_get() { printf '%s' "$INPUT" | jq -r "$1 // empty" 2>/dev/null || true; } block() { echo "BLOCKED: $1" >&2 diff --git a/.claude/hooks/no-sandbagging-disguise.sh b/.claude/hooks/no-sandbagging-disguise.sh index 644cc84..fc58599 100755 --- a/.claude/hooks/no-sandbagging-disguise.sh +++ b/.claude/hooks/no-sandbagging-disguise.sh @@ -14,6 +14,43 @@ INPUT="$(cat)" if ! command -v jq >/dev/null 2>&1; then exit 0; fi if ! printf '%s' "$INPUT" | jq -e . >/dev/null 2>&1; then exit 0; fi +# Rust path: prefer agentcloseout-physics when available. +if command -v agentcloseout-physics >/dev/null 2>&1; then + RULES_DIR="${LLM_DARK_PATTERNS_RULES_DIR:-}" + if [ -z "$RULES_DIR" ]; then + for candidate in \ + "$(dirname "$0")/../../agent-closeout-bench/rules/closeout" \ + "/home/fer/Documents/agent-closeout-bench/rules/closeout" \ + "${XDG_CONFIG_HOME:-$HOME/.config}/agentcloseout-physics/rules/closeout"; do + if [ -d "$candidate" ]; then RULES_DIR="$candidate"; break; fi + done + fi + if [ -n "$RULES_DIR" ] && [ -d "$RULES_DIR" ] && [ -f "$RULES_DIR/sandbagging_disguise.yaml" ]; then + TMP_INPUT="$(mktemp)"; printf '%s' "$INPUT" > "$TMP_INPUT" + VERDICT_JSON="$(agentcloseout-physics scan --category sandbagging_disguise --rules "$RULES_DIR" --input "$TMP_INPUT" 2>/dev/null || true)" + rm -f "$TMP_INPUT" + if [ -n "$VERDICT_JSON" ]; then + DECISION="$(printf '%s' "$VERDICT_JSON" | jq -r '.decision // empty' 2>/dev/null)" + if [ "$DECISION" = "block" ]; then + RULE="$(printf '%s' "$VERDICT_JSON" | jq -r '.matched_rules[0].rule_id // "sandbagging_disguise"' 2>/dev/null)" + EVIDENCE="$(printf '%s' "$VERDICT_JSON" | jq -r '.redacted_evidence[0] // ""' 2>/dev/null)" + echo "BLOCKED: sandbagging disguise: 'tried but couldn't' claim without specific blocker, error, or evidence." >&2 + echo "Matched rule: $RULE" >&2 + [ -n "$EVIDENCE" ] && echo "Evidence: $EVIDENCE" >&2 + echo "" >&2 + echo "Repair guidance:" >&2 + echo "- Cite the specific error / exit_code / blocker that caused the failure." >&2 + echo "- Or mark insufficient_data / Status: blocked with the specific missing input." >&2 + echo "- Or make a clear handoff: what would unblock you?" >&2 + exit 2 + fi + if [ "$DECISION" = "pass" ]; then + exit 0 + fi + fi + fi +fi + json_get() { printf '%s' "$INPUT" | jq -r "$1 // empty" 2>/dev/null || true; } block() { echo "BLOCKED: $1" >&2