sumerc · sumerc · May 6, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.claude/skills/wer-wolf/SKILL.md b/.claude/skills/wer-wolf/SKILL.md
@@ -0,0 +1,86 @@
+---
+name: wer-wolf
+description: Benchmark Zee's saved audio samples against every available STT provider/model (named for Word Error Rate — the canonical STT eval metric). Use when the user wants to compare transcription quality across Groq Whisper, OpenAI, Mistral Voxtral, ElevenLabs Scribe, etc., on their own saved recordings, evaluate which model handles their domain vocabulary best, or audit how well hints.txt biasing works per provider.
+---
+
+# wer-wolf — STT bake-off for Zee samples
+
+Run every saved sample under `~/Library/Application Support/zee/samples/` through every STT provider Zee supports (that has an API key) and present a side-by-side comparison.
+
+## What this does
+
+1. Lists every sample directory under `~/Library/Application Support/zee/samples/`.
+2. For each sample, reads `info.json` (original provider/model, original transcribed text, timestamp) and stats `audio.<ext>` for KB size + format.
+3. Loops over `(provider, model)` pairs whose API key env var is set, swapping `config.json` for each, calling `./zee -transcribe <audio_file>`. Hints are read automatically from `hints.txt` by all five providers.
+4. Restores the original `config.json` at the end (also on error — script uses a trap).
+5. Renders the result as one block per sample: metadata header followed by an ASCII table of every model's output.
+
+## Pre-flight checks
+
+Before running, verify:
+
+- **Zee binary**. Find the running process first (most accurate, matches what the user is actually using):
+  ```bash
+  lsof -c zee 2>/dev/null | awk '$4=="txt" && $NF ~ /zee$/ {print $NF; exit}'
+  ```
+  Fall back to `/Users/supo/Desktop/p/zee/zee` if no process. If neither exists, ask the user where the binary is.
+- **Samples directory** exists and has at least one `2026-*` subdir. If empty, tell the user to enable `ZEE_SAVE_LAST_AUDIO=1` and capture some recordings first.
+- **API keys**. Print which of `GROQ_API_KEY`, `OPENAI_API_KEY`, `MISTRAL_API_KEY`, `ELEVENLABS_API_KEY` are set; the script auto-skips providers whose key is missing. Deepgram is skipped entirely (its only model `nova-3` is streaming, not compatible with batch `-transcribe`).
+- **Tray app warning**. Tell the user not to interact with the running tray app's menu during the run — it caches `config.json` in memory and will overwrite the file on the next menu interaction. Backup is restored at the end either way, but mid-run interference can corrupt results.
+
+## Running it
+
+```bash
+ZEE_BIN=<path> bash ~/.claude/skills/zee-transcribe-compare/scripts/compare.sh
+```
+
+The script:
+- Writes raw results to `/tmp/zee-compare-results.txt`.
+- Writes machine-readable per-sample/per-model JSON lines to `/tmp/zee-compare-results.jsonl` (each line: `{sample, provider, model, text, error?}`). Use this for the comparison table — easier than re-parsing the human-readable file.
+
+## How to render the result
+
+Start with a header showing the active vocabulary so the user can see what biasing the providers received:
+
+```
+**Hints in effect** (`~/Library/Application Support/zee/hints.txt`):
+<comma-joined non-comment, non-empty lines>
+```
+
+Then for each sample directory (sorted by timestamp), produce one block:
+
+```
+### <sample-id>  —  <KB> KB <format>  —  recorded <timestamp>
+**Originally transcribed by:** <provider> / <model>
+**Original text:** "<text>"
+
+| Provider / Model         | Transcription                            |
+|--------------------------|------------------------------------------|
+| groq / whisper-v3-turbo  | ...                                      |
+| ...                      | ...                                      |
+```
+
+Mark the row matching the original `(provider, model)` with `*` after the model name so the user can quickly see the baseline.
+
+If a model errored (network, 4xx, etc.), put the error message in the cell instead of the transcription.
+
+## Models tested per provider
+
+(Source of truth: `transcriber/*.go` in the zee repo. Update this list if Zee adds models.)
+
+| Provider     | Model(s)                                   | Hint field              |
+|--------------|--------------------------------------------|-------------------------|
+| groq         | `whisper-large-v3-turbo`, `whisper-large-v3` | `prompt`              |
+| openai       | `gpt-4o-transcribe`                         | `prompt`               |
+| mistral      | `voxtral-mini-latest`                       | `context_bias[]`       |
+| elevenlabs   | `scribe_v2`                                 | `keyterms[]`           |
+| deepgram     | `nova-3` (streaming-only — skipped)         | streaming keyterms     |
+
+All five wire `hints.txt` automatically — no flag required. Each provider receives the same hints joined as a single comma-separated string; how aggressively each provider biases varies a lot in practice (Whisper-via-Groq honors it strongly; Mistral and Scribe much less so based on observed runs).
+
+## Notes / gotchas
+
+- Each `-transcribe` call is one HTTP round-trip (~1–3s). 4 samples × 4 models ≈ 12–20s wall time plus API latency. Costs are tiny but real.
+- `voxtral-mini-latest` is a moving target — if Mistral renames it, update the script.
+- The script intentionally does **not** parallelize providers — keeps output ordered and avoids rate-limit surprises.
+- `-transcribe` exits non-zero on transcription errors; the script captures stdout+stderr and continues so one failure doesn't abort the matrix.
diff --git a/.claude/skills/wer-wolf/scripts/compare.sh b/.claude/skills/wer-wolf/scripts/compare.sh
@@ -0,0 +1,142 @@
+#!/usr/bin/env bash
+# zee-transcribe-compare — run every saved Zee sample through every available
+# STT provider and emit both human-readable and JSONL results.
+#
+# Required: ZEE_BIN (absolute path to zee binary). Falls back to discovering
+# the running process or /Users/supo/Desktop/p/zee/zee.
+set -u
+
+ZEE_BIN="${ZEE_BIN:-}"
+if [ -z "$ZEE_BIN" ]; then
+  ZEE_BIN=$(lsof -c zee 2>/dev/null | awk '$4=="txt" && $NF ~ /zee$/ {print $NF; exit}')
+fi
+[ -z "$ZEE_BIN" ] && ZEE_BIN="/Users/supo/Desktop/p/zee/zee"
+if [ ! -x "$ZEE_BIN" ]; then
+  echo "ERROR: zee binary not found or not executable: $ZEE_BIN" >&2
+  exit 1
+fi
+
+ZEE_DIR="$HOME/Library/Application Support/zee"
+CFG="$ZEE_DIR/config.json"
+SAMPLES="$ZEE_DIR/samples"
+BACKUP="/tmp/zee-config-backup.$$.json"
+HUMAN=/tmp/zee-compare-results.txt
+JSONL=/tmp/zee-compare-results.jsonl
+
+[ -f "$CFG" ] || { echo "ERROR: $CFG missing" >&2; exit 1; }
+[ -d "$SAMPLES" ] || { echo "ERROR: $SAMPLES missing" >&2; exit 1; }
+
+cp "$CFG" "$BACKUP"
+restore() { cp "$BACKUP" "$CFG" 2>/dev/null; rm -f "$BACKUP"; }
+trap restore EXIT INT TERM
+
+: > "$HUMAN"
+: > "$JSONL"
+
+# Header: active hints (what each provider receives as biasing).
+HINTS_FILE="$ZEE_DIR/hints.txt"
+{
+  echo "########## hints.txt ##########"
+  if [ -f "$HINTS_FILE" ]; then
+    grep -vE '^\s*(#|$)' "$HINTS_FILE" | paste -sd, -
+  else
+    echo "(no hints.txt found — providers receive no biasing)"
+  fi
+  echo
+} | tee -a "$HUMAN"
+
+# (provider, model, env_var) — keep groq turbo first since it's the fastest baseline.
+# Update this list when zee adds providers/models (transcriber/*.go).
+COMBOS=(
+  "groq|whisper-large-v3-turbo|GROQ_API_KEY"
+  "groq|whisper-large-v3|GROQ_API_KEY"
+  "openai|gpt-4o-transcribe|OPENAI_API_KEY"
+  "mistral|voxtral-mini-latest|MISTRAL_API_KEY"
+  "elevenlabs|scribe_v2|ELEVENLABS_API_KEY"
+)
+
+# JSON string escaper for the JSONL output. Python is on every macOS.
+jesc() { python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()), end="")'; }
+
+write_cfg() {
+  local prov="$1" model="$2"
+  cat > "$CFG" <<EOF
+{
+  "language": "en",
+  "device": "C-1U",
+  "provider": "$prov",
+  "model": "$model",
+  "auto_paste": true,
+  "auto_start": false
+}
+EOF
+}
+
+# Discover samples (sorted by directory name = chronological).
+SAMPLE_DIRS=()
+while IFS= read -r d; do SAMPLE_DIRS+=("$d"); done < <(find "$SAMPLES" -maxdepth 1 -mindepth 1 -type d -name "2026-*" | sort)
+
+if [ "${#SAMPLE_DIRS[@]}" -eq 0 ]; then
+  echo "No samples found under $SAMPLES — enable ZEE_SAVE_LAST_AUDIO=1 and record some clips first." >&2
+  exit 1
+fi
+
+for combo in "${COMBOS[@]}"; do
+  IFS='|' read -r prov model envk <<< "$combo"
+  if [ -z "${!envk:-}" ]; then
+    echo "########## SKIP $prov/$model ($envk not set) ##########" | tee -a "$HUMAN"
+    continue
+  fi
+  echo "########## $prov / $model ##########" | tee -a "$HUMAN"
+  write_cfg "$prov" "$model"
+  for d in "${SAMPLE_DIRS[@]}"; do
+    sample=$(basename "$d")
+    audio=$(find "$d" -maxdepth 1 -type f -name "audio.*" | head -1)
+    [ -z "$audio" ] && continue
+    echo "----- $sample -----" | tee -a "$HUMAN"
+    text=$(timeout 45 "$ZEE_BIN" -transcribe "$audio" 2>&1)
+    rc=$?
+    echo "$text" | tee -a "$HUMAN"
+    # Emit JSONL (one line per sample/model). Trim trailing newline from text first.
+    text_trimmed=$(printf '%s' "$text")
+    if [ $rc -ne 0 ]; then
+      printf '{"sample":%s,"provider":%s,"model":%s,"error":%s}\n' \
+        "$(printf '%s' "$sample" | jesc)" \
+        "$(printf '%s' "$prov" | jesc)" \
+        "$(printf '%s' "$model" | jesc)" \
+        "$(printf '%s' "$text_trimmed" | jesc)" >> "$JSONL"
+    else
+      printf '{"sample":%s,"provider":%s,"model":%s,"text":%s}\n' \
+        "$(printf '%s' "$sample" | jesc)" \
+        "$(printf '%s' "$prov" | jesc)" \
+        "$(printf '%s' "$model" | jesc)" \
+        "$(printf '%s' "$text_trimmed" | jesc)" >> "$JSONL"
+    fi
+  done
+done
+
+# Also dump per-sample metadata so the rendering step doesn't need to re-stat files.
+META=/tmp/zee-compare-samples.jsonl
+: > "$META"
+for d in "${SAMPLE_DIRS[@]}"; do
+  sample=$(basename "$d")
+  info="$d/info.json"
+  audio=$(find "$d" -maxdepth 1 -type f -name "audio.*" | head -1)
+  [ -z "$audio" ] && continue
+  size_kb=$(awk -v b="$(stat -f%z "$audio")" 'BEGIN{printf "%.1f", b/1024}')
+  ext="${audio##*.}"
+  python3 -c "
+import json,sys
+info=json.load(open('$info'))
+info['sample']='$sample'
+info['size_kb']=$size_kb
+info['ext']='$ext'
+print(json.dumps(info))
+" >> "$META"
+done
+
+echo
+echo "DONE — config restored."
+echo "  samples meta: $META"
+echo "  raw output:   $HUMAN"
+echo "  per-cell:     $JSONL"
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,3 @@ transcribe_log.txt
 
 # OS
 .DS_Store
-
-# Claude
-.claude/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -43,6 +43,11 @@ make benchmark WAV=file.wav RUNS=5
 - `-benchmark <wav>` - run benchmark instead of live recording
 - `-runs N` - benchmark iterations (default: 3)
 - `-logpath <path>` - log directory (default: `$ZEE_LOG_PATH` or OS-specific, use `./` for current directory)
+- `-hints <words>` - comma-separated vocabulary hints (overrides `hints.txt`)
+- `-transcribe <file>` - transcribe an audio file (mp3/flac/wav) and exit
+
+**Environment variables:**
+- `ZEE_SAVE_LAST_AUDIO=1` - enables "Save Last Recording" tray button (saves audio + metadata to `config/samples/`)
 
 ## Architecture
 
@@ -56,7 +61,7 @@ Ctrl+Shift+Space keydown → record audio → encode (mode-based) → API call
 - `main.go` - hotkey handling, audio capture, recording logic, panic recovery
 - `tray/` - system tray icon, menus (devices, providers, languages, auto-paste), dynamic icons
 - `encoder/` - AudioEncoder interface, FLAC, MP3, and Adaptive implementations
-- `transcriber/` - Groq and DeepGram API clients with shared TracedClient for HTTP timing metrics
+- `transcriber/` - STT providers (Groq, OpenAI, Deepgram, Mistral, ElevenLabs) with shared TracedClient for HTTP timing metrics
 - `hotkey/` - global hotkey registration (Ctrl+Shift+Space) with platform-specific backends
 - `clipboard/` - platform-specific clipboard and paste operations (Cmd+V / Ctrl+V)
 - `audio/` - platform-specific audio capture (malgo on macOS, PulseAudio on Linux)
@@ -66,7 +71,7 @@ Ctrl+Shift+Space keydown → record audio → encode (mode-based) → API call
 - `device.go` - microphone picker with arrow-key navigation
 - `vad.go` - voice activity detection using WebRTC VAD with debounced speech confirmation
 - `silence.go` - silence monitoring with warnings, repeat beeps, and auto-close (toggle mode)
-- `settings.go` - persistent settings (language, device, provider/model, auto-paste, auto-start) with JSON config file
+- `config/` - persistent settings (`config.json`) and vocabulary hints (`hints.txt`)
 - `log.go` - diagnostic logging and panic capture to `diagnostics_log.txt`
 
 ## Design Philosophy

diff --git a/settings.go → config/config.go b/settings.go → config/config.go
@@ -1,4 +1,4 @@
-package main
+package config
 
 import (
 	"encoding/json"
@@ -22,19 +22,20 @@ type Settings struct {
 const settingsFile = "config.json"
 
 var (
-	settingsMu sync.Mutex
-	current    Settings
-	cfgDir     string
+	mu       sync.Mutex
+	current  Settings
+	dir      string
+	defaults = Settings{
+		Language:  "en",
+		AutoPaste: true,
+	}
 )
 
-var settingsDefaults = Settings{
-	Language:  "en",
-	AutoPaste: true,
-}
+func SetDir(d string) { dir = d }
 
-func settingsDir() string {
-	if cfgDir != "" {
-		return cfgDir
+func Dir() string {
+	if dir != "" {
+		return dir
 	}
 	home, err := os.UserHomeDir()
 	if err != nil {
@@ -58,12 +59,12 @@ func settingsDir() string {
 }
 
 func settingsPath() string {
-	return filepath.Join(settingsDir(), settingsFile)
+	return filepath.Join(Dir(), settingsFile)
 }
 
-func loadSettings() error {
-	cfgDir = settingsDir()
-	current = settingsDefaults
+func Load() error {
+	dir = Dir()
+	current = defaults
 
 	data, err := os.ReadFile(settingsPath())
 	if err != nil {
@@ -81,30 +82,30 @@ func loadSettings() error {
 
 	current = s
 	if current.Language == "" {
-		current.Language = settingsDefaults.Language
+		current.Language = defaults.Language
 	}
 	return nil
 }
 
-func getSettings() Settings {
-	settingsMu.Lock()
+func Get() Settings {
+	mu.Lock()
 	s := current
-	settingsMu.Unlock()
+	mu.Unlock()
 	return s
 }
 
-func updateSettings(fn func(*Settings)) {
-	settingsMu.Lock()
+func Update(fn func(*Settings)) {
+	mu.Lock()
 	fn(&current)
 	s := current
-	settingsMu.Unlock()
+	mu.Unlock()
 
-	saveSettings(s)
+	save(s)
 }
 
-func saveSettings(s Settings) {
-	dir := cfgDir
-	if err := os.MkdirAll(dir, 0755); err != nil {
+func save(s Settings) {
+	d := dir
+	if err := os.MkdirAll(d, 0755); err != nil {
 		log.Warnf("settings: create dir: %v", err)
 		return
 	}
@@ -116,7 +117,7 @@ func saveSettings(s Settings) {
 	}
 	data = append(data, '\n')
 
-	tmp, err := os.CreateTemp(dir, ".config-*.json")
+	tmp, err := os.CreateTemp(d, ".config-*.json")
 	if err != nil {
 		log.Warnf("settings: create temp: %v", err)
 		return