diff --git a/docs/advanced/benchmarks/_bench_helpers.py b/docs/advanced/benchmarks/_bench_helpers.py
new file mode 100644
index 00000000..9c6dbe50
--- /dev/null
+++ b/docs/advanced/benchmarks/_bench_helpers.py
@@ -0,0 +1,306 @@
+"""Shared helpers for the VE/VEP analytical benchmark suite.
+
+Three benchmark cases share this module:
+
+* ``bench_ve_harmonic.py``   — Maxwell shear under :math:`V_{top}(t) = V_0 \\sin(\\omega t)`
+* ``bench_ve_square.py``     — Maxwell shear under square-wave :math:`V_{top}`
+* ``bench_vep_square.py``    — same square-wave forcing with Min-mode plasticity
+
+Common setup
+------------
+* Mesh: ``StructuredQuadBox`` 16×8 over ``(±1, ±0.5)``.
+* Velocity at top/bottom: ``±V_top(t)``, free at left/right.
+* Pure shear with strain rate ``γ̇ = 2·V_top/H = V_top``.
+* Centre-point stress sample.
+* Scaling: ``η = μ = 1``, so Maxwell relaxation time ``t_r = 1`` and the
+  steady-state VE stress under sustained shear is ``η·γ̇``.
+
+Logging
+-------
+Each run writes a self-contained ``.npz`` to ``output/benchmarks/<name>.npz``
+holding the simulation trace, the analytical reference, the parameter
+dict, and metadata. Plotting is decoupled — see ``plot_benchmarks.py``.
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_REPO_ROOT = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))
+OUTPUT_DIR = os.path.join(_REPO_ROOT, "output", "benchmarks")
+FIG_DIR = os.path.join(_REPO_ROOT, "docs", "advanced", "figures")
+
+
+# ---------------------------------------------------------------------------
+# Common parameters
+# ---------------------------------------------------------------------------
+
+DEFAULT_PARAMS = dict(
+    eta=1.0,         # shear viscosity
+    mu=1.0,          # shear modulus
+    H=1.0,           # box height (top–bottom)
+    W=2.0,           # box width  (left–right)
+    elementRes=(16, 8),
+    velocity_degree=2,
+    pressure_degree=1,
+    bdf_order=2,
+)
+
+
+def t_relax(params):
+    return params["eta"] / params["mu"]
+
+
+# ---------------------------------------------------------------------------
+# Analytical solutions
+# ---------------------------------------------------------------------------
+
+def maxwell_oscillatory(t, eta, mu, gamma_dot_0, omega):
+    r"""Closed-form Maxwell shear stress under sinusoidal forcing
+    :math:`\dot\gamma(t) = \dot\gamma_0 \sin(\omega t)`.
+
+    Solving :math:`\dot\sigma + \sigma/t_r = \mu\dot\gamma` with
+    :math:`\sigma(0) = 0` gives
+
+    .. math::
+        \sigma(t) = \frac{\eta\dot\gamma_0}{1+\mathrm{De}^2}
+        \left[\sin(\omega t) - \mathrm{De}\cos(\omega t) + \mathrm{De}\,e^{-t/t_r}\right]
+
+    where :math:`\mathrm{De} = \omega t_r` is the Deborah number. After
+    transient decay (:math:`t \gg t_r`) the steady response has amplitude
+    :math:`\eta\dot\gamma_0/\sqrt{1+\mathrm{De}^2}` and phase lag
+    :math:`\varphi = \arctan(\mathrm{De})`.
+    """
+    t_r = eta / mu
+    De = omega * t_r
+    pre = eta * gamma_dot_0 / (1.0 + De**2)
+    return pre * (np.sin(omega * t) - De * np.cos(omega * t) + De * np.exp(-t / t_r))
+
+
+def maxwell_square_wave(t, eta, mu, gamma_dot_0, half_period):
+    r"""Closed-form Maxwell shear stress under square-wave forcing.
+
+    Within each half-period the stress relaxes exponentially toward the
+    steady-state value :math:`\pm\eta\dot\gamma_0` from the value at the
+    period boundary:
+
+    .. math::
+        \sigma(t) = s_n\sigma_{\mathrm{ss}} + (\sigma_{0,n} - s_n\sigma_{\mathrm{ss}})\,
+                    e^{-(t - t_n)/t_r}
+
+    where :math:`s_n = (-1)^n` is the sign in half-period :math:`n` and
+    :math:`\sigma_{0,n}` is the stress at the start of that half-period.
+    """
+    t_r = eta / mu
+    sigma_ss = eta * gamma_dot_0
+    out = np.zeros_like(np.asarray(t, dtype=float))
+    sigma_start = 0.0
+    for i, ti in enumerate(np.asarray(t, dtype=float)):
+        n = int(ti / half_period)
+        t_local = ti - n * half_period
+        # Replay periods 0..n-1 to find sigma at start of period n
+        sigma_n = 0.0
+        for j in range(n):
+            sign = 1.0 if j % 2 == 0 else -1.0
+            target = sign * sigma_ss
+            sigma_n = target + (sigma_n - target) * np.exp(-half_period / t_r)
+        sign = 1.0 if n % 2 == 0 else -1.0
+        target = sign * sigma_ss
+        out[i] = target + (sigma_n - target) * np.exp(-t_local / t_r)
+    return out
+
+
+def vep_square_wave(t, eta, mu, gamma_dot_0, tau_y, half_period):
+    r"""Closed-form VEP shear stress under square-wave forcing with
+    Min-mode plasticity.
+
+    Within each half-period, the stress evolves under Maxwell:
+
+    .. math::
+        \sigma(t) = s_n\sigma_{\mathrm{ss}} + (\sigma_{0,n} - s_n\sigma_{\mathrm{ss}})\,
+                    e^{-(t - t_n)/t_r}
+
+    until :math:`|\sigma| = \tau_y`, after which the plastic flow holds
+    :math:`\sigma = \pm\tau_y`. The next half-period starts from the
+    *clipped* value (``±τ_y`` if the previous period yielded; otherwise
+    the unclipped end value).
+
+    When :math:`\eta\dot\gamma_0 \le \tau_y` the solution coincides with
+    the unclipped Maxwell square-wave.
+    """
+    t_arr = np.asarray(t, dtype=float)
+    t_r = eta / mu
+    sigma_ss = eta * gamma_dot_0
+    out = np.zeros_like(t_arr)
+
+    # Pre-compute σ at the start of each half-period (including clipping)
+    n_half_max = int(np.ceil(t_arr[-1] / half_period)) + 2
+    sigma_at_start = [0.0]
+    for n in range(n_half_max):
+        sign = 1.0 if n % 2 == 0 else -1.0
+        target = sign * sigma_ss
+        sigma_0 = sigma_at_start[-1]
+        sigma_end = target + (sigma_0 - target) * np.exp(-half_period / t_r)
+        # Clip to ±τ_y at the period boundary if the unclipped value would
+        # have exceeded the yield surface
+        sigma_end_clipped = np.clip(sigma_end, -tau_y, tau_y)
+        sigma_at_start.append(float(sigma_end_clipped))
+
+    # Evaluate at each requested t
+    for i, ti in enumerate(t_arr):
+        n = int(ti / half_period)
+        t_local = ti - n * half_period
+        sign = 1.0 if n % 2 == 0 else -1.0
+        target = sign * sigma_ss
+        sigma_0 = sigma_at_start[n]
+        sigma_unclipped = target + (sigma_0 - target) * np.exp(-t_local / t_r)
+        out[i] = np.clip(sigma_unclipped, -tau_y, tau_y)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Stokes problem builder
+# ---------------------------------------------------------------------------
+
+def build_stokes(label, params, yield_stress=None, yield_mode="min"):
+    """Construct a VE_Stokes problem with the standard mesh / BCs.
+
+    Parameters
+    ----------
+    label : str
+        Used to namespace the mesh variable names so multiple problems can
+        coexist in one Python session.
+    params : dict
+        Material parameters (see DEFAULT_PARAMS).
+    yield_stress : float or None
+        If ``None``, pure VE (yield_stress is set to a large finite value).
+        Otherwise enables VEP with the given yield stress.
+    yield_mode : str
+        Passed to ``constitutive_model._yield_mode``. ``"min"`` for
+        Min-mode plasticity (sharp yield), other options are smooth
+        approximations.
+
+    Returns
+    -------
+    mesh, stokes, V_top, params
+        ``V_top`` is the user-facing UWexpression for the top BC velocity.
+    """
+    p = dict(params)
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=p["elementRes"],
+        minCoords=(-p["W"] / 2.0, -p["H"] / 2.0),
+        maxCoords=(p["W"] / 2.0, p["H"] / 2.0),
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, mesh.dim, degree=p["velocity_degree"])
+    pp = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=p["pressure_degree"])
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=pp)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=p["bdf_order"],
+    )
+    stokes.constitutive_model.Parameters.shear_viscosity_0 = p["eta"]
+    stokes.constitutive_model.Parameters.shear_modulus = p["mu"]
+    stokes.constitutive_model.Parameters.yield_stress = (
+        yield_stress if yield_stress is not None else 1.0e6
+    )
+    stokes.constitutive_model.Parameters.strainrate_inv_II_min = 1.0e-6
+    stokes.constitutive_model._yield_mode = yield_mode
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top V")
+    stokes.add_dirichlet_bc((V_top, 0.0), "Top")
+    stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    return mesh, stokes, V_top, p
+
+
+# ---------------------------------------------------------------------------
+# Per-step probe
+# ---------------------------------------------------------------------------
+
+def probe_centre(stokes, c=np.array([[0.0, 0.0]])):
+    return float(uw.function.evaluate(stokes.tau.sym[0, 1], c).flatten()[0])
+
+
+# ---------------------------------------------------------------------------
+# Self-contained npz logger
+# ---------------------------------------------------------------------------
+
+def save_run(name, *, params, params_extra=None, **arrays):
+    """Save a benchmark run to ``output/benchmarks/<name>.npz``.
+
+    Parameters
+    ----------
+    name : str
+        Output filename stem (no extension).
+    params : dict
+        Material/numerical parameters used for the run.  Stored as a
+        single ``params`` field for re-creation/replotting.
+    params_extra : dict or None
+        Per-benchmark scalar metadata (omega, half_period, tau_y, …).
+    **arrays
+        Per-step arrays: times, sigma, sigma_ana, dt, gamma_dot, etc.
+    """
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    path = f"{OUTPUT_DIR}/{name}.npz"
+    payload = {f"arr_{k}": np.asarray(v) for k, v in arrays.items()}
+    payload["__params__"] = np.asarray(repr(dict(params)), dtype=object)
+    payload["__params_extra__"] = np.asarray(repr(dict(params_extra or {})), dtype=object)
+    payload["__keys__"] = np.asarray(list(arrays.keys()), dtype=object)
+    payload["__name__"] = np.asarray(name, dtype=object)
+    np.savez(path, **payload)
+    return path
+
+
+def load_run(name):
+    """Reverse of :func:`save_run`. Returns ``(arrays, params, extra)``."""
+    path = f"{OUTPUT_DIR}/{name}.npz"
+    with np.load(path, allow_pickle=True) as f:
+        keys = list(f["__keys__"])
+        arrays = {k: f[f"arr_{k}"] for k in keys}
+        params = eval(str(f["__params__"]))
+        extra = eval(str(f["__params_extra__"]))
+    return arrays, params, extra
+
+
+# ---------------------------------------------------------------------------
+# Error metrics
+# ---------------------------------------------------------------------------
+
+def error_metrics(sigma, sigma_ana):
+    """Standard error report: max and rms absolute error."""
+    diff = sigma - sigma_ana
+    return dict(
+        max_abs=float(np.max(np.abs(diff))),
+        rms=float(np.sqrt(np.mean(diff**2))),
+        rel_max=float(np.max(np.abs(diff)) / (np.max(np.abs(sigma_ana)) + 1e-30)),
+    )
+
+
+def fit_amp_phase(t, sigma, omega):
+    """Least-squares fit of ``A·sin(ωt − φ)`` to ``sigma``.
+
+    Returns ``(A, phi)``.  Drops the first ``2*t_r`` to skip the
+    transient (assumes ``t_r = 1`` and that the array is long enough).
+    """
+    mask = t > 4.0  # skip ~4 t_r of transient
+    if mask.sum() < 8:
+        mask = np.ones_like(t, dtype=bool)
+    ts = t[mask]
+    ss = sigma[mask]
+    # σ ≈ a·sin(ωt) + b·cos(ωt) — fit (a, b) by linear least squares
+    M = np.column_stack([np.sin(omega * ts), np.cos(omega * ts)])
+    coeffs, *_ = np.linalg.lstsq(M, ss, rcond=None)
+    a, b = float(coeffs[0]), float(coeffs[1])
+    A = np.sqrt(a**2 + b**2)
+    # σ = A·sin(ωt − φ) → A·(cos(φ)sin(ωt) − sin(φ)cos(ωt)) = a·sin + b·cos
+    # so a = A cos(φ), b = −A sin(φ).  Hence φ = atan2(−b, a).
+    phi = float(np.arctan2(-b, a))
+    return A, phi
diff --git a/docs/advanced/benchmarks/_iso_pureve_vs_vep.py b/docs/advanced/benchmarks/_iso_pureve_vs_vep.py
new file mode 100644
index 00000000..0933ace2
--- /dev/null
+++ b/docs/advanced/benchmarks/_iso_pureve_vs_vep.py
@@ -0,0 +1,101 @@
+"""Pin down whether iso BDF-2 instability is from VEP machinery or VE alone.
+
+Three iso cases at the same harmonic forcing, T=8, BDF-2, η=μ=1, on
+the same mesh as the TI consistency test:
+
+  pureve         — VE only (yield_stress = sympy.oo, no plastic branch)
+  vep_huge_ty    — VEP with yield_stress = 1e8 (yielding effectively off)
+  vep_active_ty  — VEP with yield_stress = 0.30 (yielding active)
+
+If pureve is bounded but vep_huge_ty blows up, the BDF-2 instability
+is in the VEP softmin/yield expression, not in the BDF-2 method.
+"""
+
+import time
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+V0 = 0.5; OMEGA = np.pi / 2.0; DT = 0.05; T_END = 8.0
+ETA = 1.0; MU = 1.0; RES = 16
+
+
+def build(label, *, yield_stress):
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES), minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=uw.VarType.VECTOR)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                        continuous=True, vtype=uw.VarType.SCALAR)
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=2,
+    )
+    cm.Parameters.shear_viscosity_0 = ETA
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = yield_stress
+    cm.Parameters.shear_viscosity_min = ETA * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+    stokes.constitutive_model = cm
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    return stokes, V_top
+
+
+def run(label, *, yield_stress):
+    stokes, V_top = build(label, yield_stress=yield_stress)
+    phi = float(np.arctan(OMEGA))
+    n_steps = int(T_END / DT)
+    sxy = []
+    div = 0; iters_total = 0
+    t0 = time.time()
+    for step in range(n_steps):
+        t = (step + 1) * DT
+        v_now = V0 * float(np.cos(OMEGA * t + phi))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = DT
+        stokes.solve(zero_init_guess=False, timestep=DT, divergence_retries=2)
+        if stokes.snes.getConvergedReason() < 0:
+            div += 1
+        iters_total += stokes.snes.getIterationNumber()
+        c = np.array([[0.5, 0.5]])
+        td = stokes.tau.data
+        idx = int(np.argmin(np.linalg.norm(stokes.tau.coords - c, axis=1)))
+        sxy.append(td[idx, 2])
+    wall = time.time() - t0
+    return dict(label=label, yield_stress=str(yield_stress), wall=wall,
+                peak=float(np.abs(np.array(sxy)).max()),
+                div=div, mean_its=iters_total / max(1, n_steps))
+
+
+def main():
+    cases = [
+        ("pureve",         sympy.oo),
+        ("vep_huge_ty",    1e8),
+        ("vep_active_ty",  0.30),
+    ]
+    print(f"\n{'label':<14} {'yield_stress':>13} {'wall':>6} {'div':>4} {'its':>5} {'peak|σ_xy|':>11}",
+          flush=True)
+    for label, ty in cases:
+        print(f"--- running {label} ---", flush=True)
+        r = run(label, yield_stress=ty)
+        print(f"{r['label']:<14} {r['yield_stress']:>13} {r['wall']:>6.1f} "
+              f"{r['div']:>4d} {r['mean_its']:>5.2f} {r['peak']:>11.4e}",
+              flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/_iter_count_min_bdf2.py b/docs/advanced/benchmarks/_iter_count_min_bdf2.py
new file mode 100644
index 00000000..599be1c7
--- /dev/null
+++ b/docs/advanced/benchmarks/_iter_count_min_bdf2.py
@@ -0,0 +1,91 @@
+"""Quick rerun: pure Min/Min BDF-2 with iter counts captured.
+
+Confirms (or refutes) the hypothesis that the cleaner SNES record of
+the Min/Min BDF-2 run is masking very few actual Newton iterations,
+which would explain its larger answer error vs the softJac variants.
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    vep_square_wave, error_metrics, OUTPUT_DIR,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+DT_PLATEAU = 0.10
+DT_FINE = 0.01
+WINDOW = 0.1 * HALF_PERIOD
+
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def main():
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = 2
+    mesh, stokes, V_top, params = build_stokes(
+        "iter_min_o2", params, yield_stress=TAU_Y, yield_mode="min",
+    )
+    times, dts, sigmas, gammas, reasons, iters = [], [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        iters.append(int(stokes.snes.getIterationNumber()))
+    wall = time.time() - t0
+
+    times = np.array(times); sigmas = np.array(sigmas)
+    reasons = np.array(reasons); iters = np.array(iters)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = vep_square_wave(times, params["eta"], params["mu"],
+                                gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err = error_metrics(sigmas, sigma_ana)
+
+    from collections import Counter
+    print(f"\nMin/Min BDF-2 (var-dt VEP square): wall={wall:.1f}s  steps={len(times)}",
+          flush=True)
+    print(f"  reasons:    {dict(sorted(Counter(reasons.tolist()).items()))}", flush=True)
+    print(f"  iter dist:  {dict(sorted(Counter(iters.tolist()).items()))}", flush=True)
+    print(f"  iter mean:  {iters.mean():.2f}  median: {np.median(iters):.0f}  "
+          f"max: {int(iters.max())}", flush=True)
+    print(f"  fraction with iters==0: {(iters == 0).sum()}/{len(iters)} = {(iters==0).mean():.1%}",
+          flush=True)
+    print(f"  fraction with iters==1: {(iters == 1).sum()}/{len(iters)} = {(iters==1).mean():.1%}",
+          flush=True)
+    print(f"  peak|σ|={float(np.abs(sigmas).max()):.4f}  "
+          f"max|err|={err['max_abs']:.3e}  rms={err['rms']:.3e}", flush=True)
+
+    import os
+    np.savez(os.path.join(OUTPUT_DIR, "iter_count_min_bdf2.npz"),
+             times=times, dts=dts, sigmas=sigmas, sigma_ana=sigma_ana,
+             reasons=reasons, iters=iters, wall=wall)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/_repro_min_bdf1_nan.py b/docs/advanced/benchmarks/_repro_min_bdf1_nan.py
new file mode 100644
index 00000000..e1f57f5d
--- /dev/null
+++ b/docs/advanced/benchmarks/_repro_min_bdf1_nan.py
@@ -0,0 +1,74 @@
+"""Reproduce the Min-BDF-1 NaN-on-plateau divergence with SNES monitoring."""
+import numpy as np
+import sympy
+from _bench_helpers import DEFAULT_PARAMS, build_stokes, probe_centre
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+DT_PLATEAU = 0.10
+DT_FINE = 0.01
+WINDOW = 0.2
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+params = dict(DEFAULT_PARAMS)
+params["bdf_order"] = 1
+mesh, stokes, V_top, params = build_stokes(
+    "minfail_o1", params, yield_stress=TAU_Y, yield_mode="min",
+)
+
+# enable SNES monitor — prints |F| at every SNES iteration
+stokes.petsc_options["snes_monitor"] = None
+stokes.petsc_options["snes_converged_reason"] = None
+
+t_cur = 0.0
+step_idx = 0
+target_steps = {348, 404, 405, 406}
+# also peek at adjacent steps for context
+context_steps = target_steps | {347, 403}
+
+while t_cur < T_END - 1e-9:
+    dt = schedule_dt(t_cur)
+    flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                      if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+    dt = min(dt, flip_next - t_cur, T_END - t_cur)
+    t_end_step = t_cur + dt
+    n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+    sign = 1.0 if n_half % 2 == 0 else -1.0
+    v_now = sign * V0
+    V_top.sym = sympy.Float(v_now)
+    stokes.constitutive_model.Parameters.dt_elastic = dt
+
+    # Only enable verbose monitoring at the steps we care about
+    verbose = (step_idx in context_steps)
+    if verbose:
+        print(f"\n===== step {step_idx} t={t_end_step:.3f} dt={dt:.4f} sign={sign:+.0f} =====", flush=True)
+
+    if not verbose:
+        # silence the monitors temporarily
+        stokes.petsc_options.delValue("snes_monitor")
+        stokes.petsc_options.delValue("snes_converged_reason")
+    else:
+        stokes.petsc_options["snes_monitor"] = None
+        stokes.petsc_options["snes_converged_reason"] = None
+
+    stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+
+    if verbose:
+        sigma = probe_centre(stokes)
+        reason = int(stokes.snes.getConvergedReason())
+        its = int(stokes.snes.getIterationNumber())
+        print(f"  → sigma_xy={sigma:.6f}  SNES reason={reason}  iters={its}", flush=True)
+    t_cur = t_end_step
+    step_idx += 1
+    if step_idx > max(target_steps) + 2:
+        break
+
+print("\n--- done ---", flush=True)
diff --git a/docs/advanced/benchmarks/_ti_vep_alpha_sweep.py b/docs/advanced/benchmarks/_ti_vep_alpha_sweep.py
new file mode 100644
index 00000000..50b9a63f
--- /dev/null
+++ b/docs/advanced/benchmarks/_ti_vep_alpha_sweep.py
@@ -0,0 +1,73 @@
+"""Find the bdf_blend α threshold for TI-VEP + spatial τ_y stability.
+
+Known so far at T=16, harmonic forcing, BDF-2:
+  α = 1.0  →  peak|σ_xy| ≈ 7-10           (blows up modestly)
+  α = 0.5  →  peak|σ_xy| ≈ 7-30000         (blow-up reduced but still)
+  α = 0.0  →  peak|σ_xy| ≈ 0.30 (BDF-1)    (stable)
+
+What's the smallest α that still blows up?  Sweep at θ=15° (the worst
+case in earlier tests) and at θ=0° (where 1D-y blow-up is also seen).
+Use the same setup as bench_ti_vep_harmonic_zeroIC at τ_y=0.30.
+"""
+
+import time
+import numpy as np
+import sympy
+from bench_ti_vep_harmonic import build_ti_stokes, probe_stress, V0, OMEGA, DT
+
+
+T_END = 16.0
+TAU_Y = 0.30
+
+
+def run(theta_deg, alpha, label):
+    stokes, V_top, n_vec = (None, None, None)
+    mesh, stokes, V_top, n_vec = build_ti_stokes(label, theta_deg, TAU_Y, bdf_order=2)
+    stokes.constitutive_model._bdf_blend = alpha
+
+    phi = float(np.arctan(OMEGA))
+    n_steps = int(T_END / DT)
+    sxy = []; tres = []
+    div = 0; iters_total = 0
+    t0 = time.time()
+    for step in range(n_steps):
+        t = (step + 1) * DT
+        v_now = V0 * float(np.cos(OMEGA * t + phi))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = DT
+        stokes.solve(zero_init_guess=False, timestep=DT, divergence_retries=2)
+        if stokes.snes.getConvergedReason() < 0:
+            div += 1
+        iters_total += stokes.snes.getIterationNumber()
+        sxy_v, tres_v = probe_stress(stokes, n_vec)
+        sxy.append(sxy_v); tres.append(tres_v)
+    wall = time.time() - t0
+    sxy = np.array(sxy); tres = np.array(tres)
+    return dict(label=label, alpha=alpha, theta=theta_deg, wall=wall,
+                peak_sxy=float(np.abs(sxy).max()),
+                peak_tres=float(np.abs(tres).max()),
+                div=div, mean_its=iters_total / max(1, len(sxy)))
+
+
+def main():
+    cases = []
+    # θ=0° — easier; α≥0.5 already blows up modestly
+    for alpha in (0.0, 0.25, 0.5, 0.75, 1.0):
+        cases.append((0.0, alpha))
+    # θ=15° — harder; α≥0.5 still blows up massively
+    for alpha in (0.0, 0.10, 0.25, 0.50):
+        cases.append((15.0, alpha))
+
+    print(f"\n{'label':<22} {'θ°':>4} {'α':>5} {'wall':>6} {'div':>4} {'its':>5} "
+          f"{'peak|τ_res|':>11} {'peak|σ_xy|':>12}", flush=True)
+    for theta, alpha in cases:
+        label = f"th{theta:+.0f}_a{alpha:.2f}".replace(".", "p")
+        print(f"--- running {label} ---", flush=True)
+        r = run(theta, alpha, label)
+        print(f"{r['label']:<22} {r['theta']:>4.0f} {r['alpha']:>5.2f} "
+              f"{r['wall']:>6.1f} {r['div']:>4d} {r['mean_its']:>5.2f} "
+              f"{r['peak_tres']:>11.4e} {r['peak_sxy']:>12.4e}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/_ti_vep_bdf2_isolation.py b/docs/advanced/benchmarks/_ti_vep_bdf2_isolation.py
new file mode 100644
index 00000000..eacffede
--- /dev/null
+++ b/docs/advanced/benchmarks/_ti_vep_bdf2_isolation.py
@@ -0,0 +1,133 @@
+"""Isolate which factor triggers TI-VEP BDF-2 blow-up.
+
+Reference (working): tests/test_1052::test_ti_vep_yield_lock_variable_dt
+  - constant V_top, scalar τ_y, min yield, BDF-2 → stable
+Failing: bench_ti_vep_harmonic at θ=0°
+  - harmonic V_top, spatial τ_y field, softmin yield, BDF-2 → blows up
+
+Variables to flip (3 dimensions, baseline + 3 single-flip variants):
+
+  baseline (failing):  harmonic forcing, spatial τ_y, softmin
+  variant A:            const forcing,    spatial τ_y, softmin
+  variant B:            harmonic forcing, scalar  τ_y, softmin
+  variant C:            harmonic forcing, spatial τ_y, min
+
+Whichever flip stabilises BDF-2 identifies the trigger.
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+T_END = 16.0         # 4 periods — match the original benchmark length
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y = 0.30
+TAU_Y_BULK = 200.0
+RES = 16
+
+
+def build(label, *, spatial_yield, yield_mode):
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES), minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=uw.VarType.VECTOR)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                        continuous=True, vtype=uw.VarType.SCALAR)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[0.2, 0.5], [0.8, 0.5]]),  # horizontal fault, θ=0
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+    if spatial_yield:
+        weakness = fault.influence_function(
+            width=0.06, value_near=1.0/TAU_Y, value_far=1.0/TAU_Y_BULK,
+            profile="gaussian",
+        )
+        ty = 1.0 / weakness
+    else:
+        ty = TAU_Y
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, order=2,
+    )
+    stokes.constitutive_model = cm
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = ty
+    cm.Parameters.director = sympy.Matrix([0.0, 1.0])  # θ=0 throughout
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm._yield_mode = yield_mode
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+    return stokes, V_top
+
+
+def run(label, *, spatial_yield, yield_mode, harmonic):
+    stokes, V_top = build(label, spatial_yield=spatial_yield, yield_mode=yield_mode)
+    phi = float(np.arctan(OMEGA))
+    n_steps = int(T_END / DT)
+    sxy = []
+    div = 0
+    iters_total = 0
+    t0 = time.time()
+    for step in range(n_steps):
+        t = (step + 1) * DT
+        v_now = (V0 * float(np.cos(OMEGA * t + phi))) if harmonic else V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = DT
+        stokes.solve(zero_init_guess=False, timestep=DT, divergence_retries=2)
+        if stokes.snes.getConvergedReason() < 0:
+            div += 1
+        iters_total += stokes.snes.getIterationNumber()
+        # Probe centre
+        c = np.array([[0.5, 0.5]])
+        td = stokes.tau.data
+        idx = int(np.argmin(np.linalg.norm(stokes.tau.coords - c, axis=1)))
+        sxy.append(td[idx, 2])
+    wall = time.time() - t0
+    sxy = np.array(sxy)
+    return dict(label=label, spatial_yield=spatial_yield, yield_mode=yield_mode,
+                harmonic=harmonic, peak_sxy=float(np.abs(sxy).max()),
+                div=div, mean_its=iters_total / n_steps, wall=wall)
+
+
+def main():
+    cases = [
+        # baseline (failing at T=16): harmonic + spatial τ_y + softmin
+        ("baseline_fail", True,  "softmin", True),
+        # B: harmonic + scalar τ_y + softmin (does it blow up at T=16?)
+        ("varB_scalarTY", False, "softmin", True),
+        # C: harmonic + spatial τ_y + min (regression-test style)
+        ("varC_min",      True,  "min",     True),
+    ]
+    print(f"\n{'label':<18} {'spatial_τy':>11} {'yield':>8} {'forcing':>9} "
+          f"{'wall':>6} {'div':>4} {'its':>5} {'peak|σ_xy|':>11}", flush=True)
+    for label, sy, ym, harmonic in cases:
+        print(f"--- running {label} ---", flush=True)
+        r = run(label, spatial_yield=sy, yield_mode=ym, harmonic=harmonic)
+        print(f"{r['label']:<18} {str(r['spatial_yield']):>11} {r['yield_mode']:>8} "
+              f"{('harmonic' if r['harmonic'] else 'const'):>9} "
+              f"{r['wall']:>6.1f} {r['div']:>4d} {r['mean_its']:>5.2f} "
+              f"{r['peak_sxy']:>11.4e}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/_ti_vs_iso_consistency.py b/docs/advanced/benchmarks/_ti_vs_iso_consistency.py
new file mode 100644
index 00000000..0dd2bb2b
--- /dev/null
+++ b/docs/advanced/benchmarks/_ti_vs_iso_consistency.py
@@ -0,0 +1,126 @@
+"""Consistency check: does TI reduce to iso when Δ=0 (no yield, η_0 = η_1)?
+
+The rank-4 TI tensor with η_0 = η_1_eff and Δ = 0 is mathematically
+identical to 2·η·I_ijkl (the isotropic Newtonian tensor), regardless of
+the director.  At BDF-2, with the SAME ε̇_eff, the resulting stress
+should be bit-equal between TI and iso.
+
+If TI matches iso here, the BDF-2 instability is *purely* in the yield
+branch (where η_1_eff < η_0 and Δ ≠ 0 in the fault zone).  If TI
+diverges from iso even in this trivial case, the bug is more
+fundamental — possibly a stray history term, missing factor, or
+asymmetric tensor reduction.
+"""
+
+import time
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+V0 = 0.5; OMEGA = np.pi / 2.0; DT = 0.05; T_END = 8.0
+ETA = 1.0; MU = 1.0
+RES = 16
+
+
+def build(label, *, ti_model):
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES), minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=uw.VarType.VECTOR)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                        continuous=True, vtype=uw.VarType.SCALAR)
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    if ti_model:
+        cm = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+            stokes.Unknowns, order=2,
+        )
+        cm.Parameters.shear_viscosity_0 = ETA
+        cm.Parameters.shear_viscosity_1 = ETA  # === η_0
+        cm.Parameters.shear_modulus = MU
+        cm.Parameters.yield_stress = 1e8       # effectively infinite
+        cm.Parameters.director = sympy.Matrix([0.0, 1.0])
+        cm.Parameters.shear_viscosity_min = ETA * 1.0e-3
+        cm._bdf_blend = 1.0  # pure BDF-2
+    else:
+        cm = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+            stokes.Unknowns, order=2,
+        )
+        cm.Parameters.shear_viscosity_0 = ETA
+        cm.Parameters.shear_modulus = MU
+        cm.Parameters.yield_stress = 1e8
+        cm.Parameters.shear_viscosity_min = ETA * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+    stokes.constitutive_model = cm
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    return stokes, V_top
+
+
+def run(label, *, ti_model):
+    stokes, V_top = build(label, ti_model=ti_model)
+    phi = float(np.arctan(OMEGA))
+    n_steps = int(T_END / DT)
+    sxy = []
+    div = 0; iters_total = 0
+    t0 = time.time()
+    for step in range(n_steps):
+        t = (step + 1) * DT
+        v_now = V0 * float(np.cos(OMEGA * t + phi))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = DT
+        stokes.solve(zero_init_guess=False, timestep=DT, divergence_retries=2)
+        if stokes.snes.getConvergedReason() < 0:
+            div += 1
+        iters_total += stokes.snes.getIterationNumber()
+        c = np.array([[0.5, 0.5]])
+        td = stokes.tau.data
+        idx = int(np.argmin(np.linalg.norm(stokes.tau.coords - c, axis=1)))
+        sxy.append(td[idx, 2])
+    wall = time.time() - t0
+    return dict(label=label, ti=ti_model, wall=wall,
+                sxy=np.array(sxy),
+                div=div, mean_its=iters_total / max(1, n_steps))
+
+
+def main():
+    print(f"\n{'label':<14} {'ti':>5} {'wall':>6} {'div':>4} {'its':>5} {'peak|σ_xy|':>11}",
+          flush=True)
+    iso = run("iso_noTY", ti_model=False)
+    print(f"{iso['label']:<14} {str(iso['ti']):>5} {iso['wall']:>6.1f} "
+          f"{iso['div']:>4d} {iso['mean_its']:>5.2f} "
+          f"{float(np.abs(iso['sxy']).max()):>11.4e}", flush=True)
+    ti = run("ti_noTY", ti_model=True)
+    print(f"{ti['label']:<14} {str(ti['ti']):>5} {ti['wall']:>6.1f} "
+          f"{ti['div']:>4d} {ti['mean_its']:>5.2f} "
+          f"{float(np.abs(ti['sxy']).max()):>11.4e}", flush=True)
+
+    diff = ti['sxy'] - iso['sxy']
+    print(f"\n=== consistency check ===", flush=True)
+    print(f"  max|TI - iso|  = {np.abs(diff).max():.6e}", flush=True)
+    print(f"  max|iso|       = {np.abs(iso['sxy']).max():.6e}", flush=True)
+    print(f"  rel max diff   = {np.abs(diff).max() / np.abs(iso['sxy']).max():.6e}",
+          flush=True)
+    print(f"  rms TI-iso     = {np.sqrt((diff**2).mean()):.6e}", flush=True)
+    if np.abs(diff).max() / np.abs(iso['sxy']).max() < 1e-3:
+        print("  → TI ≈ iso (consistent: bug is in yield branch only)",
+              flush=True)
+    else:
+        print("  → TI != iso (deeper inconsistency: BDF-2 TI tensor structure differs)",
+              flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_convergence.py b/docs/advanced/benchmarks/bench_convergence.py
new file mode 100644
index 00000000..d86dcf13
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_convergence.py
@@ -0,0 +1,184 @@
+"""Convergence sweep for the three VE/VEP benchmarks.
+
+For each case (harmonic, square, VEP square) and each BDF order
+(1, 2), runs the simulation at a range of timestep sizes and records
+max-absolute and RMS error vs the closed-form solution.  Writes
+``output/benchmarks/convergence_<case>.npz`` containing the full
+sweep so the convergence figure can be regenerated without re-running.
+
+Run
+---
+``pixi run -e amr-dev python docs/advanced/benchmarks/bench_convergence.py``
+
+The full sweep is ~24 runs and takes a few minutes.
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, t_relax, build_stokes, probe_centre,
+    maxwell_oscillatory, maxwell_square_wave, vep_square_wave,
+    save_run, error_metrics, OUTPUT_DIR,
+)
+
+
+# ---------------------------------------------------------------------------
+# Per-case runners.  Each takes (dt, bdf_order, **overrides) and returns
+# (times, sigmas, sigma_ana, params).
+# ---------------------------------------------------------------------------
+
+def run_ve_harmonic(dt, bdf_order, V0=0.5, omega=np.pi/2.0, n_periods=4):
+    """Endpoint V_top sampling — see bench_ve_harmonic.py for the rationale.
+
+    Midpoint sampling is 1st-order accurate to the value BDF-2 wants
+    at the step endpoint and would limit BDF-2 to slope-1 convergence.
+    """
+    label = f"ve_h_dt{dt:.4f}_o{bdf_order}"
+    params = dict(DEFAULT_PARAMS); params["bdf_order"] = bdf_order
+    _, stokes, V_top, params = build_stokes(label, params)
+    gd0 = 2.0 * V0 / params["H"]
+    t_end = n_periods * 2.0 * np.pi / omega + 0.5
+
+    times, sigmas = [], []
+    t_cur = 0.0
+    while t_cur < t_end - 1e-9:
+        ds = min(dt, t_end - t_cur)
+        t_end_step = t_cur + ds
+        v_now = V0 * float(np.sin(omega * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = ds
+        stokes.solve(zero_init_guess=False, timestep=ds, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur)
+    times = np.array(times); sigmas = np.array(sigmas)
+    sigma_ana = maxwell_oscillatory(times, params["eta"], params["mu"], gd0, omega)
+    return times, sigmas, sigma_ana, params
+
+
+def run_ve_square(dt, bdf_order, V0=0.5, half_period=2.0, n_periods=4):
+    label = f"ve_s_dt{dt:.4f}_o{bdf_order}"
+    params = dict(DEFAULT_PARAMS); params["bdf_order"] = bdf_order
+    _, stokes, V_top, params = build_stokes(label, params)
+    gd0 = 2.0 * V0 / params["H"]
+    t_end = n_periods * 2.0 * half_period
+
+    times, sigmas = [], []
+    t_cur = 0.0
+    while t_cur < t_end - 1e-9:
+        ds = min(dt, t_end - t_cur)
+        n_half = int((t_cur + 0.5 * ds) / half_period)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        V_top.sym = sympy.Float(sign * V0)
+        stokes.constitutive_model.Parameters.dt_elastic = ds
+        stokes.solve(zero_init_guess=False, timestep=ds, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur += ds
+        times.append(t_cur)
+    times = np.array(times); sigmas = np.array(sigmas)
+    sigma_ana = maxwell_square_wave(times, params["eta"], params["mu"], gd0, half_period)
+    return times, sigmas, sigma_ana, params
+
+
+def run_vep_square(dt, bdf_order, V0=0.5, tau_y=0.5, half_period=2.0, n_periods=4):
+    label = f"vep_s_dt{dt:.4f}_o{bdf_order}"
+    params = dict(DEFAULT_PARAMS); params["bdf_order"] = bdf_order
+    _, stokes, V_top, params = build_stokes(
+        label, params, yield_stress=tau_y, yield_mode="min",
+    )
+    gd0 = 2.0 * V0 / params["H"]
+    t_end = n_periods * 2.0 * half_period
+
+    times, sigmas = [], []
+    t_cur = 0.0
+    while t_cur < t_end - 1e-9:
+        ds = min(dt, t_end - t_cur)
+        n_half = int((t_cur + 0.5 * ds) / half_period)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        V_top.sym = sympy.Float(sign * V0)
+        stokes.constitutive_model.Parameters.dt_elastic = ds
+        stokes.solve(zero_init_guess=False, timestep=ds, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur += ds
+        times.append(t_cur)
+    times = np.array(times); sigmas = np.array(sigmas)
+    sigma_ana = vep_square_wave(times, params["eta"], params["mu"],
+                                gd0, tau_y, half_period)
+    return times, sigmas, sigma_ana, params
+
+
+# ---------------------------------------------------------------------------
+# Sweep driver
+# ---------------------------------------------------------------------------
+
+def sweep(case_name, runner, dts, orders, **runner_kwargs):
+    """Run a sweep over (dt, order); return arrays + metrics dict.
+
+    Also stores per-run traces so that re-plotting at any (order, dt)
+    combination doesn't require re-running.  Trace arrays are stored
+    as ``trace_t_o<order>_dt<dt:.4f>``, etc.
+    """
+    results = []
+    extra_arrays = {}
+    for order in orders:
+        for dt in dts:
+            t0 = time.time()
+            times, sigmas, sigma_ana, params = runner(dt, order, **runner_kwargs)
+            err = error_metrics(sigmas, sigma_ana)
+            wall = time.time() - t0
+            print(f"  [{case_name}]  order={order}  dt={dt:.4f}  "
+                  f"steps={len(times)}  wall={wall:.1f}s  "
+                  f"max|err|={err['max_abs']:.4e}  rms={err['rms']:.4e}",
+                  flush=True)
+            results.append(dict(
+                order=order, dt=dt, n_steps=len(times),
+                max_abs=err["max_abs"], rms=err["rms"], wall=wall,
+            ))
+            # Store traces for replotting — keyed by (order, dt)
+            tag = f"o{order}_dt{dt:.4f}"
+            extra_arrays[f"trace_t_{tag}"] = times
+            extra_arrays[f"trace_sigma_{tag}"] = sigmas
+            extra_arrays[f"trace_ana_{tag}"] = sigma_ana
+    return dict(
+        order=np.array([r["order"] for r in results]),
+        dt=np.array([r["dt"] for r in results]),
+        n_steps=np.array([r["n_steps"] for r in results]),
+        max_abs=np.array([r["max_abs"] for r in results]),
+        rms=np.array([r["rms"] for r in results]),
+        wall=np.array([r["wall"] for r in results]),
+        **extra_arrays,
+    )
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    # Reasonable dt range for each case.
+    DTS_HARMONIC = [0.40, 0.20, 0.10, 0.05, 0.025]   # 5 values × 2 orders
+    DTS_SQUARE   = [0.40, 0.20, 0.10, 0.05]          # 4 values; 0.025 not needed
+    DTS_VEP      = [0.40, 0.20, 0.10, 0.05]          # same as VE square
+    ORDERS = [1, 2]
+
+    print("=== Convergence: VE harmonic (sin forcing) ===")
+    res = sweep("ve_h", run_ve_harmonic, DTS_HARMONIC, ORDERS)
+    save_run("convergence_ve_harmonic", params=DEFAULT_PARAMS,
+             params_extra=dict(orders=list(ORDERS), dts=list(DTS_HARMONIC)),
+             **res)
+
+    print("\n=== Convergence: VE square wave ===")
+    res = sweep("ve_s", run_ve_square, DTS_SQUARE, ORDERS)
+    save_run("convergence_ve_square", params=DEFAULT_PARAMS,
+             params_extra=dict(orders=list(ORDERS), dts=list(DTS_SQUARE)),
+             **res)
+
+    print("\n=== Convergence: VEP square wave (Min mode) ===")
+    res = sweep("vep_s", run_vep_square, DTS_VEP, ORDERS)
+    save_run("convergence_vep_square", params=DEFAULT_PARAMS,
+             params_extra=dict(orders=list(ORDERS), dts=list(DTS_VEP)),
+             **res)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_ti_vep_harmonic.py b/docs/advanced/benchmarks/bench_ti_vep_harmonic.py
new file mode 100644
index 00000000..cd03c732
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_ti_vep_harmonic.py
@@ -0,0 +1,313 @@
+"""Benchmark: Transverse-isotropic VEP fault under harmonic shear.
+
+Sister to ``bench_ve_harmonic.py`` (isotropic Maxwell): same peak-start
+initial condition and cosine forcing, but with an embedded fault
+modelled by ``TransverseIsotropicVEPFlowModel``.  The point of the
+benchmark is to confirm that BDF-1 / BDF-2 time integration are as
+robust on the angled-fault problem as on the isotropic case — no new
+SNES instabilities expected.
+
+Three fault angles run side-by-side: θ ∈ {0°, +15°, -15°}.
+
+Probes:
+* ``sigma_xy``      — global shear stress at the fault centre (fault frame)
+* ``tau_resolved``  — shear on the fault plane: t·σ·n  with t the fault
+                      tangent and n the fault normal.
+
+For θ = 0° the resolved shear equals σ_xy.  For θ ≠ 0° the resolved
+shear caps at τ_y while σ_xy keeps growing, since only the fault-plane
+component yields.
+
+Forcing: V_top(t) = V0·cos(ωt + φ) with the same Deborah-number / phase
+as the isotropic case so the analytical (sub-yield) reference is
+identical: the resolved shear should track A_∞·cos(ωt) once any
+plastic transients die out.
+
+Output: one ``.npz`` per (angle, τ_y) pair, BDF-1 and BDF-2 traces.
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3.function import expression
+from _bench_helpers import OUTPUT_DIR
+
+
+# ---------------------------------------------------------------------------
+# Run-specific parameters (kept aligned with bench_ve_harmonic.py)
+# ---------------------------------------------------------------------------
+
+V0 = 0.5
+OMEGA = np.pi / 2.0          # period 4·t_r
+DT = 0.05
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * np.pi / OMEGA
+
+ETA_0 = 1.0                  # bulk shear viscosity
+ETA_1 = 1.0                  # fault-plane shear viscosity
+MU = 1.0                     # elastic shear modulus
+TAU_Y_BULK = 200.0           # effectively infinite away from the fault
+
+# Geometry
+RES = 16                     # mesh resolution (RES x RES) — kept modest for benchmark turnaround
+H = 1.0; W = 1.0             # domain size [0, W] × [0, H]
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06           # influence-function half-width
+
+# Sweep
+ANGLES_DEG = (0.0, 15.0, -15.0)
+TAU_Y_LIST = (0.15, 0.30)
+BDF_ORDERS = (1, 2)
+
+
+# ---------------------------------------------------------------------------
+# Build helper
+# ---------------------------------------------------------------------------
+
+def build_ti_stokes(label, theta_deg, tau_y, bdf_order):
+    """Construct a TI-VEP Stokes problem with an embedded fault.
+
+    Parameters
+    ----------
+    label : str
+        Used to namespace mesh-variable names.
+    theta_deg : float
+        Fault angle from horizontal, in degrees.
+    tau_y : float
+        Fault-plane yield stress.
+    bdf_order : int
+        BDF time-integration order (1 or 2).
+
+    Returns
+    -------
+    mesh, stokes, V_top_expr, n_vec
+    """
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable(
+        f"U_{label}", mesh, 2, degree=2, vtype=uw.VarType.VECTOR,
+    )
+    p = uw.discretisation.MeshVariable(
+        f"P_{label}", mesh, 1, degree=1,
+        continuous=True, vtype=uw.VarType.SCALAR,
+    )
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta)
+    n_y =  np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y,
+        value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, order=bdf_order,
+    )
+    stokes.constitutive_model = cm
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"  # default; smooth and robust
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(
+        rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC velocity",
+    )
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return mesh, stokes, V_top, np.array([n_x, n_y])
+
+
+# ---------------------------------------------------------------------------
+# Probes
+# ---------------------------------------------------------------------------
+
+def probe_stress(stokes, n_vec, c=np.array([[0.5, 0.5]])):
+    """Return σ_xy and resolved fault-plane shear at the fault centre."""
+    tau = stokes.tau
+    dists = np.linalg.norm(tau.coords - c, axis=1)
+    idx = int(np.argmin(dists))
+    s_xx, s_yy, s_xy = tau.data[idx, 0], tau.data[idx, 1], tau.data[idx, 2]
+    n_x, n_y = n_vec
+    t_x, t_y = n_y, -n_x  # fault tangent (perp to normal)
+    resolved = (s_xx * t_x * n_x + s_xy * (t_x * n_y + t_y * n_x)
+                + s_yy * t_y * n_y)
+    return float(s_xy), float(resolved)
+
+
+# ---------------------------------------------------------------------------
+# Time-stepping core
+# ---------------------------------------------------------------------------
+
+def _run_one(theta_deg, tau_y, bdf_order, label):
+    """One run.  Returns dict of arrays."""
+    mesh, stokes, V_top, n_vec = build_ti_stokes(
+        label, theta_deg, tau_y, bdf_order,
+    )
+
+    # Maxwell relaxation time and steady-state amplitude (sub-yield)
+    t_r = ETA_1 / MU
+    De = OMEGA * t_r
+    # BCs: Top moves at V_top, Bottom fixed → engineering shear rate
+    # γ̇_0 = V0/H (NOT 2·V0/H — that would be the antisymmetric case
+    # used by bench_ve_harmonic.py).  Steady VE amplitude is then
+    # σ_∞ = 2η·ε̇/sqrt(1+De²) = η·γ̇_0/sqrt(1+De²) since ε̇ = γ̇/2.
+    gamma_dot_0 = V0 / H
+    A_inf = ETA_1 * gamma_dot_0 / np.sqrt(1.0 + De**2)
+    phi = float(np.arctan(De))
+
+    # Peak-start: plant ψ*[k] = (resolved shear at t=-k·dt) on the fault
+    # tangent direction in the SYM_TENSOR slot.  For a 2D tensor, with
+    # the resolved shear along (t_x, t_y) and normal (n_x, n_y), the
+    # corresponding stress contribution is τ·(t_i n_j + n_i t_j).
+    n_x, n_y = n_vec
+    t_x, t_y = n_y, -n_x
+    n_nodes = stokes.DFDt.psi_star[0].array.shape[0]
+    history = []
+    for k in range(stokes.DFDt.order):
+        val_k = A_inf * float(np.cos(OMEGA * k * DT))
+        # symmetric tensor: σ = τ_resolved * (t⊗n + n⊗t)
+        arr = np.zeros((n_nodes, 2, 2))
+        sxx = val_k * 2.0 * t_x * n_x
+        syy = val_k * 2.0 * t_y * n_y
+        sxy = val_k * (t_x * n_y + t_y * n_x)
+        arr[:, 0, 0] = sxx
+        arr[:, 1, 1] = syy
+        arr[:, 0, 1] = sxy
+        arr[:, 1, 0] = sxy
+        history.append(arr)
+    stokes.DFDt.set_initial_history(history, dt=DT)
+
+    times, sxy_h, tres_h, reasons, iters = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step + phi))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sxy, tres = probe_stress(stokes, n_vec)
+        t_cur = t_end_step
+        times.append(t_cur); sxy_h.append(sxy); tres_h.append(tres)
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        iters.append(int(stokes.snes.getIterationNumber()))
+    wall = time.time() - t0
+
+    times = np.array(times); sxy_h = np.array(sxy_h); tres_h = np.array(tres_h)
+    reasons = np.array(reasons); iters = np.array(iters)
+
+    # Sub-yield analytical: A_∞·cos(ωt).  Above yield, this is the VE
+    # "no-yield" envelope and the actual response should track it until
+    # |τ| reaches τ_y, then plateau.
+    sigma_ve = A_inf * np.cos(OMEGA * times)
+
+    return dict(
+        times=times, sigma_xy=sxy_h, tau_resolved=tres_h,
+        sigma_ve=sigma_ve, reasons=reasons, iters=iters,
+        wall=wall, A_inf=A_inf, phi=phi, De=De, gamma_dot_0=gamma_dot_0,
+    )
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    summary = []
+    for theta_deg in ANGLES_DEG:
+        for tau_y in TAU_Y_LIST:
+            results = {}
+            for bdf in BDF_ORDERS:
+                lbl = f"tivep_o{bdf}_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(
+                    ".", "p"
+                )
+                print(f"\n--- {lbl}: θ={theta_deg}°, τ_y={tau_y}, BDF-{bdf} ---",
+                      flush=True)
+                results[bdf] = _run_one(theta_deg, tau_y, bdf, lbl)
+                r = results[bdf]
+                ndiv = int((r["reasons"] < 0).sum())
+                print(f"    wall={r['wall']:.1f}s  steps={len(r['times'])}  "
+                      f"diverged={ndiv}  mean_its={float(r['iters'].mean()):.2f}  "
+                      f"peak|τ_resolved|={float(np.abs(r['tau_resolved']).max()):.4f}  "
+                      f"peak|σ_xy|={float(np.abs(r['sigma_xy']).max()):.4f}",
+                      flush=True)
+                summary.append(dict(
+                    label=lbl, theta=theta_deg, tau_y=tau_y, bdf=bdf,
+                    wall=r["wall"], diverged=ndiv,
+                    mean_its=float(r["iters"].mean()),
+                    peak_resolved=float(np.abs(r["tau_resolved"]).max()),
+                    peak_sxy=float(np.abs(r["sigma_xy"]).max()),
+                ))
+
+            # Save BDF-1 and BDF-2 traces side by side
+            tag = f"ti_vep_harmonic_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(
+                ".", "p"
+            )
+            np.savez(
+                os.path.join(OUTPUT_DIR, f"{tag}.npz"),
+                theta_deg=theta_deg, tau_y=tau_y,
+                times=results[1]["times"],
+                sigma_xy_bdf1=results[1]["sigma_xy"],
+                sigma_xy_bdf2=results[2]["sigma_xy"],
+                tau_resolved_bdf1=results[1]["tau_resolved"],
+                tau_resolved_bdf2=results[2]["tau_resolved"],
+                sigma_ve=results[1]["sigma_ve"],
+                reasons_bdf1=results[1]["reasons"],
+                reasons_bdf2=results[2]["reasons"],
+                iters_bdf1=results[1]["iters"],
+                iters_bdf2=results[2]["iters"],
+                A_inf=results[1]["A_inf"], De=results[1]["De"],
+                gamma_dot_0=results[1]["gamma_dot_0"],
+                wall_bdf1=results[1]["wall"], wall_bdf2=results[2]["wall"],
+                V0=V0, OMEGA=OMEGA, DT=DT, T_END=T_END,
+                ETA_0=ETA_0, ETA_1=ETA_1, MU=MU,
+                FAULT_WIDTH=FAULT_WIDTH, FAULT_LENGTH=FAULT_LENGTH, RES=RES,
+            )
+            print(f"  saved → {tag}.npz", flush=True)
+
+    print("\n=== summary ===", flush=True)
+    print(f"{'label':<36} {'θ°':>4} {'τ_y':>5} {'BDF':>4} {'wall':>6} "
+          f"{'div':>4} {'its':>5} {'peak|τ_res|':>11} {'peak|σ_xy|':>10}",
+          flush=True)
+    for s in summary:
+        print(f"{s['label']:<36} {s['theta']:>4.0f} {s['tau_y']:>5.2f} "
+              f"{s['bdf']:>4d} {s['wall']:>6.1f} {s['diverged']:>4d} "
+              f"{s['mean_its']:>5.2f} {s['peak_resolved']:>11.4f} "
+              f"{s['peak_sxy']:>10.4f}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_ti_vep_harmonic_zeroIC.py b/docs/advanced/benchmarks/bench_ti_vep_harmonic_zeroIC.py
new file mode 100644
index 00000000..60dcb54f
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_ti_vep_harmonic_zeroIC.py
@@ -0,0 +1,136 @@
+"""TI-VEP harmonic benchmark — variant with σ=0 initial condition.
+
+Sanity check on the previous run (peak-start IC) which produced
+catastrophic BDF-2 blow-up.  Hypothesis: planting σ_xy = A_∞
+uniformly puts the fault yield zone at 3-4× its yield stress at
+t=0, and BDF-2's inconsistent ψ*₀/ψ*₁ history then drives an
+unstable plastic correction that grows.  σ=0 IC avoids that.
+
+Driving uses a cos forcing that *does* start at peak (V_top = V0
+at t=0), so we expect a transient before settling on the steady
+cycle — but the solver should remain stable throughout.
+
+Same suite (3 angles × 2 τ_y × 2 BDF orders).
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+
+from _bench_helpers import OUTPUT_DIR
+from bench_ti_vep_harmonic import (
+    V0, OMEGA, DT, T_END, ETA_0, ETA_1, MU,
+    FAULT_LENGTH, FAULT_WIDTH, RES,
+    ANGLES_DEG, TAU_Y_LIST, BDF_ORDERS,
+    build_ti_stokes, probe_stress,
+)
+
+
+def _run_one(theta_deg, tau_y, bdf_order, label):
+    mesh, stokes, V_top, n_vec = build_ti_stokes(
+        label, theta_deg, tau_y, bdf_order,
+    )
+    # σ=0 IC — let DDt initialise history from current value (which is 0)
+    # on the first solve. No set_initial_history call.
+
+    t_r = ETA_1 / MU
+    De = OMEGA * t_r
+    # BCs: Top moves, Bottom fixed → γ̇_0 = V0/H (not 2·V0/H).
+    gamma_dot_0 = V0 / 1.0
+    A_inf = ETA_1 * gamma_dot_0 / np.sqrt(1.0 + De**2)
+    phi = float(np.arctan(De))
+
+    times, sxy_h, tres_h, reasons, iters = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step + phi))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sxy, tres = probe_stress(stokes, n_vec)
+        t_cur = t_end_step
+        times.append(t_cur); sxy_h.append(sxy); tres_h.append(tres)
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        iters.append(int(stokes.snes.getIterationNumber()))
+    wall = time.time() - t0
+
+    times = np.array(times); sxy_h = np.array(sxy_h); tres_h = np.array(tres_h)
+    reasons = np.array(reasons); iters = np.array(iters)
+    sigma_ve = A_inf * np.cos(OMEGA * times)
+    return dict(
+        times=times, sigma_xy=sxy_h, tau_resolved=tres_h,
+        sigma_ve=sigma_ve, reasons=reasons, iters=iters,
+        wall=wall, A_inf=A_inf, phi=phi, De=De, gamma_dot_0=gamma_dot_0,
+    )
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    summary = []
+    for theta_deg in ANGLES_DEG:
+        for tau_y in TAU_Y_LIST:
+            results = {}
+            for bdf in BDF_ORDERS:
+                lbl = f"tivep_zIC_o{bdf}_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(
+                    ".", "p"
+                )
+                print(f"\n--- {lbl}: θ={theta_deg}°, τ_y={tau_y}, BDF-{bdf} (σ=0 IC) ---",
+                      flush=True)
+                results[bdf] = _run_one(theta_deg, tau_y, bdf, lbl)
+                r = results[bdf]
+                ndiv = int((r["reasons"] < 0).sum())
+                print(f"    wall={r['wall']:.1f}s  steps={len(r['times'])}  "
+                      f"diverged={ndiv}  mean_its={float(r['iters'].mean()):.2f}  "
+                      f"peak|τ_resolved|={float(np.abs(r['tau_resolved']).max()):.4f}  "
+                      f"peak|σ_xy|={float(np.abs(r['sigma_xy']).max()):.4f}",
+                      flush=True)
+                summary.append(dict(
+                    label=lbl, theta=theta_deg, tau_y=tau_y, bdf=bdf,
+                    wall=r["wall"], diverged=ndiv,
+                    mean_its=float(r["iters"].mean()),
+                    peak_resolved=float(np.abs(r["tau_resolved"]).max()),
+                    peak_sxy=float(np.abs(r["sigma_xy"]).max()),
+                ))
+
+            tag = f"ti_vep_harmonic_zIC_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(
+                ".", "p"
+            )
+            np.savez(
+                os.path.join(OUTPUT_DIR, f"{tag}.npz"),
+                theta_deg=theta_deg, tau_y=tau_y,
+                times=results[1]["times"],
+                sigma_xy_bdf1=results[1]["sigma_xy"],
+                sigma_xy_bdf2=results[2]["sigma_xy"],
+                tau_resolved_bdf1=results[1]["tau_resolved"],
+                tau_resolved_bdf2=results[2]["tau_resolved"],
+                sigma_ve=results[1]["sigma_ve"],
+                reasons_bdf1=results[1]["reasons"],
+                reasons_bdf2=results[2]["reasons"],
+                iters_bdf1=results[1]["iters"],
+                iters_bdf2=results[2]["iters"],
+                A_inf=results[1]["A_inf"], De=results[1]["De"],
+                gamma_dot_0=results[1]["gamma_dot_0"],
+                wall_bdf1=results[1]["wall"], wall_bdf2=results[2]["wall"],
+                V0=V0, OMEGA=OMEGA, DT=DT, T_END=T_END,
+                ETA_0=ETA_0, ETA_1=ETA_1, MU=MU,
+                FAULT_WIDTH=FAULT_WIDTH, FAULT_LENGTH=FAULT_LENGTH, RES=RES,
+            )
+            print(f"  saved → {tag}.npz", flush=True)
+
+    print("\n=== summary (σ=0 IC) ===", flush=True)
+    print(f"{'label':<40} {'θ°':>4} {'τ_y':>5} {'BDF':>4} {'wall':>6} "
+          f"{'div':>4} {'its':>5} {'peak|τ_res|':>11} {'peak|σ_xy|':>11}",
+          flush=True)
+    for s in summary:
+        print(f"{s['label']:<40} {s['theta']:>4.0f} {s['tau_y']:>5.2f} "
+              f"{s['bdf']:>4d} {s['wall']:>6.1f} {s['diverged']:>4d} "
+              f"{s['mean_its']:>5.2f} {s['peak_resolved']:>11.4f} "
+              f"{s['peak_sxy']:>11.4f}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_ve_harmonic.py b/docs/advanced/benchmarks/bench_ve_harmonic.py
new file mode 100644
index 00000000..c90c11fa
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_ve_harmonic.py
@@ -0,0 +1,163 @@
+"""Benchmark: Maxwell viscoelastic shear under sinusoidal forcing.
+
+Drives the shear box with :math:`V_{top}(t) = V_0 \\sin(\\omega t)` and
+compares the centre-point shear stress against the closed-form
+solution.  Records amplitude, phase shift, and error norms.
+
+Closed form
+-----------
+For Maxwell with constant :math:`\\eta, \\mu` driven by
+:math:`\\dot\\gamma(t) = \\dot\\gamma_0 \\sin(\\omega t)`,
+
+.. math::
+    \\sigma(t) = \\frac{\\eta\\dot\\gamma_0}{1 + \\mathrm{De}^2}
+    \\bigl[\\sin(\\omega t) - \\mathrm{De}\\cos(\\omega t)
+    + \\mathrm{De}\\,e^{-t/t_r}\\bigr]
+
+with :math:`\\mathrm{De} = \\omega t_r` (Deborah number).  Steady amplitude
+:math:`A_{\\infty} = \\eta\\dot\\gamma_0 / \\sqrt{1+\\mathrm{De}^2}`, phase
+lag :math:`\\varphi = \\arctan(\\mathrm{De})`.
+
+Run
+---
+``pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_harmonic.py``
+
+Output: ``output/benchmarks/ve_harmonic.npz`` containing the simulation
+trace, the analytical reference at the same time points, and parameter
+metadata.  See ``plot_benchmarks.py`` for plotting from the npz.
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, t_relax, build_stokes, probe_centre,
+    maxwell_oscillatory, save_run, error_metrics, fit_amp_phase,
+)
+
+
+# Run-specific parameters
+V0 = 0.5                     # → γ̇₀ = 2·V0/H = 1.0 in the symmetric strain rate
+OMEGA = np.pi / 2.0          # period 4·t_r → De = π/2 ≈ 1.57
+DT = 0.05                    # ~80 steps per period; resolves the harmonic
+N_PERIODS = 4                # 4 full periods (no warmup needed — see below)
+T_END = N_PERIODS * 2.0 * np.pi / OMEGA   # 4 periods exactly
+
+LABEL = "ve_harmonic"
+
+# Initial condition design: start at a point in the steady-state cycle
+# where σ̇ = 0, so σ(0) is consistent with the analytical and there is
+# *no* startup transient.  Choose BC such that σ_ss(t) = A_∞·cos(ωt),
+# i.e. peak at t=0.  Working backwards through the Maxwell phase
+# response (lag φ = arctan(De)), this requires
+#   V_top(t) = V_0 · cos(ωt + φ)
+# so that ε̇_xy(t) = (V_0/H)·cos(ωt + φ) and the steady-state response
+# σ_ss(t) = A_∞·cos(ωt + φ - φ) = A_∞·cos(ωt).
+#
+# The initial condition σ(0) = A_∞ matches the steady-state at t=0
+# exactly, leaving no homogeneous (decaying) component — so the entire
+# recorded trace is on the steady cycle.
+
+
+def _run_one(bdf_order):
+    """Run the simulation at one BDF order with peak-start initial condition.
+
+    See module docstring above for why σ(0) = A_∞ paired with the cosine
+    forcing eliminates the startup transient.  V_top is sampled at the
+    *endpoint* of each step (BDF expects the value at the new time).
+    """
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(f"{LABEL}_o{bdf_order}", params)
+
+    t_r = params["eta"] / params["mu"]
+    De = OMEGA * t_r
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    A_inf = params["eta"] * gamma_dot_0 / np.sqrt(1.0 + De**2)
+    phi = float(np.arctan(De))
+
+    # Plant the steady-state cycle as the initial condition.  σ_ss(t) =
+    # A_∞·cos(ωt) is the analytical solution under our cos forcing —
+    # zero homogeneous component, σ̇(0) = 0.  History slot k is the
+    # value at t = -k·Δt, which by cosine evenness is A_∞·cos(k·ω·Δt).
+    #
+    # Using the *exact* per-slot value (not just A_∞ for all k) is what
+    # actually buys the benefit: a constant A_∞ across all slots drops
+    # O(Δt²) error into ψ*[1], contaminating BDF-2's truncation from
+    # step 1 — exactly the phase error we are trying to avoid.
+    n_nodes = stokes.DFDt.psi_star[0].array.shape[0]
+    history = []
+    for k in range(stokes.DFDt.order):
+        arr = np.zeros((n_nodes, 2, 2))
+        val_k = A_inf * float(np.cos(OMEGA * k * DT))
+        arr[:, 0, 1] = val_k
+        arr[:, 1, 0] = val_k
+        history.append(arr)
+    stokes.DFDt.set_initial_history(history, dt=DT)
+
+    times, dts, sigmas, gammas, reasons = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        # BC: V_top(t) = V_0·cos(ωt + φ) so σ_ss(t) = A_∞·cos(ωt).
+        v_now = V0 * float(np.cos(OMEGA * t_end_step + phi))
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        s = probe_centre(stokes)
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); sigmas.append(s)
+        gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+
+    t_r = t_relax(params)
+    De = OMEGA * t_r
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    A_inf = params["eta"] * gamma_dot_0 / np.sqrt(1.0 + De**2)
+    # Peak-start initial condition + cos(ωt + φ) forcing → no transient,
+    # so the analytical is the steady-state cycle σ(t) = A_∞·cos(ωt).
+    sigma_ana = A_inf * np.cos(OMEGA * times1)
+
+    err1 = error_metrics(sig1, sigma_ana)
+    err2 = error_metrics(sig2, sigma_ana)
+    A1, phi1 = fit_amp_phase(times1, sig1, OMEGA)
+    A2, phi2 = fit_amp_phase(times2, sig2, OMEGA)
+    A_ana = params["eta"] * gamma_dot_0 / np.sqrt(1.0 + De**2)
+    phi_ana = float(np.arctan(De))
+
+    print(f"[{LABEL}]  steps={len(times1)}  De=ω·t_r={De:.4f}")
+    print(f"  BDF-1 wall={wall1:.1f}s  max|err|={err1['max_abs']:.4e}  rms={err1['rms']:.4e}")
+    print(f"        amp sim={A1:.4f} ana={A_ana:.4f}   phi sim={phi1:.4f} ana={phi_ana:.4f}")
+    print(f"  BDF-2 wall={wall2:.1f}s  max|err|={err2['max_abs']:.4e}  rms={err2['rms']:.4e}")
+    print(f"        amp sim={A2:.4f} ana={A_ana:.4f}   phi sim={phi2:.4f} ana={phi_ana:.4f}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, omega=OMEGA, gamma_dot_0=gamma_dot_0, De=De,
+            t_end=T_END, dt_nominal=DT,
+            A_bdf1=A1, A_bdf2=A2, A_ana=A_ana,
+            phi_bdf1=phi1, phi_bdf2=phi2, phi_ana=phi_ana,
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_ve_square.py b/docs/advanced/benchmarks/bench_ve_square.py
new file mode 100644
index 00000000..c33280d9
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_ve_square.py
@@ -0,0 +1,97 @@
+"""Benchmark: Maxwell viscoelastic shear under square-wave forcing.
+
+Drives the shear box with a square-wave :math:`V_{top}(t)` (sign flips
+every ``half_period``) and compares the centre-point shear stress
+against the closed-form piecewise-exponential solution.
+
+Closed form
+-----------
+Within the n-th half-period (sign :math:`s_n = (-1)^n`):
+
+.. math::
+    \\sigma(t) = s_n \\sigma_{\\mathrm{ss}}
+    + (\\sigma_{0,n} - s_n\\sigma_{\\mathrm{ss}})\\, e^{-(t-t_n)/t_r}
+
+with :math:`\\sigma_{\\mathrm{ss}} = \\eta\\dot\\gamma_0` and
+:math:`\\sigma_{0,n}` the stress at the start of half-period n.
+
+Run
+---
+``pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_square.py``
+
+Output: ``output/benchmarks/ve_square.npz``.
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, t_relax, build_stokes, probe_centre,
+    maxwell_square_wave, save_run, error_metrics,
+)
+
+
+V0 = 0.5
+HALF_PERIOD = 2.0    # in units of t_r
+N_PERIODS = 4        # → t_end = 4 · 2 · t_r = 8 t_r
+DT = 0.10            # 20 steps per half-period
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+LABEL = "ve_square"
+
+
+def _run_one(bdf_order):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(f"{LABEL}_o{bdf_order}", params)
+
+    times, dts, sigmas, gammas, reasons = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        n_half = int((t_cur + 0.5 * dt) / HALF_PERIOD)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur += dt
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = maxwell_square_wave(times1, params["eta"], params["mu"],
+                                    gamma_dot_0, HALF_PERIOD)
+    err1 = error_metrics(sig1, sigma_ana)
+    err2 = error_metrics(sig2, sigma_ana)
+    print(f"[{LABEL}]  steps={len(times1)}  σ_ss=η·γ̇₀={params['eta']*gamma_dot_0:.4f}")
+    print(f"  BDF-1 wall={wall1:.1f}s  max|err|={err1['max_abs']:.4e}  rms={err1['rms']:.4e}")
+    print(f"  BDF-2 wall={wall2:.1f}s  max|err|={err2['max_abs']:.4e}  rms={err2['rms']:.4e}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, half_period=HALF_PERIOD, n_periods=N_PERIODS,
+            gamma_dot_0=gamma_dot_0, t_end=T_END, dt_nominal=DT,
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_ve_square_vardt.py b/docs/advanced/benchmarks/bench_ve_square_vardt.py
new file mode 100644
index 00000000..999c03bd
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_ve_square_vardt.py
@@ -0,0 +1,114 @@
+"""Variable-dt VE square-wave benchmark.
+
+Same physics as :mod:`bench_ve_square` but with a non-uniform timestep
+schedule: dt is reduced by a factor of 10 in a small window around each
+BC flip and held at the larger value on plateaux.  This tests the
+projection-snapshot machinery on the exact path that previously
+exhibited the implicit-projection drift (see
+``tests/test_1052_VEP_stability_regression.py::test_vep_yield_lock_variable_dt``)
+and confirms the same robustness on the pure-VE side.
+
+Schedule (with ``T_{1/2} = 2 t_r`` and a window of ``±0.1 T_{1/2}`` around
+each flip):
+    plateau dt = ``DT_PLATEAU``
+    flip-window dt = ``DT_PLATEAU / 10``
+
+Run::
+
+    pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_square_vardt.py
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    maxwell_square_wave, save_run, error_metrics,
+)
+
+
+V0 = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+DT_PLATEAU = 0.10           # plateau dt (same as bench_ve_square)
+DT_FINE_RATIO = 0.10        # flip-window dt is 0.10 × plateau
+DT_FINE = DT_PLATEAU * DT_FINE_RATIO
+WINDOW = 0.1 * HALF_PERIOD  # ±0.20 t_r around each flip
+
+LABEL = "ve_square_vardt"
+
+
+def schedule_dt(t_cur):
+    """Fine dt within ±WINDOW of any flip; plateau dt elsewhere."""
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def _run_one(bdf_order):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(f"{LABEL}_o{bdf_order}", params)
+
+    times, dts, sigmas, gammas, reasons = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        # Don't step past a flip boundary or past T_END
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = maxwell_square_wave(times1, params["eta"], params["mu"],
+                                    gamma_dot_0, HALF_PERIOD)
+    err1 = error_metrics(sig1, sigma_ana)
+    err2 = error_metrics(sig2, sigma_ana)
+    print(f"[{LABEL}]  steps={len(times1)}  σ_ss=η·γ̇₀={params['eta']*gamma_dot_0:.4f}")
+    print(f"  schedule: plateau dt={DT_PLATEAU}, fine dt={DT_FINE} (×{DT_FINE_RATIO}), window=±{WINDOW}")
+    print(f"  BDF-1 wall={wall1:.1f}s  max|err|={err1['max_abs']:.4e}  rms={err1['rms']:.4e}")
+    print(f"  BDF-2 wall={wall2:.1f}s  max|err|={err2['max_abs']:.4e}  rms={err2['rms']:.4e}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, half_period=HALF_PERIOD, n_periods=N_PERIODS,
+            gamma_dot_0=gamma_dot_0, t_end=T_END,
+            dt_plateau=DT_PLATEAU, dt_fine=DT_FINE, dt_fine_ratio=DT_FINE_RATIO,
+            window=WINDOW,
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_vep_square.py b/docs/advanced/benchmarks/bench_vep_square.py
new file mode 100644
index 00000000..dce4177a
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_vep_square.py
@@ -0,0 +1,110 @@
+"""Benchmark: visco-elastic-plastic shear under square-wave forcing.
+
+Same drive as :mod:`bench_ve_square` but with Min-mode plasticity
+(yield stress :math:`\\tau_y < \\eta\\dot\\gamma_0`).  The closed-form
+solution is the *clipped* version of the VE square-wave: within each
+half-period the stress evolves under Maxwell exponentially toward
+:math:`\\pm\\eta\\dot\\gamma_0`, but is held at :math:`\\pm\\tau_y` while the
+material is yielding.  When the BC reverses, the next half-period
+starts from the (clipped) value :math:`\\pm\\tau_y`.
+
+Closed form
+-----------
+.. math::
+    \\sigma(t) = \\mathrm{clip}\\bigl(s_n\\sigma_{\\mathrm{ss}}
+    + (\\sigma_{0,n} - s_n\\sigma_{\\mathrm{ss}})\\, e^{-(t-t_n)/t_r},\\,
+    -\\tau_y, +\\tau_y\\bigr)
+
+with :math:`\\sigma_{0,n} = \\mathrm{clip}(\\sigma(t_n), \\pm\\tau_y)` —
+i.e.\\ each new half-period starts from the clipped value at the
+previous boundary.
+
+Run
+---
+``pixi run -e amr-dev python docs/advanced/benchmarks/bench_vep_square.py``
+
+Output: ``output/benchmarks/vep_square.npz``.
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, t_relax, build_stokes, probe_centre,
+    vep_square_wave, save_run, error_metrics,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5         # < η·γ̇₀ = 1, so material yields
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+DT = 0.10
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+LABEL = "vep_square"
+
+
+def _run_one(bdf_order):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(
+        f"{LABEL}_o{bdf_order}", params,
+        yield_stress=TAU_Y, yield_mode="min",
+    )
+    times, dts, sigmas, gammas, reasons = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        n_half = int((t_cur + 0.5 * dt) / HALF_PERIOD)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur += dt
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = vep_square_wave(times1, params["eta"], params["mu"],
+                                gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err1 = error_metrics(sig1, sigma_ana)
+    err2 = error_metrics(sig2, sigma_ana)
+    peak1 = float(np.abs(sig1).max())
+    peak2 = float(np.abs(sig2).max())
+    over1 = int((np.abs(sig1) > 1.001 * TAU_Y).sum())
+    over2 = int((np.abs(sig2) > 1.001 * TAU_Y).sum())
+    print(f"[{LABEL}]  steps={len(times1)}  τ_y={TAU_Y}  η·γ̇₀={params['eta']*gamma_dot_0:.4f}")
+    print(f"  BDF-1 wall={wall1:.1f}s  peak|σ|={peak1:.4f}  over={over1}  max|err|={err1['max_abs']:.4e}  rms={err1['rms']:.4e}")
+    print(f"  BDF-2 wall={wall2:.1f}s  peak|σ|={peak2:.4f}  over={over2}  max|err|={err2['max_abs']:.4e}  rms={err2['rms']:.4e}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, half_period=HALF_PERIOD, n_periods=N_PERIODS,
+            tau_y=TAU_Y, gamma_dot_0=gamma_dot_0, t_end=T_END, dt_nominal=DT,
+            peak_bdf1=peak1, peak_bdf2=peak2,
+            n_over_bdf1=over1, n_over_bdf2=over2,
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_vep_square_vardt.py b/docs/advanced/benchmarks/bench_vep_square_vardt.py
new file mode 100644
index 00000000..e6f07512
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_vep_square_vardt.py
@@ -0,0 +1,114 @@
+"""Variable-dt VEP square-wave benchmark.
+
+VEP analogue of :mod:`bench_ve_square_vardt`.  The combination — Min-mode
+plasticity, sharp BC discontinuities, and a 10× dt change around each
+flip — is the regime that originally exhibited the variable-dt
+yield-surface drift before the projection-snapshot fix.  This benchmark
+verifies that, with the fix in place, the simulation hits the analytical
+clipped solution to the same accuracy as a fixed-dt run.
+
+Run::
+
+    pixi run -e amr-dev python docs/advanced/benchmarks/bench_vep_square_vardt.py
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    vep_square_wave, save_run, error_metrics,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+DT_PLATEAU = 0.10
+DT_FINE_RATIO = 0.10
+DT_FINE = DT_PLATEAU * DT_FINE_RATIO
+WINDOW = 0.1 * HALF_PERIOD
+
+LABEL = "vep_square_vardt"
+
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def _run_one(bdf_order):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(
+        f"{LABEL}_o{bdf_order}", params,
+        yield_stress=TAU_Y, yield_mode="min",
+    )
+    times, dts, sigmas, gammas, reasons = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = vep_square_wave(times1, params["eta"], params["mu"],
+                                gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err1 = error_metrics(sig1, sigma_ana)
+    err2 = error_metrics(sig2, sigma_ana)
+    peak1 = float(np.abs(sig1).max()); peak2 = float(np.abs(sig2).max())
+    over1 = int((np.abs(sig1) > 1.001 * TAU_Y).sum())
+    over2 = int((np.abs(sig2) > 1.001 * TAU_Y).sum())
+    print(f"[{LABEL}]  steps={len(times1)}  τ_y={TAU_Y}  η·γ̇₀={params['eta']*gamma_dot_0:.4f}")
+    print(f"  schedule: plateau dt={DT_PLATEAU}, fine dt={DT_FINE} (×{DT_FINE_RATIO}), window=±{WINDOW}")
+    print(f"  BDF-1 wall={wall1:.1f}s peak|σ|={peak1:.4f} over={over1} max|err|={err1['max_abs']:.4e} rms={err1['rms']:.4e}")
+    print(f"  BDF-2 wall={wall2:.1f}s peak|σ|={peak2:.4f} over={over2} max|err|={err2['max_abs']:.4e} rms={err2['rms']:.4e}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, half_period=HALF_PERIOD, n_periods=N_PERIODS,
+            tau_y=TAU_Y, gamma_dot_0=gamma_dot_0, t_end=T_END,
+            dt_plateau=DT_PLATEAU, dt_fine=DT_FINE, dt_fine_ratio=DT_FINE_RATIO,
+            window=WINDOW,
+            peak_bdf1=peak1, peak_bdf2=peak2,
+            n_over_bdf1=over1, n_over_bdf2=over2,
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_vep_square_vardt_minres_softjac.py b/docs/advanced/benchmarks/bench_vep_square_vardt_minres_softjac.py
new file mode 100644
index 00000000..f790339d
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_vep_square_vardt_minres_softjac.py
@@ -0,0 +1,166 @@
+"""VEP variable-dt with Min residual and softmin Jacobian.
+
+Hypothesis: at the yield kink, Newton stalls because the true Min-mode
+Jacobian has a slope discontinuity, so each Newton step is throttled by
+line search and DIVERGED_MAX_IT fires (despite the residual already being
+below any sensible tolerance).
+
+Try inexact Newton: keep the residual F1 = ``2·η_min·ε̇ + BDF-history``
+(so the answer lands on the true yield surface), but autodiff a softmin
+version of the same expression to build the uu / up Jacobian blocks.
+The Jacobian is then continuous; Newton sees no kink; convergence
+should be at-or-near 1 iteration per step.
+
+Counterfactual: bench_vep_square_vardt.py (same problem, full Min for
+both residual and Jacobian) recorded 4/413 BDF-1 steps as
+DIVERGED_MAX_IT.  We expect 0 here.
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    vep_square_wave, save_run, error_metrics,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+DT_PLATEAU = 0.10
+DT_FINE_RATIO = 0.10
+DT_FINE = DT_PLATEAU * DT_FINE_RATIO
+WINDOW = 0.1 * HALF_PERIOD
+JAC_SOFTNESS = 0.1
+
+LABEL = "vep_square_vardt_minres_softjac"
+
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def _capture_softmin_F1(stokes, softness):
+    """Build the alternative F1 that uses softmin viscosity throughout.
+
+    The current ``stokes.F1.sym`` is built from ``cm.flux`` which uses
+    whichever yield_mode is set on the constitutive model.  Briefly
+    flip the mode to ``softmin`` to grab the alternative ``cm.flux``
+    expression, then restore.
+    """
+    cm = stokes.constitutive_model
+    saved_mode = cm._yield_mode
+    saved_softness = cm._yield_softness
+    try:
+        cm._yield_mode = "softmin"
+        cm._yield_softness = softness
+        # Replicate F1.sym = stress + penalty * div_u * I, but using
+        # the freshly-recomputed (softmin) stress.
+        soft_stress = cm.flux
+        F1_softmin = soft_stress + stokes.penalty * stokes.div_u * sympy.eye(stokes.mesh.dim)
+    finally:
+        cm._yield_mode = saved_mode
+        cm._yield_softness = saved_softness
+    return F1_softmin
+
+
+def _run_one(bdf_order):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(
+        f"{LABEL}_o{bdf_order}", params,
+        yield_stress=TAU_Y, yield_mode="min",
+    )
+    # Inexact Newton: softmin Jacobian, Min residual.
+    # ``set_jacobian_F1_source`` defaults to installing the ``cp``
+    # (critical-point) linesearch, which is the right pairing for an
+    # inexact Jacobian — the default ``bt`` rejects useful steps as
+    # ``DIVERGED_LINE_SEARCH`` because they don't strictly reduce the
+    # Min residual (only the softmin one).
+    F1_jac = _capture_softmin_F1(stokes, JAC_SOFTNESS)
+    stokes.set_jacobian_F1_source(F1_jac)
+
+    times, dts, sigmas, gammas, reasons, iters = [], [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        # divergence_retries=0 to expose true Newton behaviour.
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=0)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        iters.append(int(stokes.snes.getIterationNumber()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), np.array(iters),
+            time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, its1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, its2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = vep_square_wave(times1, params["eta"], params["mu"],
+                                gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err1 = error_metrics(sig1, sigma_ana)
+    err2 = error_metrics(sig2, sigma_ana)
+    peak1 = float(np.abs(sig1).max()); peak2 = float(np.abs(sig2).max())
+    over1 = int((np.abs(sig1) > 1.001 * TAU_Y).sum())
+    over2 = int((np.abs(sig2) > 1.001 * TAU_Y).sum())
+    div1 = int((rea1 < 0).sum())
+    div2 = int((rea2 < 0).sum())
+    print(f"[{LABEL}]  steps={len(times1)}  τ_y={TAU_Y}  η·γ̇₀={params['eta']*gamma_dot_0:.4f}")
+    print(f"  schedule: plateau dt={DT_PLATEAU}, fine dt={DT_FINE} (×{DT_FINE_RATIO}), window=±{WINDOW}")
+    print(f"  Jacobian: softmin (δ={JAC_SOFTNESS}), residual: Min")
+    print(f"  BDF-1 wall={wall1:.1f}s peak|σ|={peak1:.4f} over={over1} "
+          f"max|err|={err1['max_abs']:.4e} rms={err1['rms']:.4e} "
+          f"diverged={div1} mean_its={its1.mean():.2f}")
+    print(f"  BDF-2 wall={wall2:.1f}s peak|σ|={peak2:.4f} over={over2} "
+          f"max|err|={err2['max_abs']:.4e} rms={err2['rms']:.4e} "
+          f"diverged={div2} mean_its={its2.mean():.2f}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, half_period=HALF_PERIOD, n_periods=N_PERIODS,
+            tau_y=TAU_Y, gamma_dot_0=gamma_dot_0, t_end=T_END,
+            dt_plateau=DT_PLATEAU, dt_fine=DT_FINE, dt_fine_ratio=DT_FINE_RATIO,
+            window=WINDOW, jac_softness=JAC_SOFTNESS,
+            peak_bdf1=peak1, peak_bdf2=peak2,
+            n_over_bdf1=over1, n_over_bdf2=over2,
+            n_diverged_bdf1=div1, n_diverged_bdf2=div2,
+            mean_its_bdf1=float(its1.mean()), mean_its_bdf2=float(its2.mean()),
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+        snes_iters_bdf1=its1, snes_iters_bdf2=its2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/bench_vep_square_vardt_softmin.py b/docs/advanced/benchmarks/bench_vep_square_vardt_softmin.py
new file mode 100644
index 00000000..9e02b2d4
--- /dev/null
+++ b/docs/advanced/benchmarks/bench_vep_square_vardt_softmin.py
@@ -0,0 +1,113 @@
+"""Variable-dt VEP square-wave benchmark — *softmin* yield mode.
+
+Counterpart to bench_vep_square_vardt (Min) and
+bench_vep_square_vardt_smooth (smooth blend).  Same dt schedule and
+forcing; the only change is ``yield_mode = "softmin"`` with the
+default softness δ = 0.1.
+
+Softmin replaces ``Min(η_ve, η_pl)`` with the smooth approximation
+
+    η_eff = η_ve / g(f),    g(f) = 1 + (f − 1 + √((f−1)² + δ²))/2 − offset
+
+which converges to true Min as δ → 0.  Default δ = 0.1 keeps continuous
+derivatives at the yield kink while staying close to Min in magnitude.
+"""
+
+import time
+import numpy as np
+import sympy
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    vep_square_wave, save_run, error_metrics,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+DT_PLATEAU = 0.10
+DT_FINE_RATIO = 0.10
+DT_FINE = DT_PLATEAU * DT_FINE_RATIO
+WINDOW = 0.1 * HALF_PERIOD
+
+LABEL = "vep_square_vardt_softmin"
+
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def _run_one(bdf_order):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = bdf_order
+    mesh, stokes, V_top, params = build_stokes(
+        f"{LABEL}_o{bdf_order}", params,
+        yield_stress=TAU_Y, yield_mode="softmin",
+    )
+    times, dts, sigmas, gammas, reasons = [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+    return (np.array(times), np.array(dts), np.array(sigmas),
+            np.array(gammas), np.array(reasons), time.time() - t0, params)
+
+
+def main():
+    times1, dts1, sig1, gam1, rea1, wall1, params = _run_one(1)
+    times2, dts2, sig2, gam2, rea2, wall2, _      = _run_one(2)
+    assert np.allclose(times1, times2)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana_min = vep_square_wave(times1, params["eta"], params["mu"],
+                                    gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err1 = error_metrics(sig1, sigma_ana_min)
+    err2 = error_metrics(sig2, sigma_ana_min)
+    peak1 = float(np.abs(sig1).max()); peak2 = float(np.abs(sig2).max())
+    print(f"[{LABEL}]  steps={len(times1)}  τ_y={TAU_Y}  η·γ̇₀={params['eta']*gamma_dot_0:.4f}")
+    print(f"  schedule: plateau dt={DT_PLATEAU}, fine dt={DT_FINE} (×{DT_FINE_RATIO}), window=±{WINDOW}")
+    print(f"  yield_mode = softmin (default δ = 0.1)")
+    print(f"  BDF-1 wall={wall1:.1f}s peak|σ|={peak1:.4f}  ({100*peak1/TAU_Y:.1f}% of τ_y)  max|err vs Min-clip|={err1['max_abs']:.4e}  rms={err1['rms']:.4e}")
+    print(f"  BDF-2 wall={wall2:.1f}s peak|σ|={peak2:.4f}  ({100*peak2/TAU_Y:.1f}% of τ_y)  max|err vs Min-clip|={err2['max_abs']:.4e}  rms={err2['rms']:.4e}")
+
+    save_run(
+        LABEL,
+        params=params,
+        params_extra=dict(
+            V0=V0, half_period=HALF_PERIOD, n_periods=N_PERIODS,
+            tau_y=TAU_Y, gamma_dot_0=gamma_dot_0, t_end=T_END,
+            dt_plateau=DT_PLATEAU, dt_fine=DT_FINE, dt_fine_ratio=DT_FINE_RATIO,
+            window=WINDOW, yield_mode="softmin", yield_softness=0.1,
+            peak_bdf1=peak1, peak_bdf2=peak2,
+            err_max_bdf1=err1["max_abs"], err_rms_bdf1=err1["rms"],
+            err_max_bdf2=err2["max_abs"], err_rms_bdf2=err2["rms"],
+            wall_bdf1=wall1, wall_bdf2=wall2,
+        ),
+        times=times1, dts=dts1, gamma_dot=gam1, sigma_ana=sigma_ana_min,
+        sigma_bdf1=sig1, sigma_bdf2=sig2,
+        snes_reasons_bdf1=rea1, snes_reasons_bdf2=rea2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/check_saved_data.py b/docs/advanced/benchmarks/check_saved_data.py
new file mode 100644
index 00000000..66c3b55d
--- /dev/null
+++ b/docs/advanced/benchmarks/check_saved_data.py
@@ -0,0 +1,85 @@
+"""Verify that the on-disk benchmark npz files contain everything
+needed for any plot we'd want — without re-running the simulations.
+
+Lists keys in each .npz and asserts the per-config files have
+``sigma_bdf1``, ``sigma_bdf2``, and ``sigma_ana`` (so the BDF-1 vs
+BDF-2 overlay is reproducible from saved data alone), and that the
+convergence file has ``trace_*`` arrays for every (order, dt) pair
+recorded in the metrics arrays (so any per-run trace from the
+convergence sweep is reproducible too).
+
+Run::
+
+    pixi run -e amr-dev python docs/advanced/benchmarks/check_saved_data.py
+"""
+
+import os
+import sys
+import numpy as np
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from _bench_helpers import OUTPUT_DIR, load_run
+
+
+def _list_npz_keys(name):
+    path = f"{OUTPUT_DIR}/{name}.npz"
+    if not os.path.exists(path):
+        return None
+    with np.load(path, allow_pickle=True) as f:
+        return list(f.keys())
+
+
+def _check_per_case(name):
+    arrays, params, extra = load_run(name)
+    needed_arrays = {"times", "dts", "gamma_dot", "sigma_ana",
+                     "sigma_bdf1", "sigma_bdf2"}
+    have = set(arrays.keys())
+    missing = needed_arrays - have
+    print(f"\n[{name}.npz]")
+    print(f"  arrays: {sorted(have)}")
+    print(f"  params keys: {sorted(params.keys())}")
+    print(f"  extra keys:  {sorted(extra.keys())}")
+    if missing:
+        print(f"  MISSING: {sorted(missing)}")
+        return False
+    print("  OK — has both BDF traces + analytical reference")
+    return True
+
+
+def _check_convergence(name):
+    arrays, params, extra = load_run(name)
+    n_runs = len(arrays["order"])
+    print(f"\n[{name}.npz]")
+    print(f"  metrics arrays: order, dt, n_steps, max_abs, rms, wall  ({n_runs} runs)")
+    expected_traces = []
+    for order, dt in zip(arrays["order"], arrays["dt"]):
+        tag = f"o{int(order)}_dt{float(dt):.4f}"
+        expected_traces += [f"trace_t_{tag}", f"trace_sigma_{tag}", f"trace_ana_{tag}"]
+    have = set(arrays.keys())
+    missing = [t for t in expected_traces if t not in have]
+    print(f"  expected {len(expected_traces)} trace arrays; have {len(expected_traces) - len(missing)}")
+    if missing:
+        print(f"  MISSING: {missing[:6]}{' …' if len(missing) > 6 else ''}")
+        return False
+    print("  OK — every (order, dt) trace is on disk")
+    return True
+
+
+def main():
+    ok = True
+    for name in ("ve_harmonic", "ve_square", "vep_square"):
+        if _list_npz_keys(name) is None:
+            print(f"\n[{name}.npz]  not on disk — skipping")
+            continue
+        ok = _check_per_case(name) and ok
+    for name in ("convergence_ve_harmonic", "convergence_ve_square",
+                 "convergence_vep_square"):
+        if _list_npz_keys(name) is None:
+            print(f"\n[{name}.npz]  not on disk — skipping")
+            continue
+        ok = _check_convergence(name) and ok
+    print("\n=== overall:", "OK" if ok else "FAIL", "===")
+    sys.exit(0 if ok else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/index.md b/docs/advanced/benchmarks/index.md
index 56ee9ae0..cc987bad 100644
--- a/docs/advanced/benchmarks/index.md
+++ b/docs/advanced/benchmarks/index.md
@@ -4,11 +4,58 @@ title: "Benchmarks"
 
 # Solver Benchmarks
 
-Validation benchmarks comparing Underworld3 solvers against analytical solutions.
+Validation benchmarks comparing Underworld3 solvers against
+closed-form analytical solutions.  Each benchmark has three pieces:
+
+* a `bench_*.py` runner that solves the problem and writes a
+  self-contained `.npz` log to `output/benchmarks/`,
+* `plot_benchmarks.py` that reads the logs and produces consistent-style
+  figures in `docs/advanced/figures/`,
+* a Markdown page (this section) that documents the governing
+  equation, the closed-form solution, the test setup, and the result.
+
+The runner and the plotter are deliberately decoupled: each runner
+saves the per-step trace, both BDF orders, the analytical reference,
+and the parameter dict in one self-contained file; re-running the
+plot script to tweak style does not re-run the (slow) simulation.
+A separate `bench_convergence.py` runs each case at a sweep of
+timestep sizes (and both BDF orders) and saves all per-run traces so
+the convergence figure and any per-(order, dt) replot are equally
+reproducible from saved data.
+
+## Workflow
+
+```bash
+# Run a single per-case benchmark (both BDF orders, ~3-6 min each)
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_harmonic.py
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_square.py
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_vep_square.py
+
+# Run the convergence sweep (~30 min, all dts × both orders × all cases)
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_convergence.py
+
+# Replot from saved data — does NOT re-run simulations
+pixi run -e amr-dev python docs/advanced/benchmarks/plot_benchmarks.py
+
+# Verify the on-disk data is complete (used as a sanity check before
+# claiming a benchmark suite is "done")
+pixi run -e amr-dev python docs/advanced/benchmarks/check_saved_data.py
+```
+
+## Cases
 
 ```{toctree}
 :maxdepth: 1
 
-ve-oscillatory-shear
-ve-square-wave-shear
+ve-harmonic
+ve-square
+vep-square
+vardt-square
 ```
+
+| Case | Driving | Closed form | What it tests |
+|---|---|---|---|
+| `ve-harmonic` | $V_{\mathrm{top}} = V_0\cos(\omega t + \varphi)$ | $A_\infty\cos\omega t$ | amplitude attenuation, phase lag, peak-start IC |
+| `ve-square` | square-wave $V_{\mathrm{top}}$ | piecewise exponential | BDF history at BC discontinuities |
+| `vep-square` | square-wave with yield | clipped Maxwell square-wave | Min-mode plasticity, projection-snapshot fix |
+| `vardt-square` | square-wave + reduced $\Delta t$ near flips | same as `ve-square` / `vep-square` | snapshot machinery under variable timestep |
diff --git a/docs/advanced/benchmarks/jit_cache_vs_recompile.py b/docs/advanced/benchmarks/jit_cache_vs_recompile.py
index e98ff3e3..854b9c4a 100644
--- a/docs/advanced/benchmarks/jit_cache_vs_recompile.py
+++ b/docs/advanced/benchmarks/jit_cache_vs_recompile.py
@@ -42,10 +42,10 @@ def main():
     v = uw.discretisation.MeshVariable("V", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(
-        mesh, velocityField=v, pressureField=p, order=2
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=2,
     )
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
     stokes.constitutive_model.Parameters.shear_viscosity_0 = 1.0
     stokes.constitutive_model.Parameters.shear_modulus = 1.0
 
diff --git a/docs/advanced/benchmarks/plot_benchmarks.py b/docs/advanced/benchmarks/plot_benchmarks.py
new file mode 100644
index 00000000..42be4537
--- /dev/null
+++ b/docs/advanced/benchmarks/plot_benchmarks.py
@@ -0,0 +1,371 @@
+"""Plot the VE/VEP benchmark results from on-disk ``.npz`` files.
+
+Reads ``output/benchmarks/{ve_harmonic,ve_square,vep_square}.npz`` and
+produces three figures in ``docs/advanced/figures/``.  Style is shared
+across the three so the plots can be compared directly.
+
+Each figure has the same layout:
+
+  Top panel:    σ_xy(t) — simulation markers, analytical solid line,
+                ±τ_y guide for the VEP case, light-blue filled driving
+                term γ̇(t) for context (rescaled to fit beside σ).
+  Middle panel: |error| log-scale.
+  Bottom panel: dt(t) (relevant once we add variable-dt benchmarks).
+
+Run after one or more of ``bench_*.py`` have produced their npz:
+
+    pixi run -e amr-dev python docs/advanced/benchmarks/plot_benchmarks.py
+"""
+
+import os
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from _bench_helpers import load_run, FIG_DIR
+
+
+# Shared style ---------------------------------------------------------
+plt.rcParams.update({
+    "figure.figsize": (11, 8),
+    "axes.grid": True,
+    "grid.alpha": 0.3,
+    "axes.titlesize": 11,
+    "axes.labelsize": 11,
+    "legend.fontsize": 9,
+    "legend.framealpha": 0.92,
+})
+
+C_BDF1 = "#1F77B4"         # blue circles — BDF-1
+C_BDF2 = "#D62728"         # red squares — BDF-2
+C_ANA = "black"            # solid black — analytical
+C_DRIVE = "#1F77B4"        # light blue — driving (filled)
+C_ERR_BDF1 = "#1F77B4"
+C_ERR_BDF2 = "#D62728"
+C_DT = "#2CA02C"           # green — dt
+C_YIELD = "grey"
+
+
+def _plot_three_panel(name, t_ana_grid, sigma_ana_grid, info, *, tau_y=None):
+    """Common three-panel layout used by all three benchmarks.
+
+    Reads BDF-1 + BDF-2 traces from one npz and overlays both.  The
+    npz's per-step arrays are: ``sigma_bdf1``, ``sigma_bdf2``,
+    ``sigma_ana``.
+    """
+    arrays, params, extra = info
+    times = arrays["times"]
+    sigma_ana = arrays["sigma_ana"]
+    dts = arrays["dts"]
+    gamma_dot = arrays["gamma_dot"]
+    sigma_bdf1 = arrays["sigma_bdf1"]
+    sigma_bdf2 = arrays["sigma_bdf2"]
+
+    fig, (ax_top, ax_err, ax_dt) = plt.subplots(
+        3, 1, sharex=True,
+        gridspec_kw={"height_ratios": [3.5, 1.5, 1.0]},
+    )
+
+    # If this is a variable-dt run, derive the fine-dt windows from
+    # the saved dt array (any step with dt < 0.5*max(dt) is "fine"),
+    # and shade those regions on every panel so the schedule is
+    # visually unambiguous against the marker density.
+    fine_thresh = 0.5 * float(np.max(dts))
+    fine_mask = dts < fine_thresh
+    fine_windows = []  # list of (t_start, t_end)
+    if fine_mask.any() and not fine_mask.all():
+        in_window = False
+        w_start = None
+        for i, (t, is_fine) in enumerate(zip(times, fine_mask)):
+            t_start = t - dts[i]
+            if is_fine and not in_window:
+                w_start = t_start
+                in_window = True
+            elif not is_fine and in_window:
+                fine_windows.append((w_start, times[i - 1]))
+                in_window = False
+        if in_window:
+            fine_windows.append((w_start, times[-1]))
+    for (a, b) in fine_windows:
+        for ax in (ax_top, ax_err, ax_dt):
+            ax.axvspan(a, b, color="0.85", alpha=0.5, linewidth=0, zorder=0)
+
+    # --- Top: σ(t)
+    sigma_max = float(np.max(np.abs(sigma_ana_grid))) or 1.0
+    gamma_max = float(np.max(np.abs(gamma_dot))) or 1.0
+    drive_scale = 0.5 * sigma_max / gamma_max
+    ax_top.fill_between(
+        times, 0.0, drive_scale * gamma_dot,
+        color=C_DRIVE, alpha=0.18, linewidth=0,
+        label=fr"driving $\dot\gamma(t)$ (×{drive_scale:.2f})",
+    )
+    ax_top.plot(t_ana_grid, sigma_ana_grid, "-", color=C_ANA, lw=1.4,
+                label="analytical")
+    ax_top.plot(times, sigma_bdf1, "o", color=C_BDF1, ms=4.2, alpha=0.78,
+                mec=C_BDF1, mfc="white", mew=1.3,
+                label="BDF-1")
+    ax_top.plot(times, sigma_bdf2, "s", color=C_BDF2, ms=3.8, alpha=0.85,
+                label="BDF-2")
+    if tau_y is not None:
+        ax_top.axhline(+tau_y, color=C_YIELD, ls="--", lw=0.9, alpha=0.7,
+                       label=fr"$\pm\tau_y$ = $\pm${tau_y:g}")
+        ax_top.axhline(-tau_y, color=C_YIELD, ls="--", lw=0.9, alpha=0.7)
+    ax_top.axhline(0, color="grey", lw=0.4, alpha=0.4)
+    ax_top.set_ylabel(r"$\sigma_{xy}$")
+
+    # Title with the headline numbers per order
+    bits = [name]
+    if "err_max_bdf1" in extra:
+        bits.append(fr"BDF-1 max|err|={extra['err_max_bdf1']:.2e}")
+    if "err_max_bdf2" in extra:
+        bits.append(fr"BDF-2 max|err|={extra['err_max_bdf2']:.2e}")
+    if "De" in extra:
+        bits.append(fr"De={extra['De']:.3f}")
+    if "tau_y" in extra:
+        bits.append(fr"$\tau_y={extra['tau_y']:g}$")
+    ax_top.set_title("    ".join(bits))
+    ax_top.legend(loc="lower right", ncol=2)
+
+    # --- Middle: |sigma − sigma_ana| for both orders
+    err1 = np.abs(sigma_bdf1 - sigma_ana)
+    err2 = np.abs(sigma_bdf2 - sigma_ana)
+    eps = 1e-9
+    ax_err.semilogy(times, np.maximum(err1, eps), "-", color=C_ERR_BDF1,
+                    lw=0.8, marker="o", ms=2.8, mec=C_ERR_BDF1, mfc="white",
+                    label="BDF-1")
+    ax_err.semilogy(times, np.maximum(err2, eps), "-", color=C_ERR_BDF2,
+                    lw=0.8, marker="s", ms=2.8, label="BDF-2")
+    ax_err.set_ylabel(r"$|\sigma_{\mathrm{sim}} - \sigma_{\mathrm{ana}}|$")
+    ax_err.legend(loc="upper right", ncol=2, fontsize=8)
+    ax_err.set_ylim(bottom=eps * 0.9)
+
+    # --- Bottom: dt
+    ax_dt.step(times, dts, where="post", color=C_DT, lw=1.1)
+    ax_dt.set_xlabel(r"Time $t / t_r$")
+    ax_dt.set_ylabel(r"$\Delta t$")
+    ax_dt.set_ylim(0.0, max(dts) * 1.1)
+
+    plt.tight_layout()
+    return fig
+
+
+def plot_ve_harmonic():
+    arrays, params, extra = load_run("ve_harmonic")
+    eta, mu = params["eta"], params["mu"]
+    omega = extra["omega"]
+    De = omega * eta / mu
+    gd0 = extra["gamma_dot_0"]
+    A_inf = eta * gd0 / np.sqrt(1.0 + De**2)
+    # Fine analytical grid for the smooth curve.  The bench uses the
+    # peak-start IC: V_top(t) = V_0·cos(ωt + φ) with φ = arctan(De), so
+    # the steady-state σ_ss(t) = A_∞·cos(ωt).  No transient.
+    t_grid = np.linspace(0, arrays["times"][-1], 2000)
+    sigma_grid = A_inf * np.cos(omega * t_grid)
+    fig = _plot_three_panel("VE harmonic", t_grid, sigma_grid,
+                            (arrays, params, extra))
+    ax_top = fig.axes[0]
+    info = (
+        f"Amplitude  ana={extra['A_ana']:.4f}\n"
+        f"           BDF-1={extra['A_bdf1']:.4f}  BDF-2={extra['A_bdf2']:.4f}\n"
+        f"Phase lag  ana={extra['phi_ana']:.4f}\n"
+        f"           BDF-1={extra['phi_bdf1']:.4f}  BDF-2={extra['phi_bdf2']:.4f}"
+    )
+    ax_top.text(0.02, 0.97, info, transform=ax_top.transAxes,
+                ha="left", va="top",
+                fontsize=8.5, family="monospace",
+                bbox=dict(facecolor="white", edgecolor="0.7", alpha=0.92,
+                          boxstyle="round,pad=0.4"))
+    out = f"{FIG_DIR}/bench_ve_harmonic.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_ve_square():
+    arrays, params, extra = load_run("ve_square")
+    eta, mu = params["eta"], params["mu"]
+    half_period = extra["half_period"]
+    gd0 = extra["gamma_dot_0"]
+    from _bench_helpers import maxwell_square_wave
+    t_grid = np.linspace(0, arrays["times"][-1], 2000)
+    sigma_grid = maxwell_square_wave(t_grid, eta, mu, gd0, half_period)
+    fig = _plot_three_panel("VE square wave", t_grid, sigma_grid,
+                            (arrays, params, extra))
+    out = f"{FIG_DIR}/bench_ve_square.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_vep_square():
+    arrays, params, extra = load_run("vep_square")
+    eta, mu = params["eta"], params["mu"]
+    half_period = extra["half_period"]
+    tau_y = extra["tau_y"]
+    gd0 = extra["gamma_dot_0"]
+    from _bench_helpers import vep_square_wave
+    t_grid = np.linspace(0, arrays["times"][-1], 2000)
+    sigma_grid = vep_square_wave(t_grid, eta, mu, gd0, tau_y, half_period)
+    fig = _plot_three_panel("VEP square wave (Min mode)", t_grid, sigma_grid,
+                            (arrays, params, extra), tau_y=tau_y)
+    out = f"{FIG_DIR}/bench_vep_square.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_ve_square_vardt():
+    arrays, params, extra = load_run("ve_square_vardt")
+    eta, mu = params["eta"], params["mu"]
+    half_period = extra["half_period"]
+    gd0 = extra["gamma_dot_0"]
+    from _bench_helpers import maxwell_square_wave
+    t_grid = np.linspace(0, arrays["times"][-1], 4000)
+    sigma_grid = maxwell_square_wave(t_grid, eta, mu, gd0, half_period)
+    title = (
+        f"VE square wave — variable dt "
+        fr"($\Delta t_{{\rm plat}}={extra['dt_plateau']:g}$, "
+        fr"$\Delta t_{{\rm fine}}={extra['dt_fine']:g}$, ±{extra['window']:g})"
+    )
+    fig = _plot_three_panel(title, t_grid, sigma_grid, (arrays, params, extra))
+    out = f"{FIG_DIR}/bench_ve_square_vardt.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_vep_square_vardt():
+    arrays, params, extra = load_run("vep_square_vardt")
+    eta, mu = params["eta"], params["mu"]
+    half_period = extra["half_period"]
+    tau_y = extra["tau_y"]
+    gd0 = extra["gamma_dot_0"]
+    from _bench_helpers import vep_square_wave
+    t_grid = np.linspace(0, arrays["times"][-1], 4000)
+    sigma_grid = vep_square_wave(t_grid, eta, mu, gd0, tau_y, half_period)
+    title = (
+        f"VEP square wave (Min) — variable dt "
+        fr"($\Delta t_{{\rm plat}}={extra['dt_plateau']:g}$, "
+        fr"$\Delta t_{{\rm fine}}={extra['dt_fine']:g}$, ±{extra['window']:g})"
+    )
+    fig = _plot_three_panel(title, t_grid, sigma_grid,
+                            (arrays, params, extra), tau_y=tau_y)
+    out = f"{FIG_DIR}/bench_vep_square_vardt.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_vep_square_vardt_softmin():
+    arrays, params, extra = load_run("vep_square_vardt_softmin")
+    eta, mu = params["eta"], params["mu"]
+    half_period = extra["half_period"]
+    tau_y = extra["tau_y"]
+    gd0 = extra["gamma_dot_0"]
+    from _bench_helpers import vep_square_wave
+    t_grid = np.linspace(0, arrays["times"][-1], 4000)
+    sigma_grid = vep_square_wave(t_grid, eta, mu, gd0, tau_y, half_period)
+    title = (
+        fr"VEP square wave (softmin, $\delta=0.1$) — variable dt "
+        fr"($\Delta t_{{\rm plat}}={extra['dt_plateau']:g}$, "
+        fr"$\Delta t_{{\rm fine}}={extra['dt_fine']:g}$, ±{extra['window']:g})"
+    )
+    fig = _plot_three_panel(title, t_grid, sigma_grid,
+                            (arrays, params, extra), tau_y=tau_y)
+    out = f"{FIG_DIR}/bench_vep_square_vardt_softmin.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_vep_square_vardt_smooth():
+    arrays, params, extra = load_run("vep_square_vardt_smooth")
+    eta, mu = params["eta"], params["mu"]
+    half_period = extra["half_period"]
+    tau_y = extra["tau_y"]
+    gd0 = extra["gamma_dot_0"]
+    from _bench_helpers import vep_square_wave
+    t_grid = np.linspace(0, arrays["times"][-1], 4000)
+    # Reference shown is the Min-clipped solution — useful as a guide,
+    # but smooth mode is expected to under-clip below ±τ_y.
+    sigma_grid = vep_square_wave(t_grid, eta, mu, gd0, tau_y, half_period)
+    title = (
+        f"VEP square wave (smooth) — variable dt "
+        fr"($\Delta t_{{\rm plat}}={extra['dt_plateau']:g}$, "
+        fr"$\Delta t_{{\rm fine}}={extra['dt_fine']:g}$, ±{extra['window']:g})"
+    )
+    fig = _plot_three_panel(title, t_grid, sigma_grid,
+                            (arrays, params, extra), tau_y=tau_y)
+    out = f"{FIG_DIR}/bench_vep_square_vardt_smooth.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+def plot_convergence():
+    """Three-panel log-log convergence plot, one panel per case."""
+    fig, axes = plt.subplots(1, 3, figsize=(14, 4.5), sharey=True)
+    cases = [
+        ("convergence_ve_harmonic", "VE harmonic"),
+        ("convergence_ve_square",   "VE square wave"),
+        ("convergence_vep_square",  "VEP square wave"),
+    ]
+    for ax, (name, title) in zip(axes, cases):
+        try:
+            arrays, params, extra = load_run(name)
+        except FileNotFoundError:
+            ax.set_title(f"{title} — no data")
+            continue
+        order = arrays["order"]; dt = arrays["dt"]
+        max_abs = arrays["max_abs"]; rms = arrays["rms"]
+
+        for o, marker, lbl_color in [(1, "o", C_BDF1), (2, "s", C_BDF2)]:
+            mask = order == o
+            if not mask.any():
+                continue
+            d = dt[mask]; e = max_abs[mask]; r = rms[mask]
+            ax.loglog(d, e, marker=marker, color=lbl_color, ms=7,
+                      lw=1.5, label=fr"BDF-{o} max$|\,\mathrm{{err}}\,|$")
+            ax.loglog(d, r, marker=marker, color=lbl_color, ms=5,
+                      lw=1.0, ls=":", alpha=0.7,
+                      label=fr"BDF-{o} rms")
+
+        # Reference slopes — a guide line through the smallest-dt BDF-2 max-abs
+        if (order == 2).any():
+            mask2 = order == 2
+            d_ref = float(dt[mask2].min())
+            e_ref = float(max_abs[mask2][np.argmin(dt[mask2])])
+            d_grid = np.array([dt.min() * 0.7, dt.max() * 1.3])
+            ax.loglog(d_grid, e_ref * (d_grid / d_ref) ** 2,
+                      "k--", lw=0.8, alpha=0.5, label=r"slope 2")
+            ax.loglog(d_grid, e_ref * (d_grid / d_ref) ** 1,
+                      "k:", lw=0.8, alpha=0.5, label=r"slope 1")
+        ax.set_title(title)
+        ax.set_xlabel(r"$\Delta t$")
+        ax.grid(True, which="both", alpha=0.3)
+        ax.legend(loc="lower right", fontsize=8)
+
+    axes[0].set_ylabel("error")
+    plt.tight_layout()
+    out = f"{FIG_DIR}/bench_convergence.png"
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"  wrote {out}")
+
+
+if __name__ == "__main__":
+    os.makedirs(FIG_DIR, exist_ok=True)
+    for plotter, name in [
+        (plot_ve_harmonic, "ve_harmonic"),
+        (plot_ve_square, "ve_square"),
+        (plot_vep_square, "vep_square"),
+        (plot_ve_square_vardt, "ve_square_vardt"),
+        (plot_vep_square_vardt, "vep_square_vardt"),
+        (plot_vep_square_vardt_softmin, "vep_square_vardt_softmin"),
+        (plot_vep_square_vardt_smooth, "vep_square_vardt_smooth"),
+        (plot_convergence, "convergence"),
+    ]:
+        try:
+            plotter()
+        except FileNotFoundError:
+            print(f"  skipping {name} — no .npz on disk")
diff --git a/docs/advanced/benchmarks/plot_ti_vep_harmonic.py b/docs/advanced/benchmarks/plot_ti_vep_harmonic.py
new file mode 100644
index 00000000..6edd607b
--- /dev/null
+++ b/docs/advanced/benchmarks/plot_ti_vep_harmonic.py
@@ -0,0 +1,122 @@
+"""Plot TI-VEP harmonic angled-fault benchmark traces from saved npz.
+
+Reads the σ=0-IC results saved by ``bench_ti_vep_harmonic_zeroIC.py``
+and produces one combined figure showing global σ_xy and resolved
+fault-plane shear over time, for the three fault angles
+(θ ∈ {0°, +15°, -15°}) and two yield stresses (τ_y ∈ {0.15, 0.30}).
+
+Output: ``docs/advanced/figures/bench_ti_vep_harmonic.png``
+"""
+
+import os
+import numpy as np
+import matplotlib
+if not os.environ.get('DISPLAY') and not os.environ.get('WAYLAND_DISPLAY'):
+    matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+from _bench_helpers import OUTPUT_DIR, FIG_DIR
+
+
+ANGLES = (0.0, 15.0, -15.0)
+TAU_YS = (0.15, 0.30)
+
+
+def _load(theta_deg, tau_y):
+    tag = f"ti_vep_harmonic_zIC_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(".", "p")
+    return np.load(os.path.join(OUTPUT_DIR, f"{tag}.npz"))
+
+
+def main():
+    fig, axes = plt.subplots(
+        len(TAU_YS), len(ANGLES),
+        figsize=(15, 8), sharex=True, sharey='row',
+    )
+
+    for row, ty in enumerate(TAU_YS):
+        for col, theta in enumerate(ANGLES):
+            ax = axes[row, col]
+            d = _load(theta, ty)
+            t = d['times']
+            sxy_1 = d['sigma_xy_bdf1']
+            sxy_2 = d['sigma_xy_bdf2']
+            tres_1 = d['tau_resolved_bdf1']
+            tres_2 = d['tau_resolved_bdf2']
+
+            # Reconstruct V_top(t) and the VE-no-yield envelope from
+            # saved scalars.  Note: the bench's saved ``sigma_ve`` was
+            # computed with γ̇_0 = 2·V_0/H — wrong for these BCs (Top
+            # moves, Bottom fixed → γ̇_0 = V_0/H).  Recompute here.
+            V0 = float(d['V0'])
+            omega = float(d['OMEGA'])
+            eta_1 = float(d['ETA_1']); mu = float(d['MU'])
+            t_r = eta_1 / mu
+            De = omega * t_r
+            phi = float(np.arctan(De))
+            H = 1.0  # domain height in the bench
+            gamma_dot_0 = V0 / H
+            A_inf = 2.0 * eta_1 * (gamma_dot_0 / 2.0) / np.sqrt(1.0 + De**2)
+            #         ↑  σ = 2η ε̇    ↑  ε̇ = γ̇/2 (tensor strain rate)
+            sigma_ve = A_inf * np.cos(omega * t)
+            v_top = V0 * np.cos(omega * t + phi)
+
+            # Light-blue filled driving overlay, rescaled to half-peak σ
+            sig_max = max(float(np.abs(sxy_1).max()),
+                          float(np.abs(sigma_ve).max())) or 1.0
+            drive_scale = 0.5 * sig_max / V0
+            ax.fill_between(
+                t, 0.0, drive_scale * v_top,
+                color="#1F77B4", alpha=0.18, linewidth=0,
+                label=fr"driving $V_{{\rm top}}(t)$ (×{drive_scale:.2f})",
+            )
+
+            # VE no-yield envelope (light grey, dashed)
+            ax.plot(t, sigma_ve, ':', color='0.4', linewidth=1,
+                    label=r'VE (no yield)')
+
+            # τ_y guidelines
+            ax.axhline(+ty, color='gray', linestyle=':', alpha=0.6,
+                       linewidth=1, label=rf'$\pm\tau_y={ty}$')
+            ax.axhline(-ty, color='gray', linestyle=':', alpha=0.6,
+                       linewidth=1)
+
+            # Global σ_xy (BDF-1 line, BDF-2 dots)
+            ax.plot(t, sxy_1, '-', color='steelblue', linewidth=1.4,
+                    alpha=0.8, label=r'$\sigma_{xy}$ (BDF-1)')
+            ax.plot(t, sxy_2, 'o', color='steelblue', markersize=2,
+                    markerfacecolor='none', markeredgewidth=0.6,
+                    label=r'$\sigma_{xy}$ (BDF-2)')
+
+            # Resolved fault-plane shear
+            ax.plot(t, tres_1, '-', color='crimson', linewidth=1.4,
+                    alpha=0.9, label=r'$\tau_{\rm resolved}$ (BDF-1)')
+            ax.plot(t, tres_2, 's', color='crimson', markersize=2,
+                    markerfacecolor='none', markeredgewidth=0.6,
+                    label=r'$\tau_{\rm resolved}$ (BDF-2)')
+
+            ax.set_title(rf'$\theta = {theta:+.0f}°,\;\tau_y = {ty}$',
+                         fontsize=11)
+            ax.grid(True, alpha=0.3)
+            if row == len(TAU_YS) - 1:
+                ax.set_xlabel(r'Time $t/t_r$')
+            if col == 0:
+                ax.set_ylabel(r'Stress')
+            if row == 0 and col == len(ANGLES) - 1:
+                ax.legend(fontsize=8, loc='upper right', framealpha=0.9)
+
+    fig.suptitle(
+        "TI-VEP harmonic shear with embedded fault — "
+        r"$V_{\rm top}(t) = V_0\cos(\omega t + \varphi)$,  "
+        r"$V_0 = 0.5$,  $\omega = \pi/2$,  $\eta = \mu = 1$,  "
+        r"$\Delta t = 0.05$",
+        fontsize=12, y=0.995,
+    )
+
+    fig.tight_layout(rect=[0, 0, 1, 0.97])
+    out_path = os.path.join(FIG_DIR, "bench_ti_vep_harmonic.png")
+    fig.savefig(out_path, dpi=150)
+    print(f"  wrote {out_path}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/probe_constitutive_eval.py b/docs/advanced/benchmarks/probe_constitutive_eval.py
new file mode 100644
index 00000000..7b03a22a
--- /dev/null
+++ b/docs/advanced/benchmarks/probe_constitutive_eval.py
@@ -0,0 +1,162 @@
+"""Evaluate the constitutive expressions DIRECTLY at the centre, using
+specified pre-solve psi_star values. No FE solve involved — just plug
+numbers into the symbolic stress() and viscosity formulas. If the
+formulas give σ = τ_y under Min mode, the formulas are right. If the
+SIM gives σ ≠ τ_y at the same input state, the bug is in solve/project
+not in the formulas.
+"""
+
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+# Build a fresh stokes problem
+mesh = uw.meshing.StructuredQuadBox(elementRes=(16, 8),
+    minCoords=(-1, -0.5), maxCoords=(1, 0.5))
+v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
+p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
+stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=2,
+)
+stokes.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+stokes.constitutive_model.Parameters.shear_modulus = 1.0
+stokes.constitutive_model.Parameters.yield_stress = 0.5
+stokes.constitutive_model.Parameters.strainrate_inv_II_min = 1.0e-6
+stokes.constitutive_model._yield_mode = "min"
+cm = stokes.constitutive_model
+V_top = expression(R"V_{top}", sympy.Float(0.5), "Top V")
+stokes.add_dirichlet_bc((V_top, 0.0), "Top")
+stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
+stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+stokes.tolerance = 1.0e-6
+stokes.petsc_options["snes_force_iteration"] = True
+
+# Solve a few steps at dt=0.20 to populate u to uniform shear at yield
+for _ in range(5):
+    cm.Parameters.dt_elastic = 0.20
+    stokes.solve(zero_init_guess=False, timestep=0.20, divergence_retries=1)
+
+centre = np.array([[0.0, 0.0]])
+print(f"\nAfter coarse warm-up:")
+print(f"  σ at centre = {float(uw.function.evaluate(stokes.tau.sym[0,1], centre).flatten()[0]):.4f}")
+
+# === EXPERIMENT: directly inject specific psi_star values, then EVALUATE
+# the constitutive law without solving. We'll then compare with an
+# actual solve at the same state.
+
+# Set ψ*[0] = 0.5 (yielded), ψ*[1] = 0.4268 (pre-yield from coarse step)
+stokes.DFDt.psi_star[0].array[:] = 0
+stokes.DFDt.psi_star[0].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[0].array[:, 1, 0] = 0.5
+stokes.DFDt.psi_star[1].array[:] = 0
+stokes.DFDt.psi_star[1].array[:, 0, 1] = 0.4268
+stokes.DFDt.psi_star[1].array[:, 1, 0] = 0.4268
+# Force dt_history[0] = 0.20 so that the next step at dt=0.10 is "halving"
+stokes.DFDt._dt_history[0] = 0.20
+stokes.DFDt._dt_history[1] = 0.20
+
+# Set dt = 0.10 and update BDF coefficients
+cm.Parameters.dt_elastic = 0.10
+cm._update_bdf_coefficients()
+
+print(f"\nAfter setting state:")
+print(f"  ψ*[0] at centre = {float(uw.function.evaluate(stokes.DFDt.psi_star[0].sym[0,1], centre).flatten()[0]):.4f}")
+print(f"  ψ*[1] at centre = {float(uw.function.evaluate(stokes.DFDt.psi_star[1].sym[0,1], centre).flatten()[0]):.4f}")
+print(f"  dt_h[0] = {stokes.DFDt._dt_history[0]}, dt_e = {float(cm.Parameters.dt_elastic.sym):.3f}")
+print(f"  c_0 = {float(cm._bdf_c0.sym):.4f}")
+print(f"  c_1 = {float(cm._bdf_c1.sym):.4f}")
+print(f"  c_2 = {float(cm._bdf_c2.sym):.4f}")
+
+# Now WITHOUT solving, evaluate the symbolic formulas at the centre.
+print("\n=== Direct evaluation of symbolic constitutive formulas ===")
+print("(no FE solve; using the velocity field from the warm-up which is uniform shear)")
+edot_xy = float(uw.function.evaluate(cm.grad_u[0,1], centre).flatten()[0])
+print(f"  ε̇_xy = {edot_xy:.4f}")
+E_eff_xy = float(uw.function.evaluate(cm.E_eff.sym[0,1], centre).flatten()[0])
+E_eff_inv_II = float(uw.function.evaluate(cm.E_eff_inv_II.sym, centre).flatten()[0])
+print(f"  E_eff_xy   = {E_eff_xy:.4f}")
+print(f"  E_eff_inv_II = {E_eff_inv_II:.4f}")
+eta_ve = float(uw.function.evaluate(cm.Parameters.ve_effective_viscosity.sym, centre).flatten()[0])
+eta_pl = float(uw.function.evaluate(cm._plastic_effective_viscosity, centre).flatten()[0])
+eta_min = float(uw.function.evaluate(cm.viscosity, centre).flatten()[0])
+print(f"  η_ve = {eta_ve:.4f}")
+print(f"  η_pl = {eta_pl:.4f}")
+print(f"  η = Min(η_ve, η_pl) = {eta_min:.4f}  (expected min: {min(eta_ve, eta_pl):.4f})")
+
+# Direct evaluation of the stress() formula
+stress_formula = cm.stress()
+sigma_xy_direct = float(uw.function.evaluate(stress_formula[0,1], centre).flatten()[0])
+print(f"  σ_xy direct evaluation of stress() formula = {sigma_xy_direct:.4f}")
+print(f"  Predicted from 2·η·E_eff = {2*eta_min*E_eff_xy:.4f}")
+
+# === Now solve and see what comes out
+print("\n=== Run a SOLVE with ψ*[1] = 0.4268 (pre-yield) ===")
+stokes.solve(zero_init_guess=False, timestep=0.10, divergence_retries=1)
+sigma_after_solve_a = float(uw.function.evaluate(stokes.tau.sym[0,1], centre).flatten()[0])
+print(f"  σ_xy after solve = {sigma_after_solve_a:.4f}")
+
+# === User's hypothesis: ψ*[1] = pre-yield is the issue. Try ψ*[1] = ψ*[0]
+# (matches what FINE has — both at yield)
+print("\n=== Reset and re-solve with ψ*[1] = ψ*[0] = 0.5 (both at yield) ===")
+stokes.DFDt.psi_star[0].array[:] = 0
+stokes.DFDt.psi_star[0].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[0].array[:, 1, 0] = 0.5
+stokes.DFDt.psi_star[1].array[:] = 0
+stokes.DFDt.psi_star[1].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[1].array[:, 1, 0] = 0.5
+stokes.DFDt._dt_history[0] = 0.20  # still dt-halving
+cm.Parameters.dt_elastic = 0.10
+cm._update_bdf_coefficients()
+stokes.solve(zero_init_guess=False, timestep=0.10, divergence_retries=1)
+sigma_after_solve_b = float(uw.function.evaluate(stokes.tau.sym[0,1], centre).flatten()[0])
+print(f"  σ_xy after solve = {sigma_after_solve_b:.4f}")
+
+# === And also: ψ*[1] = ψ*[0] = 0.5 with dt_history = [0.10, 0.10] (consistent)
+print("\n=== Reset, ψ*[1] = ψ*[0] = 0.5, dt_history = [0.10, 0.10] (fully consistent) ===")
+stokes.DFDt.psi_star[0].array[:] = 0
+stokes.DFDt.psi_star[0].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[0].array[:, 1, 0] = 0.5
+stokes.DFDt.psi_star[1].array[:] = 0
+stokes.DFDt.psi_star[1].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[1].array[:, 1, 0] = 0.5
+stokes.DFDt._dt_history[0] = 0.10
+stokes.DFDt._dt_history[1] = 0.10
+cm.Parameters.dt_elastic = 0.10
+cm._update_bdf_coefficients()
+
+# Probe the symbolic formula once more to confirm Min selection is correct
+sigma_direct = float(uw.function.evaluate(cm.stress()[0,1], centre).flatten()[0])
+eta_ve = float(uw.function.evaluate(cm.Parameters.ve_effective_viscosity.sym, centre).flatten()[0])
+eta_pl = float(uw.function.evaluate(cm._plastic_effective_viscosity, centre).flatten()[0])
+eta_min = float(uw.function.evaluate(cm.viscosity, centre).flatten()[0])
+print(f"  Direct symbolic eval BEFORE solve: σ = {sigma_direct:.4f}, η_ve={eta_ve:.4f}, η_pl={eta_pl:.4f}, η_min={eta_min:.4f}")
+
+stokes.solve(zero_init_guess=False, timestep=0.10, divergence_retries=1)
+sigma_after_solve_c = float(uw.function.evaluate(stokes.tau.sym[0,1], centre).flatten()[0])
+print(f"  σ_xy after solve = {sigma_after_solve_c:.4f}")
+
+# === Also: solve TWICE (force re-iteration to settle)
+print("\n=== Re-solve with same state to see if it self-corrects ===")
+stokes.DFDt.psi_star[0].array[:] = 0
+stokes.DFDt.psi_star[0].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[0].array[:, 1, 0] = 0.5
+stokes.DFDt.psi_star[1].array[:] = 0
+stokes.DFDt.psi_star[1].array[:, 0, 1] = 0.5
+stokes.DFDt.psi_star[1].array[:, 1, 0] = 0.5
+stokes.DFDt._dt_history[0] = 0.10
+stokes.DFDt._dt_history[1] = 0.10
+cm.Parameters.dt_elastic = 0.10
+cm._update_bdf_coefficients()
+for k in range(5):
+    stokes.solve(zero_init_guess=False, timestep=0.10, divergence_retries=1)
+    s = float(uw.function.evaluate(stokes.tau.sym[0,1], centre).flatten()[0])
+    print(f"  After solve {k+1}: σ_xy = {s:.4f}")
+
+print(f"\n=== Summary ===")
+print(f"  (a) ψ*=[0.5, 0.4268], dt_h0=0.20 → σ = {sigma_after_solve_a:.4f}  (large overshoot)")
+print(f"  (b) ψ*=[0.5, 0.5000], dt_h0=0.20 → σ = {sigma_after_solve_b:.4f}  (clean halving)")
+print(f"  (c) ψ*=[0.5, 0.5000], dt_h0=0.10 → σ = {sigma_after_solve_c:.4f}  (no dt change)")
diff --git a/docs/advanced/benchmarks/probe_lockstep_dt.py b/docs/advanced/benchmarks/probe_lockstep_dt.py
new file mode 100644
index 00000000..9544af2d
--- /dev/null
+++ b/docs/advanced/benchmarks/probe_lockstep_dt.py
@@ -0,0 +1,117 @@
+"""Side-by-side: two simulations of the same physical problem,
+one at dt = DT, one at dt = DT/2, stepped in lockstep.
+
+Compare every comparable quantity at common physical times.
+"""
+
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+def make_stokes(label):
+    mesh = uw.meshing.StructuredQuadBox(elementRes=(16, 8),
+        minCoords=(-1, -0.5), maxCoords=(1, 0.5))
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, mesh.dim, degree=2)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1)
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=2,
+    )
+    stokes.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+    stokes.constitutive_model.Parameters.shear_modulus = 1.0
+    stokes.constitutive_model.Parameters.yield_stress = 0.5
+    stokes.constitutive_model.Parameters.strainrate_inv_II_min = 1.0e-6
+    stokes.constitutive_model._yield_mode = "min"
+    V_top = expression(rf"V_{{{label},top}}", sympy.Float(0.5), "Top V")
+    stokes.add_dirichlet_bc((V_top, 0.0), "Top")
+    stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+    return stokes, V_top
+
+
+def probe(stokes, centre):
+    cm = stokes.constitutive_model
+    out = {}
+    out['sigma']  = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+    out['edot']   = float(uw.function.evaluate(cm.grad_u[0, 1], centre).flatten()[0])
+    out['psi0']   = float(uw.function.evaluate(stokes.DFDt.psi_star[0].sym[0, 1], centre).flatten()[0])
+    out['psi1']   = float(uw.function.evaluate(stokes.DFDt.psi_star[1].sym[0, 1], centre).flatten()[0])
+    out['eta_ve'] = float(uw.function.evaluate(cm.Parameters.ve_effective_viscosity.sym, centre).flatten()[0])
+    out['eta_pl'] = float(uw.function.evaluate(cm._plastic_effective_viscosity, centre).flatten()[0])
+    out['eta']    = float(uw.function.evaluate(cm.viscosity, centre).flatten()[0])
+    out['Eeff']   = float(uw.function.evaluate(cm.E_eff.sym[0, 1], centre).flatten()[0])
+    out['EeffII'] = float(uw.function.evaluate(cm.E_eff_inv_II.sym, centre).flatten()[0])
+    out['c0'] = float(cm._bdf_c0.sym)
+    out['c1'] = float(cm._bdf_c1.sym)
+    out['c2'] = float(cm._bdf_c2.sym)
+    out['dt_h0'] = stokes.DFDt._dt_history[0]
+    dt_e = cm.Parameters.dt_elastic
+    if hasattr(dt_e, 'sym'):
+        dt_e = dt_e.sym
+    try:
+        out['dt_e'] = float(dt_e)
+    except (TypeError, ValueError):
+        out['dt_e'] = float('nan')
+    return out
+
+
+def step_one(stokes, V_top, dt, t_cur):
+    V_top.sym = sympy.Float(0.5)  # constant +V0 — no BC flips, just pure loading
+    stokes.constitutive_model.Parameters.dt_elastic = dt
+    stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=1)
+
+
+def fmt_row(label, t, p):
+    return (f"{label:6s} t={t:.3f} dt_e={p['dt_e']:.3f} dt_h0={str(p['dt_h0']):>5s}  "
+            f"σ={p['sigma']:.4f} ε̇={p['edot']:.4f} ψ*0={p['psi0']:.4f} ψ*1={p['psi1']:.4f}  "
+            f"E={p['Eeff']:.3f}  η_ve={p['eta_ve']:.4f} η_pl={p['eta_pl']:.4f} η={p['eta']:.4f}  "
+            f"c012=[{p['c0']:.3f},{p['c1']:.3f},{p['c2']:.3f}]")
+
+
+# Build three independent problems
+print("Building COARSE simulation (always dt = 0.20)...")
+coarse, V_top_c = make_stokes("coarse")
+print("Building FINE simulation (always dt = 0.10)...")
+fine, V_top_f = make_stokes("fine")
+print("Building SWITCH simulation (dt = 0.20 → 0.10 at outer step 4)...")
+switch, V_top_s = make_stokes("switch")
+
+centre = np.array([[0.0, 0.0]])
+DT_C = 0.20
+DT_F = 0.10
+N_OUTER = 6  # 6 outer steps; halve in SWITCH starting at outer step 4
+HALVE_AT = 4
+
+print(f"\nLockstep: t advances by {DT_C} per outer step.")
+print(f"  COARSE: 1 step at dt={DT_C}")
+print(f"  FINE:   2 steps at dt={DT_F}")
+print(f"  SWITCH: 1 step at dt={DT_C} for outer<{HALVE_AT}; then 2 steps at dt={DT_F}\n")
+
+t_c, t_f, t_s = 0.0, 0.0, 0.0
+for k in range(N_OUTER):
+    step_one(coarse, V_top_c, DT_C, t_c)
+    t_c += DT_C
+    step_one(fine, V_top_f, DT_F, t_f);  t_f += DT_F
+    step_one(fine, V_top_f, DT_F, t_f);  t_f += DT_F
+    if k < HALVE_AT:
+        step_one(switch, V_top_s, DT_C, t_s);  t_s += DT_C
+    else:
+        step_one(switch, V_top_s, DT_F, t_s);  t_s += DT_F
+        step_one(switch, V_top_s, DT_F, t_s);  t_s += DT_F
+
+    pc = probe(coarse, centre)
+    pf = probe(fine, centre)
+    ps = probe(switch, centre)
+    marker = " <-- HALVING NOW" if k == HALVE_AT else ""
+    print(f"--- outer step {k+1}, t = {t_c:.3f} {marker}")
+    print(fmt_row("COARSE", t_c, pc))
+    print(fmt_row("FINE",   t_f, pf))
+    print(fmt_row("SWITCH", t_s, ps))
+    print(f"  σ:   coarse={pc['sigma']:.4f}  fine={pf['sigma']:.4f}  switch={ps['sigma']:.4f}  "
+          f"Δ(switch-fine)={ps['sigma']-pf['sigma']:+.4f}  Δ(switch-coarse)={ps['sigma']-pc['sigma']:+.4f}")
+    print()
diff --git a/docs/advanced/benchmarks/probe_projection_drift.py b/docs/advanced/benchmarks/probe_projection_drift.py
new file mode 100644
index 00000000..1b159b3a
--- /dev/null
+++ b/docs/advanced/benchmarks/probe_projection_drift.py
@@ -0,0 +1,148 @@
+"""Replace the implicit projection of flux→psi_star[0] with a direct
+one-shot pointwise evaluation. Test whether the drift disappears.
+
+Procedure per step:
+  1. Snapshot pre-solve psi_star[0] as `ps0_pre` (this is what ψ*[1] will
+     become after the shift).
+  2. Replace `_psi_star_projection_solver.solve` with a no-op so the main
+     stokes.solve() does NOT update ψ*[0].
+  3. Run stokes.solve() — Newton finds u, projection is a no-op, then
+     the shift sets ψ*[1] = ps0_pre.  At this point ψ*[0] is still ps0_pre.
+  4. Evaluate cm.flux at sample points using the just-solved u and the
+     frozen pre-solve ψ*[0]. This is a pure forward computation (no
+     fixed-point feedback).
+  5. Assign the evaluated flux to ψ*[0].array.
+
+For our uniform-shear test, the field is uniform so step 4 just samples
+the centre and step 5 assigns uniformly.
+"""
+
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+def make_stokes(label):
+    mesh = uw.meshing.StructuredQuadBox(elementRes=(16, 8),
+        minCoords=(-1, -0.5), maxCoords=(1, 0.5))
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, mesh.dim, degree=2)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1)
+    s = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    s.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        s.Unknowns, order=2,
+    )
+    s.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+    s.constitutive_model.Parameters.shear_modulus = 1.0
+    s.constitutive_model.Parameters.yield_stress = 0.5
+    s.constitutive_model.Parameters.strainrate_inv_II_min = 1.0e-6
+    s.constitutive_model._yield_mode = "min"
+    Vt = expression(rf"V_{{{label}}}", sympy.Float(0.5), "Top V")
+    s.add_dirichlet_bc((Vt, 0.0), "Top")
+    s.add_dirichlet_bc((-Vt, 0.0), "Bottom")
+    s.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+    s.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+    s.tolerance = 1.0e-6
+    s.petsc_options["snes_force_iteration"] = True
+    return s, Vt
+
+
+def patched_solve(stokes, dt, V_top, V_sign=1.0):
+    """Run the standard solve, then OVERWRITE psi_star[0] with the manually
+    computed σ from the formula evaluated against PRE-solve psi_star."""
+    cm = stokes.constitutive_model
+    ddt = stokes.DFDt
+    centre = np.array([[0.0, 0.0]])
+
+    V_top.sym = sympy.Float(V_sign * 0.5)
+    cm.Parameters.dt_elastic = dt
+
+    # Snapshot pre-solve psi_star (BOTH levels) — these are what the formula
+    # should use for the implicit step
+    ps0_pre = np.copy(ddt.psi_star[0].array)
+    ps1_pre = np.copy(ddt.psi_star[1].array)
+
+    # Run the standard solve as-is.  This will:
+    #   - Run main Newton (finds u)
+    #   - Run the (buggy) projection that writes into psi_star[0]
+    #   - Shift: psi_star[1] = old (pre-solve) psi_star[0]   ← correct
+    stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=1)
+
+    # Now manually compute the analytical σ at this state and overwrite
+    # the buggy projection's result.  Use PRE-solve psi_star values
+    # in the formula (those are the "history" inputs to the implicit step).
+    edot_now = float(uw.function.evaluate(cm.grad_u[0, 1], centre).flatten()[0])
+    # In our test, fields are uniform so a single node value suffices.
+    # Take the centre (or any node) of the snapshotted pre-solve arrays.
+    n_nodes = ps0_pre.shape[0]
+    ps0_use = float(ps0_pre[n_nodes // 2, 0, 1])  # pre-solve ψ*[0]_xy
+    ps1_use = float(ps1_pre[n_nodes // 2, 0, 1])  # pre-solve ψ*[1]_xy
+    c0 = float(cm._bdf_c0.sym); c1 = float(cm._bdf_c1.sym); c2 = float(cm._bdf_c2.sym)
+
+    E_eff_xy = edot_now + (-c1) * ps0_use / (2 * 1 * dt) + (-c2) * ps1_use / (2 * 1 * dt)
+    eta_ve_manual = 1 * dt / (c0 * 1 + 1 * dt)
+    eta_pl_manual = 0.5 / (2 * abs(E_eff_xy)) if abs(E_eff_xy) > 1e-12 else 1e9
+    eta_min = min(eta_ve_manual, eta_pl_manual)
+    sigma_manual = 2 * eta_min * E_eff_xy
+
+    # Read what the buggy projection produced (for diagnostic)
+    sigma_buggy = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+
+    print(f"   [dt={dt:.3f}: pre ψ*=[{ps0_use:.4f},{ps1_use:.4f}] ε̇={edot_now:.4f} "
+          f"c=[{c0:.3f},{c1:.3f},{c2:.3f}] manual σ={sigma_manual:.4f} buggy σ={sigma_buggy:.4f}]")
+
+    # Overwrite psi_star[0] with the manual σ (uniform in our test problem)
+    ddt.psi_star[0].array[:] = 0
+    ddt.psi_star[0].array[:, 0, 1] = sigma_manual
+    ddt.psi_star[0].array[:, 1, 0] = sigma_manual
+
+
+def step_one(stokes, V, dt):
+    V.sym = sympy.Float(0.5)
+    stokes.constitutive_model.Parameters.dt_elastic = dt
+    stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=1)
+
+
+def probe(stokes, c=np.array([[0.0, 0.0]])):
+    cm = stokes.constitutive_model
+    return {
+        'sigma': float(uw.function.evaluate(stokes.tau.sym[0, 1], c).flatten()[0]),
+        'edot':  float(uw.function.evaluate(cm.grad_u[0, 1], c).flatten()[0]),
+        'psi0':  float(uw.function.evaluate(stokes.DFDt.psi_star[0].sym[0, 1], c).flatten()[0]),
+        'psi1':  float(uw.function.evaluate(stokes.DFDt.psi_star[1].sym[0, 1], c).flatten()[0]),
+    }
+
+
+def fmt(label, p):
+    return f"{label:14s} σ={p['sigma']:.4f}  ε̇={p['edot']:.4f}  ψ*0={p['psi0']:.4f}  ψ*1={p['psi1']:.4f}"
+
+
+# === Build two simulations: ORIGINAL (uses implicit projection) and PATCHED.
+
+print("Building ORIGINAL stokes (implicit projection of flux→ψ*[0])...")
+orig, V_o = make_stokes("orig")
+print("Building PATCHED stokes (direct ptwise assign instead of projection)...")
+patched, V_p = make_stokes("patched")
+
+print("\n=== Phase 1: Drive both to yield steady state at dt=0.20 ===")
+for k in range(5):
+    step_one(orig, V_o, 0.20)
+    patched_solve(patched, 0.20, V_p)
+print(fmt("ORIG (after warm-up)", probe(orig)))
+print(fmt("PATCHED (after warm-up)", probe(patched)))
+
+print("\n=== Phase 2: switch to dt=0.10 (halving). Take 4 steps ===")
+for k in range(4):
+    step_one(orig, V_o, 0.10)
+    patched_solve(patched, 0.10, V_p)
+    print(f"Step {k+1} after halving:")
+    print("  " + fmt("ORIG", probe(orig)))
+    print("  " + fmt("PATCHED", probe(patched)))
+
+print("\n=== Phase 3: switch back to dt=0.20 (doubling). Take 4 steps ===")
+for k in range(4):
+    step_one(orig, V_o, 0.20)
+    patched_solve(patched, 0.20, V_p)
+    print(f"Step {k+1} after doubling:")
+    print("  " + fmt("ORIG", probe(orig)))
+    print("  " + fmt("PATCHED", probe(patched)))
diff --git a/docs/advanced/benchmarks/run_ve_square_wave.py b/docs/advanced/benchmarks/run_ve_square_wave.py
deleted file mode 100644
index 56a988b8..00000000
--- a/docs/advanced/benchmarks/run_ve_square_wave.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""Variable-timestep VE benchmark: square-wave forcing.
-
-Maxwell material under square-wave shear rate (Fourier series, N harmonics):
-
-    γ̇(t) = (4γ̇₀/π) Σ_{k=1..N} sin((2k-1)ωt) / (2k-1)
-
-Since the Maxwell equation is linear, the analytical stress is the
-superposition of single-frequency solutions:
-
-    σ(t) = Σ_{k=1..N} maxwell_oscillatory(t, η, μ, aₖ, ωₖ)
-
-where aₖ = 4γ̇₀/(π(2k-1)) and ωₖ = (2k-1)ω.
-
-The sharp transitions demand small dt at the edges, large dt in the
-flat regions — testing variable-dt BDF correctness.
-
-Usage:
-    python tests/run_ve_square_wave.py
-"""
-
-import time as timer
-import numpy as np
-import sympy
-import underworld3 as uw
-from underworld3.function import expression
-
-
-def maxwell_oscillatory(t, eta, mu, gamma_dot_0, omega):
-    """Full analytical σ_xy for single-frequency oscillatory Maxwell shear."""
-    t_r = eta / mu
-    De = omega * t_r
-    prefactor = eta * gamma_dot_0 / (1.0 + De**2)
-    return prefactor * (np.sin(omega * t) - De * np.cos(omega * t) + De * np.exp(-t / t_r))
-
-
-def square_wave_analytical(t, eta, mu, gamma_dot_0, omega, n_harmonics=20):
-    """Analytical stress for square-wave forcing via Fourier superposition."""
-    sigma = np.zeros_like(t)
-    for k in range(1, n_harmonics + 1):
-        n = 2 * k - 1  # odd harmonics: 1, 3, 5, ...
-        a_k = 4.0 * gamma_dot_0 / (np.pi * n)
-        omega_k = n * omega
-        sigma += maxwell_oscillatory(t, eta, mu, a_k, omega_k)
-    return sigma
-
-
-def square_wave_shear_rate(t, gamma_dot_0, omega, n_harmonics=20):
-    """Square-wave shear rate via truncated Fourier series."""
-    rate = np.zeros_like(t)
-    for k in range(1, n_harmonics + 1):
-        n = 2 * k - 1
-        rate += 4.0 * gamma_dot_0 / (np.pi * n) * np.sin(n * omega * t)
-    return rate
-
-
-def adaptive_dt(t_current, omega, dt_min, dt_max):
-    """Adaptive timestep: small near square-wave transitions, large on plateaux.
-
-    Transitions occur at t = (2m+1)·π/(2ω) for integer m, i.e. at odd
-    multiples of quarter-period. We use distance to nearest transition
-    to interpolate between dt_min and dt_max.
-    """
-    half_period = np.pi / omega
-    # Phase within half-period [0, half_period)
-    phase = t_current % half_period
-    # Distance to nearest transition (0 or half_period boundary)
-    dist = min(phase, half_period - phase)
-    # Normalise to [0, 1] where 0 = at transition, 1 = mid-plateau
-    frac = dist / (half_period / 2.0)
-    # Smooth interpolation
-    return dt_min + (dt_max - dt_min) * frac**2
-
-
-def run_square_wave(order, De, n_periods, dt_min_over_tr, dt_max_over_tr,
-                    n_harmonics=20, uniform=False):
-    """Run VE square-wave shear box with adaptive or uniform timestep."""
-
-    ETA, MU, H, W = 1.0, 1.0, 1.0, 2.0
-    t_r = ETA / MU
-    omega = De / t_r
-    V0 = 0.5
-    gamma_dot_0 = 2.0 * V0 / H
-    dt_min = dt_min_over_tr * t_r
-    dt_max = dt_max_over_tr * t_r
-    T = 2.0 * np.pi / omega
-    t_end = n_periods * T
-
-    mesh = uw.meshing.StructuredQuadBox(
-        elementRes=(16, 8), minCoords=(-W / 2, -H / 2), maxCoords=(W / 2, H / 2),
-    )
-    v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
-    p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
-
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
-    stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
-    stokes.constitutive_model.Parameters.shear_modulus = MU
-
-    # Boundary conditions: simple shear driven by top/bottom velocity
-    # V_top updated numerically each timestep to produce square-wave γ̇
-    V_top = expression(R"V_{\mathrm{top}}", sympy.Float(0.0), "Top boundary velocity")
-
-    stokes.add_dirichlet_bc((V_top, 0.0), "Top")
-    stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
-    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
-    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
-    stokes.tolerance = 1.0e-6
-
-    # Time loop
-    times = []
-    numerical_stress = []
-    timesteps_used = []
-    t_current = 0.0
-    step = 0
-
-    while t_current < t_end:
-        # Determine timestep
-        if uniform:
-            dt = dt_min
-        else:
-            dt = adaptive_dt(t_current, omega, dt_min, dt_max)
-
-        t_next = t_current + dt
-
-        # Update boundary velocity: V_top(t) such that γ̇ = 2·V_top/H = square wave
-        # square_wave_shear_rate returns the actual γ̇, so V_top = γ̇ · H/2
-        gamma_dot_t = square_wave_shear_rate(
-            np.array([t_next]), gamma_dot_0, omega, n_harmonics
-        )[0]
-        V_top.sym = sympy.Float(gamma_dot_t * H / 2.0)
-
-        stokes.constitutive_model.Parameters.dt_elastic = dt
-
-        stokes.solve(zero_init_guess=False, timestep=dt)
-
-        t_current += dt
-        step += 1
-
-        # Extract stress at mesh centre
-        centre = np.array([[0.0, 0.0]])
-        tau_xy = uw.function.evaluate(stokes.tau.sym[0, 1], centre)
-        sigma_xy = float(tau_xy.flatten()[0])
-
-        times.append(t_current)
-        numerical_stress.append(sigma_xy)
-        timesteps_used.append(dt)
-
-        if step % 50 == 0:
-            ana = square_wave_analytical(
-                np.array([t_current]), ETA, MU, gamma_dot_0, omega, n_harmonics
-            )[0]
-            print(f"  Step {step:4d}: t/t_r = {t_current / t_r:.3f}, "
-                  f"dt/t_r = {dt / t_r:.4f}, "
-                  f"σ_xy = {sigma_xy:.6f}, ana = {ana:.6f}")
-
-    times = np.array(times)
-    numerical_stress = np.array(numerical_stress)
-    timesteps_used = np.array(timesteps_used)
-
-    # Analytical solution
-    analytical_stress = square_wave_analytical(times, ETA, MU, gamma_dot_0, omega, n_harmonics)
-
-    # Error metrics (skip first period for startup transient)
-    mask = times > T
-    if mask.sum() > 0:
-        l2_err = np.sqrt(np.mean((numerical_stress[mask] - analytical_stress[mask]) ** 2))
-        linf_err = np.max(np.abs(numerical_stress[mask] - analytical_stress[mask]))
-    else:
-        l2_err = linf_err = np.nan
-
-    return {
-        "times": times,
-        "numerical": numerical_stress,
-        "analytical": analytical_stress,
-        "timesteps": timesteps_used,
-        "l2_error": l2_err,
-        "linf_error": linf_err,
-        "n_steps": step,
-    }
-
-
-if __name__ == "__main__":
-    De = 1.5
-    order = 2
-    n_periods = 3
-    n_harmonics = 10
-
-    print("=" * 60)
-    print(f"Square-wave VE benchmark: De={De}, order={order}")
-    print(f"  {n_harmonics} Fourier harmonics, {n_periods} periods")
-    print("=" * 60)
-
-    # Run with adaptive dt
-    print("\n--- Adaptive timestep ---")
-    t0 = timer.time()
-    result_adaptive = run_square_wave(
-        order=order, De=De, n_periods=n_periods,
-        dt_min_over_tr=0.02, dt_max_over_tr=0.15,
-        n_harmonics=n_harmonics, uniform=False,
-    )
-    t_adaptive = timer.time() - t0
-    print(f"  {result_adaptive['n_steps']} steps in {t_adaptive:.1f}s")
-    print(f"  L2 error:   {result_adaptive['l2_error']:.6e}")
-    print(f"  Linf error: {result_adaptive['linf_error']:.6e}")
-    print(f"  dt range:   [{result_adaptive['timesteps'].min():.4f}, "
-          f"{result_adaptive['timesteps'].max():.4f}]")
-
-    # Run with uniform dt (reference)
-    print("\n--- Uniform timestep (dt_min) ---")
-    t0 = timer.time()
-    result_uniform = run_square_wave(
-        order=order, De=De, n_periods=n_periods,
-        dt_min_over_tr=0.02, dt_max_over_tr=0.02,
-        n_harmonics=n_harmonics, uniform=True,
-    )
-    t_uniform = timer.time() - t0
-    print(f"  {result_uniform['n_steps']} steps in {t_uniform:.1f}s")
-    print(f"  L2 error:   {result_uniform['l2_error']:.6e}")
-    print(f"  Linf error: {result_uniform['linf_error']:.6e}")
-
-    # Summary
-    print("\n" + "=" * 60)
-    print("Summary:")
-    print(f"  Adaptive: {result_adaptive['n_steps']} steps, "
-          f"L2 = {result_adaptive['l2_error']:.2e}")
-    print(f"  Uniform:  {result_uniform['n_steps']} steps, "
-          f"L2 = {result_uniform['l2_error']:.2e}")
-    ratio = result_adaptive['l2_error'] / result_uniform['l2_error']
-    print(f"  Error ratio (adaptive/uniform): {ratio:.2f}")
-    print(f"  Step savings: {1 - result_adaptive['n_steps'] / result_uniform['n_steps']:.0%}")
-
-    # Save results
-    np.savez(
-        f"tests/ve_square_wave_{n_harmonics}h.npz",
-        adaptive_times=result_adaptive["times"],
-        adaptive_numerical=result_adaptive["numerical"],
-        adaptive_analytical=result_adaptive["analytical"],
-        adaptive_timesteps=result_adaptive["timesteps"],
-        uniform_times=result_uniform["times"],
-        uniform_numerical=result_uniform["numerical"],
-        uniform_analytical=result_uniform["analytical"],
-        uniform_timesteps=result_uniform["timesteps"],
-    )
-    print(f"\nResults saved to tests/ve_square_wave_{n_harmonics}h.npz")
diff --git a/docs/advanced/benchmarks/sweep_bdf1_softjac.py b/docs/advanced/benchmarks/sweep_bdf1_softjac.py
new file mode 100644
index 00000000..7516e7e3
--- /dev/null
+++ b/docs/advanced/benchmarks/sweep_bdf1_softjac.py
@@ -0,0 +1,136 @@
+"""Follow-up sweep — BDF-1 inexact-Newton softness sensitivity.
+
+Sister to ``sweep_bdf2_softjac.py``.  In the headline experiment the
+BDF-1 case (Min residual, softmin Jacobian δ=0.1) was perfect: 0/413
+divergences, 1.02 mean iter/step, identical answer to pure Min/Min.
+That's already optimal so we don't expect changes — but we do want
+to confirm that BDF-1 numerics are insensitive to δ.  If that holds,
+the BDF-2 line-search noise we're chasing is specifically about
+residual/Jacobian disagreement at the second history term, not about
+"smoother Jacobians help everywhere."
+
+Sweep: δ ∈ {0.05, 0.10, 0.20, 0.50}, with bt linesearch, no atol.
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    vep_square_wave, error_metrics,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+DT_PLATEAU = 0.10
+DT_FINE = 0.01
+WINDOW = 0.1 * HALF_PERIOD
+OUT_DIR = "../../../output/benchmarks/sweep_bdf2_softjac"
+
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def _capture_softmin_F1(stokes, softness):
+    cm = stokes.constitutive_model
+    saved_mode, saved_softness = cm._yield_mode, cm._yield_softness
+    try:
+        cm._yield_mode = "softmin"
+        cm._yield_softness = softness
+        soft_stress = cm.flux
+        F1_softmin = soft_stress + stokes.penalty * stokes.div_u * sympy.eye(stokes.mesh.dim)
+    finally:
+        cm._yield_mode, cm._yield_softness = saved_mode, saved_softness
+    return F1_softmin
+
+
+def run_variant(label, softness):
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = 1
+    mesh, stokes, V_top, params = build_stokes(
+        f"sweep_{label}", params, yield_stress=TAU_Y, yield_mode="min",
+    )
+
+    F1_jac = _capture_softmin_F1(stokes, softness)
+    stokes.set_jacobian_F1_source(F1_jac)
+    stokes.petsc_options["snes_linesearch_type"] = "bt"
+
+    times, dts, sigmas, gammas, reasons, iters = [], [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=0)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        iters.append(int(stokes.snes.getIterationNumber()))
+    wall = time.time() - t0
+
+    times = np.array(times); sigmas = np.array(sigmas)
+    reasons = np.array(reasons); iters = np.array(iters)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = vep_square_wave(times, params["eta"], params["mu"],
+                                gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err = error_metrics(sigmas, sigma_ana)
+
+    out = dict(
+        label=label, softness=softness, wall=wall,
+        peak=float(np.abs(sigmas).max()),
+        diverged=int((reasons < 0).sum()),
+        mean_its=float(iters.mean()),
+        max_err=float(err["max_abs"]), rms=float(err["rms"]),
+    )
+    os.makedirs(OUT_DIR, exist_ok=True)
+    np.savez(os.path.join(OUT_DIR, f"{label}.npz"), **{
+        "times": times, "sigmas": sigmas, "sigma_ana": sigma_ana,
+        "reasons": reasons, "iters": iters,
+        "softness": softness, "linesearch": "bt", "atol": -1.0,
+        "wall": wall,
+    })
+    print(f"  [{label:<28}] δ={softness:<5} ls=bt    atol=None   "
+          f"wall={wall:6.1f}s  div={out['diverged']:3d}  its={out['mean_its']:5.2f}  "
+          f"peak={out['peak']:.4f}  max|err|={out['max_err']:.3e}  rms={out['rms']:.3e}",
+          flush=True)
+    return out
+
+
+def main():
+    print("\n=== sweep_bdf1_softjac (BDF-1, Min residual, softmin Jacobian) ===\n",
+          flush=True)
+    print("BDF-1 family: vary softness δ", flush=True)
+    runs = [run_variant(f"bdf1_delta_{d}", softness=d) for d in (0.05, 0.10, 0.20, 0.50)]
+
+    print("\n\n=== summary ===", flush=True)
+    print(f"{'label':<28} {'δ':>5} {'wall':>7} {'div':>4} {'its':>5} {'peak|σ|':>7} {'max|err|':>10} {'rms':>10}",
+          flush=True)
+    for r in runs:
+        print(f"{r['label']:<28} {r['softness']:>5} {r['wall']:>7.1f} "
+              f"{r['diverged']:>4d} {r['mean_its']:>5.2f} "
+              f"{r['peak']:>7.4f} {r['max_err']:>10.3e} {r['rms']:>10.3e}",
+              flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/sweep_bdf2_softjac.py b/docs/advanced/benchmarks/sweep_bdf2_softjac.py
new file mode 100644
index 00000000..4c93d225
--- /dev/null
+++ b/docs/advanced/benchmarks/sweep_bdf2_softjac.py
@@ -0,0 +1,160 @@
+"""Sweep — BDF-2 inexact-Newton line-search behaviour with softmin Jacobian.
+
+Background: with Min residual + softmin Jacobian (δ=0.1), BDF-2 sees
+195/413 line-search rejections and gives a more accurate answer than
+pure Min/Min. Question: can we keep the accuracy and lose the noise?
+
+Axes:
+  A. Softness δ ∈ {0.05, 0.1, 0.2, 0.5}  (Jacobian-only smoothing)
+  B. snes_linesearch_type ∈ {bt, cp, basic}  (basic = no LS, accept Newton)
+  C. snes_atol ∈ {None, 1e-5}                (early termination if residual tiny)
+
+We DON'T sweep the full cross product — too expensive. Run two
+families in series, each with the matched axis fixed at the baseline
+(δ=0.1, bt, atol=None):
+  Family A: vary δ
+  Family B: vary linesearch
+  Family C: try atol=1e-5 with the baseline
+
+Results land in output/benchmarks/sweep_bdf2_softjac/ — one .npz per
+variant — and a summary table is printed at the end.
+"""
+
+import os
+import time
+import numpy as np
+import sympy
+
+from _bench_helpers import (
+    DEFAULT_PARAMS, build_stokes, probe_centre,
+    vep_square_wave, error_metrics,
+)
+
+
+V0 = 0.5
+TAU_Y = 0.5
+HALF_PERIOD = 2.0
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+DT_PLATEAU = 0.10
+DT_FINE = 0.01
+WINDOW = 0.1 * HALF_PERIOD
+
+OUT_DIR = "../../../output/benchmarks/sweep_bdf2_softjac"
+
+
+def schedule_dt(t_cur):
+    flip_times = [HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2 - 1)]
+    for f in flip_times:
+        if abs(t_cur - f) <= WINDOW:
+            return DT_FINE
+    return DT_PLATEAU
+
+
+def _capture_softmin_F1(stokes, softness):
+    cm = stokes.constitutive_model
+    saved_mode, saved_softness = cm._yield_mode, cm._yield_softness
+    try:
+        cm._yield_mode = "softmin"
+        cm._yield_softness = softness
+        soft_stress = cm.flux
+        F1_softmin = soft_stress + stokes.penalty * stokes.div_u * sympy.eye(stokes.mesh.dim)
+    finally:
+        cm._yield_mode, cm._yield_softness = saved_mode, saved_softness
+    return F1_softmin
+
+
+def run_variant(label, softness, linesearch="bt", atol=None):
+    """Run one BDF-2 variant. Returns dict with metrics + arrays."""
+    params = dict(DEFAULT_PARAMS)
+    params["bdf_order"] = 2
+    mesh, stokes, V_top, params = build_stokes(
+        f"sweep_{label}", params, yield_stress=TAU_Y, yield_mode="min",
+    )
+
+    F1_jac = _capture_softmin_F1(stokes, softness)
+    stokes.set_jacobian_F1_source(F1_jac)
+    stokes.petsc_options["snes_linesearch_type"] = linesearch
+    if atol is not None:
+        stokes.petsc_options["snes_atol"] = atol
+
+    times, dts, sigmas, gammas, reasons, iters = [], [], [], [], [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = schedule_dt(t_cur)
+        flip_next = next((HALF_PERIOD * (k + 1) for k in range(N_PERIODS * 2)
+                          if HALF_PERIOD * (k + 1) > t_cur + 1e-9), T_END)
+        dt = min(dt, flip_next - t_cur, T_END - t_cur)
+        t_end_step = t_cur + dt
+        n_half = int(t_end_step / HALF_PERIOD - 1e-9)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        stokes.constitutive_model.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=0)
+        sigmas.append(probe_centre(stokes))
+        t_cur = t_end_step
+        times.append(t_cur); dts.append(dt); gammas.append(2.0 * v_now / params["H"])
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        iters.append(int(stokes.snes.getIterationNumber()))
+    wall = time.time() - t0
+
+    times = np.array(times); sigmas = np.array(sigmas)
+    reasons = np.array(reasons); iters = np.array(iters)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    sigma_ana = vep_square_wave(times, params["eta"], params["mu"],
+                                gamma_dot_0, TAU_Y, HALF_PERIOD)
+    err = error_metrics(sigmas, sigma_ana)
+
+    out = dict(
+        label=label, softness=softness, linesearch=linesearch, atol=atol,
+        wall=wall, peak=float(np.abs(sigmas).max()),
+        diverged=int((reasons < 0).sum()), mean_its=float(iters.mean()),
+        max_err=float(err["max_abs"]), rms=float(err["rms"]),
+        times=times, sigmas=sigmas, sigma_ana=sigma_ana,
+        reasons=reasons, iters=iters,
+    )
+    os.makedirs(OUT_DIR, exist_ok=True)
+    np.savez(os.path.join(OUT_DIR, f"{label}.npz"), **{
+        "times": times, "sigmas": sigmas, "sigma_ana": sigma_ana,
+        "reasons": reasons, "iters": iters,
+        "softness": softness, "linesearch": linesearch,
+        "atol": (atol if atol is not None else -1.0),
+        "wall": wall,
+    })
+    print(f"  [{label:<28}] δ={softness:<5} ls={linesearch:<5} atol={atol!r:<6} "
+          f"wall={wall:6.1f}s  div={out['diverged']:3d}  its={out['mean_its']:5.2f}  "
+          f"peak={out['peak']:.4f}  max|err|={out['max_err']:.3e}  rms={out['rms']:.3e}",
+          flush=True)
+    return out
+
+
+def main():
+    print("\n=== sweep_bdf2_softjac (Min residual, softmin Jacobian) ===\n", flush=True)
+    print("Family A: vary softness δ (bt linesearch, no atol)", flush=True)
+    family_A = [run_variant(f"deltaA_{d}", softness=d) for d in (0.05, 0.10, 0.20, 0.50)]
+
+    print("\nFamily B: vary linesearch (δ=0.10, no atol)", flush=True)
+    family_B = []
+    for ls in ("cp", "basic", "l2"):
+        family_B.append(run_variant(f"lsB_{ls}", softness=0.10, linesearch=ls))
+
+    print("\nFamily C: snes_atol=1e-5 with baseline (δ=0.10, bt)", flush=True)
+    family_C = [run_variant("atolC_1e-5", softness=0.10, linesearch="bt", atol=1e-5)]
+
+    all_runs = family_A + family_B + family_C
+    print("\n\n=== summary ===", flush=True)
+    print(f"{'label':<28} {'δ':>5} {'ls':>6} {'atol':>8} {'wall':>7} {'div':>4} {'its':>5} {'peak|σ|':>7} {'max|err|':>10} {'rms':>10}",
+          flush=True)
+    for r in all_runs:
+        atol_str = f"{r['atol']:.0e}" if r['atol'] is not None else "None"
+        print(f"{r['label']:<28} {r['softness']:>5} {r['linesearch']:>6} {atol_str:>8} "
+              f"{r['wall']:>7.1f} {r['diverged']:>4d} {r['mean_its']:>5.2f} "
+              f"{r['peak']:>7.4f} {r['max_err']:>10.3e} {r['rms']:>10.3e}",
+              flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/advanced/benchmarks/vardt-square.md b/docs/advanced/benchmarks/vardt-square.md
new file mode 100644
index 00000000..f320c867
--- /dev/null
+++ b/docs/advanced/benchmarks/vardt-square.md
@@ -0,0 +1,140 @@
+---
+title: "Variable-dt square-wave (VE and VEP)"
+---
+
+# Square-wave shear with reduced timestep near BC discontinuities
+
+The VE and VEP square-wave cases concentrate their numerical error in a
+small window around each BC flip — the discrete time derivative is
+attempting to follow a corner in the analytical $\sigma(t)$.  Reducing
+$\Delta t$ inside that window and keeping it large on the plateaux is
+exactly the kind of variable-timestep schedule that the projection
+machinery has to handle robustly.
+
+This pair of benchmarks runs the VE and VEP square-wave problems with
+$\Delta t$ = 0.10·$t_r$ on plateaux and 0.01·$t_r$ within ±0.20·$t_r$
+of every flip — a 10× reduction across the discontinuity.
+
+## Schedule
+
+```
+Δt(t) = 0.10·t_r          on plateaux  (≥ 0.20·t_r away from any flip)
+        0.01·t_r          within ±0.20·t_r of a flip
+```
+
+Step boundaries are clamped to the flip time so no step straddles a
+discontinuity.
+
+## What this exercises
+
+* The DDt's snapshot machinery: every halve/double of $\Delta t$
+  exposes a new $\Delta t$ ratio to the implicit projection.  Without
+  the snapshot fix the previous-generation code drifted off the yield
+  surface by ~30% under exactly this schedule.
+* The Picard / `divergence_retries` SNES rescue: VEP's first solve
+  inside a fine-window after a flip lands close to the yield kink and
+  occasionally takes a Newton step that fails the tolerance check
+  within 50 iterations; the retry mechanism recovers without manual
+  intervention.
+
+## Results
+
+### VE
+
+```{figure} ../figures/bench_ve_square_vardt.png
+:width: 100%
+
+Top: BDF-1 (blue circles) and BDF-2 (red squares) overlaid on the
+analytical (black) for the variable-Δt VE square wave.  Driving γ̇
+shown in light blue fill.  Middle: pointwise absolute error.  Bottom:
+the Δt schedule with the 10× drop visible at every flip.
+```
+
+| | BDF-1 | BDF-2 |
+|---|---|---|
+| max\|err\| | 2.38e-02 | 1.42e-02 |
+| rms        | 1.62e-02 | 6.17e-03 |
+
+For comparison, the fixed-Δt=0.10 run gave BDF-2 max\|err\| ≈ 8.07e-02
+— so the fine window around the flip is doing exactly what it should
+(reducing the dominant per-flip error).
+
+### VEP (Min mode)
+
+```{figure} ../figures/bench_vep_square_vardt.png
+:width: 100%
+
+Same layout, with τ_y = 0.5 yield surface guides (dashed grey).
+```
+
+| | BDF-1 | BDF-2 |
+|---|---|---|
+| peak\|σ\| | 0.5000 | 0.5004 |
+| overshoots > 1.001·τ_y | 0 | 0 |
+| max\|err\| | 2.15e-02 | 8.70e-02 |
+| rms        | 6.68e-03 | 1.53e-02 |
+
+**The yield surface holds**.  With both the snapshot machinery and
+the Picard-style SNES retry in place, σ stays clipped to ±τ_y under
+the variable-Δt schedule that previously produced a ~30% drift.
+Peak\|σ\| matches τ_y = 0.5 to four decimal places (BDF-1) and
+within 0.1% (BDF-2 — the 0.0004 excess is a transient at one
+loading-onset transition, not a sustained yield-surface violation).
+
+The BDF-2 max\|err\| being larger than BDF-1's is the same phase-lag
+story as in the fixed-dt VEP case: at the loading→yield transition
+the 2nd-order step occasionally lags by one fine-Δt step before
+catching up.  RMS — which is the more honest measure for a sharp
+transition — is comparable to BDF-1's.
+
+### VEP (softmin mode, default δ = 0.1)
+
+```{figure} ../figures/bench_vep_square_vardt_softmin.png
+:width: 100%
+
+Same problem with `yield_mode="softmin"` — replacing
+$\min(\eta_{ve},\eta_{pl})$ with the smooth approximation
+$\eta_{ve}/g(f)$, $g(f) = 1 + (f-1+\sqrt{(f-1)^2+\delta^2})/2$.
+At δ = 0.1 the kink is differentiable but the plateau still tracks
+the yield surface tightly.
+```
+
+| | BDF-1 | BDF-2 |
+|---|---|---|
+| peak\|σ\| | 0.4894 (97.9% of τ_y) | 0.4853 (97.1% of τ_y) |
+| max\|err vs Min-clip\| | 8.39e-02 | 1.11e-01 |
+| rms vs Min-clip       | 6.12e-02 | 7.76e-02 |
+
+Softmin keeps the plateau within 2-3% of the true Min yield surface
+while smoothing out the kink at $\eta_{ve} = \eta_{pl}$ — Newton sees
+a continuous derivative and the SNES never needs the Picard retry
+that Min mode occasionally triggers.  The "error vs Min-clip"
+figure-of-merit penalises the deliberately rounded transitions; it
+is not an accuracy gap in any physically meaningful sense.
+
+### Why `yield_mode="smooth"` was retired
+
+The third yield-mode option, the harmonic-blend "smooth" formula
+$\eta_{eff} = \eta_{ve}\,(1+f)/(1+f+f^2)$, was retired in this same
+benchmark suite (commit 5936b46) after the variable-dt run made the
+problem unmissable:
+
+```{figure} ../figures/bench_vep_square_vardt_smooth.png
+:width: 100%
+
+`yield_mode="smooth"` plateaus at |σ| ≈ 0.24 — only ~50% of
+τ_y = 0.5 — across every loading half-cycle.  Both BDF orders
+under-clip identically.  This is not a transient; the driving
+$\dot\gamma$ holds long enough to reach steady state on every plateau.
+```
+
+| | BDF-1 | BDF-2 |
+|---|---|---|
+| peak\|σ\| | 0.2599 (52.0% of τ_y) | 0.2355 (47.1% of τ_y) |
+| max\|err vs Min-clip\| | 3.66e-01 | 3.81e-01 |
+
+The blend formula has the wrong asymptotic behaviour: at $f = 1$
+(the yield kink itself) it gives $\eta_{eff}/\eta_{ve} = 2/3$, and it
+keeps reducing $\eta_{eff}$ deep into the plastic regime instead of
+saturating at $\eta_{pl}$.  `softmin` does not have this defect, so
+`smooth` was removed and the setter now redirects users.
diff --git a/docs/advanced/benchmarks/ve-harmonic.md b/docs/advanced/benchmarks/ve-harmonic.md
new file mode 100644
index 00000000..e50a1c45
--- /dev/null
+++ b/docs/advanced/benchmarks/ve-harmonic.md
@@ -0,0 +1,109 @@
+---
+title: "VE — sinusoidal shear"
+---
+
+# Maxwell viscoelastic shear under sinusoidal forcing
+
+A Maxwell material driven by a sinusoidal shear-rate has a closed-form
+stress response.  This benchmark drives the simple-shear box with
+$V_{\mathrm{top}}(t) = V_0 \sin(\omega t)$ and compares the centre-point
+shear stress to the analytical solution.  The check covers the
+amplitude attenuation and phase lag that the Deborah number predicts.
+
+## Governing equation
+
+Maxwell constitutive law in shear:
+
+$$
+\dot\sigma + \frac{\sigma}{t_r} = \mu\,\dot\gamma(t),
+\qquad t_r = \frac{\eta}{\mu}.
+$$
+
+For $\dot\gamma(t) = \dot\gamma_0 \sin(\omega t)$ with $\sigma(0) = 0$,
+the closed-form solution is
+
+$$
+\sigma(t) = \frac{\eta\,\dot\gamma_0}{1 + \mathrm{De}^2}
+\left[\sin(\omega t) - \mathrm{De}\,\cos(\omega t)
++ \mathrm{De}\,e^{-t/t_r}\right]
+$$
+
+with $\mathrm{De} = \omega\,t_r$ the Deborah number.  After the
+exponential transient (a few $t_r$) the stress oscillates as
+
+$$
+\sigma_\infty(t) = A_\infty \sin(\omega t - \varphi),
+\qquad
+A_\infty = \frac{\eta\,\dot\gamma_0}{\sqrt{1+\mathrm{De}^2}},
+\qquad
+\varphi = \arctan(\mathrm{De}).
+$$
+
+## Setup
+
+| | |
+|---|---|
+| Mesh | `StructuredQuadBox` 16×8 over $\bigl(\pm 1,\pm 0.5\bigr)$ |
+| Velocity field | $\mathbb{P}^2$ |
+| Pressure field | $\mathbb{P}^1$ |
+| Boundary conditions | top/bottom velocity = $\pm V_0\sin(\omega t)$, free at left/right |
+| Time integration | BDF-1 *and* BDF-2 at $\Delta t = 0.05\,t_r$, plus a sweep over $\Delta t \in \{0.025, 0.05, 0.10, 0.20, 0.40\}\,t_r$ |
+| BC sampling | $V_{\mathrm{top}}$ evaluated at the *endpoint* of each step |
+| Shear viscosity | $\eta = 1$ |
+| Shear modulus | $\mu = 1$ |
+| Top velocity amplitude | $V_0 = 0.5$ → $\dot\gamma_0 = 1$ |
+| Forcing frequency | $\omega = \pi/2$ → period $4\,t_r$, $\mathrm{De} = \pi/2 \approx 1.57$ |
+| Run length | $4$ full periods |
+
+The strain rate uses the symmetric tensor convention
+$\dot\varepsilon_{xy} = (\partial_y u_x + \partial_x u_y)/2$, so
+$\dot\gamma = 2 V_0 / H = 1$ for $V_0 = 0.5$, $H = 1$.
+
+## Run
+
+```bash
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_harmonic.py
+pixi run -e amr-dev python docs/advanced/benchmarks/plot_benchmarks.py
+```
+
+The simulation logs to `output/benchmarks/ve_harmonic.npz` (per-step
+trace + analytical reference at the same time points).  Re-running the
+plot script doesn't re-run the simulation.
+
+## Results
+
+```{figure} ../figures/bench_ve_harmonic.png
+:width: 100%
+
+Top: BDF-1 (blue open circles) and BDF-2 (red filled squares)
+overlaid on the closed-form solution (black) and the rescaled
+sinusoidal forcing (light blue fill) for context.  Middle: pointwise
+absolute error for both orders.  Bottom: time-step.  Inset compares
+fitted vs analytical amplitude and phase lag.
+```
+
+At $\mathrm{De} = \pi/2$ the analytical amplitude is
+$A_\infty = 1/\sqrt{1+\pi^2/4} \approx 0.537$ and the phase lag is
+$\varphi = \arctan(\pi/2) \approx 1.004$ rad.  At $\Delta t =
+0.05\,t_r$ BDF-2 recovers both the amplitude and the phase lag to
+within $10^{-3}$ rad; BDF-1 is off by a few percent in the phase
+(the residual O($\Delta t$) error of an implicit-Euler scheme).
+
+```{figure} ../figures/bench_convergence.png
+:width: 100%
+
+Convergence sweep — left panel is the harmonic case.  BDF-1 sits on
+slope 1 (dotted reference); BDF-2 (rms, lower red dotted line)
+hits slope 2 (dashed reference) cleanly between $\Delta t = 0.4$ and
+$0.1$ before levelling off at the fine end (where the BDF-2 startup
+transient — first one or two steps that effectively run at BDF-1 —
+becomes the dominant contribution).
+```
+
+The benchmark surfaces a subtle but important detail: $V_{\mathrm{top}}$
+is sampled at the step *endpoint* (i.e.\\ the time BDF's implicit step
+solves for), not at the midpoint.  Midpoint sampling is only
+1st-order accurate to the endpoint value; using it would limit BDF-2
+to slope-1 convergence even though the time integrator itself is
+2nd-order.  Same nominal mesh, dt schedule, and tolerance — only the
+BC sampling differs.
diff --git a/docs/advanced/benchmarks/ve-oscillatory-shear.md b/docs/advanced/benchmarks/ve-oscillatory-shear.md
deleted file mode 100644
index 03e95c43..00000000
--- a/docs/advanced/benchmarks/ve-oscillatory-shear.md
+++ /dev/null
@@ -1,108 +0,0 @@
----
-title: "Viscoelastic Oscillatory Shear Benchmark"
----
-
-# Maxwell Oscillatory Shear
-
-This benchmark validates the viscoelastic Stokes solver against the analytical
-solution for a Maxwell material under oscillatory simple shear.
-
-## Problem Setup
-
-A box with height $H$ and width $2H$ is sheared by imposing time-dependent
-velocities on the top and bottom boundaries:
-
-$$v_x(y=\pm H/2, t) = \pm V_0 \sin(\omega t)$$
-
-The left and right boundaries are free-slip (no vertical velocity). The shear
-rate is $\dot\gamma(t) = \dot\gamma_0 \sin(\omega t)$ where $\dot\gamma_0 = 2V_0/H$.
-
-## Analytical Solution
-
-The Maxwell constitutive law gives:
-
-$$\dot\sigma_{xy} + \frac{\sigma_{xy}}{t_r} = \mu \dot\gamma_0 \sin(\omega t)$$
-
-where $t_r = \eta/\mu$ is the relaxation time. With $\sigma(0) = 0$, the full
-solution (including the startup transient) is:
-
-$$\sigma_{xy}(t) = \frac{\eta \dot\gamma_0}{1 + \text{De}^2}
-\left[\sin(\omega t) - \text{De}\cos(\omega t) + \text{De}\,e^{-t/t_r}\right]$$
-
-where $\text{De} = \omega t_r$ is the Deborah number.
-
-**Steady-state properties** (after transient decays):
-
-- Amplitude: $A = \eta \dot\gamma_0 / \sqrt{1 + \text{De}^2}$
-- Phase lag: $\delta = \arctan(\text{De})$
-
-At $\text{De} = 0$ (viscous limit): $A = \eta\dot\gamma_0$, $\delta = 0$.
-At $\text{De} \to \infty$ (elastic limit): $A \to 0$, $\delta \to 90°$.
-
-## Convergence with BDF Order
-
-The VE solver uses BDF-$k$ time integration ($k = 1, 2, 3$). The convergence
-study (constant shear, $\text{De} = 1$) shows:
-
-| $\Delta t / t_r$ | BDF-1 error | BDF-2 error | BDF-3 error |
-|-------------------|-------------|-------------|-------------|
-| 0.200 | 3.0e-02 | 4.0e-03 | 6.4e-03 |
-| 0.100 | 1.5e-02 | 9.3e-04 | 1.7e-03 |
-| 0.050 | 7.8e-03 | 2.3e-04 | 4.3e-04 |
-| 0.020 | 3.1e-03 | 3.7e-05 | — |
-
-BDF-2 achieves second-order convergence (~4x error reduction per halving) and
-is the recommended default. BDF-1 is first-order. BDF-3 converges at nearly
-second order but with a larger error constant.
-
-## Resolution Study (Oscillatory, De = 5)
-
-At high Deborah number, the oscillation period is short relative to the
-relaxation time, requiring fine time resolution. The plot below shows the
-effect of timestep size at $\text{De} = 5$ ($\omega t_r = 5$, phase lag = 79°):
-
-- **63 pts/period** ($\Delta t/t_r = 0.02$): both orders match analytical
-- **31 pts/period** ($\Delta t/t_r = 0.04$): O1 shows slight amplitude reduction, O2 still accurate
-- **16 pts/period** ($\Delta t/t_r = 0.08$): O1 amplitude visibly damped, O2 remains good
-
-```{note}
-The amplitude reduction at coarse timesteps is numerical dissipation from
-the BDF-1 discrete transfer function, not a cumulative error. The discrete
-steady-state amplitude is a fixed fraction of the analytical amplitude,
-determined by $\omega \Delta t$.
-```
-
-## Running the Benchmarks
-
-```bash
-# Oscillatory validation (De=1.5, order 1 and 2)
-python tests/plot_ve_oscillatory_validation.py
-
-# Resolution study (De=5, three timestep sizes)
-# Saves .npz data files for re-analysis
-python tests/plot_ve_oscillatory_validation.py
-
-# Replot from saved data (no re-running)
-python tests/plot_ve_oscillatory_validation.py --replot
-```
-
-## Notes on `dt_elastic`
-
-The parameter `dt_elastic` on the constitutive model is the elastic relaxation
-timescale used in the BDF discretisation. It controls the effective viscosity
-$\eta_{\text{eff}}$ and the stress history weighting:
-
-- BDF-1: $\eta_{\text{eff}} = \eta\mu\Delta t_e / (\eta + \mu\Delta t_e)$
-- BDF-2: $\eta_{\text{eff}} = 2\eta\mu\Delta t_e / (3\eta + 2\mu\Delta t_e)$
-- BDF-3: $\eta_{\text{eff}} = 6\eta\mu\Delta t_e / (11\eta + 6\mu\Delta t_e)$
-
-When `timestep` is passed to `VE_Stokes.solve()`, it controls the advection
-step for semi-Lagrangian history transport. It does **not** overwrite
-`dt_elastic` — these are independent parameters.
-
-A running-average approach for accumulating history when $\Delta t \ll \Delta t_e$
-was investigated but found to be extremely diffusive for semi-Lagrangian
-transport and is not implemented. To prevent runaway or unstable behaviour
-when timesteps become small (e.g. due to CFL constraints or failure events),
-we advise limiting the minimum effective viscosity, in line with the physics
-of the problem.
diff --git a/docs/advanced/benchmarks/ve-square-wave-shear.md b/docs/advanced/benchmarks/ve-square-wave-shear.md
deleted file mode 100644
index 6d80998a..00000000
--- a/docs/advanced/benchmarks/ve-square-wave-shear.md
+++ /dev/null
@@ -1,79 +0,0 @@
----
-title: "Viscoelastic Square-Wave Shear Benchmark"
----
-
-# Maxwell Square-Wave Shear
-
-This benchmark validates the viscoelastic Stokes solver with **variable timesteps**
-against an analytical solution for a Maxwell material under square-wave shear forcing.
-
-It tests both the variable-dt BDF-2 coefficients and the PetscDS constants mechanism
-that routes these coefficients to the compiled pointwise functions at runtime.
-
-## Problem Setup
-
-Same geometry as the oscillatory shear benchmark: a box with height $H$ and width $2H$,
-sheared by top/bottom boundary velocities. The shear rate is a truncated Fourier series
-approximation of a square wave:
-
-$$\dot\gamma(t) = \frac{4\dot\gamma_0}{\pi}
-\sum_{k=1}^{N} \frac{\sin\bigl((2k-1)\omega t\bigr)}{2k-1}$$
-
-The sharp transitions between positive and negative shear demand small timesteps
-near the transition points, while the plateaux can use much larger steps. This
-makes it a natural test for adaptive (variable) timestepping.
-
-## Analytical Solution
-
-Since the Maxwell equation is linear, the stress is the superposition of
-single-frequency Maxwell solutions at each Fourier harmonic:
-
-$$\sigma_{xy}(t) = \sum_{k=1}^{N} \sigma_k(t)$$
-
-where each $\sigma_k$ is the oscillatory Maxwell solution with amplitude
-$a_k = 4\dot\gamma_0 / (\pi(2k-1))$ and frequency $\omega_k = (2k-1)\omega$:
-
-$$\sigma_k(t) = \frac{\eta\, a_k}{1 + \text{De}_k^2}
-\left[\sin(\omega_k t) - \text{De}_k\cos(\omega_k t)
-+ \text{De}_k\,e^{-t/t_r}\right]$$
-
-with $\text{De}_k = \omega_k t_r$.
-
-## Adaptive Timestep Strategy
-
-The timestep varies between `dt_min` near transitions and `dt_max` on plateaux,
-based on distance to the nearest square-wave transition point:
-
-$$\Delta t = \Delta t_{\min} + (\Delta t_{\max} - \Delta t_{\min})\, f^2$$
-
-where $f \in [0,1]$ measures the normalised distance from the nearest transition.
-
-## Variable-dt BDF Coefficients
-
-With uniform timesteps, BDF-2 uses constant coefficients $[3/2, -2, 1/2]$.
-With variable timesteps (ratio $r = \Delta t_n / \Delta t_{n-1}$), the
-coefficients become:
-
-$$c_0 = \frac{1+2r}{1+r}, \quad c_1 = -(1+r), \quad c_2 = \frac{r^2}{1+r}$$
-
-These coefficients are stored as UWexpressions and updated each step via
-`_update_bdf_coefficients()`, flowing through PetscDS `constants[]` to the
-compiled pointwise functions without JIT recompilation.
-
-## Results (De=1.5, BDF-2, 10 harmonics)
-
-| Run | Steps | L2 Error | Ratio |
-|-----|-------|----------|-------|
-| Adaptive dt | 295 | 9.55e-04 | 1.78x |
-| Uniform dt | 629 | 5.35e-04 | 1.0x |
-
-The adaptive run uses 53% fewer steps at only 1.78x the error.
-
-## Running the Benchmark
-
-```bash
-pixi run -e default python docs/advanced/benchmarks/run_ve_square_wave.py
-```
-
-The script runs both adaptive and uniform timestep cases, prints convergence
-data, and saves results to `.npz` files.
diff --git a/docs/advanced/benchmarks/ve-square.md b/docs/advanced/benchmarks/ve-square.md
new file mode 100644
index 00000000..0cbfee07
--- /dev/null
+++ b/docs/advanced/benchmarks/ve-square.md
@@ -0,0 +1,100 @@
+---
+title: "VE — square-wave shear"
+---
+
+# Maxwell viscoelastic shear under square-wave forcing
+
+A Maxwell material driven by a square-wave shear rate also has a
+closed-form solution: within each half-period the stress relaxes
+exponentially toward the new steady-state value.  This benchmark
+exercises the BDF-2 stress-history integrator at the BC discontinuities,
+where the time derivative has jumps.
+
+## Governing equation
+
+Same Maxwell ODE as the harmonic case,
+
+$$
+\dot\sigma + \frac{\sigma}{t_r} = \mu\,\dot\gamma(t),
+$$
+
+but now $\dot\gamma(t) = s_n\,\dot\gamma_0$ where $s_n = (-1)^n$ is the
+sign during half-period $n$ of length $T_{1/2}$.  Within half-period
+$n$ (with $t_n = n\,T_{1/2}$ and initial value $\sigma_{0,n}$),
+
+$$
+\sigma(t) = s_n\sigma_{\mathrm{ss}}
++ \bigl(\sigma_{0,n} - s_n\sigma_{\mathrm{ss}}\bigr)\,
+e^{-(t-t_n)/t_r},
+\qquad
+\sigma_{\mathrm{ss}} = \eta\,\dot\gamma_0,
+$$
+
+and the next half-period's initial value is the previous one's end
+value:
+
+$$
+\sigma_{0,n+1} = s_n\sigma_{\mathrm{ss}}
++ \bigl(\sigma_{0,n} - s_n\sigma_{\mathrm{ss}}\bigr)\,
+e^{-T_{1/2}/t_r}.
+$$
+
+After a few periods the response settles into a periodic envelope
+between $\pm\sigma_{\mathrm{ss}}\tanh\bigl(T_{1/2}/(2 t_r)\bigr)$.
+
+## Setup
+
+| | |
+|---|---|
+| Mesh | `StructuredQuadBox` 16×8 over $\bigl(\pm 1,\pm 0.5\bigr)$ |
+| Velocity field | $\mathbb{P}^2$ |
+| Pressure field | $\mathbb{P}^1$ |
+| Time integration | BDF-2, $\Delta t = 0.10\,t_r$ |
+| Shear viscosity | $\eta = 1$ |
+| Shear modulus | $\mu = 1$ |
+| Top velocity amplitude | $V_0 = 0.5$ → $\dot\gamma_0 = 1$ |
+| Half-period | $T_{1/2} = 2\,t_r$ |
+| Run length | 4 full periods (= $8\,T_{1/2}$) |
+
+## Run
+
+```bash
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_ve_square.py
+pixi run -e amr-dev python docs/advanced/benchmarks/plot_benchmarks.py
+```
+
+Logs to `output/benchmarks/ve_square.npz`.
+
+## Results
+
+```{figure} ../figures/bench_ve_square.png
+:width: 100%
+
+Top: BDF-1 (blue open circles) and BDF-2 (red filled squares)
+overlaid on the analytical envelope (black) over four periods of the
+square-wave forcing (light blue fill).  Middle: pointwise absolute
+error for both orders on a log scale; the bumps coincide with the BC
+flips at $t = 2, 4, 6, \ldots\, t_r$ where the analytical
+$\dot\sigma$ has a jump.  Bottom: time-step (constant for this run).
+```
+
+Both orders track the analytical envelope.  The asymptotic per-period
+envelope amplitude is $\sigma_{\mathrm{ss}}\tanh(T_{1/2}/(2t_r)) =
+\tanh(1) \approx 0.762$, reached within two periods.
+
+```{figure} ../figures/bench_convergence.png
+:width: 100%
+
+Convergence sweep — middle panel is the square-wave case.  BDF-1
+shows clean slope 1.  BDF-2 starts steeper than slope 1 and trends
+toward slope 2 as $\Delta t$ shrinks, but at the dt range plotted
+the BC discontinuity at every half-period flip injects an
+$O(\Delta t)$ contribution that masks BDF-2's slope-2 advantage —
+the constant ratio between BDF-1 and BDF-2 errors is the asymptote
+the BDF-2 line is bending towards as the BC-flip contribution
+becomes subdominant.
+```
+
+The error decays exponentially within each half-period as the
+discrete history catches up with the new ramp; the decay rate matches
+the Maxwell relaxation time $t_r$.
diff --git a/docs/advanced/benchmarks/vep-square.md b/docs/advanced/benchmarks/vep-square.md
new file mode 100644
index 00000000..ad46fe98
--- /dev/null
+++ b/docs/advanced/benchmarks/vep-square.md
@@ -0,0 +1,122 @@
+---
+title: "VEP — square-wave shear (Min mode)"
+---
+
+# Visco-elastic-plastic shear under square-wave forcing
+
+Add a yield surface to the square-wave VE benchmark and the closed-form
+solution is just the *clipped* version of the VE square-wave: within each
+half-period the stress evolves exponentially toward
+$\pm\sigma_{\mathrm{ss}}$ but is held at $\pm\tau_y$ while the material
+is yielding.
+
+This benchmark verifies the implementation of Min-mode plasticity, the
+yield-surface clip itself, and — under variable timestep — the
+projection-snapshot machinery in `SemiLagrangian` DDt that prevents the
+implicit-projection drift at the Min kink (see the regression test in
+`tests/test_1052_VEP_stability_regression.py`).
+
+## Governing equation
+
+Maxwell evolution with a Min-mode yield surface:
+
+$$
+\dot\sigma + \frac{\sigma}{t_r} = \mu\,\dot\gamma(t),
+\qquad
+\eta_{\mathrm{eff}} = \min\bigl(\eta_{\mathrm{ve}},\,\eta_{\mathrm{pl}}\bigr),
+\qquad
+\eta_{\mathrm{pl}} = \frac{\tau_y}{2\,|\dot\varepsilon_{\mathrm{eff}}|}.
+$$
+
+Within each half-period the analytical solution is
+
+$$
+\sigma(t) = \mathrm{clip}\Bigl(
+s_n\sigma_{\mathrm{ss}}
++ (\sigma_{0,n} - s_n\sigma_{\mathrm{ss}})\,e^{-(t-t_n)/t_r},
+\;-\tau_y,\;+\tau_y
+\Bigr).
+$$
+
+Because the yielded portion holds $\sigma = \pm\tau_y$ exactly, the
+*clipped* value carries forward as the next half-period's initial
+condition:
+
+$$
+\sigma_{0,n+1} = \mathrm{clip}\bigl(\sigma(t_n+T_{1/2}),
+\;-\tau_y,\;+\tau_y\bigr).
+$$
+
+When $\eta\,\dot\gamma_0 > \tau_y$ (yielding occurs) the response
+saturates at $\pm\tau_y$ during the second half of each half-period.
+
+## Setup
+
+| | |
+|---|---|
+| Mesh | `StructuredQuadBox` 16×8 over $\bigl(\pm 1,\pm 0.5\bigr)$ |
+| Velocity field | $\mathbb{P}^2$ |
+| Pressure field | $\mathbb{P}^1$ |
+| Time integration | BDF-2, $\Delta t = 0.10\,t_r$ |
+| Shear viscosity | $\eta = 1$ |
+| Shear modulus | $\mu = 1$ |
+| Yield stress | $\tau_y = 0.5$ (so $\eta\dot\gamma_0 / \tau_y = 2$) |
+| Yield mode | `min` |
+| Top velocity amplitude | $V_0 = 0.5$ → $\dot\gamma_0 = 1$ |
+| Half-period | $T_{1/2} = 2\,t_r$ |
+| Run length | 4 full periods |
+
+## Run
+
+```bash
+pixi run -e amr-dev python docs/advanced/benchmarks/bench_vep_square.py
+pixi run -e amr-dev python docs/advanced/benchmarks/plot_benchmarks.py
+```
+
+Logs to `output/benchmarks/vep_square.npz`.
+
+## Results
+
+```{figure} ../figures/bench_vep_square.png
+:width: 100%
+
+Top: BDF-1 (blue open circles) and BDF-2 (red filled squares)
+overlaid on the analytical clipped solution (black), yield surface
+guides $\pm\tau_y$ (dashed grey), and rescaled forcing (light blue
+fill).  Middle: pointwise absolute error for both orders on a log
+scale — note the dramatic drop to $\sim 10^{-6}$ during yielded
+plateaux where simulation and analytical both sit at $\pm\tau_y$ to
+machine precision.  Bottom: time-step.
+```
+
+Two things to read from the per-case plot:
+
+1. **The yield surface holds for both orders**.  Peak $|\sigma|$
+   matches $\tau_y = 0.5$ to four decimal places at the canonical dt
+   for both BDF-1 and BDF-2; the count of overshoots
+   ($|\sigma| > 1.001\,\tau_y$) is zero in both runs.  This is the
+   regression that the
+   [variable-dt yield-lock test](../../../tests/test_1052_VEP_stability_regression.py)
+   protects against re-introduction.
+
+2. **The error has structure**.  During yielded plateaux the
+   simulation matches the analytical to machine precision (the
+   $\sim 10^{-6}$ floor is the projection's L2 residual).  During the
+   elastic loading/unloading transients the per-step truncation error
+   peaks just after each BC flip and decays within the half-period.
+
+```{figure} ../figures/bench_convergence.png
+:width: 100%
+
+Convergence sweep — right panel is the VEP case.  BDF-1 shows clean
+slope 1.  BDF-2 follows the same trend with a constant ratio above
+BDF-1, until the smallest $\Delta t$ where a transient overshoot
+arrives one step late on the yield-onset transition; this shows up
+in the max-norm but not the rms.  Peak $|\sigma|$ stays within 1.3 %
+of $\tau_y$ at every $\Delta t$ tested.
+```
+
+The benchmark's strict accuracy requirement is the yield-surface peak,
+not the transient error: any future change that produces $|\sigma| >
+\tau_y$ on a fixed-dt yielded plateau by more than the yield-lock
+test's tolerance fails the regression suite.
diff --git a/docs/advanced/figures/bench_convergence.png b/docs/advanced/figures/bench_convergence.png
new file mode 100644
index 00000000..3020114b
Binary files /dev/null and b/docs/advanced/figures/bench_convergence.png differ
diff --git a/docs/advanced/figures/bench_ti_vep_harmonic.png b/docs/advanced/figures/bench_ti_vep_harmonic.png
new file mode 100644
index 00000000..1d2a91a3
Binary files /dev/null and b/docs/advanced/figures/bench_ti_vep_harmonic.png differ
diff --git a/docs/advanced/figures/bench_ve_harmonic.png b/docs/advanced/figures/bench_ve_harmonic.png
new file mode 100644
index 00000000..00ced4f4
Binary files /dev/null and b/docs/advanced/figures/bench_ve_harmonic.png differ
diff --git a/docs/advanced/figures/bench_ve_square.png b/docs/advanced/figures/bench_ve_square.png
new file mode 100644
index 00000000..f4ebb8a6
Binary files /dev/null and b/docs/advanced/figures/bench_ve_square.png differ
diff --git a/docs/advanced/figures/bench_ve_square_vardt.png b/docs/advanced/figures/bench_ve_square_vardt.png
new file mode 100644
index 00000000..83c36f19
Binary files /dev/null and b/docs/advanced/figures/bench_ve_square_vardt.png differ
diff --git a/docs/advanced/figures/bench_vep_square.png b/docs/advanced/figures/bench_vep_square.png
new file mode 100644
index 00000000..dbad9e82
Binary files /dev/null and b/docs/advanced/figures/bench_vep_square.png differ
diff --git a/docs/advanced/figures/bench_vep_square_vardt.png b/docs/advanced/figures/bench_vep_square_vardt.png
new file mode 100644
index 00000000..60da2589
Binary files /dev/null and b/docs/advanced/figures/bench_vep_square_vardt.png differ
diff --git a/docs/advanced/figures/bench_vep_square_vardt_smooth.png b/docs/advanced/figures/bench_vep_square_vardt_smooth.png
new file mode 100644
index 00000000..12139df6
Binary files /dev/null and b/docs/advanced/figures/bench_vep_square_vardt_smooth.png differ
diff --git a/docs/advanced/figures/bench_vep_square_vardt_softmin.png b/docs/advanced/figures/bench_vep_square_vardt_softmin.png
new file mode 100644
index 00000000..c0c0705e
Binary files /dev/null and b/docs/advanced/figures/bench_vep_square_vardt_softmin.png differ
diff --git a/docs/api/systems_ddt.md b/docs/api/systems_ddt.md
index 1bc7e942..d9ded7df 100644
--- a/docs/api/systems_ddt.md
+++ b/docs/api/systems_ddt.md
@@ -8,6 +8,10 @@ and ``update_post_solve(dt)`` after the solve completes.
 History is initialised automatically on the first solve call, and BDF order
 ramps from 1 up to the requested ``order`` over the first few timesteps.
 
+For analytical-IC benchmarks (no startup transient) or checkpoint restarts,
+``set_initial_history(values, dt=...)`` plants the BDF history directly and
+bypasses the order ramp, so the first solve runs at full BDF order.
+
 ## Base Class
 
 ### Symbolic
diff --git a/docs/developer/CHANGELOG.md b/docs/developer/CHANGELOG.md
index 3a2e4239..0d068fd1 100644
--- a/docs/developer/CHANGELOG.md
+++ b/docs/developer/CHANGELOG.md
@@ -6,6 +6,30 @@ This log tracks significant development work at a conceptual level, suitable for
 
 ## 2026 Q2 (April – June)
 
+### DDt.set_initial_history — Public API for BDF Restart (April 2026)
+
+**New `set_initial_history(values, dt=...)` method on `SemiLagrangian` and
+`Eulerian` DDt classes** to plant BDF history at the start of a run.
+Two use cases:
+
+- **Analytical IC for benchmarks** — populate ψ* from a known closed-form
+  solution so the very first solve runs at full BDF order with no startup
+  transient. The `bench_ve_harmonic.py` peak-start benchmark used the manual
+  pattern (poking four private attributes including `psi_star[k].array`,
+  `_n_solves_completed`, `_dt_history`); the new API wraps that cleanly.
+- **Checkpoint restart** — resume a multistep history from disk without
+  re-ramping `effective_order` from BDF-1 over the first `order` steps.
+
+Sets `psi_star[0..order-1].array`, marks history initialised,
+seeds `_dt_history` for variable-dt BDF coefficients, and warns when
+`order >= 2` is called without `dt`. Six unit tests cover bookkeeping,
+scalar broadcast, length validation, and the warning path.
+
+**Files**: `src/underworld3/systems/ddt.py`,
+`docs/advanced/benchmarks/bench_ve_harmonic.py`,
+`tests/test_1052_ddt_set_initial_history.py`,
+`docs/api/systems_ddt.md`.
+
 ### Multi-Component Projection Solver (April 2026)
 
 **New `SNES_MultiComponent_Projection` solver** that projects N scalar components in a single PETSc SNES solve sharing one DM, replacing the per-component cycling in `SNES_Tensor_Projection` (which tore down and rebuilt the DM on each inner iteration). The underlying `SNES_MultiComponent` Cython base decouples the FE component count from `mesh.dim` — PETSc's pointwise callback interface accepts any DOF count per node; the new class exposes that directly.
diff --git a/docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md b/docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md
new file mode 100644
index 00000000..f9dfe42f
--- /dev/null
+++ b/docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md
@@ -0,0 +1,436 @@
+# Exponential Integrator for VE / VEP Constitutive Updates — Implementation Plan
+
+**Status**: **ETD-1 ships as the recommended default** (2026-04-29, 27 commits). ETD-1 reproduces BDF-1 essentially exactly on the deep-yield TI killer test (σ_∥ peak 1.04·τ_y, |u_y| peak 0.0320, SNES 1.8 mean iters — all identical to BDF-1) AND inherits ETD's analytical exponential factor for the linear-relaxation part. Phase B (ETD-2, single α/φ), Phase D (per-component split), and Phase E (hybrid BDF/ETD) remain on the branch as instructive failures — they don't ship.
+
+**TL;DR**:
+- **The lesson**: the drift/blow-up on VEP+yield is order-driven, not algorithm-driven. **First-order methods (BDF-1, ETD-1) are L-stable and damp the high-frequency modes that plastic yield transitions excite**; higher-order methods (BDF-2, ETD-2 lumped/split/hybrid) preserve those modes and let them grow. Recognising this collapses the whole "ETD doesn't work for fault mechanics" narrative — it's *higher-order* ETD that doesn't work, same as higher-order BDF.
+- **Production recommendation**: `integrator='etd', order=1` for everything. Single-step like BDF-1, no forcing-history mesh variable, fully L-stable, with the analytical exp factor for the linear part. Killer-test trajectory **byte-identical to BDF-1** in σ_∥ and |u_y|; ~5% slower wall-clock.
+- **Higher-order ETD on smooth VE** (no yield): ETD-2 still beats BDF-2 by 4.3× on `bench_ve_harmonic`. Available as `integrator='etd', order=2` for users who know their problem is fully VE.
+- **Higher-order anything on tight-yield TI**: don't use. BDF-2, ETD-2 lumped, ETD-2 split + lag + cap, ETD-2 hybrid — all show drift or blow-up of various flavours.
+
+**Branch**: `feature/exp-integrator-investigation`
+
+**API (production)**:
+- `ViscoElasticPlasticFlowModel(unknowns, integrator='etd', order=1)` — ETD-1 (first-order). **Default-recommended for new code** — BDF-1 stability + analytical exp factor for the linear-relaxation part.
+- `TransverseIsotropicVEPFlowModel(unknowns, integrator='etd', order=1)` — same, TI variant.
+- `integrator='bdf'` on the same classes (with `order=1` or `2`) — production default, unchanged behaviour. Same accuracy class as ETD-1 but with rational rather than analytical relaxation factor.
+- `integrator='etd', order=2` (Phase B ETD-2) — second-order, accurate on smooth VE (4.3× better than BDF-2 on `bench_ve_harmonic`); **avoid in VEP+yield regime** (catastrophic σ/u runaway documented in lessons #7, #9).
+- Sibling `MaxwellExponentialFlowModel` / `TransverseIsotropicMaxwellExponentialFlowModel` survive as thin aliases for backwards compat.
+
+**API (experimental — investigative, not for production)**:
+- `TransverseIsotropicVEPSplitFlowModel` (Phase D): per-component split with τ-cap. σ enforcement OK, `|u_y|` ratchets.
+- `TransverseIsotropicVEPFlowModel(integrator='hybrid', fault_weight=...)` (Phase E): spatial blend. σ enforcement OK, `|u_y|` drifts.
+- Both retained on the branch for reference; docstrings marked EXPERIMENTAL.
+
+---
+
+## TL;DR
+
+For Maxwell-type viscoelasticity $\dot\sigma + \sigma/\tau = \mu\dot\gamma$, integrate the relaxation operator analytically and approximate only the forcing:
+
+$$\sigma^{n+1} = \alpha\,\sigma^n + \mu(A\,\dot\gamma^{n+1} + B\,\dot\gamma^n)$$
+
+with $\alpha = e^{-\Delta t/\tau}$, $\varphi = (1-\alpha)\tau/\Delta t$, $A = \tau(1-\varphi)$, $B = \tau(\varphi-\alpha)$.
+
+Numerically validated: **5–12× more accurate than BDF-2** at small Δt, **decisively better at Δt ≈ τ** (where BDF-1/2 over-damp to near-zero output), **structurally avoids the BDF-2 multistep instability** seen in TI-VEP + spatial yield_stress (no second history term to amplify through the autodiff Jacobian).
+
+The integrator stores one slot of σ-history *and* one slot of γ̇-history. Yield handling via standard return-mapping. The DDt class hierarchy already supports multiple parallel integrator coefficient sets (`_bdf_coeffs`, `_am_coeffs`); adding `_exp_coeffs` and an optional forcing-history storage stream is a peer extension, ~200 lines.
+
+---
+
+## Implementation phasing
+
+### Phase B — UW3 prototype (next session, est. 3–5 days)
+
+**Goal**: Match BDF-2's `bench_ve_harmonic` accuracy (1.34e-3) with single-step exponential, in a clean implementation.
+
+**Tasks** (in order):
+
+1. **Resolve UWexpression-to-JIT propagation** (~half day)
+   - The Phase B jury-rig (`_exp_integrator_uw3_jury_rig.py`) hit a JIT propagation snag: setting `cm._exp_alpha.sym = X` per step doesn't reach the JIT-compiled flux. The BDF path's `_bdf_c0..c3` *do* propagate via `_update_constants()` — replicate that mechanism for `_exp_alpha`, `_exp_phi`.
+   - Likely fix: subclass-level `_update_constants` or piggyback on the existing constants-manifest registration in `SolverBaseClass`.
+
+2. **Extend `SemiLagrangian` DDt with exponential integrator** (~1 day, ~200 lines)
+   - Add `_exp_coeffs = _create_exp_coefficients(...)` parallel to existing `_bdf_coeffs`/`_am_coeffs`
+   - Add `with_forcing_history=False` constructor parameter; when True, allocate `forcing_star` MeshVariable and wire projection-snapshot machinery (mirror what's done for `psi_star`)
+   - Add `update_post_solve` branch that calls `_update_exp_values(dt, tau_eff)` and projects the current strain rate into `forcing_star[0]` (use `SNES_MultiComponent_Projection` — already used for VE-Stokes' tau projection)
+   - Add `exp_history_term()` peer method to `bdf()` and `adams_moulton_flux()`
+
+3. **Add `MaxwellExponentialFlowModel`** (~half day, ~150 lines)
+   - Sibling of `ViscoElasticPlasticFlowModel`. `requires_stress_history = True`, but the auto-DDt creation path uses `with_forcing_history=True` instead of `order=k`
+   - Stress: `σ = 2η(1-φ)·ε̇ + DFDt.exp_history_term()`
+   - Yield handling: the `viscosity` property wraps with softmin/min as today, replacing η(1-φ) where it appears
+   - Lagged-τ: each `_update_constants()` call pulls τ_eff from the most recent post-solve projected stress and uses it for next step's α, φ, A, B
+
+4. **Validate on existing benchmarks** (~half day)
+   - `bench_ve_harmonic` — must match BDF-2's max\|err\| = 1.34e-3 at peak-start IC, or be stricter
+   - `bench_ve_square_vardt` — must match BDF-2's accuracy under variable Δt
+   - `bench_vep_square` (Min mode) — peak \|σ\| within 1% of τ_y, matching the snapshot-fix BDF-2 baseline
+   - All 20 existing VE/VEP regression tests still pass
+
+5. **The killer test** (~half day)
+   - `bench_ti_vep_harmonic` at θ ∈ {0°, ±15°}, τ_y ∈ {0.15, 0.30}, with the spatial yield_stress field
+   - **Decision gate**: peak \|σ_xy\| must stay bounded (≲ 1.1·τ_y in fault zone, ≲ A_∞ in bulk) for all 6 (θ, τ_y) combinations. BDF-2 currently produces 10⁸ blow-up here; exp should run cleanly. This is the empirical proof of the structural argument.
+
+### Phase C — Particle / Lagrangian extension (later session)
+
+`Lagrangian_DDt` and `Lagrangian_Swarm_DDt` are siblings of `SemiLagrangian`; they already share the BDF/AM coefficient API. Mirror the Phase B changes:
+- Add `_exp_coeffs` and `exp_history_term()`
+- Add forcing-history slot (a swarm variable in the `Lagrangian_Swarm` case)
+- The integrator-method API is storage-agnostic; nothing the constitutive model calls needs to change
+
+### Phase D — Per-component (α_⊥, φ_⊥)/(α_∥, φ_∥) for TI VEP — DONE (2026-04-29)
+
+The rank-4 TI modulus splits cleanly into two orthogonal projectors:
+$$\mathbf{C} = 2\eta_0 \, \mathbf{P}_\perp + 2\eta_\parallel^\text{eff} \, \mathbf{P}_\parallel$$
+
+with `P_∥` the director-aligned projector (the `K` kernel of the original `_build_c_tensor`) and `P_⊥ = I_4 - P_∥`. Each projector has its own Maxwell relaxation time during yielding (τ_⊥ = η_0/μ stays at the matrix value while τ_∥ = η_∥_eff/μ collapses). Phase B's single lumped (α, φ) cannot represent both timescales; per-component decomposition can.
+
+**Validated in 1D cleanroom first** (`_exp_integrator_phase_d_split.py`): two parallel Maxwell branches with disparate τ, sinusoidal forcing, closed-form analytical reference. Per-component matches analytical to discretisation order (slope-2 in Δt, max\|err\|/A_∞ ≈ 5e-6 at Δt=0.005); every lumped variant carries Δt-independent error 7%-142% — the splitting is structurally required when τ_⊥ ≠ τ_∥, not a Δt issue.
+
+**UW3 implementation** as `TransverseIsotropicVEPSplitFlowModel` (`src/underworld3/constitutive_models.py`):
+
+1. **Sub-moduli**: `_build_split_c_tensors(η_⊥, η_∥)` returns `C_⊥ = 2η_⊥·(I-K)` and `C_∥ = 2η_∥·K` by zeroing one viscosity in the existing `_build_c_tensor` loop. Sum recovers original C.
+
+2. **Lagged η_∥ via `forcing_star`**: `_eta_par_eff_lagged()` reuses the parent's softmin envelope but reads the rate from `forcing_star.sym` (projected previous-step ε̇) instead of `self.E_eff.sym` (current Newton iterate). Breaks the per-quad-split's 1-iter trivial-Newton failure mode (where α_∥ depends on η_∥ depends on E_eff depends on Newton's u, collapsing to fixed point).
+
+3. **Explicit-parallel plasticity**: both `α_∥, φ_∥` AND the C_∥ multiplier use the lagged η — fully Picard for the parallel branch. ETD's E_eff has weak σ-history coupling (`α/(2η_1) ≈ 0.5` vs BDF's `1/(2μΔt) ≈ 10`) so the parent's _eta_par_eff would not see the yielded state on the current iterate; using forcing_star sees it because |γ̇*| is large there. BDF-1 effectively does the same Picard treatment via its E_eff magnification.
+
+4. **Soft cap on x_par** (recommendation #4): `x_eff = (1 - exp(-c·x_natural))/c` keeps `α_∥ ≥ exp(-1/c)`, equivalent to `τ_∥ ≥ c·Δt`. User-tunable via `cm.tau_par_cap_factor` (default c=1.0). This shape pre-evaluates to a finite scalar at codegen-time defaults (dt=∞, μ=∞, Pint(1, "Pa·s") for η) where additive forms hit `oo+Pint` dimensional clashes.
+
+5. **σ_∥ probe added** to all three killer-test runners: resolved fault-shear `|σ_∥| = √(|σ·n|² - (n·σ·n)²)` measured at fault centre per step. The previously-used `|σ_xy|` global-frame probe overshoots the yield surface in BDF too (2.15·τ_y) — `|σ_∥|` is the right comparator and shows BDF sits at 1.04·τ_y (essentially exact).
+
+**Killer-test outcome** (θ=+15°, τ_y=0.05, RES=32, 1.5T):
+
+| metric | BDF-1 | ETD lumped | split (Newton-impl, c=0) | split + cap (c=1.0) |
+| --- | --- | --- | --- | --- |
+| centre `\|σ_∥\|` peak | **1.04·τ_y** | 2.06·τ_y | 4.15·τ_y | **1.21·τ_y** |
+| centre `\|σ_xy\|` peak | 2.15·τ_y | 29.10·τ_y | 4.92·τ_y | 2.47·τ_y |
+| global max `\|σ\|_II` | 1.05 | 17.82 | 0.41 | 1.32 |
+| global max `\|u_y\|` | **0.032** | 18.49 | 0.070 | 0.681 |
+| SNES iters mean / max | 1.8 / 4 | 8.1 / 22 | 1.0 / 1 | 1.0 / 1 |
+| wall / step | 1.7 s | 5.6 s | 4.1 s | 1.9 s |
+
+(τ_y=0.15 sanity check: split + cap gives σ_∥ = 1.03·τ_y, |u_y| = 0.012, SNES 1 iter mean — Phase B regime preserved.)
+
+**What works**: σ_∥ enforcement to within 21% of τ_y (vs BDF's 4%); no global runaway; physically correct fault-mechanics structure (PyVista plots `output/exp_integrator_phase_d_pyvista_split_*.png` show strain rate localised on fault, σ saturated at yield surface, bipolar u_y indicating along-fault slip). 1-iter Newton (linear in parallel branch) makes per-step cost competitive with BDF.
+
+**Open**: `|u_y|` is 16-21× BDF-1's. The yield surface is correctly enforced; the difference is in how much slip accumulates per yield cycle. Mechanism (lesson #9 below): BDF's E_eff = ε̇ + σ*/(2μΔt) has built-in elastic damping that absorbs boundary motion into elastic accumulation rather than slip. ETD's E_eff with α_∥ → 0 at yield wipes elastic memory each step; even with the soft cap, the flux structure keeps slip accumulating at near-boundary rate. Not a yield-criterion failure — both integrators sit on the yield surface — but a difference in how the constitutive law is integrated through the yielded regime.
+
+### Phase E — Hybrid BDF/ETD with spatial fault weight — DONE (2026-04-29)
+
+User-suggested structural insight: in the TI fault model the user already supplies the fault geometry through `yield_stress(x)`; we know a priori where yielding *can* happen. So let each integrator handle its sweet spot:
+
+- Inside the fault zone (where `τ_y(x)` is reachable): **BDF-1** — its `σ*/(2μΔt)` magnification provides the elastic damping that the cyclic-yield regime needs.
+- Outside the fault (where `τ_y(x) → τ_y_bulk` ≫ A_∞ and yielding is structurally unreachable): **ETD-2** — strictly more accurate VE; its lack of plastic damping doesn't matter because plasticity isn't activated.
+
+**Math**: `σ(x) = w(x)·σ_BDF + (1-w(x))·σ_ETD` with `w(x) = (1/τ_y(x) - 1/τ_y_bulk) / (1/τ_y_fault - 1/τ_y_bulk) ∈ [0, 1]`.
+
+**Implementation** (in `TransverseIsotropicVEPFlowModel`):
+- New `integrator='hybrid'` option; constructor takes `fault_weight` (sympy expression).
+- `_eta_for_tensor(integrator_mode, apply_yield)` extracts (η_0, η_1_eff) per integrator/yield combination.
+- `_assemble_c_tensor(η_0, η_1_eff)` builds the rank-4 tensor from given values.
+- `_build_c_tensor` for `'hybrid'` builds both `_c_bdf` (yield-clipped) and `_c_etd` (raw).
+- `_e_eff_for(integrator_mode)` returns the right E_eff form.
+- `stress()` for `'hybrid'` blends `w·(C_BDF:E_eff_BDF) + (1-w)·(C_ETD:E_eff_ETD)`.
+- Both BDF and ETD coefficients update each step. Single shared psi_star + forcing_star.
+
+**Killer-test outcome** (θ=+15°, τ_y=0.05, RES=32, 1.5T):
+
+| metric | BDF-1 | ETD lumped | split + cap | **hybrid** |
+| --- | --- | --- | --- | --- |
+| centre `\|σ_∥\|` peak | **1.04·τ_y** | 2.06·τ_y | 1.21·τ_y | 1.12·τ_y |
+| centre `\|σ_xy\|` peak | 2.15·τ_y | 29·τ_y | 2.47·τ_y | 2.35·τ_y |
+| global max `\|σ\|_II` | 1.05 | 17.82 | 1.32 | **0.95** |
+| global max `\|u_y\|` | **0.032** | 18.49 | 0.681 | 0.109 |
+| SNES iters mean / max | 1.8 / 4 | 8.1 / 22 | 1.0 / 1 | 2.1 / 4 |
+| wall / step | 1.7 s | 5.6 s | 1.9 s | 2.3 s |
+
+(τ_y=0.15 sanity: σ_∥=1.05·τ_y, |u_y|=0.014, SNES 1.5 mean — matches BDF.)
+
+**What works**: σ_∥ peak 1.12·τ_y (closest to BDF's 1.04 of any ETD variant), |σ|_II peak 0.95 (actually slightly tighter than BDF's 1.05 at this snapshot), Newton iterates normally (2.1 vs split's degenerate 1.0). PyVista field plots show physically clean structure: u_y range ±0.017 at chosen step (no boundary overshoot), strain-rate localised on the fault band, no fault-tip stress concentrations.
+
+**Why we still don't ship it**: the trajectory plot reveals `|u_y|` ramps monotonically from ~1e-5 to 0.109 over 1.5 periods — slow accumulation, not bounded oscillation like BDF-1 (which oscillates around 0.01-0.03 returning to baseline between yield events). At any single snapshot the field looks BDF-class; over cycles, drift accumulates.
+
+**Likely cause**: shared σ* history. Both BDF and ETD branches read from the same `psi_star`, but `psi_star` is updated to the *blended* σ each step. Inside the fault, the BDF branch's σ* is "previous step's blended σ" — not "previous step's BDF-pure σ". Bulk's ETD-stored history leaks into the fault's BDF computation, slowly amplifying fault slip over cycles. Fixing this would need two independent history fields with parallel updates — a significant refactor.
+
+**Decision**: Phase E as committed is the cleanest hybrid we tried, but doesn't deliver BDF-class temporal behaviour and fundamentally can't without the independent-history rework. Keep on branch as documented investigation; not advertised in user-facing API.
+
+### Phase F — Generic `TimeIntegrator` refactor (deferred — only if needed)
+
+If we end up with five-plus integrator methods on the DDt class and want to add another (e.g., Crank-Nicolson or higher-order ETD), refactor to separate `HistoryStorage` from a `TimeIntegrator` strategy object. Not needed for current scope.
+
+---
+
+## Open architectural questions to resolve during Phase B
+
+1. **Lagged-τ vs SNES sub-iteration for VEP**
+
+   For yield-active VEP, $\tau_{\text{eff}} = \eta_{\text{eff}}/\mu$ depends on σ (nonlinear). Two strategies:
+   - *Lagged-τ (Picard)*: Compute α, φ, A, B from previous step's η_eff. First-order in the nonlinear coupling, trivial to implement. **Phase B starts with this.**
+   - *Self-consistent τ via SNES*: Include τ in the iterate so the inner Newton converges τ↔σ together. More accurate but couples the time-integration to the SNES tolerance. Add only if lagged-τ shows insufficient accuracy.
+
+2. **Per-quad α, φ when τ is spatial**
+
+   When η_eff is a spatial field (yield zone, weakness map), α = exp(-Δt/τ) becomes a spatial expression. Sympy handles `exp(spatial_expr)` symbolically, but JIT codegen has to evaluate `exp` per quadrature point per residual eval — potentially expensive.
+
+   Mitigation: project (α, φ) onto a scalar mesh variable at the start of each step. They're constant within a step. The JIT then sees a scalar-field reference, not an `exp` to evaluate. ~one extra projection per step.
+
+3. **Forcing-history projection cost**
+
+   ε̇* needs to be projected into `forcing_star[0]` after each solve. UW3's `SNES_MultiComponent_Projection` (committed in 2026-04 for VE-Stokes' tau projection, see `docs/developer/CHANGELOG.md`) makes this cheap and direct. Memory cost: one extra `SYM_TENSOR` MeshVariable per VE/VEP solver.
+
+4. **TI-VEP per-component decomposition**
+
+   The TI rank-4 tensor has separate timescales: $\tau_0 = \eta_0/\mu$ for bulk, $\tau_{1,\text{eff}} = \eta_{1,\text{eff}}/\mu$ for fault-tangent. The clean approach is to construct the rank-4 stress tensor with separate $(\alpha_0, \varphi_0)$ for the isotropic part and $(\alpha_1, \varphi_1)$ for the director-aligned correction, matching how `_build_c_tensor` already does separate viscosities. **Validate at Phase B step 5; bug-fix in this session if needed.**
+
+5. **Asymmetric fine-Δt windows around BC flips**
+
+   Phase B 1D evaluation showed that centred fine windows around BC discontinuities waste their pre-flip half (σ is near peak and barely changes) while the post-flip half does the real work. Production benchmarks should use asymmetric windows (small pre, larger post). Affects the `bench_*_vardt` schedule, not the integrator itself.
+
+---
+
+## Validation gates (achieved — Phase B closed)
+
+| Test | Baseline | ETD-2 result | Status |
+|---|---|---|---|
+| `bench_ve_harmonic` | BDF-2 max\|err\| = 1.34e-3 | **3.14e-4** | ✅ **4.3× more accurate** than BDF-2 |
+| `bench_ve_square` (const-Δt) | BDF-1 2.83e-2, BDF-2 8.07e-2 | 8.72e-2 | ✅ matches BDF-2 within 10% |
+| `bench_vep_square` (Min) | peak\|σ\| = 0.5000 | peak\|σ\| = 0.4899, **0/160 violations** | ✅ saturated under τ_y |
+| **`bench_ti_vep_harmonic` order=2 (killer test)** | **BDF-2: 10⁵-10⁹ blow-up on every yield-active combo** | **6/6 PASS, σ ≲ 1.12·τ_y at fault centre** | ✅ **decision gate met** |
+| 20 existing VE/VEP regression tests | pass | pass | ✅ no BDF regression |
+
+### Killer-test detail (`bench_ti_vep_harmonic`)
+
+Centre-probe metrics (apples-to-apples with BDF-1 production), ETD-2 vs BDF-1:
+
+| θ | τ_y | ETD-2 \|τ_resolved\| | BDF-1 \|τ_resolved\| | ETD-2 \|σ_xy\| | BDF-1 \|σ_xy\| |
+|---|---|---|---|---|---|
+| 0°  | 0.15 | **1.103·τ_y** | 1.122·τ_y | 1.103·τ_y | 1.122·τ_y |
+| +15°| 0.15 | **1.118·τ_y** | 1.143·τ_y | 1.410·τ_y | 1.447·τ_y |
+| -15°| 0.15 | **1.120·τ_y** | 1.127·τ_y | 1.408·τ_y | 1.440·τ_y |
+| 0°  | 0.30 | **0.922·τ_y** | 1.150·τ_y | 0.922·τ_y | 1.150·τ_y |
+| +15°| 0.30 | **0.804·τ_y** | 1.139·τ_y | 0.929·τ_y | 1.049·τ_y |
+| -15°| 0.30 | **0.803·τ_y** | 1.138·τ_y | 0.929·τ_y | 1.047·τ_y |
+
+ETD-2 **is tighter than BDF-1 production on every probe**. BDF-2 (the higher-order method ETD-2 replaces) blows up to 10⁵-10⁹ on every yield-active combo (τ_y=0.15) — confirming the structural argument empirically.
+
+Runner: `docs/developer/design/_exp_integrator_phase_b_killer.py`.
+
+---
+
+## Future work (out of scope for Phase B but relevant)
+
+- **Backtracking timestepping**: when a step contains an event that the integrator can't capture in one piece (e.g., a steep change in γ̇ or yield-onset), back up and retry with smaller Δt. Logically separate from the integrator choice; both BDF and exp would benefit. Useful for adaptive timestep strategies that don't know flip times a priori.
+
+- **Higher-order ETDs**: ETD-3, ETD-4 would store 2 or 3 forcing-history slots and use cubic/quartic interpolation in the integral. Not needed unless second-order forcing accuracy proves insufficient (unlikely for typical mantle/lithosphere problems).
+
+- **Higher-order yield treatment**: the lagged-τ approach is first-order in the nonlinear coupling. For sharp yield onset under variable Δt, a self-consistent τ via SNES sub-iteration may be needed. Bridge from Phase B if observed.
+
+- **Symbolic τ_eff in non-Maxwell rheologies** (Burgers, Maxwell-Voigt, etc.): the exponential framework generalises to any linear relaxation operator. Each relaxation timescale gets its own (α, φ); the rank-4 contraction picks them up via a matrix exponential of the relaxation tensor. Out of scope, but the architecture leaves the door open.
+
+---
+
+## What we learned — deviations from the original plan
+
+These notes capture decisions taken during Phase B that diverge from or refine the pre-Phase-B plan above. They should inform Phase C and Phase D scope.
+
+### 1. The "JIT propagation" task was a red herring
+
+The plan's Task 1 anticipated a UWexpression-to-JIT propagation issue based on the jury-rig's failure. The actual cause was simpler: the jury-rig subclassed `ViscousFlowModel` which has `requires_stress_history = False`, so the Stokes solver took the viscous branch where `cm.flux` is **never compiled** — it builds the flux from `cm.viscosity` instead. The custom flux containing `_exp_alpha`, `_exp_phi`, etc. was effectively dead code. Once the model declares `requires_stress_history = True` (as the new sibling did), the existing constants-manifest infrastructure handles α, φ propagation correctly with no new plumbing.
+
+### 2. Predictor-corrector return mapping (the 1D Phase B's yield approach) is wrong for 2D Stokes
+
+The 1D Phase B evaluator used predictor-corrector return mapping: solve pure VE, then clip σ to satisfy yield. In 2D Stokes that breaks momentum balance — the SNES finds u that satisfies `∇·σ_VE = body force` (no yield), then we clip σ but leave u unchanged, so the velocity field corresponds to the unclipped stress. **In 2D Stokes-VEP, yield must live inside the SNES residual** via the standard viscosity-wrapping pattern (`viscosity = softmin(η, η_pl)`), the same as the production BDF VEP path. Refactored mid-Phase-B (commit `aba93c2`).
+
+### 3. Lagged-τ aggregation experiments did not tighten yield-surface saturation
+
+Multiple lagged-τ approaches were tried (scalar `min η_eff` over yield-active nodes; scalar `median η_eff`; per-node spatial α via projected scalar mesh variables) — all gave **worse** σ overshoot than the raw τ_VE baseline. Analysis showed the ETD-2 history term `2η_raw·(φ-α)·ε̇*` uses raw η (not yield-clipped) — a Picard-style approximation — and produces a non-zero floor on σ under harmonic forcing that is insensitive to τ_eff except via the α·σ* scaling. The effect is geometric, not a τ-choice issue. Reverted to raw τ_VE = η/μ (commit `584dea8`).
+
+The ETD-2 result at parity-with-BDF-1-production — 1.10-1.14·τ_y at the fault centre — reflects the same kind of overshoot BDF-1 itself shows. Tightening past that is a Phase D concern requiring per-component (α₀, φ₀) for the rank-4 TI tensor, **not** a fix on the lagged-τ aggregation.
+
+### 4. Probe-metric mismatch caused a false alarm
+
+`max σ_II/τ_y_local` over a fault-zone mask reads larger than `σ_xy at fault centre / τ_y_at_fault` because the Gaussian-weakened τ_y(x) varies sharply across the mask: shoulder nodes have τ_y_local much larger than the centerline value, and σ_II saturates accordingly at those local τ_y, which inflates the ratio when the centerline τ_y_at_fault is used as the denominator. **Use the per-node ratio `max σ_II(x)/τ_y(x)`**, or stick to the centre-probe metric (the one BDF-1 production reports). The killer-test runner now reports both.
+
+### 5. Architectural collapse landed at the parameter level
+
+The plan envisaged Phase B with sibling classes (`MaxwellExponentialFlowModel`, TI variant) and Phase D moving integrator state onto the DDt with a strategy parameter. The collapse landed at the **constructor parameter** level instead: `ViscoElasticPlasticFlowModel(unknowns, integrator='etd')` (and the same on TI-VEP). Coefficients still live where the existing infrastructure naturally wants them (`_bdf_c0..c3` on the model, `_exp_coeffs` on the DDt) — the dispatch in `E_eff`, `viscosity`, `_build_c_tensor`, and the uniform `_update_history_*` hooks all branch on `self._integrator`. Sibling classes survive as ~10-line aliases for backwards compatibility (commit `ae79664`).
+
+### 6. Unit-handling in any new array touchpoints needs explicit care
+
+A predictor-corrector clip of `psi_star[0].array` via raw numpy initially looked correct in the non-units case (production benches don't use units) but stripped UnitAwareArray wrappers silently. The user flagged this as accumulating tech debt. The audit fix landed in commit `aba93c2`: `forcing_star` allocated `units=None` (ε̇ has different physical dimensions from σ), `update_forcing_history` non-dimensionalises eval results before storing. Future Phase D work touching `.array` should follow the same pattern (see `update_pre_solve` for the canonical example).
+
+### 7. Empirical range of validity: τ_y / A_∞ ≥ ~0.5; below that, Phase B ETD-2 is strictly worse than BDF-1 production
+
+A direct test by tightening τ_y from 0.15 to **0.05** on `bench_ti_vep_harmonic` (so τ_y / A_∞ = 0.05/0.27 ≈ 0.19) at RES=32 over 1.5 periods produced a **catastrophic step-by-step runaway** for Phase B ETD-2 even though Newton converged on every step. Apples-to-apples comparison with BDF-1 production on the *same* setup (saved at `output/phase_b_th{0,15}_ty0p05.*` and `output/phase_b_bdf_th+15_ty0p05.npz`):
+
+| metric (θ=+15°, τ_y=0.05) | **ETD-2** (Phase B) | **BDF-1** (production) |
+|---|---|---|
+| max σ_II in domain | **17.8** | **1.05** |
+| u_y range | ±18 | ±0.032 |
+| SNES iter mean / max | 8 / 22 | 1.8 / 4 |
+| Wall / step | 5.9 s | 1.7 s |
+| Centre \|σ_xy\| peak | 17.8 (356·τ_y) | 0.108 (2.15·τ_y) |
+| Diverged SNES steps | 0/120 | 0/120 |
+
+The catastrophe **is specific to the ETD-2 implementation**, not a problem-class issue: BDF-1 production handles τ_y=0.05 cleanly with bounded σ, faster Newton (mean 1.8 iters), and 3.5× faster wall time per step.
+
+Time-series comparison, both integrators run on the same RES=32 mesh with matching driver and step size: ``output/exp_integrator_phase_b_bdf_vs_etd.png`` (generated by `_plot_phase_b_bdf_vs_etd.py` from `output/phase_b_{bdf,etd}_th+15_ty0p05.npz`). The ETD-2 trace tracks BDF-1 inside the ±τ_y band for the first half-cycle, then breaks loose at the second yield event and runs away through the second period — peak centre |σ_xy| reaches 1.46 (29·τ_y) and global max |u_y| reaches ~18, while BDF-1 stays at 0.11 and 0.03 respectively. The divergence point is the first deep yield, not a steady accumulation.
+
+The mechanism is the one items 3 and 5 in this list already identified: the ETD-2 history term ``α·σ* + 2η·(φ-α)·ε̇*`` uses raw η (Picard approximation when yield is active). The analytical-floor σ-magnitude under harmonic forcing is ~A_∞, independent of τ_y. When A_∞ > τ_y, σ* feeds back through α·σ* on each step and grows without bound; the leading viscous term is yield-clipped but has small (1-φ) coefficient at typical Δt, so it can't dominate the runaway history.
+
+**Newton's "convergence" reports in this regime are physically meaningless** — Newton finds the residual minimum each step, but the time-integration loop diverges. SNES iteration counts are *not* an early-warning signal (they actually *drop* from typical levels because the residual structure becomes degenerate); the warning is in σ_II / u_y magnitudes themselves.
+
+Practical implications for Phase B as committed:
+
+* ``integrator='etd'`` with the raw τ_VE = η/μ in `α, φ` works for **τ_y / A_∞ ≥ ~0.5** — at parity with BDF-1 production for accuracy at ratio 0.55, beats BDF-2 by 4.3× at no-yield ``bench_ve_harmonic``.
+* Below that ratio (the **typical fault-mechanics regime**): solution diverges silently (no SNES error). Phase B ETD-2 is **strictly worse than BDF-1** — slower, less accurate, unstable.
+* Phase B as currently committed should be treated as a structural-argument demo, not a drop-in replacement for the BDF integrators. Production users should keep ``integrator='bdf'`` (the default) until Phase D lands.
+* **Phase D (per-component (α₀, φ₀)/(α₁, φ₁) for TI) is blocking, not "future work"**, for any production use of ETD-2 on tight-yield problems.
+
+The Phase B design-doc note that "lagged-τ doesn't help" applies in this regime too — the failure is structural to the Picard approximation, not a τ-choice issue.
+
+### 8. The diagnostic that mattered: |σ_∥| (resolved fault shear), not |σ_xy|
+
+Throughout Phase B and into the early Phase D iterations, the killer-test trajectories used `|σ_xy|` at fault centre as the yield-surface diagnostic. That was wrong. The yield criterion `|σ_∥| ≤ τ_y` lives in the fault frame; `|σ_xy|` is global-frame and includes contributions that the limiter doesn't constrain (off-fault stress, geometric tilts).
+
+Adding the resolved fault-plane shear `|σ_∥| = √(|σ·n|² - (n·σ·n)²)` as a per-step probe (commits 59ab769 onwards) revealed that:
+
+* BDF-1 sits **right on** the yield surface (peak `|σ_∥|` = 1.04·τ_y, essentially exact) despite `|σ_xy|` peaking at 2.15·τ_y.
+* Lumped Phase B ETD-2 stays at 2.06·τ_y in `|σ_∥|` even though `|σ_xy|` runs away to 29·τ_y — the catastrophe is off-fault, the *fault* is doing fine.
+* Phase D's first split implementations (per-quad and Newton-implicit lag) overshot to 4·τ_y *on the fault plane*; this needed fixing.
+
+Without `|σ_∥|`, Phase D would have been judged on `|σ_xy|` alone — and the cure (explicit-parallel + cap) would have looked like it just lowered the global-frame number without engaging with the actual yield-criterion physics.
+
+### 9. The structural BDF-vs-ETD slip-rate difference — physics, not numerics
+
+After Phase D's σ_∥ enforcement reached BDF parity (1.21 vs 1.04·τ_y), `|u_y|` remained 16-21× BDF-1's at τ_y=0.05. The mechanism is a structural difference in how each integrator handles the yielded regime, not a numerical defect:
+
+* **BDF**: E_eff = ε̇ + σ*/(2μΔt). At Δt=0.05, μ=1, the σ-history prefactor is **10**. When σ_∥ saturates near τ_y, this term *dominates* E_eff_∥ — boundary motion is preferentially absorbed into elastic accumulation rather than slip. The integrator has built-in elastic damping during yield.
+* **ETD (Phase D)**: E_eff_∥ = (1-φ_∥)·ε̇ + α_∥/(2η_∥)·σ* + (φ_∥-α_∥)·ε̇*. At yield with α_∥, φ_∥ → 0 (or even with the soft cap clamping them at 0.37, 0.63), the σ-history coefficient is at most O(1). Boundary motion goes into γ̇_∥ at the imposed BC rate — the fault slips freely.
+
+Both integrators correctly enforce `|σ_∥| ≤ τ_y` (the limiter works). They just distribute the boundary motion differently between elastic and plastic strain. BDF's behaviour is closer to a typical seismic-cycle picture (elastic energy stores and releases episodically); ETD's is closer to steady-flow plasticity (boundary motion drives free slip at yield). Neither is "wrong"; they're modelling different limits of the same constitutive law.
+
+Implication: when comparing integrators on a tight-yield problem, σ-amplitude is a poor metric (both at τ_y); the meaningful difference is in time-integrated slip per cycle, which depends on the elastic-damping strength and is integrator-specific.
+
+### 10. Phase D recommendations checklist — what worked, what didn't
+
+The chatGPT advisor's stabilisation strategy was on the money for the issues we hit:
+
+| Recommendation | Phase D status |
+| --- | --- |
+| 1. Lag τ in the exponential — use τⁿ, never τⁿ⁺¹ | **Implemented.** `_eta_par_eff_lagged()` reads forcing_star (previous-step ε̇). Cured the per-quad split's 1-iter trivial-Newton failure mode. |
+| 2. Plastic correction *after* VE update (predictor-corrector) | **Rejected.** Tried earlier in Phase B; broke 2D Stokes momentum balance. Yield-in-residual via softmin is the working pattern. |
+| 3. Under-relax stress update (ω ~ 0.5) | **Not implemented.** Open follow-up. Would smooth Newton's hop and might tame the slip ratchet (lesson #9) without affecting the yield surface. |
+| 4. Cap τ_eff ≥ c·Δt to avoid α_∥ → 0 | **Implemented** as a soft x_par cap `(1-exp(-c·x))/c`. Tunable via `cm.tau_par_cap_factor` (default 1.0). Modestly improves σ_∥ enforcement (1.31 → 1.21·τ_y at τ_y=0.05) but slightly worsens the slip ratchet (0.525 → 0.681) — the inconsistent capping (η_C natural, η_α capped) shrinks the (1-φ_∥)·E term in proportion to the σ*-contribution, so σ_∥ stays controlled but flux balance is more sensitive between yield events. |
+| 5. Consistent viscosity in Stokes + constitutive | **Implemented.** Both C_∥ and (α_∥, φ_∥) use the lagged forcing_star-based η. Earlier inconsistency (C_∥ on current η, α_∥ on lagged η) had Newton converge in 1 iter and σ_∥ drift to 4·τ_y. |
+
+Also-tried, rejected:
+* **Raw E (current strain rate) as yield-criterion rate input**: adds explicit u-dependence into η_∥_eff, which propagates into C_∥ and produces a singular GAMG operator at u=0 (start-up zero state). Smooth-floor regularisation didn't fix the SNES 0-iter divergence. Reverted — the parent's E_eff-based criterion is the right shape, just needs the right rate input (forcing_star, lesson above).
+* **`σ*/(2ε̇*)` back-derivation for lagged η**: appears intuitive (it's the *effective* viscosity from histories alone) but breaks elastic regime (where σ ≈ μ·γ·dt, not η·ε̇). Produced startup spikes. Replaced by the parent-softmin-on-forcing_star pattern.
+* **`sympy.Min` cap on η**: catastrophic 29/120 SNES diverged; non-smooth derivative breaks Newton. Replaced by smooth `(1-exp(-c·x))/c`.
+
+### 11. Phase E (hybrid) drifts because of shared σ* history — and that's structural
+
+The Phase E hybrid (`σ = w·σ_BDF + (1-w)·σ_ETD`) was conceptually the cleanest fix for the BDF-vs-ETD mismatch — let each integrator handle its sweet spot. Snapshot-by-snapshot the field structure is BDF-class (no boundary overshoot, fault-band strain-rate localisation, σ_∥ within 8% of τ_y). But the time-trajectory shows `|u_y|` ramping monotonically over cycles, ending at ~3× BDF.
+
+Mechanism: both branches share a single ``psi_star`` history slot, which is updated to the *blended* σ each step. So inside the fault, the BDF branch's σ* is "previous step's blended σ" — contaminated with ETD's looser-history contribution from the bulk. Over many cycles, that contamination amplifies fault slip slightly each pass.
+
+The fix would require two independent history fields with parallel updates (BDF history fed only by BDF flux, ETD history fed only by ETD flux, plus the spatial blend at flux time). That's a real DDt refactor, not a one-line fix, and even then there's no guarantee the slow drift fully closes — the underlying physics-mismatch (lesson #9) lives in how each integrator handles the yielded regime, not just in the history bookkeeping.
+
+The investigation-level lesson: **patches that share history between BDF and ETD branches will leak the missing damping into temporal drift.** Whether it's a per-quad split (Phase D, Newton-implicit), explicit-parallel split (Phase D with cap), or spatial blend (Phase E), the slow drift keeps reappearing in different magnitudes. ETD-as-designed is a beautiful integrator for VE; trying to retrofit it onto deep-yield VEP without rebuilding from the ground up consistently leaves residual non-physical behaviour.
+
+### 12. (superseded by #13)
+
+The conclusion in earlier drafts of this section ("for deep-yield fault mechanics BDF-1 is the right integrator; don't retrofit ETD") was correct *for higher-order ETD*. Lesson #13 below shows it's wrong for ETD generally — first-order ETD works fine.
+
+### 13. The drift was order-driven, not algorithm-driven — ETD-1 ships
+
+User's structural insight: "all the integrators have this growing instability except the first order one." ETD-1 (first-order ETD with `φ = α`) confirms it empirically — it reproduces BDF-1 essentially exactly on the killer test:
+
+| metric (θ=+15°, τ_y=0.05) | BDF-1 | ETD-1 |
+| --- | --- | --- |
+| centre `\|σ_∥\|` peak | 1.04·τ_y | 1.04·τ_y |
+| global max `\|u_y\|` | 0.0320 | 0.0320 |
+| SNES iters mean | 1.8 | 1.8 |
+| diverged | 0/120 | 0/120 |
+
+Mechanism: BDF-1 and ETD-1 are both **L-stable** (`|R(z)| ≤ 1` on the entire negative-real-part half-plane → every mode is damped). BDF-2 is only A-stable; ETD-2 is exact for the linear ODE so has *zero* numerical dissipation. The plastic yield transitions create effective high-frequency modes (residual structure flips discontinuously when σ crosses τ_y); first-order methods damp them with the same numerical viscosity they apply to everything else, while higher-order methods preserve them and let them grow.
+
+Same general principle as Crank-Nicolson failing on stiff problems while implicit Euler doesn't.
+
+This collapses the "ETD doesn't work for fault mechanics" narrative the earlier lessons #7, #9, #11, #12 were converging toward. The actual statement is "*higher-order* ETD doesn't work for fault mechanics," same as higher-order BDF. ETD-1 (single-step, no forcing-history slot, analytical exp factor) is the right shape: BDF-1 stability + ETD's exact treatment of the linear-relaxation part.
+
+**Production recommendation**: `integrator='etd', order=1` as the default for VEP and TI-VEP. Wall-clock cost ~5% over BDF-1 (one extra `exp` per coefficient update); accuracy is per-iteration the same as BDF-1 (both first-order) but the analytical factor handles the linear-relaxation limit cleanly without the rational-approximation error at large `Δt/τ`. Phase B's ETD-2 (`integrator='etd', order=2`) remains available for users with smooth VE problems who can certify yield is never active — it beats BDF-2 by 4× there.
+
+The Phase D and Phase E artefacts stay on the branch as instructive failures of the higher-order-ETD idea — useful documentation of what doesn't work and why, but not part of the production API.
+
+---
+
+## Appendix A — Numerical evidence
+
+### Phase A (1D linear Maxwell, sinusoidal forcing) — DONE
+
+`_exp_integrator_phase_a.py` solves $\dot\sigma + \sigma/\tau = \mu\dot\gamma$ with $\dot\gamma = \dot\gamma_0 \cos(\omega t)$, $\omega = \pi/2$, $\eta = \mu = \tau = 1$.
+
+| Δt/τ | Exp max\|err\| | BDF-1 | BDF-2 |
+|---|---|---|---|
+| 0.01 | 1.1e-5 | 4.5e-3 | 7.4e-5 |
+| 0.05 | 2.9e-4 | 2.2e-2 | 1.8e-3 |
+| 0.10 | 1.1e-3 | 4.4e-2 | 6.8e-3 |
+| 0.50 | 2.8e-2 | 1.9e-1 | 1.1e-1 |
+| **1.00** | **1.0e-1** | 3.5e-1 | 3.5e-1 |
+| **2.00** | **5.7e-2** | 3.4e-1 | 3.4e-1 |
+
+Exp shows clean second-order slope at small Δt and stays accurate at Δt ≥ τ where both BDFs collapse to near-zero output. Figure: `exp_integrator_phase_a.png`.
+
+### Phase B (VEP, large Δt, square wave, variable-Δt) — DONE
+
+`_exp_integrator_phase_b_eval.py` extends to:
+
+- **VEP harmonic** ($\omega = \pi/4$, return-mapping yield): both Exp and BDF-1 clip correctly at τ_y; agreement to ~1% at small Δt because yield mechanism dominates over time integrator.
+
+- **Pure VE at large Δt/τ**: at Δt/τ ≤ 1, Exp 5–12× more accurate; at Δt/τ ≥ 2, both struggle but Exp degrades more gracefully (gives bounded under-shoot vs BDF's wrong-shape output).
+
+- **Square wave VE/VEP**: exp consistently ~2× more accurate than BDF-1 for VE; the yield+BC-discontinuity error dominates both for VEP at small Δt.
+
+- **Variable-Δt around BC flips** (correctly schedule, with fine-zone clamp): improvement of 11–19% in max error for both VE and VEP, both Exp and BDF-1. The exp's plateau-period exactness shows clearly as the per-step error drops to near machine precision once the BC discontinuity is well-resolved.
+
+Figures: `exp_integrator_phase_b_yield.png`, `exp_integrator_phase_b_largedt.png`, `exp_integrator_phase_b_square.png`, `exp_integrator_phase_b_vardt.png`.
+
+### Phase B UW3 jury-rig — partial (propagation snag identified)
+
+`_exp_integrator_uw3_jury_rig.py` attempted to wire ETD-2 into UW3 via a custom `MaxwellExpFlowModel(ViscousFlowModel)` subclass. Hit a JIT propagation issue: `cm._exp_alpha.sym = X` per-step updates don't reach the JIT-compiled flux. Minimal incremental test (`_exp_jury_rig_minimal.py`) confirmed the constitutive-model class plumbing works in isolation; the issue is specific to per-step updates of UWexpression coefficients. **First task of Phase B is resolving this**, by replicating the BDF coefficient propagation pattern.
+
+---
+
+## Appendix B — Architecture details
+
+### What the exponential integrator stores
+
+| Integrator | psi_star slots | forcing_star slots | Coefficients |
+|---|---|---|---|
+| BDF-1 | 1 | 0 | c_0, c_1 |
+| BDF-2 | 2 | 0 | c_0, c_1, c_2 |
+| AM-2 | 1 | 0 | a_0, a_1, a_2 |
+| ETD-1 (Lawson) | 1 | 0 | α |
+| **ETD-2 (this proposal)** | **1** | **1** | **α, A, B** |
+| ETD-3 | 1 | 2 | α, A, B, C |
+
+The `SemiLagrangian` already maintains parallel `_bdf_coeffs` and `_am_coeffs`. Adding `_exp_coeffs` is the same kind of peer extension.
+
+### What stays the same vs BDF in the constitutive model
+
+The factorisation σ = η_eff·γ̇ + (history) is preserved. Yield-mode logic (softmin/min/harmonic) wraps η_eff identically to today. The Stokes weak-form structure is unchanged. What changes:
+- Different formula for η_eff_VE: η(1-φ) replaces η Δt/(τ+Δt)
+- Different history term: α·σⁿ + 2η(φ-α)·ε̇ⁿ replaces the BDF Σ c_i ψ*_i sum
+- New ε̇* storage slot
+
+### Why this avoids the BDF-2 instability (TI-VEP + spatial yield)
+
+The instability we documented arises from the c_2·ψ*_{n-1} term in BDF-2's history sum getting autodiff'd into the Jacobian, where it picks up the spatial gradient of η_1_eff (via $\partial\eta_{1,\text{eff}}/\partial\nabla u$), and then gets *directionally amplified* by the rank-4 tensor's $\hat n\otimes\hat n\otimes\hat n\otimes\hat n$ coupling. The amplification compounds across history, exploding |σ| over ~10 t_r.
+
+Exponential has **no second history term**. There's no c_2·ψ*_1 to amplify. The α·σⁿ contribution is autodiff-trivial (σⁿ is a known mesh variable, treated as constant w.r.t. ∇u). The ε̇ⁿ contribution likewise. The Jacobian's only ∇u-dependent term is the leading 2η_eff(1-φ)·ε̇, which is well-behaved.
+
+This is why the structural argument carries to TI-VEP via the per-component decomposition (Phase B step 5): each component of the rank-4 tensor gets its own (α, φ, A, B), each with single-history-slot relaxation, no cross-component amplification.
diff --git a/docs/developer/design/SOLVER_UNIFICATION_DESIGN.md b/docs/developer/design/SOLVER_UNIFICATION_DESIGN.md
index 0d1f6875..2d6f65dc 100644
--- a/docs/developer/design/SOLVER_UNIFICATION_DESIGN.md
+++ b/docs/developer/design/SOLVER_UNIFICATION_DESIGN.md
@@ -1,6 +1,15 @@
 # Solver Unification Design
 
-> Status: **Proposed** — for implementation after VEP validation is complete
+> Status: **Implemented** (2026-04-28, branch `feature/exp-integrator-investigation`).
+> The unification landed alongside Phase B of the exponential integrator —
+> see `EXPONENTIAL_VE_INTEGRATOR.md`. ``VE_Stokes`` now emits a runtime
+> ``DeprecationWarning`` on construction with a migration template; production
+> callsites have been swept; the constitutive model declares its DDt needs via
+> ``requires_stress_history`` (existing) plus ``stress_history_ddt_kwargs``
+> (new in Phase B — used by ``integrator='etd'`` to request
+> ``with_forcing_history=True`` on the auto-DDt). The lazy-creation path in
+> ``Stokes.constitutive_model.setter`` reads both. The text below is preserved
+> as the design rationale for future readers.
 
 ## Goal
 
diff --git a/docs/developer/design/VEP_TWO_STOKES_OPERATOR_SPLIT.md b/docs/developer/design/VEP_TWO_STOKES_OPERATOR_SPLIT.md
new file mode 100644
index 00000000..7b49ccf9
--- /dev/null
+++ b/docs/developer/design/VEP_TWO_STOKES_OPERATOR_SPLIT.md
@@ -0,0 +1,127 @@
+# VEP Two-Stokes Operator Split — Investigation Plan
+
+> **Status**: planned, not implemented (2026-04-29). New investigation branching off the ETD integrator work (PR #161). Captures architectural context while it's fresh; first session of implementation will likely build the second-stage solver and run a comparison vs ETD-1 + in-residual yield (the production path that ships in PR #161).
+
+## Motivation
+
+The exponential-integrator investigation (Phase A–F, see `EXPONENTIAL_VE_INTEGRATOR.md`) eliminated several VE+plasticity strategies and converged on **ETD-1 + yield-in-residual softmin** as the production answer. That works because:
+
+1. ETD-1 is L-stable (advice §13 in this doc → `EXPONENTIAL_VE_INTEGRATOR.md` lesson #13).
+2. The in-residual softmin yield is a coupled Newton solve where σ and u find each other through the residual.
+
+But the in-residual softmin has known imperfections:
+* **Yield surface saturation** is approximate (softmin with finite δ allows σ to drift above τ_y, especially under variable Δt — see project memory `project_vep_variable_dt_yield_violation.md`).
+* **Higher-order ETD-2** beats BDF-2 by 4× on smooth VE but blows up under in-residual yield in tight-yield regimes (lesson #11 — fundamental, not patchable). For *fully-VE problems* ETD-2 is shippable; for *VEP*, only ETD-1 is.
+* **No clean predictor-corrector path**: the radial-return-after-VE-predictor pattern (Phase F) only worked with ETD-1, and adding it to ETD-2 didn't rescue ETD-2 (lesson: σ-damping in outer Picard isn't sufficient — need to update η_eff between iters too).
+
+The web-advice doc (`/Users/lmoresi/Downloads/vep_stress_update_full_latex.md`) prescribes the architecture that *is* the standard robust pattern in production geodynamics codes:
+
+```
+Stokes solve with lagged viscosity → VE exponential predictor → plastic return mapping → damped outer Picard iteration
+```
+
+What we built in Phase F was the **single-Stokes** version of this: one Stokes solve per Picard iter, σ damped between iters, η_eff fixed at η_VE. That's not the full architecture. The advice (and the user's framing in the conversation) is a **two-Stokes** operator split:
+
+> **Stage 1**: VE Stokes solve — find v, p, σ_VE assuming pure viscoelastic.
+> **Stage 2**: plasticity Stokes solve where σ_VE is *fully explicit* (a known stress source) and viscosity plays the role of the plastic multiplier.
+
+Adding the second momentum-balanced solve is what's missing. With it:
+* **ETD-2 might become viable for VEP+yield** because the second Stokes equilibrates the velocity field with the corrected stress field via a proper momentum solve, not just σ damping.
+* **Pointwise plastic correction is exact** for J2 (closed-form radial return). The second Stokes restores momentum balance globally given the corrected stress field.
+* **Anisotropic case (TI-VEP)** becomes a clean extension: stage 1 uses TI VE; stage 2 uses J2 (or Drucker–Prager) on the resolved fault-shear with the same momentum-balanced equilibration.
+
+## Architecture
+
+### Stage 1 (VE predictor)
+
+Standard Stokes with the existing `ViscoElasticPlasticFlowModel`:
+* `integrator='etd', order=2` (or `order=1`)
+* `yield_stress = ∞` (no in-residual yield)
+* solves `∇·σ_VE - body_force - ∇p_VE = 0`
+
+After solve: `psi_star.array` holds σ_VE (the VE trial stress, the predictor's output).
+
+### Stage 2 (plasticity corrector — the new piece)
+
+A separate Stokes-like solver with:
+
+* **Constitutive**: pure viscous, `σ_pl = 2η_pl(x)·ε̇(v_pl)`. The `ViscousFlowModel` works for this; `Parameters.shear_viscosity_0` is set to a meshvar field.
+* **Body force**: `−∇·σ_VE` so the total stress satisfies momentum: `∇·(σ_VE + 2η_pl·ε̇(v_pl)) = body_force_external`.
+  * Computed symbolically from `psi_star.sym` — UW3 supports `mesh.vector.divergence(rank-2 sym)` or equivalent.
+  * The bodyforce expression goes into `stokes_pl.bodyforce`.
+* **Effective plastic viscosity** `η_pl(x)`:
+  * **Interpretation (a) — coupled solve**: `η_pl` is determined implicitly so that `|σ_total|_eq ≤ σ_y` everywhere with equality where yielded. Solver iterates Newton on this; `η_pl` is a non-linear function of v_pl. This is the rigorous form, equivalent to a yield-aware viscosity in stage 2.
+  * **Interpretation (b) — explicit**: compute `η_pl` from stage-1 data: `η_pl = σ_y/(2|γ̇_VE|)` where `|σ_VE|_eq > σ_y`, large value otherwise. Stage 2 is a *linear* viscous solve. Outer Picard iterates over (a) or (b) blend until consistent.
+
+Phase F tested neither (a) nor (b) properly — it iterated σ via single-Stokes Picard with `η_eff = η_VE` fixed, which doesn't add momentum-balanced equilibration. The first session of this investigation should test (b) as the simpler scaffold; (a) requires either a yield-aware constitutive law for `ViscousFlowModel` or Newton-iterating the linear-viscous-with-spatial-η problem.
+
+After Stage 2: corrected velocity `v` and stress `σ_total = σ_VE + 2η_pl·ε̇(v_pl)` (or `v_total = v_VE + v_pl` depending on framing).
+
+### Outer iteration
+
+Wrap the two stages in a Picard loop with η damping (advice §9, ω_η ≈ 0.3) and σ damping (§10, ω_τ ≈ 0.5):
+
+```
+for k = 1, 2, ...:
+    Stage 1 with η_eff_k from previous iter        →  σ_VE^k
+    Compute η_pl^k from σ_VE^k (interpretation b)   or
+    Stage 2 (linear or nonlinear) with η_pl^k       →  v_pl^k, p_pl^k
+    Damp:
+        σ ← (1-ω_τ)·σ_old + ω_τ·σ_total^k
+        η ← (1-ω_η)·η_old + ω_η·η_pl^k
+    Convergence check: ||σ_k - σ_{k-1}|| / ||σ_k|| < tol
+```
+
+Within a single timestep. After convergence, `psi_star ← σ_total` (or σ_VE — design choice).
+
+## Implementation challenges in UW3
+
+1. **Two Stokes objects on the same mesh.** Each wants its own velocity/pressure meshvar. Currently the codebase has one Stokes per setup; we'd build a separate `Stokes(mesh, velocityField=v_pl, pressureField=p_pl)` and configure its constitutive model independently.
+
+2. **Spatial η_pl as a meshvar in the constitutive law.** `ViscousFlowModel.Parameters.shear_viscosity_0 = eta_pl_field.sym` should work — model uses the symbolic expression. Need to verify the JIT path handles a meshvar reference correctly (it does for σ_y already).
+
+3. **Body force = −∇·σ_VE**. `psi_star` is a SYM_TENSOR meshvar. Its symbolic divergence is `sympy.diff(psi_star.sym[i, j], coord_j)` summed over j. The bodyforce expression assembles to a vector. Cost: per-quadrature-point evaluation of three `sympy.diff` expressions on a tensor field — should be cheap.
+
+4. **What stays in psi_star at end of step.** Three options:
+   * `σ_total = σ_VE + 2η_pl·ε̇(v_pl)` — full corrected stress. Best for next-step ETD history term `α·σⁿ`.
+   * `σ_VE` — the VE-only stress. Cleaner separation but loses plasticity history.
+   * Apply final return-mapping cleanup on `σ_VE + 2η_pl·ε̇(v_pl)` to enforce yield exactly.
+
+5. **Velocity composition.** Does `v_pl` *replace* v_VE or *add* to it? If stage 2 has body force `−∇·σ_VE`, the `v_pl` from stage 2 is the velocity that, combined with `σ_VE` as "baseline stress", balances momentum. So `v_pl` IS the corrected velocity field at end of step (not v_VE + v_pl). For Lagrangian advection, use `v_pl`.
+
+6. **Boundary conditions.** Stage 2 inherits the same BCs as stage 1 (kinematic boundary conditions on v_pl). The VE stress σ_VE on the boundary is consistent with v_VE; stage 2 finds v_pl satisfying BCs and the plastic-balanced momentum.
+
+## Validation plan
+
+Reuse the Phase F harness (isotropic VEP, localised weak zone, harmonic loading). Compare:
+
+* **BDF-1 yield-in-residual** — production reference (already validated).
+* **ETD-1 + two-Stokes operator split (interpretation b)** — does it match BDF-1?
+* **ETD-2 + two-Stokes operator split** — does the second momentum solve rescue ETD-2 from the drift seen in Phase F? *This is the headline test.*
+
+If the answer is yes for ETD-2: we have a robust path to the higher-accuracy integrator for VEP+yield, and the path to TI-VEP fault mechanics is open.
+
+If no: the residual drift mechanism is structural beyond two-Stokes equilibration, and ETD-1 + softmin yield-in-residual remains the right answer.
+
+## Code organisation
+
+Suggested new files (on a branch off `development` after PR #161 merges):
+
+```
+docs/developer/design/_phase_g_two_stokes.py         # runner with stage 1 + stage 2 + outer Picard
+docs/developer/design/_plot_phase_g.py               # comparison plot, includes Phase F traces
+docs/developer/design/_phase_g_*.trace.txt           # per-step traces
+docs/developer/design/VEP_TWO_STOKES_OPERATOR_SPLIT.md  # this document
+```
+
+No production-API changes expected unless the architecture proves itself for VEP+yield — in which case the second-stage solver might land as a new helper or a method on `ViscoElasticPlasticFlowModel`.
+
+## Connecting back
+
+The user's framing closing the ETD investigation: *"the radial return, correctly computed as a sequence of solves ... offers the potential for a very robust VEP solver"*. That's exactly what this branch tests. Reference points:
+
+* `EXPONENTIAL_VE_INTEGRATOR.md` lesson #13 — first-order dissipation explanation
+* Phase F results — what radial return alone (without two-Stokes) achieves and where it fails
+* Web advice `/Users/lmoresi/Downloads/vep_stress_update_full_latex.md` §3, §11, §15 — the canonical predictor-corrector + outer Picard architecture
+
+The two-Stokes investigation is the bridge between the ETD work and a production-quality VEP+yield solver.
diff --git a/docs/developer/design/_exp_integrator_phase_a.py b/docs/developer/design/_exp_integrator_phase_a.py
new file mode 100644
index 00000000..9e8e4ade
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_a.py
@@ -0,0 +1,216 @@
+"""Phase A — 1D linear Maxwell exponential integrator validator.
+
+Engineering form throughout: σ̇ + σ/τ = μ γ̇  (γ̇ engineering shear rate).
+Steady-state under constant γ̇ → σ = η γ̇.  Compare:
+  - Exponential integrator (proposed)
+  - BDF-1
+  - BDF-2 (constant Δt)
+  - Analytical reference
+
+Forcings:
+  - sinusoidal: ε̇ = γ̇₀ cos(ωt)         → analytical Maxwell phasor
+  - square-wave: ε̇ = ±γ̇₀                → piecewise exponential
+
+Sweep Δt/τ from 0.01 to 10.  Output: max|err|, RMS, behaviour at large dt.
+"""
+
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import os
+
+
+# ── Parameters (η = μ = 1 for clarity) ─────────────────────────────
+ETA = 1.0; MU = 1.0; TAU = ETA / MU            # relaxation time = 1
+GAMMA_DOT_0 = 1.0
+T_END = 8 * TAU
+
+
+# ── Analytical references ──────────────────────────────────────────
+
+def maxwell_sin(t, omega, gamma_dot_0):
+    """σ(t) for ε̇(t) = γ̇₀ cos(ωt), σ(0) = 0."""
+    De = omega * TAU
+    A_inf = ETA * gamma_dot_0 / np.sqrt(1 + De**2)
+    phi = np.arctan(De)
+    # σ_ss(t) = A_∞ cos(ωt - φ); transient = -A_∞ cos(-φ) e^(-t/τ)
+    sigma_ss = A_inf * np.cos(omega * t - phi)
+    transient = -A_inf * np.cos(-phi) * np.exp(-t / TAU)
+    return sigma_ss + transient
+
+
+def maxwell_square(t, half_period, gamma_dot_0):
+    """σ(t) for square-wave γ̇, σ(0) = 0.  σ_start_n = value at start
+    of period n; updated period-by-period by relaxing toward target."""
+    sigma_ss = ETA * gamma_dot_0
+    out = np.zeros_like(t)
+    decay_full = np.exp(-half_period / TAU)
+    n_prev = -1
+    sigma_start = 0.0  # σ at start of period 0
+    for i, ti in enumerate(t):
+        n = int(ti // half_period)
+        # Advance sigma_start to start-of-period-n if we crossed boundaries
+        while n_prev < n - 1:
+            n_prev += 1
+            sign = 1.0 if n_prev % 2 == 0 else -1.0
+            target = sign * sigma_ss
+            sigma_start = target + (sigma_start - target) * decay_full
+        n_prev = n
+        sign = 1.0 if n % 2 == 0 else -1.0
+        target = sign * sigma_ss
+        t_local = ti - n * half_period
+        out[i] = target + (sigma_start - target) * np.exp(-t_local / TAU)
+    return out
+
+
+# ── Integrators ────────────────────────────────────────────────────
+
+def exp_integrator(gdot_n, gdot_np1, sigma_n, dt):
+    """One step of σⁿ⁺¹ = α σⁿ + μ(A γ̇ⁿ⁺¹ + B γ̇ⁿ).  Engineering form."""
+    x = dt / TAU
+    alpha = np.exp(-x)
+    phi = (1 - alpha) / x if x > 1e-12 else 1.0 - x/2 + x*x/6
+    A = TAU * (1 - phi)
+    B = TAU * (phi - alpha)
+    return alpha * sigma_n + MU * (A * gdot_np1 + B * gdot_n)
+
+
+def bdf1_step(gdot_np1, sigma_n, dt):
+    """Backward Euler: σⁿ⁺¹ = (σⁿ + μΔt γ̇ⁿ⁺¹) / (1 + Δt/τ)."""
+    return (sigma_n + MU * dt * gdot_np1) / (1 + dt / TAU)
+
+
+def bdf2_step(gdot_np1, sigma_n, sigma_nm1, dt):
+    """BDF-2 (constant dt): σⁿ⁺¹ (3/(2Δt) + 1/τ) = (4σⁿ - σⁿ⁻¹)/(2Δt) + μ γ̇ⁿ⁺¹"""
+    lhs = 1.5 / dt + 1.0 / TAU
+    rhs = (2 * sigma_n - 0.5 * sigma_nm1) / dt + MU * gdot_np1
+    return rhs / lhs
+
+
+# ── Run a forcing through each integrator ──────────────────────────
+
+def run_sinusoidal(omega, dt):
+    """Return (t, σ_exp, σ_bdf1, σ_bdf2, σ_ana)."""
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    eps = GAMMA_DOT_0 * np.cos(omega * t)
+
+    sig_exp = np.zeros_like(t)
+    sig_b1 = np.zeros_like(t)
+    sig_b2 = np.zeros_like(t)
+
+    for i in range(1, len(t)):
+        sig_exp[i] = exp_integrator(eps[i-1], eps[i], sig_exp[i-1], dt)
+        sig_b1[i] = bdf1_step(eps[i], sig_b1[i-1], dt)
+        if i == 1:
+            # BDF-2 startup: do BDF-1 for the very first step
+            sig_b2[i] = bdf1_step(eps[i], sig_b2[i-1], dt)
+        else:
+            sig_b2[i] = bdf2_step(eps[i], sig_b2[i-1], sig_b2[i-2], dt)
+
+    sig_ana = maxwell_sin(t, omega, GAMMA_DOT_0)
+    return t, sig_exp, sig_b1, sig_b2, sig_ana
+
+
+def run_square(half_period, dt):
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    # sign flips at integer multiples of half_period
+    n_period = (t // half_period).astype(int)
+    eps_at = GAMMA_DOT_0 * np.where(n_period % 2 == 0, 1.0, -1.0)
+    # ε̇ at step boundaries: take the value at t (right-continuous)
+
+    sig_exp = np.zeros_like(t)
+    sig_b1 = np.zeros_like(t)
+    sig_b2 = np.zeros_like(t)
+
+    for i in range(1, len(t)):
+        sig_exp[i] = exp_integrator(eps_at[i-1], eps_at[i], sig_exp[i-1], dt)
+        sig_b1[i] = bdf1_step(eps_at[i], sig_b1[i-1], dt)
+        if i == 1:
+            sig_b2[i] = bdf1_step(eps_at[i], sig_b2[i-1], dt)
+        else:
+            sig_b2[i] = bdf2_step(eps_at[i], sig_b2[i-1], sig_b2[i-2], dt)
+
+    sig_ana = maxwell_square(t, half_period, GAMMA_DOT_0)
+    return t, sig_exp, sig_b1, sig_b2, sig_ana
+
+
+def errors(sig, sig_ana):
+    err = np.abs(sig - sig_ana)
+    return float(err.max()), float(np.sqrt((err**2).mean()))
+
+
+def main():
+    out_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Sinusoidal sweep over Δt/τ
+    omega = np.pi / 2  # period 4τ
+    dt_ratios = [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0]
+    print("\n=== Sinusoidal ε̇(t) = cos(πt/2),  T = 8τ,  η = μ = τ = 1 ===")
+    print(f"{'Δt/τ':>6} | {'exp max|err|':>12} {'bdf1 max':>10} {'bdf2 max':>10} | "
+          f"{'exp rms':>10} {'bdf1 rms':>10} {'bdf2 rms':>10}")
+    print("-" * 90)
+    rows = []
+    for r in dt_ratios:
+        dt = r * TAU
+        if dt > T_END / 4:
+            continue
+        t, se, s1, s2, sa = run_sinusoidal(omega, dt)
+        em = errors(se, sa); e1 = errors(s1, sa); e2 = errors(s2, sa)
+        print(f"{r:>6.3g} | {em[0]:>12.3e} {e1[0]:>10.3e} {e2[0]:>10.3e} | "
+              f"{em[1]:>10.3e} {e1[1]:>10.3e} {e2[1]:>10.3e}")
+        rows.append((r, dt, em[0], e1[0], e2[0], em[1], e1[1], e2[1]))
+
+    # Square-wave (just one dt — focus on flip handling)
+    half_period = 2 * TAU
+    print("\n=== Square-wave (half-period = 2τ) ===")
+    for dt in (0.05, 0.1, 0.2, 0.5):
+        t, se, s1, s2, sa = run_square(half_period, dt)
+        em = errors(se, sa); e1 = errors(s1, sa); e2 = errors(s2, sa)
+        print(f"  dt = {dt:>4.2f}: exp max={em[0]:.3e}  bdf1 max={e1[0]:.3e}  "
+              f"bdf2 max={e2[0]:.3e}")
+
+    # Plot the dt-sweep convergence
+    rs = np.array([row[0] for row in rows])
+    em_max = np.array([row[2] for row in rows])
+    e1_max = np.array([row[3] for row in rows])
+    e2_max = np.array([row[4] for row in rows])
+
+    fig, (ax_l, ax_r) = plt.subplots(1, 2, figsize=(11, 4.5))
+
+    # Left: convergence
+    ax_l.loglog(rs, em_max, 'o-', label='Exponential', color='C0')
+    ax_l.loglog(rs, e1_max, 's-', label='BDF-1', color='C1')
+    ax_l.loglog(rs, e2_max, '^-', label='BDF-2', color='C2')
+    # Reference slopes
+    ax_l.loglog(rs, 1e-3 * rs / rs[0], 'k:', alpha=0.4, label='slope 1')
+    ax_l.loglog(rs, 1e-4 * (rs / rs[0])**2, 'k--', alpha=0.4, label='slope 2')
+    ax_l.set_xlabel(r'$\Delta t / \tau$')
+    ax_l.set_ylabel(r'max $|\sigma_{\rm sim} - \sigma_{\rm ana}|$')
+    ax_l.set_title('Sinusoidal forcing — dt convergence')
+    ax_l.grid(True, which='both', alpha=0.3)
+    ax_l.legend(fontsize=9)
+
+    # Right: trace at large dt (1.0 if we have it)
+    if 1.0 in rs:
+        idx = list(rs).index(1.0)
+        dt = TAU * 1.0
+        t, se, s1, s2, sa = run_sinusoidal(omega, dt)
+        ax_r.plot(t, sa, 'k-', label='analytical', linewidth=1.5)
+        ax_r.plot(t, se, 'o-', label='Exponential', color='C0', markersize=4)
+        ax_r.plot(t, s1, 's-', label='BDF-1', color='C1', markersize=4)
+        ax_r.plot(t, s2, '^-', label='BDF-2', color='C2', markersize=4)
+        ax_r.set_xlabel(r'$t / \tau$')
+        ax_r.set_ylabel(r'$\sigma$')
+        ax_r.set_title(rf'Trace at $\Delta t/\tau = 1$ (= 1/4 period)')
+        ax_r.grid(True, alpha=0.3)
+        ax_r.legend(fontsize=9)
+
+    fig.tight_layout()
+    fig_path = os.path.join(out_dir, "exp_integrator_phase_a.png")
+    fig.savefig(fig_path, dpi=140)
+    print(f"\nWrote {fig_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_phase_b_benches.py b/docs/developer/design/_exp_integrator_phase_b_benches.py
new file mode 100644
index 00000000..49d488e3
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_b_benches.py
@@ -0,0 +1,279 @@
+"""Phase B benchmark suite for MaxwellExponentialFlowModel.
+
+Runs the four required Phase B benches with the new ETD-2 model and
+prints comparison against the BDF baselines from the design doc:
+
+  1. ve_harmonic         — peak-start harmonic, BDF-2 baseline 1.34e-3
+  2. ve_square           — square wave, BDF-2 baseline ≈ 0.5e-2 (wider gap)
+  3. vep_square (Min)    — yield-active square wave, peak |σ| ≤ 1.001·τ_y
+  4. ve_square_vardt     — variable Δt around BC flips
+
+A small companion script ``_exp_integrator_phase_b_validate.py`` runs
+just bench 1 (the harmonic) — kept separate as the primary smoke test.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_exp_integrator_phase_b_benches.py
+"""
+
+from __future__ import annotations
+
+import time
+import sys
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+# ─────────────────────────────────────────────────────────────────────
+# Helpers (mirror docs/advanced/benchmarks/_bench_helpers.py)
+# ─────────────────────────────────────────────────────────────────────
+
+DEFAULT_PARAMS = dict(
+    eta=1.0, mu=1.0, H=1.0, W=2.0,
+    elementRes=(16, 8), velocity_degree=2, pressure_degree=1,
+)
+
+
+def t_relax(p):
+    return p["eta"] / p["mu"]
+
+
+def maxwell_square_wave(t, eta, mu, gamma_dot_0, half_period):
+    """Closed-form Maxwell square-wave response, σ(0) = 0."""
+    sigma_ss = eta * gamma_dot_0
+    tr = eta / mu
+    sigma = np.zeros_like(t)
+    sigma_at_t0 = 0.0
+    for n in range(int(np.ceil(t.max() / half_period)) + 1):
+        s_n = 1.0 if n % 2 == 0 else -1.0
+        t0 = n * half_period
+        in_window = (t >= t0 - 1e-12) & (t < t0 + half_period + 1e-12)
+        sigma[in_window] = (
+            s_n * sigma_ss
+            + (sigma_at_t0 - s_n * sigma_ss) * np.exp(-(t[in_window] - t0) / tr)
+        )
+        sigma_at_t0 = (
+            s_n * sigma_ss
+            + (sigma_at_t0 - s_n * sigma_ss) * np.exp(-half_period / tr)
+        )
+    return sigma
+
+
+def vep_square_wave(t, eta, mu, gamma_dot_0, tau_y, half_period):
+    """Closed-form yield-clipped square-wave response."""
+    sigma_ss = eta * gamma_dot_0
+    tr = eta / mu
+    sigma = np.zeros_like(t)
+    sigma_at_t0 = 0.0
+    for n in range(int(np.ceil(t.max() / half_period)) + 1):
+        s_n = 1.0 if n % 2 == 0 else -1.0
+        t0 = n * half_period
+        in_window = (t >= t0 - 1e-12) & (t < t0 + half_period + 1e-12)
+        raw = (
+            s_n * sigma_ss
+            + (sigma_at_t0 - s_n * sigma_ss) * np.exp(-(t[in_window] - t0) / tr)
+        )
+        sigma[in_window] = np.clip(raw, -tau_y, tau_y)
+        raw_end = (
+            s_n * sigma_ss
+            + (sigma_at_t0 - s_n * sigma_ss) * np.exp(-half_period / tr)
+        )
+        sigma_at_t0 = float(np.clip(raw_end, -tau_y, tau_y))
+    return sigma
+
+
+# ─────────────────────────────────────────────────────────────────────
+# Builder for an exp-integrator Stokes problem
+# ─────────────────────────────────────────────────────────────────────
+
+def build_stokes_exp(label, params, yield_stress=None, yield_mode="min"):
+    """Plain Stokes + MaxwellExponentialFlowModel (auto-DDt with forcing_star)."""
+    p = dict(params)
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=p["elementRes"],
+        minCoords=(-p["W"] / 2.0, -p["H"] / 2.0),
+        maxCoords=(p["W"] / 2.0, p["H"] / 2.0),
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, mesh.dim, degree=p["velocity_degree"])
+    pp = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=p["pressure_degree"])
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=pp)
+    stokes.constitutive_model = uw.constitutive_models.MaxwellExponentialFlowModel
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = p["eta"]
+    cm.Parameters.shear_modulus = p["mu"]
+    if yield_stress is not None:
+        cm.Parameters.yield_stress = yield_stress
+        cm._yield_mode = yield_mode
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top V")
+    stokes.add_dirichlet_bc((V_top, 0.0), "Top")
+    stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    return mesh, stokes, V_top, p
+
+
+def probe_centre(stokes, c=np.array([[0.0, 0.0]])):
+    return float(uw.function.evaluate(stokes.tau.sym[0, 1], c).flatten()[0])
+
+
+# ─────────────────────────────────────────────────────────────────────
+# Benchmarks
+# ─────────────────────────────────────────────────────────────────────
+
+def bench_ve_harmonic_exp():
+    V0 = 0.5
+    OMEGA = np.pi / 2.0
+    DT = 0.05
+    N_PERIODS = 4
+    T_END = N_PERIODS * 2.0 * np.pi / OMEGA
+
+    params = dict(DEFAULT_PARAMS)
+    mesh, stokes, V_top, params = build_stokes_exp("ve_harm_exp", params)
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    t_r = params["eta"] / params["mu"]
+    De = OMEGA * t_r
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    A_inf = params["eta"] * gamma_dot_0 / np.sqrt(1.0 + De ** 2)
+    phi_lag = float(np.arctan(De))
+
+    n_nodes = DFDt.psi_star[0].array.shape[0]
+    sigma0 = np.zeros((n_nodes, 2, 2))
+    sigma0[:, 0, 1] = A_inf
+    sigma0[:, 1, 0] = A_inf
+    DFDt.set_initial_history([sigma0], dt=DT)
+
+    edot0 = gamma_dot_0 / (2.0 * np.sqrt(1.0 + De ** 2))
+    f0 = np.zeros((n_nodes, 2, 2))
+    f0[:, 0, 1] = edot0
+    f0[:, 1, 0] = edot0
+    DFDt.forcing_star.array[...] = f0
+
+    times, sigmas = [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end + phi_lag))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        times.append(t_end)
+        t_cur = t_end
+    times = np.array(times); sigmas = np.array(sigmas)
+    sigma_ana = A_inf * np.cos(OMEGA * times)
+    err = np.abs(sigmas - sigma_ana)
+    return dict(
+        label="ve_harmonic", times=times, sigma=sigmas, sigma_ana=sigma_ana,
+        max_err=float(err.max()), rms=float(np.sqrt((err ** 2).mean())),
+        wall=time.time() - t0,
+    )
+
+
+def _square_run(label, yield_stress=None, yield_mode="min"):
+    V0 = 0.5
+    HALF_PERIOD = 2.0
+    N_PERIODS = 4
+    DT = 0.10
+    T_END = N_PERIODS * 2.0 * HALF_PERIOD
+
+    params = dict(DEFAULT_PARAMS)
+    mesh, stokes, V_top, params = build_stokes_exp(
+        label, params, yield_stress=yield_stress, yield_mode=yield_mode
+    )
+    cm = stokes.constitutive_model
+
+    times, sigmas = [], []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        n_half = int((t_cur + 0.5 * dt) / HALF_PERIOD)
+        sign = 1.0 if n_half % 2 == 0 else -1.0
+        v_now = sign * V0
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        sigmas.append(probe_centre(stokes))
+        t_cur += dt
+        times.append(t_cur)
+    times = np.array(times); sigmas = np.array(sigmas)
+    gamma_dot_0 = 2.0 * V0 / params["H"]
+    if yield_stress is None:
+        sigma_ana = maxwell_square_wave(times, params["eta"], params["mu"], gamma_dot_0, HALF_PERIOD)
+    else:
+        sigma_ana = vep_square_wave(times, params["eta"], params["mu"],
+                                    gamma_dot_0, yield_stress, HALF_PERIOD)
+    err = np.abs(sigmas - sigma_ana)
+    return dict(
+        label=label, times=times, sigma=sigmas, sigma_ana=sigma_ana,
+        max_err=float(err.max()), rms=float(np.sqrt((err ** 2).mean())),
+        peak_abs_sigma=float(np.abs(sigmas).max()),
+        wall=time.time() - t0,
+    )
+
+
+def bench_ve_square_exp():
+    return _square_run("ve_square_exp")
+
+
+def bench_vep_square_exp(tau_y=0.5, yield_mode="softmin"):
+    """VEP square-wave benchmark — defaults to softmin yield_mode for SNES robustness.
+
+    Min mode (sharp Newton kink) leads to ``DIVERGED_LINE_SEARCH`` for the
+    new exp model under this setup; softmin gives a smooth derivative at
+    the yield surface and converges robustly.
+    """
+    res = _square_run(f"vep_square_exp_{yield_mode}", yield_stress=tau_y, yield_mode=yield_mode)
+    res["tau_y"] = tau_y
+    res["yield_mode"] = yield_mode
+    return res
+
+
+# ─────────────────────────────────────────────────────────────────────
+# Driver
+# ─────────────────────────────────────────────────────────────────────
+
+def main():
+    runs = []
+    for fn, label in [
+        (bench_ve_harmonic_exp, "ve_harmonic"),
+        (bench_ve_square_exp, "ve_square"),
+        (bench_vep_square_exp, "vep_square_min"),
+    ]:
+        print(f"\n=== {label} (ETD-2) ===", flush=True)
+        try:
+            res = fn()
+            print(f"  steps={len(res['times'])}  wall={res['wall']:.1f}s")
+            print(f"  max|err|={res['max_err']:.4e}  rms={res['rms']:.4e}")
+            if "peak_abs_sigma" in res:
+                print(f"  peak|σ|={res['peak_abs_sigma']:.4f}")
+                if "tau_y" in res:
+                    over = int((np.abs(res["sigma"]) > 1.001 * res["tau_y"]).sum())
+                    print(f"  τ_y={res['tau_y']:.4f}  over_count={over}/{len(res['sigma'])}")
+            runs.append(res)
+        except Exception as e:
+            import traceback; traceback.print_exc()
+            print(f"  FAILED: {type(e).__name__}: {e}")
+            runs.append(None)
+    print("\n=== Summary ===")
+    print("Baselines (BDF-2, from design doc):")
+    print("  ve_harmonic      max|err|=1.34e-3")
+    print("  ve_square        max|err|=~5e-3")
+    print("  vep_square (Min) peak|σ|≤1.001·τ_y, BDF-2 over_count=0 once snapshot fix landed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_phase_b_eval.py b/docs/developer/design/_exp_integrator_phase_b_eval.py
new file mode 100644
index 00000000..16b12cc9
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_b_eval.py
@@ -0,0 +1,500 @@
+"""Phase B evaluation — exponential integrator under VEP, yield-active.
+
+Two questions to answer numerically before committing to the UW3
+``MaxwellExponentialFlowModel`` design:
+
+  1. Does the exponential integrator + softmin yield (lagged-τ) handle
+     a sub-/super-yield harmonic problem cleanly?  Compare against BDF-1
+     (the current safe choice for fault problems).
+
+  2. At Δt/τ ≥ 1 — the regime where BDF-1/2 collapse to no-amplitude
+     output — does the exponential integrator give a physically
+     meaningful answer?  This is the most interesting regime for
+     mantle/lithosphere coupling where τ can be small.
+
+Engineering form throughout: σ̇ + σ/τ = μ γ̇.  Steady viscous limit
+σ → η γ̇.  Yield surface: |σ| ≤ τ_y.
+
+The "VEP" treatment here uses **lagged-τ**: each step uses τ from the
+previous step's η_eff (= softmin(η_ve_exp, η_pl)).  η_pl is the
+Drucker-Prager-style instantaneous limiter.
+"""
+
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import os
+
+
+# ── Parameters ─────────────────────────────────────────────────────
+ETA = 1.0; MU = 1.0
+TAU_VE = ETA / MU            # = 1
+GAMMA_DOT_0 = 1.0
+T_END = 16 * TAU_VE
+
+
+# ── softmin yield (matches UW3's formula at δ=0.1) ─────────────────
+
+def eta_eff_softmin(eta_ve, eta_pl, delta=0.1):
+    """η_eff = η_ve / g(f) where f = η_ve/η_pl,
+       g(f) = 1 + softplus(f-1) - softplus(-1) = 1 + (f-1+sqrt((f-1)²+δ²))/2 - offset
+       This is a smooth approximation to min(η_ve, η_pl)."""
+    f = eta_ve / eta_pl
+    offset = (-1 + np.sqrt(1 + delta**2)) / 2
+    g = 1 + (f - 1 + np.sqrt((f - 1)**2 + delta**2)) / 2 - offset
+    return eta_ve / g
+
+
+def eta_pl_DP(sigma, gamma_dot, tau_y, eps_min=1e-6):
+    """Drucker-Prager-like plastic viscosity: τ_y = η_pl · γ̇  →  η_pl = τ_y/|γ̇|.
+       Use |γ̇| including a tiny floor to avoid 1/0 at γ̇=0."""
+    return tau_y / (abs(gamma_dot) + eps_min)
+
+
+# ── Integrators (engineering Maxwell, optional yield) ──────────────
+
+def step_exp_VEP(sigma_n, gdot_n, gdot_np1, dt, tau_y=None,
+                 tau_prev=TAU_VE, delta=0.1):
+    """One step of the exponential integrator with optional yield clip.
+
+    For the prototype: predictor-corrector return mapping.
+    1. Predict σ_pred via pure VE exponential update
+    2. If |σ_pred| > τ_y: smoothly clip via softmin so |σ| → τ_y
+    3. Update τ for next step based on (yielded or not)
+    """
+    x = dt / tau_prev
+    alpha = np.exp(-x)
+    phi = (1 - alpha) / x if x > 1e-12 else 1.0 - x/2 + x*x/6
+    A = tau_prev * (1 - phi)
+    B = tau_prev * (phi - alpha)
+    sigma_pred = alpha * sigma_n + MU * (A * gdot_np1 + B * gdot_n)
+    if tau_y is None:
+        return sigma_pred, ETA / MU  # pure VE, full elastic τ
+    # Smooth return-mapping clip via softmin on |σ|/τ_y
+    # f = |σ_pred|/τ_y.  If f<1, no change.  If f>1, scale toward τ_y.
+    f = abs(sigma_pred) / tau_y
+    if f <= 1.0:
+        return sigma_pred, ETA / MU  # below yield, full elastic relaxation
+    offset = (-1 + np.sqrt(1 + delta**2)) / 2
+    g = 1 + (f - 1 + np.sqrt((f - 1)**2 + delta**2)) / 2 - offset
+    sigma_clipped = sigma_pred / g
+    # During yield, effective relaxation time τ = η_pl/μ.
+    # η_pl ≈ τ_y/|γ̇| (Drucker-Prager).  Use γ̇ⁿ⁺¹ for the lagged update.
+    eta_pl = tau_y / (abs(gdot_np1) + 1e-6)
+    tau_new = max(eta_pl / MU, 1e-3)  # floor to avoid α→1 numerical issues
+    return sigma_clipped, tau_new
+
+
+def step_bdf1_VEP(sigma_n, gdot_np1, dt, tau_y=None, delta=0.1):
+    """BDF-1 with softmin return-mapping yield (parallel to step_exp_VEP)."""
+    sigma_pred = (sigma_n + MU * dt * gdot_np1) / (1 + dt / TAU_VE)
+    if tau_y is None:
+        return sigma_pred
+    f = abs(sigma_pred) / tau_y
+    if f <= 1.0:
+        return sigma_pred
+    offset = (-1 + np.sqrt(1 + delta**2)) / 2
+    g = 1 + (f - 1 + np.sqrt((f - 1)**2 + delta**2)) / 2 - offset
+    return sigma_pred / g
+
+
+# ── Square-wave analyticals and tests ──────────────────────────────
+
+def maxwell_square_analytical(t, half_period, gamma_dot_0, tau_y=None):
+    """σ(t) for square-wave γ̇, σ(0) = 0.  Optional yield clip at τ_y.
+
+    Within period n: σ(t) = target_n + (σ_start_n - target_n) e^{-(t-n·HP)/τ}
+    σ_start_(n+1) = target_n + (σ_start_n - target_n) e^{-HP/τ}
+
+    For yielding: clip σ to [-τ_y, +τ_y] post-hoc.  This is approximate
+    (real yielding clamps σ̇=0 once at yield, doesn't blend) but good
+    enough for cross-checking integrators against the same model.
+    """
+    sigma_ss = ETA * gamma_dot_0
+    decay = np.exp(-half_period / TAU_VE)
+    out = np.zeros_like(t)
+    n_prev = 0
+    sigma_start = 0.0
+    for i, ti in enumerate(t):
+        n = int(ti // half_period)
+        while n_prev < n:                         # advance one period at a time
+            sign_p = 1.0 if n_prev % 2 == 0 else -1.0
+            target_p = sign_p * sigma_ss
+            sigma_start = target_p + (sigma_start - target_p) * decay
+            n_prev += 1
+        sign = 1.0 if n % 2 == 0 else -1.0
+        target = sign * sigma_ss
+        t_local = ti - n * half_period
+        s = target + (sigma_start - target) * np.exp(-t_local / TAU_VE)
+        if tau_y is not None and abs(s) > tau_y:
+            s = np.sign(s) * tau_y
+        out[i] = s
+    return out
+
+
+def test_square_VE_VEP(half_period, dt, tau_y=None):
+    """Constant-dt run.  Returns (t, sig_exp, sig_b1, sig_ana)."""
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    n_period = (t // half_period).astype(int)
+    gdot_at = GAMMA_DOT_0 * np.where(n_period % 2 == 0, 1.0, -1.0)
+
+    sig_exp = np.zeros_like(t)
+    sig_b1 = np.zeros_like(t)
+    tau_lag = TAU_VE
+    for i in range(1, len(t)):
+        sig_exp[i], tau_lag = step_exp_VEP(
+            sig_exp[i-1], gdot_at[i-1], gdot_at[i], dt,
+            tau_y=tau_y, tau_prev=tau_lag,
+        )
+        sig_b1[i] = step_bdf1_VEP(sig_b1[i-1], gdot_at[i], dt, tau_y=tau_y)
+    sig_ana = maxwell_square_analytical(t, half_period, GAMMA_DOT_0, tau_y=tau_y)
+    return t, sig_exp, sig_b1, sig_ana
+
+
+def test_square_VE_VEP_vardt(half_period, dt_plateau, dt_fine, window,
+                               tau_y=None):
+    """Variable-dt run: dt_fine inside ±window of every BC flip,
+    dt_plateau elsewhere.  Step boundaries are clamped to the flip
+    times so no step straddles a discontinuity.  Returns (t, sig_exp,
+    sig_b1, sig_ana, dts)."""
+    flip_times = [half_period * (k + 1)
+                  for k in range(int(T_END / half_period) - 1)]
+
+    def schedule_dt(t_cur):
+        for f in flip_times:
+            if abs(t_cur - f) <= window:
+                return dt_fine
+        return dt_plateau
+
+    times_list = [0.0]; dts_list = []
+    sig_exp_list = [0.0]; sig_b1_list = [0.0]
+    tau_lag = TAU_VE
+    t_cur = 0.0
+
+    while t_cur < T_END - 1e-12:
+        dt_step = schedule_dt(t_cur)
+        # Clamp so we don't straddle the next flip OR step over the
+        # fine zone preceding it.  Without the second clamp, a plateau
+        # step starting just outside the window can leap clean over
+        # the entire fine zone, defeating the purpose of having one.
+        flip_next = next((f for f in flip_times if f > t_cur + 1e-12), T_END)
+        fine_zone_start = max(0.0, flip_next - window)
+        if t_cur < fine_zone_start - 1e-12:
+            # Approaching the fine zone — clamp to land at its start
+            dt_step = min(dt_step, fine_zone_start - t_cur)
+        dt_step = min(dt_step, flip_next - t_cur, T_END - t_cur)
+        t_end = t_cur + dt_step
+        # Period indexing: int(t // HP) gives the period containing t,
+        # right-continuous (period flips at exact multiples of HP).
+        # No fudge: it breaks the case where t_cur lands exactly on a
+        # flip (clamped step boundaries).
+        n_period_end = int(t_end // half_period)
+        # If t_end == HP exactly, we want the discontinuity TO BE inside
+        # this step (gdot transitions from +1 to -1 across it), matching
+        # const-dt convention.  So at exact flip, treat n_period_end as
+        # the post-flip period:
+        if t_end >= flip_next - 1e-12 and t_end <= flip_next + 1e-12 \
+           and flip_next < T_END - 1e-12:
+            n_period_end = int(flip_next // half_period)
+        sign_np1 = 1.0 if n_period_end % 2 == 0 else -1.0
+
+        n_period_start = int(t_cur // half_period)
+        sign_n = 1.0 if n_period_start % 2 == 0 else -1.0
+        gdot_n = GAMMA_DOT_0 * sign_n
+        gdot_np1 = GAMMA_DOT_0 * sign_np1
+
+        s_exp_new, tau_lag = step_exp_VEP(
+            sig_exp_list[-1], gdot_n, gdot_np1, dt_step,
+            tau_y=tau_y, tau_prev=tau_lag,
+        )
+        s_b1_new = step_bdf1_VEP(sig_b1_list[-1], gdot_np1, dt_step, tau_y=tau_y)
+
+        sig_exp_list.append(s_exp_new)
+        sig_b1_list.append(s_b1_new)
+        times_list.append(t_end)
+        dts_list.append(dt_step)
+        t_cur = t_end
+
+    times = np.array(times_list)
+    sig_exp = np.array(sig_exp_list)
+    sig_b1 = np.array(sig_b1_list)
+    dts = np.array(dts_list)
+    sig_ana = maxwell_square_analytical(times, half_period, GAMMA_DOT_0,
+                                         tau_y=tau_y)
+    return times, sig_exp, sig_b1, sig_ana, dts
+
+
+# ── Test 1: yield-active sinusoidal forcing ────────────────────────
+
+def test_yield_sin(omega, dt, tau_y):
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    gdot = GAMMA_DOT_0 * np.cos(omega * t)
+
+    sig_exp = np.zeros_like(t)
+    sig_b1 = np.zeros_like(t)
+    tau_lag = TAU_VE  # initial relaxation time
+    for i in range(1, len(t)):
+        sig_exp[i], tau_lag = step_exp_VEP(
+            sig_exp[i-1], gdot[i-1], gdot[i], dt,
+            tau_y=tau_y, tau_prev=tau_lag,
+        )
+        sig_b1[i] = step_bdf1_VEP(sig_b1[i-1], gdot[i], dt, tau_y=tau_y)
+    return t, sig_exp, sig_b1
+
+
+# ── Test 2: large-dt regime (Δt = τ, 2τ, 5τ) ───────────────────────
+
+def test_largedt_sin(omega, dt):
+    """Sinusoidal forcing, no yield (pure VE), large dt."""
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    gdot = GAMMA_DOT_0 * np.cos(omega * t)
+    sig_exp = np.zeros_like(t)
+    sig_b1 = np.zeros_like(t)
+    for i in range(1, len(t)):
+        sig_exp[i], _ = step_exp_VEP(sig_exp[i-1], gdot[i-1], gdot[i], dt)
+        sig_b1[i] = step_bdf1_VEP(sig_b1[i-1], gdot[i], dt)
+    De = omega * TAU_VE
+    A_inf = ETA * GAMMA_DOT_0 / np.sqrt(1 + De**2)
+    phi = np.arctan(De)
+    sig_ana = A_inf * (np.cos(omega * t - phi) - np.cos(phi) * np.exp(-t/TAU_VE))
+    return t, sig_exp, sig_b1, sig_ana
+
+
+def main():
+    out_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # ── Test 1: yield-active VEP ─────────────────────────────────
+    omega = np.pi / 4   # period 8τ — generous timestep window
+    dt = 0.1 * TAU_VE
+    print(f"\n=== Test 1: VEP harmonic, ω = π/4, dt = {dt} ===")
+    print(f"{'τ_y':>5} | {'A_∞':>6} {'sub/sup':>8} | "
+          f"{'Exp peak|σ|':>11} {'BDF-1 peak|σ|':>13} {'ratio':>6}")
+    for tau_y in (0.10, 0.20, 0.30, 0.50):
+        t, sig_exp, sig_b1 = test_yield_sin(omega, dt, tau_y)
+        De = omega * TAU_VE
+        A_inf = ETA * GAMMA_DOT_0 / np.sqrt(1 + De**2)
+        regime = "sub" if A_inf <= tau_y else "sup"
+        peak_e = np.abs(sig_exp).max()
+        peak_b = np.abs(sig_b1).max()
+        print(f"{tau_y:>5.2f} | {A_inf:>6.3f} {regime:>8} | "
+              f"{peak_e:>11.4f} {peak_b:>13.4f} {peak_e/peak_b:>6.3f}")
+
+    # ── Test 2: large dt ─────────────────────────────────────────
+    print(f"\n=== Test 2: Pure VE harmonic at large Δt/τ ===")
+    print(f"{'Δt/τ':>5} | {'Exp max|err|':>12} {'Exp peak':>9} | "
+          f"{'BDF-1 max|err|':>14} {'BDF-1 peak':>10} | {'analytical peak':>15}")
+    for dt_over_tau in (0.5, 1.0, 2.0, 5.0):
+        dt = dt_over_tau * TAU_VE
+        t, sig_exp, sig_b1, sig_ana = test_largedt_sin(omega, dt)
+        peak_ana = np.abs(sig_ana).max()
+        peak_e = np.abs(sig_exp).max()
+        peak_b = np.abs(sig_b1).max()
+        err_e = np.abs(sig_exp - sig_ana).max()
+        err_b = np.abs(sig_b1 - sig_ana).max()
+        print(f"{dt_over_tau:>5.2g} | {err_e:>12.3e} {peak_e:>9.4f} | "
+              f"{err_b:>14.3e} {peak_b:>10.4f} | {peak_ana:>15.4f}")
+
+    # ── Plot 1: yield-active VEP traces ──────────────────────────
+    fig, axes = plt.subplots(2, 2, figsize=(11, 7), sharex=True, sharey=True)
+    omega = np.pi / 4
+    dt = 0.1 * TAU_VE
+    De = omega * TAU_VE
+    A_inf = ETA * GAMMA_DOT_0 / np.sqrt(1 + De**2)
+    for ax, tau_y in zip(axes.ravel(), (0.10, 0.20, 0.30, 0.50)):
+        t, sig_exp, sig_b1 = test_yield_sin(omega, dt, tau_y)
+        # Reference: pure-VE no-yield analytical
+        phi = np.arctan(De)
+        sig_ve = A_inf * (np.cos(omega * t - phi) - np.cos(phi) * np.exp(-t/TAU_VE))
+        ax.plot(t, sig_ve, ':', color='0.4', linewidth=1, label='VE (no yield)')
+        ax.axhline(+tau_y, color='gray', linestyle=':', alpha=0.6, linewidth=1)
+        ax.axhline(-tau_y, color='gray', linestyle=':', alpha=0.6, linewidth=1)
+        ax.plot(t, sig_exp, '-', color='C0', linewidth=1.4, label='Exponential')
+        ax.plot(t, sig_b1, '-', color='C1', linewidth=1.4, label='BDF-1', alpha=0.85)
+        regime = "sub-yield" if A_inf <= tau_y else "yielding"
+        ax.set_title(rf'$\tau_y = {tau_y}$  ({regime}; $A_\infty = {A_inf:.3f}$)')
+        ax.grid(True, alpha=0.3)
+    axes[0, 0].legend(fontsize=9, loc='upper right')
+    for ax in axes[1]:
+        ax.set_xlabel(r'$t/\tau$')
+    for ax in axes[:, 0]:
+        ax.set_ylabel(r'$\sigma$')
+    fig.suptitle("VEP harmonic — Exponential vs BDF-1 (lagged-τ softmin yield, δ=0.1)",
+                 fontsize=12, y=0.995)
+    fig.tight_layout(rect=[0, 0, 1, 0.97])
+    fig.savefig(os.path.join(out_dir, "exp_integrator_phase_b_yield.png"), dpi=140)
+    print(f"\nWrote phase_b_yield.png")
+
+    # ── Test 3: Square-wave VE and VEP ───────────────────────────
+    half_period = 2 * TAU_VE
+    print(f"\n=== Test 3: Square-wave VE  (half-period = 2τ) ===")
+    print(f"{'dt':>5} | {'Exp max|err|':>12} {'Exp peak':>9} | "
+          f"{'BDF-1 max|err|':>14} {'BDF-1 peak':>10} | {'Ana peak':>8}")
+    for dt in (0.05, 0.1, 0.25, 0.5, 1.0):
+        t, se, s1, sa = test_square_VE_VEP(half_period, dt, tau_y=None)
+        eme = np.abs(se - sa).max(); em1 = np.abs(s1 - sa).max()
+        print(f"{dt:>5.2f} | {eme:>12.3e} {np.abs(se).max():>9.4f} | "
+              f"{em1:>14.3e} {np.abs(s1).max():>10.4f} | {np.abs(sa).max():>8.4f}")
+
+    print(f"\n=== Test 4: Square-wave VEP (half-period = 2τ, τ_y = 0.4) ===")
+    tau_y = 0.4
+    print(f"{'dt':>5} | {'Exp max|err|':>12} {'Exp peak':>9} | "
+          f"{'BDF-1 max|err|':>14} {'BDF-1 peak':>10} | {'τ_y':>5}")
+    for dt in (0.05, 0.1, 0.25, 0.5):
+        t, se, s1, sa = test_square_VE_VEP(half_period, dt, tau_y=tau_y)
+        eme = np.abs(se - sa).max(); em1 = np.abs(s1 - sa).max()
+        print(f"{dt:>5.2f} | {eme:>12.3e} {np.abs(se).max():>9.4f} | "
+              f"{em1:>14.3e} {np.abs(s1).max():>10.4f} | {tau_y:>5.2f}")
+
+    # ── Plot 3: square-wave VE/VEP traces ────────────────────────
+    fig, axes = plt.subplots(2, 2, figsize=(11, 7), sharex=True)
+    half_period = 2 * TAU_VE
+    for ax, dt, tau_y_plot, title in [
+        (axes[0, 0], 0.1, None, "VE  Δt=0.1τ"),
+        (axes[0, 1], 0.5, None, "VE  Δt=0.5τ (large)"),
+        (axes[1, 0], 0.1, 0.4,  "VEP Δt=0.1τ, τ_y=0.4"),
+        (axes[1, 1], 0.5, 0.4,  "VEP Δt=0.5τ, τ_y=0.4"),
+    ]:
+        t, se, s1, sa = test_square_VE_VEP(half_period, dt, tau_y=tau_y_plot)
+        ax.plot(t, sa, 'k-', lw=1.5, label='analytical')
+        ax.plot(t, se, 'o-', color='C0', ms=3, label='Exp', alpha=0.85)
+        ax.plot(t, s1, 's-', color='C1', ms=3, label='BDF-1', alpha=0.85)
+        if tau_y_plot is not None:
+            ax.axhline(+tau_y_plot, color='gray', ls=':', alpha=0.5)
+            ax.axhline(-tau_y_plot, color='gray', ls=':', alpha=0.5)
+        ax.set_title(title)
+        ax.grid(True, alpha=0.3)
+    axes[0, 0].legend(fontsize=9)
+    for ax in axes[1]: ax.set_xlabel(r'$t/\tau$')
+    for ax in axes[:, 0]: ax.set_ylabel(r'$\sigma$')
+    fig.suptitle("Square-wave forcing: VE & VEP — Exponential vs BDF-1",
+                 fontsize=12, y=0.995)
+    fig.tight_layout(rect=[0, 0, 1, 0.97])
+    fig.savefig(os.path.join(out_dir, "exp_integrator_phase_b_square.png"), dpi=140)
+    print(f"\nWrote phase_b_square.png")
+
+    # ── Test 5: Variable-dt around BC flips ──────────────────────
+    print(f"\n=== Test 5: Square-wave with variable dt around BC flips ===")
+    half_period = 2 * TAU_VE
+    DT_PLATEAU = 0.25 * TAU_VE       # coarse on plateaus
+    DT_FINE = 0.025 * TAU_VE         # 10× finer near flips
+    WINDOW = 0.2 * TAU_VE            # ±0.2τ window around each flip
+
+    print(f"  schedule: plateau Δt={DT_PLATEAU}, fine Δt={DT_FINE} "
+          f"(×{DT_FINE/DT_PLATEAU}), window=±{WINDOW}")
+
+    # VE
+    t_v, se_v, s1_v, sa_v, dts_v = test_square_VE_VEP_vardt(
+        half_period, DT_PLATEAU, DT_FINE, WINDOW, tau_y=None,
+    )
+    err_e_v = np.abs(se_v - sa_v).max()
+    err_1_v = np.abs(s1_v - sa_v).max()
+    print(f"  VE   Exp max|err|={err_e_v:.3e}  BDF-1 max|err|={err_1_v:.3e}")
+
+    # VEP
+    t_p, se_p, s1_p, sa_p, dts_p = test_square_VE_VEP_vardt(
+        half_period, DT_PLATEAU, DT_FINE, WINDOW, tau_y=0.4,
+    )
+    err_e_p = np.abs(se_p - sa_p).max()
+    err_1_p = np.abs(s1_p - sa_p).max()
+    print(f"  VEP  Exp max|err|={err_e_p:.3e}  BDF-1 max|err|={err_1_p:.3e}")
+
+    # Comparison: same problems at constant DT_PLATEAU (no fine windows)
+    t_v_c, se_v_c, s1_v_c, sa_v_c = test_square_VE_VEP(
+        half_period, DT_PLATEAU, tau_y=None,
+    )
+    t_p_c, se_p_c, s1_p_c, sa_p_c = test_square_VE_VEP(
+        half_period, DT_PLATEAU, tau_y=0.4,
+    )
+    print(f"  VE  const Δt={DT_PLATEAU}:  Exp max|err|={np.abs(se_v_c-sa_v_c).max():.3e}, "
+          f"BDF-1 max|err|={np.abs(s1_v_c-sa_v_c).max():.3e}")
+    print(f"  VEP const Δt={DT_PLATEAU}:  Exp max|err|={np.abs(se_p_c-sa_p_c).max():.3e}, "
+          f"BDF-1 max|err|={np.abs(s1_p_c-sa_p_c).max():.3e}")
+
+    # ── Plot 4: variable-dt traces ───────────────────────────────
+    fig, axes = plt.subplots(2, 2, figsize=(12, 7), sharex='col')
+    flip_times = [half_period * (k + 1)
+                  for k in range(int(T_END / half_period) - 1)]
+
+    # Top-left: VE trace
+    ax = axes[0, 0]
+    ax.plot(t_v, sa_v, 'k-', lw=1.5, label='analytical')
+    ax.plot(t_v, se_v, 'o-', color='C0', ms=4, label='Exp', alpha=0.85)
+    ax.plot(t_v, s1_v, 's-', color='C1', ms=4, label='BDF-1', alpha=0.85)
+    for f in flip_times:
+        ax.axvspan(f - WINDOW, f + WINDOW, color='0.85', alpha=0.4, lw=0)
+    ax.set_title('VE  variable Δt (fine windows shaded)')
+    ax.set_ylabel(r'$\sigma$')
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+    # Top-right: dt schedule
+    ax = axes[0, 1]
+    # Stair-step: each dt belongs to its step interval
+    t_steps = t_v[1:]  # right edge of each step
+    ax.step(t_v[1:], dts_v, where='post', color='C2', lw=1.5)
+    for f in flip_times:
+        ax.axvspan(f - WINDOW, f + WINDOW, color='0.85', alpha=0.4, lw=0)
+    ax.set_title('Δt schedule')
+    ax.set_ylabel(r'$\Delta t$')
+    ax.grid(True, alpha=0.3)
+
+    # Bottom-left: VEP trace
+    ax = axes[1, 0]
+    ax.plot(t_p, sa_p, 'k-', lw=1.5, label='analytical')
+    ax.plot(t_p, se_p, 'o-', color='C0', ms=4, label='Exp', alpha=0.85)
+    ax.plot(t_p, s1_p, 's-', color='C1', ms=4, label='BDF-1', alpha=0.85)
+    ax.axhline(+0.4, color='gray', ls=':', alpha=0.5)
+    ax.axhline(-0.4, color='gray', ls=':', alpha=0.5)
+    for f in flip_times:
+        ax.axvspan(f - WINDOW, f + WINDOW, color='0.85', alpha=0.4, lw=0)
+    ax.set_title(r'VEP variable Δt ($\tau_y = 0.4$)')
+    ax.set_xlabel(r'$t/\tau$')
+    ax.set_ylabel(r'$\sigma$')
+    ax.grid(True, alpha=0.3)
+
+    # Bottom-right: error vs t for both
+    ax = axes[1, 1]
+    err_e_t = np.abs(se_v - sa_v)
+    err_1_t = np.abs(s1_v - sa_v)
+    ax.semilogy(t_v, err_e_t + 1e-12, 'o-', color='C0', ms=3,
+                label='Exp (VE)', alpha=0.7)
+    ax.semilogy(t_v, err_1_t + 1e-12, 's-', color='C1', ms=3,
+                label='BDF-1 (VE)', alpha=0.7)
+    for f in flip_times:
+        ax.axvspan(f - WINDOW, f + WINDOW, color='0.85', alpha=0.4, lw=0)
+    ax.set_title('|σ - σ_ana|  (VE)')
+    ax.set_xlabel(r'$t/\tau$')
+    ax.set_ylabel(r'pointwise error')
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3, which='both')
+
+    fig.suptitle(
+        f"Variable-Δt square wave — fine Δt={DT_FINE} within ±{WINDOW}τ "
+        f"of flips, plateau Δt={DT_PLATEAU}",
+        fontsize=12, y=0.995,
+    )
+    fig.tight_layout(rect=[0, 0, 1, 0.96])
+    fig.savefig(os.path.join(out_dir, "exp_integrator_phase_b_vardt.png"), dpi=140)
+    print(f"  Wrote phase_b_vardt.png")
+
+    # ── Plot 2: large-dt traces ──────────────────────────────────
+    fig, axes = plt.subplots(2, 2, figsize=(11, 7), sharex=False)
+    for ax, dt_over_tau in zip(axes.ravel(), (0.5, 1.0, 2.0, 5.0)):
+        dt = dt_over_tau * TAU_VE
+        t, sig_exp, sig_b1, sig_ana = test_largedt_sin(omega, dt)
+        ax.plot(t, sig_ana, 'k-', linewidth=1.5, label='analytical')
+        ax.plot(t, sig_exp, 'o-', color='C0', markersize=3, label='Exp')
+        ax.plot(t, sig_b1, 's-', color='C1', markersize=3, label='BDF-1', alpha=0.85)
+        ax.set_title(rf'$\Delta t/\tau = {dt_over_tau}$')
+        ax.grid(True, alpha=0.3)
+    axes[0, 0].legend(fontsize=9)
+    fig.suptitle("Pure VE harmonic at large Δt/τ — Exponential vs BDF-1",
+                 fontsize=12, y=0.995)
+    fig.tight_layout(rect=[0, 0, 1, 0.97])
+    fig.savefig(os.path.join(out_dir, "exp_integrator_phase_b_largedt.png"), dpi=140)
+    print(f"Wrote phase_b_largedt.png")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_phase_b_killer.py b/docs/developer/design/_exp_integrator_phase_b_killer.py
new file mode 100644
index 00000000..eebf4822
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_b_killer.py
@@ -0,0 +1,281 @@
+"""Phase B killer test (decision gate): TI-VEP harmonic with spatial yield_stress.
+
+Mirrors ``docs/advanced/benchmarks/bench_ti_vep_harmonic.py`` but assigns
+the new ``TransverseIsotropicMaxwellExponentialFlowModel`` (ETD-2 +
+predictor-corrector return mapping) instead of the BDF-2 TI-VEP model.
+
+Decision gate (from EXPONENTIAL_VE_INTEGRATOR.md §Validation gates):
+  ``peak |σ_xy| bounded ≲ 1.1·τ_y in fault zone, ≲ A_∞ in bulk for all
+   6 (θ, τ_y) combinations.``
+
+BDF-2 currently produces 10⁸ blow-up on this setup; ETD-2 should run
+cleanly and stay bounded — the empirical proof of the structural
+argument that closes Phase B.
+
+Sweep: θ ∈ {0°, +15°, -15°} × τ_y ∈ {0.15, 0.30}.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_exp_integrator_phase_b_killer.py
+"""
+
+from __future__ import annotations
+
+import os
+import time
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3.function import expression
+
+
+# ---------------------------------------------------------------------------
+# Run-specific parameters (kept aligned with bench_ti_vep_harmonic.py)
+# ---------------------------------------------------------------------------
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * np.pi / OMEGA
+
+ETA_0 = 1.0
+ETA_1 = 1.0
+MU = 1.0
+TAU_Y_BULK = 200.0
+
+RES = 16
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+
+ANGLES_DEG = (0.0, 15.0, -15.0)
+TAU_Y_LIST = (0.15, 0.30)
+
+
+# ---------------------------------------------------------------------------
+# Build helper
+# ---------------------------------------------------------------------------
+
+def build_ti_exp_stokes(label, theta_deg, tau_y_at_fault):
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable(
+        f"U_{label}", mesh, 2, degree=2, vtype=uw.VarType.VECTOR,
+    )
+    p = uw.discretisation.MeshVariable(
+        f"P_{label}", mesh, 1, degree=1,
+        continuous=True, vtype=uw.VarType.SCALAR,
+    )
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta)
+    n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault,
+        value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicMaxwellExponentialFlowModel
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return mesh, stokes, V_top, np.array([n_x, n_y])
+
+
+# ---------------------------------------------------------------------------
+# Probes
+# ---------------------------------------------------------------------------
+
+def probe_centre_resolved(stokes, n_vec, c=np.array([[0.5, 0.5]])):
+    """σ_xy and resolved fault-plane shear at fault centre."""
+    tau = stokes.tau
+    dists = np.linalg.norm(tau.coords - c, axis=1)
+    idx = int(np.argmin(dists))
+    s_xx, s_yy, s_xy = tau.data[idx, 0], tau.data[idx, 1], tau.data[idx, 2]
+    n_x, n_y = n_vec
+    t_x, t_y = n_y, -n_x
+    resolved = (s_xx * t_x * n_x + s_xy * (t_x * n_y + t_y * n_x)
+                + s_yy * t_y * n_y)
+    return float(s_xy), float(resolved)
+
+
+# ---------------------------------------------------------------------------
+# Time-stepping
+# ---------------------------------------------------------------------------
+
+def run_one(theta_deg, tau_y_at_fault, t_end=None):
+    """Run one (θ, τ_y) combo. ``t_end`` overrides the module-level T_END;
+    use a fraction of T_END (e.g. ``T_END/4`` = 1 period) for fast
+    smokes that just check yield enforcement / convergence behaviour."""
+    if t_end is None:
+        t_end = T_END
+    label = f"ti_exp_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+    mesh, stokes, V_top, n_vec = build_ti_exp_stokes(label, theta_deg, tau_y_at_fault)
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    # Per-node τ_y(x) — the SPATIAL yield_stress field evaluated at psi_star
+    # node coords. Used for the proper yield-surface gate
+    # ``max_x σ_II(x) / τ_y(x)`` rather than dividing peak σ_II by the
+    # fault-centerline τ_y (which is misleading because the Gaussian
+    # influence decays sharply, so points just off centerline have
+    # τ_y_local much larger than τ_y_at_fault).
+    ty_field_sym = cm.Parameters.yield_stress.sym
+    ty_per_node = np.asarray(
+        uw.function.evaluate(ty_field_sym, DFDt.psi_star[0].coords)
+    ).flatten()
+
+    # Steady-state amplitude (sub-yield) for the analytical baseline
+    t_r = ETA_1 / MU
+    De = OMEGA * t_r
+    gamma_dot_0 = V0 / H  # engineering shear (NOT 2·V0/H — TI bench uses fixed-bottom BC)
+    A_inf = ETA_1 * gamma_dot_0 / np.sqrt(1.0 + De ** 2)
+
+    times, sxy_centre, sxy_max_global, sigmaII_max_fault = [], [], [], []
+    sigmaII_over_ty_max = []     # the proper yield-surface gate: max σ_II(x)/τ_y(x)
+    times_ana, resolved_centre = [], []
+    diverged = 0
+    t0 = time.time()
+    t_cur = 0.0
+    n_x, n_y = n_vec
+    while t_cur < t_end - 1e-9:
+        dt = min(DT, t_end - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end_step:.3f}: {exc}", flush=True)
+            diverged += 1
+            break
+
+        # Centre probe (uses tau projection with snapshot)
+        sxy_c, res_c = probe_centre_resolved(stokes, n_vec)
+        sxy_centre.append(sxy_c)
+        resolved_centre.append(res_c)
+
+        # Global stress-array probes (peak |σ_xy| and σ_II in fault zone)
+        sigma = np.asarray(DFDt.psi_star[0].array)
+        sxy_max_global.append(float(np.abs(sigma[:, 0, 1]).max()))
+        sigma_II = np.sqrt(0.5 * (sigma ** 2).sum(axis=(1, 2)))
+        coords = DFDt.psi_star[0].coords
+        # fault zone mask: distance from fault line ≤ 3·FAULT_WIDTH
+        cx, cy = 0.5 * W, 0.5 * H
+        sd = np.abs((coords[:, 0] - cx) * n_x + (coords[:, 1] - cy) * n_y)
+        mask = sd < 3.0 * FAULT_WIDTH
+        sigmaII_max_fault.append(float(sigma_II[mask].max()) if mask.any() else 0.0)
+
+        # Proper yield-surface gate: per-node ratio σ_II(x)/τ_y(x).
+        # Should be ≤ 1.001 at all nodes if yield is correctly enforced.
+        ratio = sigma_II / np.maximum(ty_per_node, 1e-30)
+        sigmaII_over_ty_max.append(float(ratio.max()))
+
+        times.append(t_end_step)
+        t_cur = t_end_step
+
+    return dict(
+        theta_deg=theta_deg,
+        tau_y=tau_y_at_fault,
+        A_inf=A_inf,
+        times=np.array(times),
+        sxy_centre=np.array(sxy_centre),
+        resolved_centre=np.array(resolved_centre),
+        sxy_max_global=np.array(sxy_max_global),
+        sigmaII_max_fault=np.array(sigmaII_max_fault),
+        sigmaII_over_ty_max=np.array(sigmaII_over_ty_max),
+        wall=time.time() - t0,
+        diverged=diverged,
+    )
+
+
+def main():
+    print(f"[ti_killer] dt={DT} T_end={T_END:.4f} (4 periods)", flush=True)
+    print(f"  bulk τ_y={TAU_Y_BULK}  fault τ_y∈{TAU_Y_LIST}  θ∈{ANGLES_DEG}", flush=True)
+    print(f"  Decision gate: σ_II_fault ≤ 1.1·τ_y, |σ_xy| ≤ A_∞ in bulk\n", flush=True)
+    n_pass = 0; n_total = 0
+    summary = []
+    for ty in TAU_Y_LIST:
+        for theta in ANGLES_DEG:
+            n_total += 1
+            print(f"--- θ={theta:+.0f}°, fault τ_y={ty:.2f} ---", flush=True)
+            res = run_one(theta, ty)
+            print(f"  steps={len(res['times'])} wall={res['wall']:.1f}s "
+                  f"diverged={res['diverged']}", flush=True)
+            if len(res['times']):
+                sxy_c = float(np.abs(res['sxy_centre']).max())
+                tau_res_c = float(np.abs(res['resolved_centre']).max())
+                ratio_sxy = sxy_c / ty
+                ratio_tau = tau_res_c / ty
+                print(f"  centre probes (apples-to-apples with BDF-1 baseline):")
+                print(f"    peak |σ_xy|       = {sxy_c:.4f}  ({ratio_sxy:.3f}·τ_y)")
+                print(f"    peak |τ_resolved| = {tau_res_c:.4f}  ({ratio_tau:.3f}·τ_y)")
+                print(f"  global probes:")
+                print(f"    peak |σ_xy| any node     = {float(res['sxy_max_global'].max()):.4f}")
+                print(f"    peak σ_II any node       = {float(res['sigmaII_max_fault'].max()):.4f}")
+                # Decision gate: τ_resolved at centre ≤ 1.20·τ_y
+                # (BDF-1 production baseline is 1.12-1.15·τ_y on this setup)
+                ok_yield = ratio_tau < 1.20
+                if ok_yield and res['diverged'] == 0:
+                    print(f"  PASS")
+                    n_pass += 1
+                    summary.append((theta, ty, ratio_tau, "PASS"))
+                else:
+                    print(f"  FAIL (centre |τ_resolved|/τ_y = {ratio_tau:.4f}, "
+                          f"diverged={res['diverged']})")
+                    summary.append((theta, ty, ratio_tau, "FAIL"))
+            else:
+                print(f"  FAIL — no steps completed")
+                summary.append((theta, ty, float('inf'), "FAIL"))
+            print()
+    print(f"\n=== KILLER TEST SUMMARY: {n_pass}/{n_total} PASS ===", flush=True)
+    print("  metric: peak |τ_resolved| at fault centre / τ_y_at_fault")
+    print("  BDF-1 production baseline ≈ 1.12-1.15·τ_y (centre)")
+    print("  BDF-2 (the higher-order method ETD-2 replaces) blows up to 10⁵-10⁹\n")
+    for theta, ty, ratio, status in summary:
+        print(f"  θ={theta:+.0f}°, τ_y={ty:.2f}: |τ_resolved|/τ_y = {ratio:.4f}  [{status}]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_phase_b_ti_iso.py b/docs/developer/design/_exp_integrator_phase_b_ti_iso.py
new file mode 100644
index 00000000..af0f42ff
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_b_ti_iso.py
@@ -0,0 +1,174 @@
+"""Phase B intermediate test: bench_ti_vep_harmonic geometry with the ISOTROPIC
+MaxwellExponentialFlowModel + spatial yield_stress field.
+
+This is NOT the killer test as designed (which uses TransverseIsotropic
+rank-4 tensor) — it's a structural sanity check before investing in the
+TI extension. Goal: confirm that the predictor-corrector return mapping
+on the spatial yield_stress field stays bounded, i.e. σ_II ≤ 1.001·τ_y
+everywhere. If yes: the exp framework's structural argument extends to
+spatial yield, and TI extension is tensor-bookkeeping. If no: the spatial
+yield handling itself needs more work.
+
+Note: this uses the ``zIC`` (zero IC) variant of the bench, since the
+peak-start TI IC requires resolving stress onto the fault tangent —
+which is TI-specific. Zero IC + smooth ramp-up is a cleaner test.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_exp_integrator_phase_b_ti_iso.py
+"""
+
+import time
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3.function import expression
+
+
+# ---------------------------------------------------------------------------
+# Parameters (kept aligned with bench_ti_vep_harmonic.py)
+# ---------------------------------------------------------------------------
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * np.pi / OMEGA
+
+ETA = 1.0          # use single isotropic viscosity (η₀ in TI nomenclature)
+MU = 1.0
+TAU_Y_BULK = 200.0
+
+RES = 16
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+
+ANGLES_DEG = (0.0,)        # start with 0° to keep the iso comparison simple
+TAU_Y_LIST = (0.30, 0.15)
+
+
+def build_iso_exp_stokes(label, theta_deg, tau_y_at_fault):
+    """Plain Stokes + MaxwellExponentialFlowModel + spatial yield_stress field."""
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1, continuous=True)
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault,
+        value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.MaxwellExponentialFlowModel
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.shear_viscosity_min = ETA * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm._yield_mode = "softmin"
+
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top V")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return mesh, stokes, V_top, tau_y_field
+
+
+def run_iso_zIC(theta_deg, tau_y_at_fault):
+    label = f"ti_iso_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+    mesh, stokes, V_top, ty_field = build_iso_exp_stokes(label, theta_deg, tau_y_at_fault)
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    times, peak_sxy_global, peak_sigmaII_fault = [], [], []
+    n_diverged = 0
+    t0 = time.time()
+    t_cur = 0.0
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end = t_cur + dt
+        # Forcing: V_top(t) = V0·cos(ωt + φ_lag) — same as bench
+        v_now = V0 * float(np.cos(OMEGA * t_end))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end:.3f}: {exc}", flush=True)
+            n_diverged += 1
+            break
+        # Probe
+        sigma = np.asarray(DFDt.psi_star[0].array)
+        sxy_global = float(np.abs(sigma[:, 0, 1]).max())
+        sigma_II = np.sqrt(0.5 * (sigma ** 2).sum(axis=(1, 2)))
+        coords = DFDt.psi_star[0].coords
+        # fault-zone mask (within ~3·FAULT_WIDTH of the centerline)
+        dist = np.abs(coords[:, 1] - 0.5 * H)  # 2D, theta=0 simplification
+        mask = dist < 3.0 * FAULT_WIDTH
+        sigmaII_fault = float(sigma_II[mask].max()) if mask.any() else 0.0
+        peak_sxy_global.append(sxy_global)
+        peak_sigmaII_fault.append(sigmaII_fault)
+        times.append(t_end)
+        t_cur = t_end
+    return dict(
+        times=np.array(times),
+        peak_sxy_global=np.array(peak_sxy_global),
+        peak_sigmaII_fault=np.array(peak_sigmaII_fault),
+        wall=time.time() - t0,
+        n_diverged=n_diverged,
+        tau_y=tau_y_at_fault,
+    )
+
+
+def main():
+    print(f"[ti_iso_zIC] dt={DT} T_end={T_END:.4f} (4 periods)", flush=True)
+    print(f"  bulk τ_y={TAU_Y_BULK}, fault τ_y values: {TAU_Y_LIST}\n", flush=True)
+    for ty in TAU_Y_LIST:
+        for theta in ANGLES_DEG:
+            print(f"--- θ={theta:+.0f}°, fault τ_y={ty:.2f} ---", flush=True)
+            res = run_iso_zIC(theta, ty)
+            print(f"  steps={len(res['times'])} wall={res['wall']:.1f}s "
+                  f"diverged={res['n_diverged']}", flush=True)
+            sxy = res['peak_sxy_global']
+            sii_fault = res['peak_sigmaII_fault']
+            if len(sxy):
+                print(f"  peak|σ_xy| (global): {sxy.max():.4f}", flush=True)
+                print(f"  peak σ_II (fault):   {sii_fault.max():.4f}", flush=True)
+                print(f"  ratio σ_II_fault/τ_y: {sii_fault.max()/ty:.3f}", flush=True)
+                if sii_fault.max() < 1.1 * ty:
+                    print(f"  PASS (σ_II_fault ≤ 1.1·τ_y)", flush=True)
+                else:
+                    print(f"  FAIL (σ_II_fault = {sii_fault.max():.4f} > 1.1·τ_y = {1.1*ty:.4f})", flush=True)
+            print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_phase_b_validate.py b/docs/developer/design/_exp_integrator_phase_b_validate.py
new file mode 100644
index 00000000..b77c5450
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_b_validate.py
@@ -0,0 +1,144 @@
+"""Phase B validator for MaxwellExponentialFlowModel — VE harmonic.
+
+Mirrors ``docs/advanced/benchmarks/bench_ve_harmonic.py`` but assigns the
+new ``MaxwellExponentialFlowModel`` (ETD-2) instead of the BDF-style VEP
+model. Decision gate: max|err| must match or beat BDF-2's 1.34e-3 baseline.
+
+The peak-start IC plants ``σ⁰ = A_∞·cos(0) = A_∞`` and the matching
+``ε̇⁰ = γ̇₀/(2√(1+De²))`` so step 1 starts on the analytical steady cycle
+with no homogeneous transient.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_exp_integrator_phase_b_validate.py
+"""
+
+import time
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+# Parameters — same as bench_ve_harmonic
+ETA = 1.0
+MU = 1.0
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+N_PERIODS = 4
+T_END = N_PERIODS * 2.0 * np.pi / OMEGA
+H = 1.0
+W = 2.0
+
+
+def run_exp():
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(16, 8), minCoords=(-W / 2, -H / 2), maxCoords=(W / 2, H / 2)
+    )
+    v = uw.discretisation.MeshVariable("U_exp_b", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable("P_exp_b", mesh, 1, degree=1)
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.MaxwellExponentialFlowModel
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA
+    cm.Parameters.shear_modulus = MU
+    stokes.tolerance = 1e-7
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    # Antisymmetric BCs (matches bench_ve_harmonic)
+    V_top = expression(r"V_{top}^{exp}", sympy.Float(0.0), "Top BC for exp validator")
+    stokes.add_essential_bc((V_top, 0.0), "Top")
+    stokes.add_essential_bc((-V_top, 0.0), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+
+    t_r = ETA / MU
+    De = OMEGA * t_r
+    gamma_dot_0 = 2.0 * V0 / H
+    A_inf = ETA * gamma_dot_0 / np.sqrt(1.0 + De ** 2)
+    phi_lag = float(np.arctan(De))
+
+    DFDt = stokes.Unknowns.DFDt
+    n_nodes = DFDt.psi_star[0].array.shape[0]
+
+    # Plant σ_xy = A_inf at t=0 (peak-start)
+    sigma0 = np.zeros((n_nodes, 2, 2))
+    sigma0[:, 0, 1] = A_inf
+    sigma0[:, 1, 0] = A_inf
+    history = [sigma0]
+    DFDt.set_initial_history(history, dt=DT)
+
+    # Plant ε̇⁰ = γ̇₀/(2√(1+De²)) (i.e. shear-only) so step 1's history
+    # term references the analytical ε̇ at t=0, not zero.
+    edot0 = gamma_dot_0 / (2.0 * np.sqrt(1.0 + De ** 2))
+    f0 = np.zeros((n_nodes, 2, 2))
+    f0[:, 0, 1] = edot0
+    f0[:, 1, 0] = edot0
+    DFDt.forcing_star.array[...] = f0
+
+    times, dts, sigmas, reasons = [], [], [], []
+    t_cur = 0.0
+    t0_wall = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step + phi_lag))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+
+        coords = DFDt.psi_star[0].coords
+        centre = np.array([[0.0, 0.0]])
+        idx = int(np.argmin(np.linalg.norm(coords - centre, axis=1)))
+        sigmas.append(float(DFDt.psi_star[0].array[idx, 0, 1]))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        times.append(t_end_step)
+        dts.append(dt)
+        t_cur = t_end_step
+
+    times = np.array(times)
+    dts = np.array(dts)
+    sigmas = np.array(sigmas)
+    reasons = np.array(reasons)
+    sigma_ana = A_inf * np.cos(OMEGA * times)
+    err = np.abs(sigmas - sigma_ana)
+    max_err = float(err.max())
+    rms = float(np.sqrt((err ** 2).mean()))
+    wall = time.time() - t0_wall
+    diverged = int((reasons < 0).sum())
+    return dict(
+        times=times,
+        dts=dts,
+        sigmas=sigmas,
+        sigma_ana=sigma_ana,
+        max_err=max_err,
+        rms=rms,
+        wall=wall,
+        diverged=diverged,
+        A_inf=A_inf,
+        De=De,
+    )
+
+
+def main():
+    print(f"[ve_harmonic_exp] dt={DT} T_end={T_END:.4f} (4 periods)", flush=True)
+    res = run_exp()
+    print(f"  steps={len(res['times'])} A_inf={res['A_inf']:.4f} De={res['De']:.4f}")
+    print(f"  ETD-2  wall={res['wall']:.1f}s  max|err|={res['max_err']:.4e}  rms={res['rms']:.4e}")
+    print(f"  diverged: {res['diverged']}/{len(res['times'])}")
+    print(f"  baseline (bench_ve_harmonic BDF-2 peak-start): max|err| = 1.34e-3", flush=True)
+    out = dict(
+        times=res["times"], dts=res["dts"],
+        sigma_exp=res["sigmas"], sigma_ana=res["sigma_ana"],
+        A_inf=res["A_inf"], De=res["De"],
+        max_err=res["max_err"], rms=res["rms"],
+    )
+    np.savez("output/exp_integrator_phase_b_ve_harmonic.npz", **out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_phase_d_split.py b/docs/developer/design/_exp_integrator_phase_d_split.py
new file mode 100644
index 00000000..7d7eadf1
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_phase_d_split.py
@@ -0,0 +1,253 @@
+"""Phase D — 1D cleanroom bench for the per-component ETD-2 scheme.
+
+Two parallel Maxwell branches with disparate relaxation times — the
+exact analogue of the rank-4 TI tensor split into matrix-aligned (η_⊥)
+and director-aligned (η_∥) channels:
+
+    σ̇_⊥ + σ_⊥/τ_⊥ = μ ε̇,   τ_⊥ = η_⊥ / μ     (slow, matrix)
+    σ̇_∥ + σ_∥/τ_∥ = μ ε̇,   τ_∥ = η_∥ / μ     (fast, post-yield clamp)
+    σ_total = σ_⊥ + σ_∥
+
+Both branches see the same engineering shear rate ε̇ = γ̇₀ cos(ωt).
+The analytical solution is the sum of two independent Maxwell phasor
+responses — fully closed-form, no numerical reference needed.
+
+Three integrators run on the *total* stress:
+
+  1. Per-component ETD-2 — propose, integrate σ_⊥ and σ_∥ separately
+     with their own (α_⊥, φ_⊥) and (α_∥, φ_∥), then sum. (Phase D.)
+  2. Lumped-effective ETD-2 — Phase B's current shape, one (α, φ) from
+     τ_eff = (η_⊥ + η_∥) / μ on the total stress.
+  3. Lumped-min ETD-2 — single (α, φ) from τ_min = min(τ_⊥, τ_∥); a
+     prior lagged-τ experiment we already tried.
+
+τ_∥ = 0.05 (post-yield-clamp regime), τ_⊥ = 1.0, μ = 1, ω = π/2,
+Δt swept from 0.005 to 0.5.  Headline metric: max-|err|/A_∞_total.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_exp_integrator_phase_d_split.py
+"""
+
+import os
+
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+# ── Parameters ─────────────────────────────────────────────────────
+MU = 1.0
+ETA_PERP = 1.0     # matrix viscosity (slow)
+ETA_PAR = 0.05     # post-yield-clamp director viscosity (fast)
+TAU_PERP = ETA_PERP / MU
+TAU_PAR = ETA_PAR / MU
+
+GAMMA_DOT_0 = 1.0
+OMEGA = np.pi / 2.0
+N_PERIODS = 4.0
+T_END = N_PERIODS * 2.0 * np.pi / OMEGA
+
+OUT_DIR = "output"
+
+
+def maxwell_sin(t, tau):
+    """σ(t) for ε̇(t) = γ̇₀ cos(ωt), one Maxwell branch with relaxation τ.
+    Phasor steady state plus decaying transient at σ(0) = 0."""
+    De = OMEGA * tau
+    A_inf = (MU * tau) * GAMMA_DOT_0 / np.sqrt(1 + De ** 2)
+    phi = np.arctan(De)
+    sigma_ss = A_inf * np.cos(OMEGA * t - phi)
+    transient = -A_inf * np.cos(-phi) * np.exp(-t / tau)
+    return sigma_ss + transient
+
+
+def analytical_total(t):
+    return maxwell_sin(t, TAU_PERP) + maxwell_sin(t, TAU_PAR)
+
+
+# ── Integrators ────────────────────────────────────────────────────
+
+
+def _alpha_phi(dt, tau):
+    x = dt / tau
+    alpha = np.exp(-x)
+    if x > 1e-12:
+        phi = (1 - alpha) / x
+    else:
+        phi = 1.0 - x / 2 + x * x / 6
+    return alpha, phi
+
+
+def etd2_step(gdot_n, gdot_np1, sigma_n, dt, tau, eta):
+    """Single Maxwell branch: σⁿ⁺¹ = α σⁿ + μ[A γ̇ⁿ⁺¹ + B γ̇ⁿ]."""
+    alpha, phi = _alpha_phi(dt, tau)
+    A = tau * (1 - phi)
+    B = tau * (phi - alpha)
+    return alpha * sigma_n + MU * (A * gdot_np1 + B * gdot_n)
+
+
+def run_per_component(dt):
+    """Per-component scheme: integrate the two branches separately."""
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    eps_dot = GAMMA_DOT_0 * np.cos(OMEGA * t)
+    sigma_perp = np.zeros_like(t)
+    sigma_par = np.zeros_like(t)
+    for i in range(1, len(t)):
+        sigma_perp[i] = etd2_step(
+            eps_dot[i - 1], eps_dot[i], sigma_perp[i - 1], dt, TAU_PERP, ETA_PERP
+        )
+        sigma_par[i] = etd2_step(
+            eps_dot[i - 1], eps_dot[i], sigma_par[i - 1], dt, TAU_PAR, ETA_PAR
+        )
+    return t, sigma_perp + sigma_par, sigma_perp, sigma_par
+
+
+def run_lumped(dt, tau_choice):
+    """Single-(α, φ) lump applied to the total stress.
+
+    The effective viscosity in the lumped picture is η_⊥ + η_∥ (the
+    instantaneous viscous stress is the sum of the two branches at
+    γ̇₀), so the model is σ̇ + σ/τ_choice = (η_⊥ + η_∥) γ̇ / τ_choice
+    — i.e. μ_eff γ̇ in our shorthand, where μ_eff = (η_⊥ + η_∥)/τ_choice.
+    """
+    t = np.arange(0.0, T_END + 1e-12, dt)
+    eps_dot = GAMMA_DOT_0 * np.cos(OMEGA * t)
+    sigma = np.zeros_like(t)
+    eta_eff = ETA_PERP + ETA_PAR
+    mu_eff = eta_eff / tau_choice
+    alpha, phi = _alpha_phi(dt, tau_choice)
+    A = tau_choice * (1 - phi)
+    B = tau_choice * (phi - alpha)
+    for i in range(1, len(t)):
+        sigma[i] = (
+            alpha * sigma[i - 1]
+            + mu_eff * (A * eps_dot[i] + B * eps_dot[i - 1])
+        )
+    return t, sigma
+
+
+# ── Main bench ─────────────────────────────────────────────────────
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+
+    # Pick one Δt for the trajectory plot; sweep for the error figure.
+    dt_show = 0.05
+    dt_sweep = np.array([0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5])
+
+    # --- trajectory at dt_show ---
+    t_pc, s_pc_total, s_pc_perp, s_pc_par = run_per_component(dt_show)
+    _, s_lump_eff = run_lumped(dt_show, TAU_PERP + TAU_PAR)   # naive sum
+    _, s_lump_min = run_lumped(dt_show, TAU_PAR)              # min-τ
+    _, s_lump_slow = run_lumped(dt_show, TAU_PERP)            # pick-the-slow
+
+    s_ana = analytical_total(t_pc)
+    s_perp_ana = maxwell_sin(t_pc, TAU_PERP)
+    s_par_ana = maxwell_sin(t_pc, TAU_PAR)
+
+    A_inf_total = np.max(np.abs(s_ana[len(s_ana) // 2:]))
+
+    # --- err sweep ---
+    err_pc, err_lump_eff, err_lump_min, err_lump_slow = [], [], [], []
+    for dt in dt_sweep:
+        t, s_pc, _, _ = run_per_component(dt)
+        _, s_le = run_lumped(dt, TAU_PERP + TAU_PAR)
+        _, s_lm = run_lumped(dt, TAU_PAR)
+        _, s_ls = run_lumped(dt, TAU_PERP)
+        ana = analytical_total(t)
+        err_pc.append(np.max(np.abs(s_pc - ana)) / A_inf_total)
+        err_lump_eff.append(np.max(np.abs(s_le - ana)) / A_inf_total)
+        err_lump_min.append(np.max(np.abs(s_lm - ana)) / A_inf_total)
+        err_lump_slow.append(np.max(np.abs(s_ls - ana)) / A_inf_total)
+
+    err_pc = np.array(err_pc); err_lump_eff = np.array(err_lump_eff)
+    err_lump_min = np.array(err_lump_min); err_lump_slow = np.array(err_lump_slow)
+
+    print("Phase D 1D bench — two parallel Maxwell branches", flush=True)
+    print(f"  τ_⊥={TAU_PERP}, τ_∥={TAU_PAR}, η_⊥={ETA_PERP}, η_∥={ETA_PAR}", flush=True)
+    print(f"  ω={OMEGA:.4f}, γ̇₀={GAMMA_DOT_0}, T_END={T_END:.2f}", flush=True)
+    print(f"  A_∞_total ≈ {A_inf_total:.4f}", flush=True)
+    print(flush=True)
+    print(f"{'dt':>8s} {'per-comp':>11s} {'lump-eff':>11s} {'lump-min':>11s} {'lump-slow':>11s}",
+          flush=True)
+    for i, dt in enumerate(dt_sweep):
+        print(
+            f"{dt:8.4f} {err_pc[i]:11.4e} {err_lump_eff[i]:11.4e} "
+            f"{err_lump_min[i]:11.4e} {err_lump_slow[i]:11.4e}",
+            flush=True,
+        )
+
+    # --- plots ---
+    fig = plt.figure(figsize=(11, 8.5))
+    gs = fig.add_gridspec(2, 2, height_ratios=[1.4, 1.0])
+    ax_traj = fig.add_subplot(gs[0, :])
+    ax_split = fig.add_subplot(gs[1, 0])
+    ax_err = fig.add_subplot(gs[1, 1])
+
+    # Total trajectories
+    ax_traj.plot(t_pc, s_ana, "-", color="black", lw=2.0, alpha=0.8,
+                 label=f"analytical (total, A∞={A_inf_total:.3f})")
+    ax_traj.plot(t_pc, s_pc_total, "--", color="#1f77b4", lw=1.5,
+                 label=f"per-component ETD-2 "
+                       f"(max|err|/A∞={err_pc[np.where(dt_sweep==dt_show)[0][0]]:.2e})")
+    idx = np.where(dt_sweep == dt_show)[0][0]
+    ax_traj.plot(t_pc, s_lump_eff, "--", color="#d62728", lw=1.2,
+                 label=f"lumped τ=τ_⊥+τ_∥ "
+                       f"(max|err|/A∞={err_lump_eff[idx]:.2e})")
+    ax_traj.plot(t_pc, s_lump_slow, ":", color="#9467bd", lw=1.2,
+                 label=f"lumped τ=τ_⊥ "
+                       f"(max|err|/A∞={err_lump_slow[idx]:.2e})")
+    ax_traj.plot(t_pc, s_lump_min, ":", color="#2ca02c", lw=1.2,
+                 label=f"lumped τ=τ_∥ "
+                       f"(max|err|/A∞={err_lump_min[idx]:.2e})")
+    ax_traj.set_xlabel("time")
+    ax_traj.set_ylabel(r"σ_total")
+    ax_traj.set_title(rf"Total stress — Δt={dt_show}, τ_⊥={TAU_PERP}, τ_∥={TAU_PAR}")
+    ax_traj.legend(loc="upper right", fontsize=8.5, ncol=1)
+    ax_traj.grid(alpha=0.3)
+
+    # Per-component split
+    ax_split.plot(t_pc, s_perp_ana, "-", color="black", lw=1.6,
+                  label=r"σ_⊥ analytical")
+    ax_split.plot(t_pc, s_pc_perp, "--", color="#1f77b4", lw=1.2,
+                  label=r"σ_⊥ ETD-2")
+    ax_split.plot(t_pc, s_par_ana, "-", color="#444444", lw=1.6,
+                  label=r"σ_∥ analytical")
+    ax_split.plot(t_pc, s_pc_par, "--", color="#ff7f0e", lw=1.2,
+                  label=r"σ_∥ ETD-2")
+    ax_split.set_xlabel("time")
+    ax_split.set_ylabel(r"branch stress")
+    ax_split.set_title("Per-component branches resolved separately")
+    ax_split.legend(loc="upper right", fontsize=8)
+    ax_split.grid(alpha=0.3)
+
+    # Error sweep (log-log)
+    ax_err.loglog(dt_sweep, err_pc, "o-", color="#1f77b4", label="per-component")
+    ax_err.loglog(dt_sweep, err_lump_eff, "s--", color="#d62728", label=r"lumped τ_⊥+τ_∥")
+    ax_err.loglog(dt_sweep, err_lump_slow, "^:", color="#9467bd", label=r"lumped τ_⊥")
+    ax_err.loglog(dt_sweep, err_lump_min, "v:", color="#2ca02c", label=r"lumped τ_∥")
+    ax_err.set_xlabel(r"Δt")
+    ax_err.set_ylabel(r"max|err|/A∞_total")
+    ax_err.set_title(r"Error vs Δt (log-log)")
+    ax_err.legend(loc="lower right", fontsize=8)
+    ax_err.grid(alpha=0.3, which="both")
+
+    fig.suptitle(
+        "Phase D — per-component ETD-2 vs lumped variants  "
+        rf"(parallel Maxwell branches, τ_⊥={TAU_PERP}, τ_∥={TAU_PAR})",
+        y=0.995, fontsize=11,
+    )
+    fig.tight_layout()
+
+    out_png = os.path.join(OUT_DIR, "exp_integrator_phase_d_split.png")
+    fig.savefig(out_png, dpi=140)
+    plt.close(fig)
+    print(f"\n  wrote {out_png}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_integrator_uw3_jury_rig.py b/docs/developer/design/_exp_integrator_uw3_jury_rig.py
new file mode 100644
index 00000000..f2867c64
--- /dev/null
+++ b/docs/developer/design/_exp_integrator_uw3_jury_rig.py
@@ -0,0 +1,362 @@
+"""Jury-rigged exponential integrator in UW3 — proof of concept.
+
+Tests:
+  1. Iso VE harmonic on a 2:1 box (matches bench_ve_harmonic geometry).
+     Decision gate: must reproduce BDF-2's max|err| ≈ 1.34e-3 or beat it.
+  2. Iso VEP harmonic with spatial yield_stress field on the 1:1 mesh
+     that blew up the BDF-2 consistency test.  Decision gate: must
+     stay bounded with reasonable peak |σ| ≤ 1.1·τ_y.
+
+Architecture (jury-rig, not production):
+
+  * Custom ``MaxwellExpFlowModel`` subclasses ``ViscousFlowModel`` and
+    overrides ``flux`` to return:
+
+        σ = 2·η_eff·(1-φ)·ε̇  +  α·σⁿ  +  2·η_eff·(φ-α)·ε̇ⁿ
+
+    where ``α, φ`` are scalar UWexpressions updated per step.
+
+  * Two new MeshVariables (``sigma_n_var``, ``epsdot_n_var``) hold the
+    two history streams.  After each solve we write the new σ and ε̇
+    back to these variables via direct nodal evaluation — not L2
+    projection.  Adequate for this proof-of-concept; production would
+    use SNES_Tensor_Projection.
+
+  * Yield handling (Test 2): η_eff = softmin(η(1-φ), η_pl), η_pl =
+    τ_y/(2·|ε̇_inv|).  Lagged-τ: τ_eff used in α, φ comes from the
+    *previous* step's η_eff (Picard-style; full self-consistent
+    iteration would solve τ↔σ inside the SNES).
+"""
+
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+from underworld3.constitutive_models import ViscousFlowModel
+
+
+# ─────────────────────────────────────────────────────────────────
+# Custom constitutive model
+# ─────────────────────────────────────────────────────────────────
+
+class MaxwellExpFlowModel(ViscousFlowModel):
+    """Jury-rigged Maxwell with exponential time integration.
+
+    flux = 2·η_eff·(1-φ)·ε̇ + α·σⁿ + 2·η·(φ-α)·ε̇ⁿ
+
+    For VE: η_eff = η.  For VEP: η_eff = softmin(η, η_pl) (set externally
+    on the model's ``yield_stress`` parameter and recomputed per step).
+    """
+
+    def __init__(self, unknowns, sigma_n_var, epsdot_n_var, **kwargs):
+        super().__init__(unknowns, **kwargs)
+        self._sigma_n = sigma_n_var
+        self._epsdot_n = epsdot_n_var
+        # Coefficients updated per timestep.  Initialise to non-degenerate
+        # values so the JIT compile produces an invertible Jacobian — at
+        # JIT time, the autodiff bakes in the symbolic structure, but the
+        # FIRST PetscDS constants[] update uses the *current* UWexpression
+        # values, so we want 2η(1-φ) > 0 even if update_exp_coeffs hasn't
+        # been called yet.  φ = 0 means "fully viscous (no relaxation)";
+        # this is the right "neutral" starting point.
+        self._exp_alpha = expression(r"{\alpha_{\rm exp}}", sympy.Float(0.0),
+                                      "exponential α = exp(-Δt/τ)")
+        self._exp_phi = expression(r"{\varphi_{\rm exp}}", sympy.Float(0.0),
+                                    "exponential φ = (1-α)/(Δt/τ)")
+        # Yield-stress UWexpression — set externally for VEP, leave at oo for VE
+        self._tau_y = expression(r"{\tau_y}", sympy.oo, "yield stress")
+        self._strainrate_min = expression(r"{\dot\varepsilon_{\min}}",
+                                           sympy.Float(1e-6),
+                                           "strain-rate floor for η_pl")
+        self._softness = 0.1  # softmin δ
+
+    @property
+    def K(self):
+        """Stiffness for saddle preconditioner — use raw η."""
+        return self.Parameters.shear_viscosity_0
+
+    def _eta_eff(self):
+        """Yield-limited viscosity (softmin)."""
+        eta = self.Parameters.shear_viscosity_0
+        ty = self._tau_y
+        # If τ_y is ∞, no yield
+        if hasattr(ty, 'sym') and ty.sym is sympy.oo:
+            return eta
+        # η_pl = τ_y / (2·|ε̇_inv|).  Use current ε̇ (sym(∇u))
+        E = self.Unknowns.E
+        epsII = sympy.sqrt((E**2).trace() / 2)
+        eta_pl = ty / (2 * (epsII + self._strainrate_min))
+        # softmin(eta, eta_pl)
+        delta = self._softness
+        f = eta / eta_pl
+        import math
+        offset = (-1 + math.sqrt(1 + delta**2)) / 2
+        g = 1 + (f - 1 + sympy.sqrt((f - 1)**2 + delta**2)) / 2 - offset
+        return eta / g
+
+    @property
+    def flux(self):
+        """Stress = 2·η_eff·(1-φ)·ε̇ + α·σⁿ + 2·η·(φ-α)·ε̇ⁿ."""
+        eta_eff = self._eta_eff()
+        eta_raw = self.Parameters.shear_viscosity_0
+        E = self.Unknowns.E
+        return (2 * eta_eff * (1 - self._exp_phi) * E
+                + self._exp_alpha * self._sigma_n.sym
+                + 2 * eta_raw * (self._exp_phi - self._exp_alpha) * self._epsdot_n.sym)
+
+
+# ─────────────────────────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────────
+
+def update_exp_coeffs(cm, dt, tau_eff):
+    """Set α, φ on the constitutive model for this step.  Uses the
+    *lagged* τ_eff from the previous step's η_eff."""
+    x = float(dt) / float(tau_eff)
+    if x < 1e-10:
+        alpha, phi = 1.0, 1.0
+    else:
+        alpha = float(np.exp(-x))
+        phi = (1.0 - alpha) / x
+    cm._exp_alpha.sym = sympy.Float(alpha)
+    cm._exp_phi.sym = sympy.Float(phi)
+
+
+def project_history(stokes, cm, sigma_n_var, epsdot_n_var,
+                    eta_raw, alpha_val, phi_val):
+    """After a solve, update σⁿ and ε̇ⁿ history variables.
+
+    Strategy that avoids the Matrix-evaluate-with-derivatives issue:
+      1. Evaluate ε̇^{n+1} component-wise (scalars) and write to a
+         temporary array.
+      2. Compute σ^{n+1} purely on numpy .array data using the
+         exponential update formula (mesh-variable reads, no derivs).
+      3. Update both history variables.
+    """
+    # Step 1: project ε̇^{n+1} component-wise
+    E_sym = stokes.Unknowns.E
+    coords = epsdot_n_var.coords
+    e_xx = np.asarray(uw.function.evaluate(E_sym[0, 0], coords)).flatten()
+    e_xy = np.asarray(uw.function.evaluate(E_sym[0, 1], coords)).flatten()
+    e_yy = np.asarray(uw.function.evaluate(E_sym[1, 1], coords)).flatten()
+    new_epsdot = np.zeros_like(epsdot_n_var.array)
+    new_epsdot[:, 0, 0] = e_xx
+    new_epsdot[:, 1, 1] = e_yy
+    new_epsdot[:, 0, 1] = e_xy
+    new_epsdot[:, 1, 0] = e_xy
+
+    # Step 2: σ^{n+1} = α·σⁿ + 2η(1-φ)·ε̇^{n+1} + 2η(φ-α)·ε̇ⁿ
+    # All quantities are nodal arrays — pure numpy.
+    a = alpha_val
+    p = phi_val
+    sigma_old = np.array(sigma_n_var.array)         # σⁿ (snapshot)
+    epsdot_old = np.array(epsdot_n_var.array)       # ε̇ⁿ (snapshot)
+    new_sigma = (a * sigma_old
+                 + 2 * eta_raw * (1 - p) * new_epsdot
+                 + 2 * eta_raw * (p - a) * epsdot_old)
+
+    # Step 3: write back
+    sigma_n_var.array[...] = new_sigma
+    epsdot_n_var.array[...] = new_epsdot
+
+
+def probe_centre_xy(sigma_n_var, c=np.array([[0.5, 0.5]])):
+    """Read σ_xy at the domain centre from sigma_n_var."""
+    coords = sigma_n_var.coords
+    idx = int(np.argmin(np.linalg.norm(coords - c, axis=1)))
+    return float(sigma_n_var.array[idx, 0, 1])
+
+
+# ─────────────────────────────────────────────────────────────────
+# Test 1 — Iso VE harmonic, 2:1 antisymmetric box, peak-start IC
+# ─────────────────────────────────────────────────────────────────
+
+def test_1_iso_VE_harmonic():
+    print("\n=== Test 1: iso VE harmonic — exp integrator vs BDF-2 baseline ===",
+          flush=True)
+    ETA = 1.0; MU = 1.0
+    V0 = 0.5
+    OMEGA = np.pi / 2.0
+    DT = 0.05
+    T_END = 4 * 2 * np.pi / OMEGA   # 4 periods
+
+    H = 1.0; W = 2.0
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(16, 8), minCoords=(-W/2, -H/2), maxCoords=(W/2, H/2),
+    )
+    v = uw.discretisation.MeshVariable("U_exp", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable("P_exp", mesh, 1, degree=1)
+    sigma_n = uw.discretisation.MeshVariable(
+        "sigma_n", mesh, 2, degree=2, vtype=VarType.SYM_TENSOR,
+    )
+    epsdot_n = uw.discretisation.MeshVariable(
+        "epsdot_n", mesh, 2, degree=2, vtype=VarType.SYM_TENSOR,
+    )
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = MaxwellExpFlowModel(stokes.Unknowns, sigma_n, epsdot_n)
+    cm.Parameters.shear_viscosity_0 = ETA
+    stokes.constitutive_model = cm
+    stokes.tolerance = 1e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    # Antisymmetric BCs (matches bench_ve_harmonic.py)
+    V_top_expr = expression(r"V_{top}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc((V_top_expr, 0.0), "Top")
+    stokes.add_essential_bc((-V_top_expr, 0.0), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+
+    # Peak-start IC: plant σ_xy = A_inf in sigma_n_var so step 1 starts
+    # at the steady-state cycle peak under cos(ωt+φ) forcing.
+    t_r = ETA / MU
+    De = OMEGA * t_r
+    gamma_dot_0 = 2.0 * V0 / H
+    A_inf = ETA * gamma_dot_0 / np.sqrt(1.0 + De**2)
+    phi_lag = float(np.arctan(De))
+
+    # Initialize sigma_n with σ_xy = A_inf, others = 0
+    sigma_n.array[:, 0, 0] = 0.0
+    sigma_n.array[:, 1, 1] = 0.0
+    sigma_n.array[:, 0, 1] = A_inf
+    sigma_n.array[:, 1, 0] = A_inf
+    # Initialize epsdot_n with ε̇_xy = γ̇₀/2 · cos(-ω·DT + φ_lag) ≈ γ̇₀/2
+    # The exact peak-start ε̇ value is small for the harmonic, but use 0
+    # initially — the first solve will correct.
+    epsdot_n.array[...] = 0.0
+
+    # Pure VE: τ_eff = η/μ = 1
+    tau_eff = ETA / MU
+
+    times, sxys, reasons = [], [], []
+    t_cur = 0.0
+    while t_cur < T_END - 1e-9:
+        update_exp_coeffs(cm, DT, tau_eff)
+        t_end = t_cur + DT
+        v_now = V0 * float(np.cos(OMEGA * t_end + phi_lag))
+        V_top_expr.sym = sympy.Float(v_now)
+        stokes.solve(zero_init_guess=False)
+        # Pull current α, φ from the model (we just set them via update_exp_coeffs)
+        a = float(cm._exp_alpha.sym); p = float(cm._exp_phi.sym)
+        project_history(stokes, cm, sigma_n, epsdot_n,
+                        eta_raw=ETA, alpha_val=a, phi_val=p)
+        sxys.append(probe_centre_xy(sigma_n, c=np.array([[0.0, 0.0]])))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        times.append(t_end)
+        t_cur = t_end
+
+    times = np.array(times); sxys = np.array(sxys)
+    sigma_ana = A_inf * np.cos(OMEGA * times)
+    err = np.abs(sxys - sigma_ana)
+    print(f"  steps={len(times)}, peak|σ_xy|={np.abs(sxys).max():.4f}, "
+          f"max|err|={err.max():.4e}, rms={np.sqrt((err**2).mean()):.4e}",
+          flush=True)
+    print(f"  diverged: {(np.array(reasons) < 0).sum()}/{len(reasons)}",
+          flush=True)
+    print(f"  baseline (bench_ve_harmonic BDF-2 peak-start): max|err| = 1.34e-3",
+          flush=True)
+    return times, sxys, sigma_ana
+
+
+# ─────────────────────────────────────────────────────────────────
+# Test 2 — Iso VEP with spatial yield_stress (the consistency case)
+# ─────────────────────────────────────────────────────────────────
+
+def test_2_iso_VEP_spatial():
+    print("\n=== Test 2: iso VEP harmonic w/ spatial τ_y — exp vs BDF-2 (which blew up) ===",
+          flush=True)
+    ETA = 1.0; MU = 1.0
+    V0 = 0.5
+    OMEGA = np.pi / 2.0
+    DT = 0.05
+    T_END = 16.0  # match the consistency test
+    TAU_Y_FAULT = 0.30
+    TAU_Y_BULK = 200.0
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(16, 16), minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0),
+        qdegree=3,
+    )
+    v = uw.discretisation.MeshVariable("U_exp2", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable("P_exp2", mesh, 1, degree=1)
+    sigma_n = uw.discretisation.MeshVariable(
+        "sigma_n_2", mesh, 2, degree=2, vtype=VarType.SYM_TENSOR,
+    )
+    epsdot_n = uw.discretisation.MeshVariable(
+        "epsdot_n_2", mesh, 2, degree=2, vtype=VarType.SYM_TENSOR,
+    )
+
+    fault = uw.meshing.Surface(
+        "fault_exp", mesh,
+        np.array([[0.2, 0.5], [0.8, 0.5]]),
+    )
+    fault.discretize()
+    weakness = fault.influence_function(
+        width=0.06, value_near=1.0/TAU_Y_FAULT, value_far=1.0/TAU_Y_BULK,
+        profile="gaussian",
+    )
+    ty_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = MaxwellExpFlowModel(stokes.Unknowns, sigma_n, epsdot_n)
+    cm.Parameters.shear_viscosity_0 = ETA
+    cm._tau_y.sym = ty_field  # spatial yield_stress
+    stokes.constitutive_model = cm
+    stokes.tolerance = 1e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top_expr = expression(r"V_{top}^{(2)}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc((V_top_expr, 0.0), "Top")
+    stokes.add_essential_bc((0.0, 0.0), "Bottom")  # asymmetric (matches consistency test)
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+
+    # σ=0 IC
+    sigma_n.array[...] = 0.0
+    epsdot_n.array[...] = 0.0
+
+    # Lagged τ_eff — start at τ_VE = η/μ = 1
+    tau_eff = ETA / MU
+
+    times, sxys, reasons = [], [], []
+    t_cur = 0.0
+    phi_lag = float(np.arctan(OMEGA))
+    n_steps = int(T_END / DT)
+    for step in range(n_steps):
+        update_exp_coeffs(cm, DT, tau_eff)
+        t_end = t_cur + DT
+        v_now = V0 * float(np.cos(OMEGA * t_end + phi_lag))
+        V_top_expr.sym = sympy.Float(v_now)
+        try:
+            stokes.solve(zero_init_guess=False, divergence_retries=2)
+        except Exception as exc:
+            print(f"    step {step+1}: solve failed: {exc}", flush=True)
+            break
+        # Pull current α, φ from the model (we just set them via update_exp_coeffs)
+        a = float(cm._exp_alpha.sym); p = float(cm._exp_phi.sym)
+        project_history(stokes, cm, sigma_n, epsdot_n,
+                        eta_raw=ETA, alpha_val=a, phi_val=p)
+        sxys.append(probe_centre_xy(sigma_n, c=np.array([[0.5, 0.5]])))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+        times.append(t_end)
+        t_cur = t_end
+
+    times = np.array(times); sxys = np.array(sxys)
+    print(f"  steps={len(times)}, peak|σ_xy|={np.abs(sxys).max():.4f}, "
+          f"diverged: {(np.array(reasons) < 0).sum()}/{len(reasons)}",
+          flush=True)
+    print(f"  baseline (BDF-2 same setup, consistency test): peak|σ_xy| = 13377  ← BLEW UP",
+          flush=True)
+    print(f"  expected exp result: bounded |σ_xy| ≲ τ_y = 0.30 (yield-clipped)",
+          flush=True)
+    return times, sxys
+
+
+def main():
+    test_1_iso_VE_harmonic()
+    test_2_iso_VEP_spatial()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_exp_jury_rig_minimal.py b/docs/developer/design/_exp_jury_rig_minimal.py
new file mode 100644
index 00000000..e5f1be74
--- /dev/null
+++ b/docs/developer/design/_exp_jury_rig_minimal.py
@@ -0,0 +1,150 @@
+"""Minimal jury-rig: build up the exponential constitutive model term by term.
+
+  Step A: pure Newton fluid (sanity check the custom-class plumbing)
+  Step B: Newton fluid + constant σⁿ history (additive stress)
+  Step C: Add α·σⁿ with α<1 (real exp)
+  Step D: Add ε̇ⁿ history term (full ETD-2)
+
+If any step diverges, the failing addition is identified.
+"""
+
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+from underworld3.constitutive_models import ViscousFlowModel
+
+
+ETA = 1.0; MU = 1.0
+V0 = 0.5
+DT = 0.05
+T_END = 0.5  # short — just need a few steps
+
+
+def setup():
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(8, 4), minCoords=(-1, -0.5), maxCoords=(1, 0.5),
+    )
+    v = uw.discretisation.MeshVariable("U_min", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable("P_min", mesh, 1, degree=1)
+    sigma_n = uw.discretisation.MeshVariable(
+        "sigma_n_m", mesh, 2, degree=2, vtype=VarType.SYM_TENSOR,
+    )
+    epsdot_n = uw.discretisation.MeshVariable(
+        "epsdot_n_m", mesh, 2, degree=2, vtype=VarType.SYM_TENSOR,
+    )
+    # Initialise to zero
+    sigma_n.array[...] = 0.0
+    epsdot_n.array[...] = 0.0
+    return mesh, v, p, sigma_n, epsdot_n
+
+
+def make_solver(mesh, v, p, custom_flux_fn):
+    """Build a Stokes solver with a custom-flux constitutive model."""
+
+    class _Custom(ViscousFlowModel):
+        @property
+        def K(self):
+            return self.Parameters.shear_viscosity_0
+
+        @property
+        def flux(self):
+            return custom_flux_fn(self)
+
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = _Custom(stokes.Unknowns)
+    cm.Parameters.shear_viscosity_0 = ETA
+    stokes.constitutive_model = cm
+    stokes.tolerance = 1e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(r"V_t", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc((V_top, 0.0), "Top")
+    stokes.add_essential_bc((-V_top, 0.0), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    return stokes, cm, V_top
+
+
+def run_a_few_steps(stokes, V_top, label, n_steps=4):
+    print(f"\n--- {label} ---", flush=True)
+    diverged = 0
+    last_iters = 0
+    for step in range(n_steps):
+        V_top.sym = sympy.Float(V0)
+        try:
+            stokes.solve(zero_init_guess=(step == 0))
+        except Exception as exc:
+            print(f"    step {step+1}: solve raised: {exc}", flush=True)
+            diverged += 1
+            break
+        reason = int(stokes.snes.getConvergedReason())
+        last_iters = int(stokes.snes.getIterationNumber())
+        if reason < 0:
+            diverged += 1
+            print(f"    step {step+1}: SNES diverged (reason={reason})", flush=True)
+        else:
+            print(f"    step {step+1}: converged in {last_iters} its (reason={reason})",
+                  flush=True)
+
+
+def main():
+    # Step A: pure Newton fluid
+    mesh, v, p, sigma_n, epsdot_n = setup()
+    stokes, cm, V_top = make_solver(
+        mesh, v, p,
+        custom_flux_fn=lambda self: 2 * self.Parameters.shear_viscosity_0 * self.Unknowns.E,
+    )
+    run_a_few_steps(stokes, V_top, "A: pure Newton (2η·ε̇)")
+
+    # Step B: Newton + uniform constant σ added (use sigma_n with σ_xy=0.3 baked in)
+    mesh, v, p, sigma_n, epsdot_n = setup()
+    sigma_n.array[:, 0, 1] = 0.3
+    sigma_n.array[:, 1, 0] = 0.3
+    stokes, cm, V_top = make_solver(
+        mesh, v, p,
+        custom_flux_fn=lambda self: (
+            2 * self.Parameters.shear_viscosity_0 * self.Unknowns.E
+            + sigma_n.sym
+        ),
+    )
+    run_a_few_steps(stokes, V_top, "B: Newton + σⁿ (constant uniform σ_xy=0.3)")
+
+    # Step C: scaled-down viscosity with α·σⁿ history (representative of exp)
+    mesh, v, p, sigma_n, epsdot_n = setup()
+    sigma_n.array[:, 0, 1] = 0.3
+    sigma_n.array[:, 1, 0] = 0.3
+    alpha_expr = expression(r"\alpha", sympy.Float(0.95), "α")
+    phi_expr = expression(r"\varphi", sympy.Float(0.975), "φ")
+    stokes, cm, V_top = make_solver(
+        mesh, v, p,
+        custom_flux_fn=lambda self: (
+            2 * self.Parameters.shear_viscosity_0 * (1 - phi_expr) * self.Unknowns.E
+            + alpha_expr * sigma_n.sym
+        ),
+    )
+    run_a_few_steps(stokes, V_top, "C: 2η(1-φ)·ε̇ + α·σⁿ  (φ=0.975, α=0.95)")
+
+    # Step D: full ETD-2 form including ε̇ⁿ history
+    mesh, v, p, sigma_n, epsdot_n = setup()
+    sigma_n.array[:, 0, 1] = 0.3
+    sigma_n.array[:, 1, 0] = 0.3
+    epsdot_n.array[:, 0, 1] = 0.5
+    epsdot_n.array[:, 1, 0] = 0.5
+    alpha_expr = expression(r"\alpha", sympy.Float(0.95), "α")
+    phi_expr = expression(r"\varphi", sympy.Float(0.975), "φ")
+    stokes, cm, V_top = make_solver(
+        mesh, v, p,
+        custom_flux_fn=lambda self: (
+            2 * self.Parameters.shear_viscosity_0 * (1 - phi_expr) * self.Unknowns.E
+            + alpha_expr * sigma_n.sym
+            + 2 * self.Parameters.shear_viscosity_0
+              * (phi_expr - alpha_expr) * epsdot_n.sym
+        ),
+    )
+    run_a_few_steps(stokes, V_top, "D: full ETD-2 form")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_phase_b_bdf2_at_tight_yield.py b/docs/developer/design/_phase_b_bdf2_at_tight_yield.py
new file mode 100644
index 00000000..81bc1a25
--- /dev/null
+++ b/docs/developer/design/_phase_b_bdf2_at_tight_yield.py
@@ -0,0 +1,238 @@
+"""BDF-2 trajectory at τ_y=0.05, θ=+15° — companion to the BDF-1/ETD/split/hybrid
+captures so the user can see the original BDF-2 instability that motivated
+the whole ETD investigation.
+
+Same setup as ``_phase_b_bdf_vs_etd_at_tight_yield.py`` but with ``order=2``
+on the BDF integrator. Saves σ_∥ probe + same trajectory metrics.
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def run_bdf2(theta_deg, tau_y_at_fault, n_periods=1.5):
+    label = f"bdf2_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    u = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=VarType.VECTOR)
+    p_sol = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                            continuous=True, vtype=VarType.SCALAR)
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, integrator="bdf", order=2,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    DFDt = stokes.Unknowns.DFDt
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    iters = []; reasons = []
+    sigma_II_max_per_step = []
+    u_y_max_per_step = []
+    sigma_xy_centre = []
+    sigma_par_centre = []
+    centre = np.array([[cx, cy]])
+    n_x_val = -float(np.sin(theta))
+    n_y_val = float(np.cos(theta))
+
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end_step:.3f}: solve raised — {exc}", flush=True)
+            iters.append(-1)
+            reasons.append(-99)
+            break
+
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        sigma_II_max_per_step.append(float(sigma_II.max()))
+        u_arr = np.asarray(u.array).reshape(-1, 2)
+        u_y_max_per_step.append(float(np.abs(u_arr[:, 1]).max()))
+        sxy_centre = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+        sigma_xy_centre.append(sxy_centre)
+        sxx_c = float(uw.function.evaluate(stokes.tau.sym[0, 0], centre).flatten()[0])
+        syy_c = float(uw.function.evaluate(stokes.tau.sym[1, 1], centre).flatten()[0])
+        T_x = sxx_c * n_x_val + sxy_centre * n_y_val
+        T_y = sxy_centre * n_x_val + syy_c * n_y_val
+        sig_nn = T_x * n_x_val + T_y * n_y_val
+        sig_par = float(np.sqrt(max(T_x ** 2 + T_y ** 2 - sig_nn ** 2, 0.0)))
+        sigma_par_centre.append(sig_par)
+
+        # Per-step running output so a runaway is visible immediately.
+        step_idx = len(iters)
+        if step_idx <= 5 or step_idx % 5 == 0:
+            print(
+                f"  step {step_idx:3d}/120  t={t_end_step:5.3f}  "
+                f"V={v_now:+.3f}  iters={iters[-1]:2d}  "
+                f"|σ|_II={sigma_II_max_per_step[-1]:.3e}  "
+                f"|u_y|={u_y_max_per_step[-1]:.3e}  "
+                f"|σ_∥|={sig_par:.3e}",
+                flush=True,
+            )
+
+        # Runaway guard — BDF-2 instability on TI-VEP+spatial yield is
+        # the documented original-investigation gap. Break and save the
+        # partial trajectory so we can plot the blow-up.
+        if sigma_II_max_per_step[-1] > 100.0 or u_y_max_per_step[-1] > 10.0:
+            print(
+                f"  *** runaway detected at step {step_idx}: "
+                f"|σ|_II={sigma_II_max_per_step[-1]:.3e}, "
+                f"|u_y|={u_y_max_per_step[-1]:.3e} — breaking ***",
+                flush=True,
+            )
+            break
+
+        t_cur = t_end_step
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+
+    print(
+        f"  ran {len(iters)} steps in {time.time()-t0:.1f}s; "
+        f"BDF-2, order=2, τ_y_fault={tau_y_at_fault}",
+        flush=True,
+    )
+    if iters_arr.size > 0 and (iters_arr >= 0).any():
+        print(
+            f"  SNES iters per step (bdf2): mean={iters_arr[iters_arr>=0].mean():.1f} "
+            f"median={int(np.median(iters_arr[iters_arr>=0]))} "
+            f"max={iters_arr[iters_arr>=0].max()} "
+            f"diverged={int((reasons_arr<0).sum())}/{len(reasons_arr)}",
+            flush=True,
+        )
+    if sigma_II_max_per_step:
+        print(
+            f"  max |σ|_II per step: end={sigma_II_max_per_step[-1]:.4f}  "
+            f"global max={max(sigma_II_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  max |u_y| per step:  end={u_y_max_per_step[-1]:.4f}  "
+            f"global max={max(u_y_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_xy| time series: "
+            f"end={abs(sigma_xy_centre[-1]):.4f}  "
+            f"peak={max(abs(s) for s in sigma_xy_centre):.4f}  "
+            f"({max(abs(s) for s in sigma_xy_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_∥| (resolved): "
+            f"end={sigma_par_centre[-1]:.4f}  "
+            f"peak={max(sigma_par_centre):.4f}  "
+            f"({max(sigma_par_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+
+    out_npz = os.path.join(
+        OUT_DIR,
+        f"phase_b_bdf2_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p") + ".npz",
+    )
+    np.savez(
+        out_npz,
+        iters=iters_arr,
+        reasons=reasons_arr,
+        sigma_II_max_per_step=np.asarray(sigma_II_max_per_step),
+        u_y_max_per_step=np.asarray(u_y_max_per_step),
+        sigma_xy_centre=np.asarray(sigma_xy_centre),
+        sigma_par_centre=np.asarray(sigma_par_centre),
+        theta_deg=np.array(theta_deg),
+        tau_y_at_fault=np.array(tau_y_at_fault),
+        T_END=np.array(T_END),
+        n_steps=np.array(len(iters)),
+        wall_seconds=np.array(time.time() - t0),
+    )
+    print(f"  saved → {out_npz}", flush=True)
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    cache = os.path.join(OUT_DIR, "phase_b_bdf2_th+15_ty0p05.npz")
+    if os.path.exists(cache):
+        print(f"=== BDF-2 cache hit: {cache} — skipping run ===", flush=True)
+        return
+    print("=== BDF-2: θ=+15°, τ_y=0.05 ===", flush=True)
+    run_bdf2(15.0, 0.05, n_periods=1.5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_phase_b_bdf2_th+15_ty0p05.trace.txt b/docs/developer/design/_phase_b_bdf2_th+15_ty0p05.trace.txt
new file mode 100644
index 00000000..1555b41a
--- /dev/null
+++ b/docs/developer/design/_phase_b_bdf2_th+15_ty0p05.trace.txt
@@ -0,0 +1,15 @@
+=== BDF-2: θ=+15°, τ_y=0.05 ===
+Structured box element resolution 32 32
+  step   1/120  t=0.050  V=+0.498  iters= 1  |σ|_II=2.383e-02  |u_y|=1.003e-05  |σ_∥|=2.052e-02
+  step   2/120  t=0.100  V=+0.494  iters= 1  |σ|_II=4.634e-02  |u_y|=8.526e-05  |σ_∥|=3.989e-02
+  step   3/120  t=0.150  V=+0.486  iters= 3  |σ|_II=7.835e-02  |u_y|=9.166e-03  |σ_∥|=5.149e-02
+  step   4/120  t=0.200  V=+0.476  iters= 2  |σ|_II=1.296e-01  |u_y|=2.537e-02  |σ_∥|=5.081e-02
+  step   5/120  t=0.250  V=+0.462  iters= 2  |σ|_II=2.099e-01  |u_y|=2.466e-02  |σ_∥|=5.069e-02
+  step  10/120  t=0.500  V=+0.354  iters= 2  |σ|_II=6.482e-01  |u_y|=2.372e-02  |σ_∥|=5.034e-02
+  step  15/120  t=0.750  V=+0.191  iters= 2  |σ|_II=8.816e-01  |u_y|=2.833e-02  |σ_∥|=5.033e-02
+  step  20/120  t=1.000  V=-0.000  iters= 4  |σ|_II=8.373e-01  |u_y|=1.561e-02  |σ_∥|=4.854e-02
+  step  25/120  t=1.250  V=-0.191  iters= 1  |σ|_II=6.041e-01  |u_y|=1.319e-02  |σ_∥|=1.914e-02
+  step  30/120  t=1.500  V=-0.354  iters= 2  |σ|_II=7.496e-01  |u_y|=3.150e-02  |σ_∥|=3.976e-02
+  step  35/120  t=1.750  V=-0.462  iters= 2  |σ|_II=1.872e+00  |u_y|=7.211e-02  |σ_∥|=4.820e-02
+  step  40/120  t=2.000  V=-0.500  iters= 4  |σ|_II=1.423e+01  |u_y|=8.612e-01  |σ_∥|=4.884e-02
+  step  45/120  t=2.250  V=-0.462  iters=10  |σ|_II=6.761e+01  |u_y|=9.766e+00  |σ_∥|=8.159e-02
diff --git a/docs/developer/design/_phase_b_bdf_vs_etd_at_tight_yield.py b/docs/developer/design/_phase_b_bdf_vs_etd_at_tight_yield.py
new file mode 100644
index 00000000..f0d2fef7
--- /dev/null
+++ b/docs/developer/design/_phase_b_bdf_vs_etd_at_tight_yield.py
@@ -0,0 +1,252 @@
+"""Phase B: BDF-1 vs ETD-2 trajectory comparison at τ_y=0.05.
+
+The user asked whether the catastrophic ETD-2 runaway at τ_y=0.05 is
+specific to ETD-2 or a problem-class issue affecting BDF as well. This
+script runs the bench_ti_vep_harmonic geometry at τ_y=0.05 with both
+``integrator='bdf'`` (production BDF-1) and ``integrator='etd'``
+(ETD-2 trial) and saves matching time series so we can plot them on
+the same axes.
+
+Running θ=+15° (the more demanding angled case) for 1.5 periods at
+RES=32 — same setup as ``output/phase_b_th+15_ty0p05.*``.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_phase_b_bdf_vs_etd_at_tight_yield.py
+"""
+
+import os
+import sys
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+# Match the ETD-2 demo parameters
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def run_case(theta_deg, tau_y_at_fault, integrator, n_periods=1.5):
+    """Run one (integrator, θ, τ_y) trajectory and save a time-series npz.
+
+    integrator: 'bdf' (BDF-1) or 'etd' (ETD-2).
+    """
+    if integrator == "bdf":
+        order = 1
+    elif integrator == "etd":
+        order = 2
+    else:
+        raise ValueError(f"unknown integrator '{integrator}'")
+
+    label = f"{integrator}_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    u = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=VarType.VECTOR)
+    p_sol = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                            continuous=True, vtype=VarType.SCALAR)
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, integrator=integrator, order=order,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    DFDt = stokes.Unknowns.DFDt
+    sigma_coords = DFDt.psi_star[0].coords
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    iters = []; reasons = []
+    sigma_II_max_per_step = []
+    u_y_max_per_step = []
+    sigma_xy_centre = []     # at fault centre, time series
+    sigma_par_centre = []    # resolved fault-plane shear at fault centre
+    centre = np.array([[cx, cy]])
+    n_x_val = -float(np.sin(theta))
+    n_y_val = float(np.cos(theta))
+
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end_step:.3f}: solve raised — {exc}", flush=True)
+            iters.append(-1)
+            reasons.append(-99)
+            break
+
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        sigma_II_max_per_step.append(float(sigma_II.max()))
+        u_arr = np.asarray(u.array).reshape(-1, 2)
+        u_y_max_per_step.append(float(np.abs(u_arr[:, 1]).max()))
+        sxy_centre = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+        sigma_xy_centre.append(sxy_centre)
+        # Resolved fault-plane shear |σ_∥| at fault centre.
+        sxx_c = float(uw.function.evaluate(stokes.tau.sym[0, 0], centre).flatten()[0])
+        syy_c = float(uw.function.evaluate(stokes.tau.sym[1, 1], centre).flatten()[0])
+        T_x = sxx_c * n_x_val + sxy_centre * n_y_val
+        T_y = sxy_centre * n_x_val + syy_c * n_y_val
+        sig_nn = T_x * n_x_val + T_y * n_y_val
+        sig_par = float(np.sqrt(max(T_x ** 2 + T_y ** 2 - sig_nn ** 2, 0.0)))
+        sigma_par_centre.append(sig_par)
+
+        t_cur = t_end_step
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+
+    integrator_label = "BDF-1" if integrator == "bdf" else "ETD-2"
+    print(
+        f"  ran {len(iters)} steps in {time.time()-t0:.1f}s; "
+        f"{integrator_label}, integrator='{integrator}', τ_y_fault={tau_y_at_fault}",
+        flush=True,
+    )
+    if iters_arr.size > 0 and (iters_arr >= 0).any():
+        print(
+            f"  SNES iters per step ({integrator}): mean={iters_arr[iters_arr>=0].mean():.1f} "
+            f"median={int(np.median(iters_arr[iters_arr>=0]))} "
+            f"max={iters_arr[iters_arr>=0].max()} "
+            f"diverged={int((reasons_arr<0).sum())}/{len(reasons_arr)}",
+            flush=True,
+        )
+    if sigma_II_max_per_step:
+        print(
+            f"  max |σ|_II per step: end={sigma_II_max_per_step[-1]:.4f}  "
+            f"global max={max(sigma_II_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  max |u_y| per step:  end={u_y_max_per_step[-1]:.4f}  "
+            f"global max={max(u_y_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_xy| time series: "
+            f"end={abs(sigma_xy_centre[-1]):.4f}  "
+            f"peak={max(abs(s) for s in sigma_xy_centre):.4f}  "
+            f"({max(abs(s) for s in sigma_xy_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_∥| (resolved): "
+            f"end={sigma_par_centre[-1]:.4f}  "
+            f"peak={max(sigma_par_centre):.4f}  "
+            f"({max(sigma_par_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+
+    # Save the time series so we can replot/compare without rerunning
+    out_npz = os.path.join(
+        OUT_DIR,
+        f"phase_b_{integrator}_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p") + ".npz",
+    )
+    np.savez(
+        out_npz,
+        iters=iters_arr,
+        reasons=reasons_arr,
+        sigma_II_max_per_step=np.asarray(sigma_II_max_per_step),
+        u_y_max_per_step=np.asarray(u_y_max_per_step),
+        sigma_xy_centre=np.asarray(sigma_xy_centre),
+        sigma_par_centre=np.asarray(sigma_par_centre),
+        theta_deg=np.array(theta_deg),
+        tau_y_at_fault=np.array(tau_y_at_fault),
+        T_END=np.array(T_END),
+        n_steps=np.array(len(iters)),
+        wall_seconds=np.array(time.time() - t0),
+    )
+    print(f"  saved → {out_npz}", flush=True)
+    return out_npz
+
+
+def _cache_path(integrator, theta_deg, tau_y):
+    return os.path.join(
+        OUT_DIR,
+        f"phase_b_{integrator}_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(".", "p") + ".npz",
+    )
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    theta_deg = 15.0
+    tau_y = 0.05
+
+    for integrator in ("bdf", "etd"):
+        cache = _cache_path(integrator, theta_deg, tau_y)
+        if os.path.exists(cache):
+            print(f"=== {integrator.upper()} cache hit: {cache} — skipping run ===", flush=True)
+            continue
+        print(f"=== {integrator.upper()}: θ={theta_deg:+.0f}°, τ_y={tau_y} ===", flush=True)
+        run_case(theta_deg, tau_y, integrator, n_periods=1.5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_phase_b_etd1_at_tight_yield.py b/docs/developer/design/_phase_b_etd1_at_tight_yield.py
new file mode 100644
index 00000000..9efa414b
--- /dev/null
+++ b/docs/developer/design/_phase_b_etd1_at_tight_yield.py
@@ -0,0 +1,244 @@
+"""ETD (order=1) trajectory at τ_y=0.05, θ=+15°.
+
+Hypothesis (after the lesson from BDF-2/ETD-2 lumped/split/hybrid):
+all higher-order time integrators show some flavour of growing
+instability on this tight-yield TI fault problem; only first-order
+BDF stays stable. The cause is L-stability / numerical dissipation,
+not algorithm specifics. ETD-1 is the first-order ETD analogue:
+
+    σ^{n+1} = α·σ^n + 2η(1-α)·ε̇^{n+1},   α = exp(-Δt/τ)
+
+Single step, no φ, no ε̇* history. Selected via
+``integrator='etd', order=1`` on the constitutive model — implemented
+as ETD-2 with φ = α (set in _update_history_coefficients), which
+zeroes the (φ-α)·ε̇* term and turns (1-φ)·ε̇ into (1-α)·ε̇.
+
+Per-step logging every 5 steps + runaway guard, per the project
+memory on per-step diagnostics.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_phase_b_etd1_at_tight_yield.py
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def run_etd1(theta_deg, tau_y_at_fault, n_periods=1.5):
+    label = f"etd1_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    u = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=VarType.VECTOR)
+    p_sol = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                            continuous=True, vtype=VarType.SCALAR)
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, integrator="etd", order=1,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    DFDt = stokes.Unknowns.DFDt
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    iters = []; reasons = []
+    sigma_II_max_per_step = []
+    u_y_max_per_step = []
+    sigma_xy_centre = []
+    sigma_par_centre = []
+    centre = np.array([[cx, cy]])
+    n_x_val = -float(np.sin(theta))
+    n_y_val = float(np.cos(theta))
+
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end_step:.3f}: solve raised — {exc}", flush=True)
+            iters.append(-1)
+            reasons.append(-99)
+            break
+
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        sigma_II_max_per_step.append(float(sigma_II.max()))
+        u_arr = np.asarray(u.array).reshape(-1, 2)
+        u_y_max_per_step.append(float(np.abs(u_arr[:, 1]).max()))
+        sxy_centre = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+        sigma_xy_centre.append(sxy_centre)
+        sxx_c = float(uw.function.evaluate(stokes.tau.sym[0, 0], centre).flatten()[0])
+        syy_c = float(uw.function.evaluate(stokes.tau.sym[1, 1], centre).flatten()[0])
+        T_x = sxx_c * n_x_val + sxy_centre * n_y_val
+        T_y = sxy_centre * n_x_val + syy_c * n_y_val
+        sig_nn = T_x * n_x_val + T_y * n_y_val
+        sig_par = float(np.sqrt(max(T_x ** 2 + T_y ** 2 - sig_nn ** 2, 0.0)))
+        sigma_par_centre.append(sig_par)
+
+        # Per-step logging
+        step_idx = len(iters)
+        if step_idx <= 5 or step_idx % 5 == 0:
+            print(
+                f"  step {step_idx:3d}/120  t={t_end_step:5.3f}  "
+                f"V={v_now:+.3f}  iters={iters[-1]:2d}  "
+                f"|σ|_II={sigma_II_max_per_step[-1]:.3e}  "
+                f"|u_y|={u_y_max_per_step[-1]:.3e}  "
+                f"|σ_∥|={sig_par:.3e}",
+                flush=True,
+            )
+
+        # Runaway guard
+        if sigma_II_max_per_step[-1] > 100.0 or u_y_max_per_step[-1] > 10.0:
+            print(
+                f"  *** runaway detected at step {step_idx}: "
+                f"|σ|_II={sigma_II_max_per_step[-1]:.3e}, "
+                f"|u_y|={u_y_max_per_step[-1]:.3e} — breaking ***",
+                flush=True,
+            )
+            break
+
+        t_cur = t_end_step
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+
+    print(
+        f"  ran {len(iters)} steps in {time.time()-t0:.1f}s; "
+        f"ETD-1, τ_y_fault={tau_y_at_fault}",
+        flush=True,
+    )
+    if iters_arr.size > 0 and (iters_arr >= 0).any():
+        print(
+            f"  SNES iters per step (etd1): mean={iters_arr[iters_arr>=0].mean():.1f} "
+            f"median={int(np.median(iters_arr[iters_arr>=0]))} "
+            f"max={iters_arr[iters_arr>=0].max()} "
+            f"diverged={int((reasons_arr<0).sum())}/{len(reasons_arr)}",
+            flush=True,
+        )
+    if sigma_II_max_per_step:
+        print(
+            f"  max |σ|_II per step: end={sigma_II_max_per_step[-1]:.4f}  "
+            f"global max={max(sigma_II_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  max |u_y| per step:  end={u_y_max_per_step[-1]:.4f}  "
+            f"global max={max(u_y_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_∥| (resolved): "
+            f"end={sigma_par_centre[-1]:.4f}  "
+            f"peak={max(sigma_par_centre):.4f}  "
+            f"({max(sigma_par_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+
+    out_npz = os.path.join(
+        OUT_DIR,
+        f"phase_b_etd1_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p") + ".npz",
+    )
+    np.savez(
+        out_npz,
+        iters=iters_arr,
+        reasons=reasons_arr,
+        sigma_II_max_per_step=np.asarray(sigma_II_max_per_step),
+        u_y_max_per_step=np.asarray(u_y_max_per_step),
+        sigma_xy_centre=np.asarray(sigma_xy_centre),
+        sigma_par_centre=np.asarray(sigma_par_centre),
+        theta_deg=np.array(theta_deg),
+        tau_y_at_fault=np.array(tau_y_at_fault),
+        T_END=np.array(T_END),
+        n_steps=np.array(len(iters)),
+        wall_seconds=np.array(time.time() - t0),
+    )
+    print(f"  saved → {out_npz}", flush=True)
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    cache = os.path.join(OUT_DIR, "phase_b_etd1_th+15_ty0p05.npz")
+    if os.path.exists(cache):
+        print(f"=== ETD-1 cache hit: {cache} — skipping run ===", flush=True)
+        return
+    print("=== ETD-1: θ=+15°, τ_y=0.05 ===", flush=True)
+    run_etd1(15.0, 0.05, n_periods=1.5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_phase_d_killer_split.py b/docs/developer/design/_phase_d_killer_split.py
new file mode 100644
index 00000000..f78d636c
--- /dev/null
+++ b/docs/developer/design/_phase_d_killer_split.py
@@ -0,0 +1,228 @@
+"""Phase D: split-history ETD-2 vs Phase B lumped + BDF-1 baseline.
+
+Same setup as ``_phase_b_bdf_vs_etd_at_tight_yield.py``: bench_ti_vep_harmonic
+geometry at θ=+15°, τ_y=0.05, RES=32, 1.5 periods. Compares the new
+``TransverseIsotropicVEPSplitFlowModel`` (per-component (α_⊥, φ_⊥)/
+(α_∥, φ_∥)) against the existing BDF-1 trajectory cache.
+
+Saves the split-ETD trajectory to ``output/phase_b_etd-split_th+15_ty0p05.npz``
+and reports the same metrics as the BDF/lumped-ETD captures.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_phase_d_killer_split.py
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def run_split(theta_deg, tau_y_at_fault, n_periods=1.5):
+    label = f"etd-split_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    u = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=VarType.VECTOR)
+    p_sol = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                            continuous=True, vtype=VarType.SCALAR)
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    # *** Phase D split-history ETD-2 ***
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPSplitFlowModel(
+        stokes.Unknowns,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    DFDt = stokes.Unknowns.DFDt
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    iters = []; reasons = []
+    sigma_II_max_per_step = []
+    u_y_max_per_step = []
+    sigma_xy_centre = []
+    sigma_par_centre = []
+    centre = np.array([[cx, cy]])
+    n_x_val = -float(np.sin(theta))
+    n_y_val = float(np.cos(theta))
+
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end_step:.3f}: solve raised — {exc}", flush=True)
+            iters.append(-1)
+            reasons.append(-99)
+            break
+
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        sigma_II_max_per_step.append(float(sigma_II.max()))
+        u_arr = np.asarray(u.array).reshape(-1, 2)
+        u_y_max_per_step.append(float(np.abs(u_arr[:, 1]).max()))
+        sxy_centre = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+        sigma_xy_centre.append(sxy_centre)
+        sxx_c = float(uw.function.evaluate(stokes.tau.sym[0, 0], centre).flatten()[0])
+        syy_c = float(uw.function.evaluate(stokes.tau.sym[1, 1], centre).flatten()[0])
+        T_x = sxx_c * n_x_val + sxy_centre * n_y_val
+        T_y = sxy_centre * n_x_val + syy_c * n_y_val
+        sig_nn = T_x * n_x_val + T_y * n_y_val
+        sig_par = float(np.sqrt(max(T_x ** 2 + T_y ** 2 - sig_nn ** 2, 0.0)))
+        sigma_par_centre.append(sig_par)
+
+        t_cur = t_end_step
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+
+    print(
+        f"  ran {len(iters)} steps in {time.time()-t0:.1f}s; "
+        f"split-ETD-2, τ_y_fault={tau_y_at_fault}",
+        flush=True,
+    )
+    if iters_arr.size > 0 and (iters_arr >= 0).any():
+        print(
+            f"  SNES iters per step (split): mean={iters_arr[iters_arr>=0].mean():.1f} "
+            f"median={int(np.median(iters_arr[iters_arr>=0]))} "
+            f"max={iters_arr[iters_arr>=0].max()} "
+            f"diverged={int((reasons_arr<0).sum())}/{len(reasons_arr)}",
+            flush=True,
+        )
+    if sigma_II_max_per_step:
+        print(
+            f"  max |σ|_II per step: end={sigma_II_max_per_step[-1]:.4f}  "
+            f"global max={max(sigma_II_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  max |u_y| per step:  end={u_y_max_per_step[-1]:.4f}  "
+            f"global max={max(u_y_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_xy| time series: "
+            f"end={abs(sigma_xy_centre[-1]):.4f}  "
+            f"peak={max(abs(s) for s in sigma_xy_centre):.4f}  "
+            f"({max(abs(s) for s in sigma_xy_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_∥| (resolved): "
+            f"end={sigma_par_centre[-1]:.4f}  "
+            f"peak={max(sigma_par_centre):.4f}  "
+            f"({max(sigma_par_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+
+    out_npz = os.path.join(
+        OUT_DIR,
+        f"phase_b_etd-split_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p") + ".npz",
+    )
+    np.savez(
+        out_npz,
+        iters=iters_arr,
+        reasons=reasons_arr,
+        sigma_II_max_per_step=np.asarray(sigma_II_max_per_step),
+        u_y_max_per_step=np.asarray(u_y_max_per_step),
+        sigma_xy_centre=np.asarray(sigma_xy_centre),
+        sigma_par_centre=np.asarray(sigma_par_centre),
+        theta_deg=np.array(theta_deg),
+        tau_y_at_fault=np.array(tau_y_at_fault),
+        T_END=np.array(T_END),
+        n_steps=np.array(len(iters)),
+        wall_seconds=np.array(time.time() - t0),
+    )
+    print(f"  saved → {out_npz}", flush=True)
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    cases = [(15.0, 0.05), (15.0, 0.15)]   # tight + Phase B working regime
+    for theta_deg, tau_y in cases:
+        cache_name = (
+            f"phase_b_etd-split_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(".", "p")
+            + ".npz"
+        )
+        cache = os.path.join(OUT_DIR, cache_name)
+        if os.path.exists(cache):
+            print(f"=== split-ETD-2 cache hit: {cache} — skipping run ===", flush=True)
+            continue
+        print(f"=== Phase D split-ETD-2: θ=+{theta_deg:.0f}°, τ_y={tau_y} ===", flush=True)
+        run_split(theta_deg, tau_y, n_periods=1.5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_phase_e_killer_hybrid.py b/docs/developer/design/_phase_e_killer_hybrid.py
new file mode 100644
index 00000000..6cc9a766
--- /dev/null
+++ b/docs/developer/design/_phase_e_killer_hybrid.py
@@ -0,0 +1,238 @@
+"""Phase E: hybrid BDF/ETD integrator with spatial fault weight.
+
+σ(x) = w(x)·σ_BDF + (1-w(x))·σ_ETD
+
+w(x) = (1/τ_y(x) - 1/τ_y_bulk) / (1/τ_y_fault - 1/τ_y_bulk) ∈ [0, 1]
+
+Inside the fault zone (where yielding can happen), w → 1 and BDF
+takes over (its built-in elastic damping during yield is the right
+physics, lesson #9). Outside the fault (where τ_y(x) → τ_y_bulk and
+yielding is structurally unreachable), w → 0 and ETD takes over (its
+4× accuracy advantage on smooth VE matters in the bulk).
+
+Same setup as ``_phase_d_killer_split.py`` (θ=+15°, RES=32, τ_y values
+{0.05, 0.15}, 1.5 periods). Saves time-series with σ_∥ probe.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_phase_e_killer_hybrid.py
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def run_hybrid(theta_deg, tau_y_at_fault, n_periods=1.5):
+    label = f"hybrid_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    u = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=VarType.VECTOR)
+    p_sol = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                            continuous=True, vtype=VarType.SCALAR)
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    # Fault weight: 0 in bulk (where weakness = 1/τ_y_bulk),
+    #               1 inside fault (where weakness = 1/τ_y_fault).
+    weakness_min = 1.0 / TAU_Y_BULK
+    weakness_max = 1.0 / tau_y_at_fault
+    fault_weight = (weakness - weakness_min) / (weakness_max - weakness_min)
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, integrator="hybrid", fault_weight=fault_weight,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    DFDt = stokes.Unknowns.DFDt
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    iters = []; reasons = []
+    sigma_II_max_per_step = []
+    u_y_max_per_step = []
+    sigma_xy_centre = []
+    sigma_par_centre = []
+    centre = np.array([[cx, cy]])
+    n_x_val = -float(np.sin(theta))
+    n_y_val = float(np.cos(theta))
+
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        try:
+            stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        except Exception as exc:
+            print(f"  step at t={t_end_step:.3f}: solve raised — {exc}", flush=True)
+            iters.append(-1)
+            reasons.append(-99)
+            break
+
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        sigma_II_max_per_step.append(float(sigma_II.max()))
+        u_arr = np.asarray(u.array).reshape(-1, 2)
+        u_y_max_per_step.append(float(np.abs(u_arr[:, 1]).max()))
+        sxy_centre = float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+        sigma_xy_centre.append(sxy_centre)
+        sxx_c = float(uw.function.evaluate(stokes.tau.sym[0, 0], centre).flatten()[0])
+        syy_c = float(uw.function.evaluate(stokes.tau.sym[1, 1], centre).flatten()[0])
+        T_x = sxx_c * n_x_val + sxy_centre * n_y_val
+        T_y = sxy_centre * n_x_val + syy_c * n_y_val
+        sig_nn = T_x * n_x_val + T_y * n_y_val
+        sig_par = float(np.sqrt(max(T_x ** 2 + T_y ** 2 - sig_nn ** 2, 0.0)))
+        sigma_par_centre.append(sig_par)
+
+        t_cur = t_end_step
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+
+    print(
+        f"  ran {len(iters)} steps in {time.time()-t0:.1f}s; "
+        f"hybrid (BDF/ETD), τ_y_fault={tau_y_at_fault}",
+        flush=True,
+    )
+    if iters_arr.size > 0 and (iters_arr >= 0).any():
+        print(
+            f"  SNES iters per step (hybrid): mean={iters_arr[iters_arr>=0].mean():.1f} "
+            f"median={int(np.median(iters_arr[iters_arr>=0]))} "
+            f"max={iters_arr[iters_arr>=0].max()} "
+            f"diverged={int((reasons_arr<0).sum())}/{len(reasons_arr)}",
+            flush=True,
+        )
+    if sigma_II_max_per_step:
+        print(
+            f"  max |σ|_II per step: end={sigma_II_max_per_step[-1]:.4f}  "
+            f"global max={max(sigma_II_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  max |u_y| per step:  end={u_y_max_per_step[-1]:.4f}  "
+            f"global max={max(u_y_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_xy| time series: "
+            f"end={abs(sigma_xy_centre[-1]):.4f}  "
+            f"peak={max(abs(s) for s in sigma_xy_centre):.4f}  "
+            f"({max(abs(s) for s in sigma_xy_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+        print(
+            f"  centre |σ_∥| (resolved): "
+            f"end={sigma_par_centre[-1]:.4f}  "
+            f"peak={max(sigma_par_centre):.4f}  "
+            f"({max(sigma_par_centre)/tau_y_at_fault:.2f}·τ_y_fault)",
+            flush=True,
+        )
+
+    out_npz = os.path.join(
+        OUT_DIR,
+        f"phase_b_hybrid_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p") + ".npz",
+    )
+    np.savez(
+        out_npz,
+        iters=iters_arr,
+        reasons=reasons_arr,
+        sigma_II_max_per_step=np.asarray(sigma_II_max_per_step),
+        u_y_max_per_step=np.asarray(u_y_max_per_step),
+        sigma_xy_centre=np.asarray(sigma_xy_centre),
+        sigma_par_centre=np.asarray(sigma_par_centre),
+        theta_deg=np.array(theta_deg),
+        tau_y_at_fault=np.array(tau_y_at_fault),
+        T_END=np.array(T_END),
+        n_steps=np.array(len(iters)),
+        wall_seconds=np.array(time.time() - t0),
+    )
+    print(f"  saved → {out_npz}", flush=True)
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    cases = [(15.0, 0.05), (15.0, 0.15)]
+    for theta_deg, tau_y in cases:
+        cache_name = (
+            f"phase_b_hybrid_th{theta_deg:+.0f}_ty{tau_y:.2f}".replace(".", "p")
+            + ".npz"
+        )
+        cache = os.path.join(OUT_DIR, cache_name)
+        if os.path.exists(cache):
+            print(f"=== hybrid cache hit: {cache} — skipping run ===", flush=True)
+            continue
+        print(f"=== Phase E hybrid BDF/ETD: θ=+{theta_deg:.0f}°, τ_y={tau_y} ===", flush=True)
+        run_hybrid(theta_deg, tau_y, n_periods=1.5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_phase_f_bdf1_iso.trace.txt b/docs/developer/design/_phase_f_bdf1_iso.trace.txt
new file mode 100644
index 00000000..29afd73d
--- /dev/null
+++ b/docs/developer/design/_phase_f_bdf1_iso.trace.txt
@@ -0,0 +1,123 @@
+# Phase F predictor-corrector trace: bdf1_iso
+# integrator='bdf' order=1 apply_radial_return=False in_residual_yield=True
+# columns: step, t, V_top, snes_iters_total, picard_iters, sigma_eq_max, sigma_eq_max_after_correction, u_y_max, yielded_fraction
+   1  0.0500 +0.4985   1  1 4.127208e-02 4.127208e-02 9.910768e-06 0.000000
+   2  0.1000 +0.4938   1  1 8.038114e-02 8.038114e-02 1.736993e-04 0.046832
+   3  0.1500 +0.4862  28  1 1.506698e-01 1.506698e-01 9.614702e-03 0.059688
+   4  0.2000 +0.4755  23  1 2.390412e-01 2.390412e-01 1.853175e-02 0.067034
+   5  0.2500 +0.4619  20  1 3.495552e-01 3.495552e-01 2.103261e-02 0.074380
+   6  0.3000 +0.4455  15  1 4.846955e-01 4.846955e-01 2.298590e-02 0.074380
+   7  0.3500 +0.4263  15  1 6.096305e-01 6.096305e-01 2.423472e-02 0.074380
+   8  0.4000 +0.4045  14  1 7.198707e-01 7.198707e-01 2.396950e-02 0.076217
+   9  0.4500 +0.3802  14  1 8.120357e-01 8.120357e-01 2.274448e-02 0.079890
+  10  0.5000 +0.3536  14  1 8.836892e-01 8.836892e-01 2.156902e-02 0.081726
+  11  0.5500 +0.3247  13  1 9.348462e-01 9.348462e-01 2.019526e-02 0.081726
+  12  0.6000 +0.2939  14  1 9.672433e-01 9.672433e-01 1.857488e-02 0.081726
+  13  0.6500 +0.2612  16  1 9.830292e-01 9.830292e-01 1.680594e-02 0.081726
+  14  0.7000 +0.2270  17  1 9.840694e-01 9.840694e-01 1.495069e-02 0.081726
+  15  0.7500 +0.1913  19  1 9.717716e-01 9.717716e-01 1.282147e-02 0.081726
+  16  0.8000 +0.1545  21  1 9.473505e-01 9.473505e-01 1.052256e-02 0.081726
+  17  0.8500 +0.1167  25  1 9.120627e-01 9.120627e-01 9.265141e-03 0.081726
+  18  0.9000 +0.0782  30  1 8.675153e-01 8.675153e-01 8.563905e-03 0.081726
+  19  0.9500 +0.0392  34  1 8.158198e-01 8.158198e-01 7.559992e-03 0.081726
+  20  1.0000 -0.0000  19  1 7.630093e-01 7.630093e-01 6.323271e-03 0.074380
+  21  1.0500 -0.0392   8  1 7.288530e-01 7.288530e-01 5.154964e-03 0.074380
+  22  1.1000 -0.0782   7  1 6.886855e-01 6.886855e-01 4.095200e-03 0.072544
+  23  1.1500 -0.1167   5  1 6.462222e-01 6.462222e-01 3.139607e-03 0.054178
+  24  1.2000 -0.1545   4  1 6.024623e-01 6.024623e-01 2.466041e-03 0.009183
+  25  1.2500 -0.1913   3  1 5.579514e-01 5.579514e-01 2.282205e-03 0.000000
+  26  1.3000 -0.2270   3  1 5.130849e-01 5.130849e-01 2.190943e-03 0.000000
+  27  1.3500 -0.2612   2  1 4.667991e-01 4.667991e-01 2.080320e-03 0.000918
+  28  1.4000 -0.2939   1  1 4.197769e-01 4.197769e-01 1.930754e-03 0.021120
+  29  1.4500 -0.3247   2  1 3.722971e-01 3.722971e-01 1.743052e-03 0.033058
+  30  1.5000 -0.3536  12  1 3.225307e-01 3.225307e-01 3.181995e-03 0.048669
+  31  1.5500 -0.3802  25  1 2.627879e-01 2.627879e-01 9.008150e-03 0.056015
+  32  1.6000 -0.4045  22  1 2.470817e-01 2.470817e-01 1.384321e-02 0.056015
+  33  1.6500 -0.4263  23  1 2.617071e-01 2.617071e-01 1.795609e-02 0.063361
+  34  1.7000 -0.4455  20  1 3.171074e-01 3.171074e-01 2.084389e-02 0.066116
+  35  1.7500 -0.4619  16  1 3.720918e-01 3.720918e-01 2.375286e-02 0.072544
+  36  1.8000 -0.4755  15  1 4.180842e-01 4.180842e-01 2.593457e-02 0.074380
+  37  1.8500 -0.4862  14  1 5.144363e-01 5.144363e-01 2.754916e-02 0.074380
+  38  1.9000 -0.4938  13  1 6.484323e-01 6.484323e-01 3.122790e-02 0.077135
+  39  1.9500 -0.4985  12  1 7.692478e-01 7.692478e-01 3.421500e-02 0.079890
+  40  2.0000 -0.5000  11  1 8.776854e-01 8.776854e-01 3.577072e-02 0.081726
+  41  2.0500 -0.4985  10  1 9.753243e-01 9.753243e-01 3.613523e-02 0.082645
+  42  2.1000 -0.4938   9  1 1.063180e+00 1.063180e+00 3.558671e-02 0.083563
+  43  2.1500 -0.4862   8  1 1.141797e+00 1.141797e+00 3.439847e-02 0.084481
+  44  2.2000 -0.4755   8  1 1.211360e+00 1.211360e+00 3.344108e-02 0.084481
+  45  2.2500 -0.4619   8  1 1.278016e+00 1.278016e+00 3.406503e-02 0.086318
+  46  2.3000 -0.4455   8  1 1.339710e+00 1.339710e+00 3.373125e-02 0.086318
+  47  2.3500 -0.4263   8  1 1.389034e+00 1.389034e+00 3.234658e-02 0.088154
+  48  2.4000 -0.4045   8  1 1.425182e+00 1.425182e+00 3.025302e-02 0.089073
+  49  2.4500 -0.3802  10  1 1.447622e+00 1.447622e+00 2.745848e-02 0.089073
+  50  2.5000 -0.3536  11  1 1.456493e+00 1.456493e+00 2.508182e-02 0.090909
+  51  2.5500 -0.3247  12  1 1.452144e+00 1.452144e+00 2.321883e-02 0.090909
+  52  2.6000 -0.2939  13  1 1.435099e+00 1.435099e+00 2.117630e-02 0.089073
+  53  2.6500 -0.2612  14  1 1.408897e+00 1.408897e+00 1.892825e-02 0.089073
+  54  2.7000 -0.2270  15  1 1.382260e+00 1.382260e+00 1.693459e-02 0.088154
+  55  2.7500 -0.1913  17  1 1.375198e+00 1.375198e+00 1.496980e-02 0.089073
+  56  2.8000 -0.1545  20  1 1.361218e+00 1.361218e+00 1.361402e-02 0.089073
+  57  2.8500 -0.1167  23  1 1.338628e+00 1.338628e+00 1.213324e-02 0.088154
+  58  2.9000 -0.0782  27  1 1.307413e+00 1.307413e+00 1.046934e-02 0.086318
+  59  2.9500 -0.0392  33  1 1.267677e+00 1.267677e+00 8.977656e-03 0.082645
+  60  3.0000 -0.0000  24  1 1.220795e+00 1.220795e+00 7.933386e-03 0.081726
+  61  3.0500 +0.0392  10  1 1.171966e+00 1.171966e+00 6.959463e-03 0.080808
+  62  3.1000 +0.0782   7  1 1.112697e+00 1.112697e+00 5.897244e-03 0.071625
+  63  3.1500 +0.1167   5  1 1.047772e+00 1.047772e+00 4.924303e-03 0.056015
+  64  3.2000 +0.1545   5  1 9.822068e-01 9.822068e-01 4.021409e-03 0.016529
+  65  3.2500 +0.1913   4  1 9.166450e-01 9.166450e-01 3.436732e-03 0.001837
+  66  3.3000 +0.2270   3  1 8.520661e-01 8.520661e-01 3.068158e-03 0.000000
+  67  3.3500 +0.2612   2  1 7.883038e-01 7.883038e-01 2.865541e-03 0.001837
+  68  3.4000 +0.2939   2  1 7.247583e-01 7.247583e-01 2.728022e-03 0.019284
+  69  3.4500 +0.3247   2  1 6.614330e-01 6.614330e-01 2.528061e-03 0.032140
+  70  3.5000 +0.3536  11  1 5.970971e-01 5.970971e-01 3.228570e-03 0.048669
+  71  3.5500 +0.3802  25  1 5.245090e-01 5.245090e-01 8.312550e-03 0.056015
+  72  3.6000 +0.4045  23  1 4.415680e-01 4.415680e-01 1.276039e-02 0.056015
+  73  3.6500 +0.4263  23  1 3.740465e-01 3.740465e-01 1.660880e-02 0.062443
+  74  3.7000 +0.4455  19  1 3.769107e-01 3.769107e-01 1.956283e-02 0.065197
+  75  3.7500 +0.4619  16  1 3.849742e-01 3.849742e-01 2.212255e-02 0.069789
+  76  3.8000 +0.4755  15  1 4.314492e-01 4.314492e-01 2.585272e-02 0.073462
+  77  3.8500 +0.4862  13  1 4.569590e-01 4.569590e-01 2.882846e-02 0.074380
+  78  3.9000 +0.4938  13  1 5.536645e-01 5.536645e-01 3.087855e-02 0.076217
+  79  3.9500 +0.4985  12  1 6.987001e-01 6.987001e-01 3.270765e-02 0.076217
+  80  4.0000 +0.5000  11  1 8.300400e-01 8.300400e-01 3.425436e-02 0.078972
+  81  4.0500 +0.4985  10  1 9.481211e-01 9.481211e-01 3.628425e-02 0.081726
+  82  4.1000 +0.4938   9  1 1.052744e+00 1.052744e+00 3.772184e-02 0.082645
+  83  4.1500 +0.4862   9  1 1.143134e+00 1.143134e+00 3.843999e-02 0.082645
+  84  4.2000 +0.4755   8  1 1.218388e+00 1.218388e+00 3.838852e-02 0.083563
+  85  4.2500 +0.4619   8  1 1.277964e+00 1.277964e+00 3.765011e-02 0.085399
+  86  4.3000 +0.4455   8  1 1.322205e+00 1.322205e+00 3.624003e-02 0.086318
+  87  4.3500 +0.4263   8  1 1.352417e+00 1.352417e+00 3.423680e-02 0.087236
+  88  4.4000 +0.4045   9  1 1.370249e+00 1.370249e+00 3.178397e-02 0.087236
+  89  4.4500 +0.3802  10  1 1.378066e+00 1.378066e+00 2.894993e-02 0.087236
+  90  4.5000 +0.3536  11  1 1.378157e+00 1.378157e+00 2.593349e-02 0.087236
+  91  4.5500 +0.3247  12  1 1.371909e+00 1.371909e+00 2.340301e-02 0.088154
+  92  4.6000 +0.2939  13  1 1.359741e+00 1.359741e+00 2.156856e-02 0.089073
+  93  4.6500 +0.2612  14  1 1.342647e+00 1.342647e+00 1.961159e-02 0.089073
+  94  4.7000 +0.2270  16  1 1.344674e+00 1.344674e+00 1.743032e-02 0.089073
+  95  4.7500 +0.1913  18  1 1.336877e+00 1.336877e+00 1.502347e-02 0.087236
+  96  4.8000 +0.1545  20  1 1.320058e+00 1.320058e+00 1.392546e-02 0.087236
+  97  4.8500 +0.1167  23  1 1.294622e+00 1.294622e+00 1.297610e-02 0.085399
+  98  4.9000 +0.0782  28  1 1.260405e+00 1.260405e+00 1.166777e-02 0.083563
+  99  4.9500 +0.0392  33  1 1.217751e+00 1.217751e+00 1.010702e-02 0.081726
+ 100  5.0000 +0.0000  22  1 1.167237e+00 1.167237e+00 8.388419e-03 0.081726
+ 101  5.0500 -0.0392   9  1 1.111585e+00 1.111585e+00 7.391111e-03 0.081726
+ 102  5.1000 -0.0782   7  1 1.048879e+00 1.048879e+00 5.923332e-03 0.071625
+ 103  5.1500 -0.1167   5  1 9.869192e-01 9.869192e-01 4.702023e-03 0.055096
+ 104  5.2000 -0.1545   4  1 9.244225e-01 9.244225e-01 3.691150e-03 0.013774
+ 105  5.2500 -0.1913   3  1 8.618489e-01 8.618489e-01 3.237655e-03 0.000918
+ 106  5.3000 -0.2270   3  1 8.000484e-01 8.000484e-01 3.031333e-03 0.000000
+ 107  5.3500 -0.2612   2  1 7.381037e-01 7.381037e-01 2.863970e-03 0.001837
+ 108  5.4000 -0.2939   2  1 6.763055e-01 6.763055e-01 2.714261e-03 0.021120
+ 109  5.4500 -0.3247   2  1 6.147766e-01 6.147766e-01 2.540707e-03 0.033058
+ 110  5.5000 -0.3536  12  1 5.518377e-01 5.518377e-01 3.367459e-03 0.048669
+ 111  5.5500 -0.3802  25  1 4.792515e-01 4.792515e-01 8.877937e-03 0.056015
+ 112  5.6000 -0.4045  23  1 3.948926e-01 3.948926e-01 1.378900e-02 0.056933
+ 113  5.6500 -0.4263  23  1 3.537057e-01 3.537057e-01 1.782485e-02 0.063361
+ 114  5.7000 -0.4455  20  1 3.527766e-01 3.527766e-01 2.055281e-02 0.065197
+ 115  5.7500 -0.4619  16  1 3.959641e-01 3.959641e-01 2.324016e-02 0.068871
+ 116  5.8000 -0.4755  15  1 4.349951e-01 4.349951e-01 2.537614e-02 0.073462
+ 117  5.8500 -0.4862  14  1 4.724881e-01 4.724881e-01 2.728828e-02 0.074380
+ 118  5.9000 -0.4938  13  1 5.735626e-01 5.735626e-01 3.131495e-02 0.076217
+ 119  5.9500 -0.4985  12  1 6.955473e-01 6.955473e-01 3.435077e-02 0.077135
+ 120  6.0000 -0.5000  11  1 8.061232e-01 8.061232e-01 3.596882e-02 0.079890
diff --git a/docs/developer/design/_phase_f_etd1_pc1.trace.txt b/docs/developer/design/_phase_f_etd1_pc1.trace.txt
new file mode 100644
index 00000000..9c1186c8
--- /dev/null
+++ b/docs/developer/design/_phase_f_etd1_pc1.trace.txt
@@ -0,0 +1,123 @@
+# Phase F predictor-corrector trace: etd1_pc1
+# integrator='etd' order=1 apply_radial_return=True in_residual_yield=False
+# columns: step, t, V_top, snes_iters_total, picard_iters, sigma_eq_max, sigma_eq_max_after_correction, u_y_max, yielded_fraction
+   1  0.0500 +0.4985   1  1 4.227008e-02 4.227008e-02 7.408887e-06 0.000000
+   2  0.1000 +0.4938   1  1 8.219118e-02 8.219118e-02 1.875342e-05 0.048669
+   3  0.1500 +0.4862   1  1 1.312133e-01 1.312133e-01 3.499493e-03 0.061524
+   4  0.2000 +0.4755   1  1 1.847484e-01 1.847484e-01 8.601088e-03 0.072544
+   5  0.2500 +0.4619   1  1 2.349787e-01 2.349787e-01 1.262241e-02 0.073462
+   6  0.3000 +0.4455   1  1 2.842488e-01 2.842488e-01 1.517815e-02 0.081726
+   7  0.3500 +0.4263   1  1 3.303034e-01 3.303034e-01 1.652152e-02 0.081726
+   8  0.4000 +0.4045   1  1 3.759653e-01 3.759653e-01 1.709990e-02 0.081726
+   9  0.4500 +0.3802   1  1 4.155843e-01 4.155843e-01 1.717741e-02 0.081726
+  10  0.5000 +0.3536   1  1 4.482948e-01 4.482948e-01 1.675827e-02 0.083563
+  11  0.5500 +0.3247   1  1 4.744306e-01 4.744306e-01 1.604408e-02 0.083563
+  12  0.6000 +0.2939   1  1 4.942698e-01 4.942698e-01 1.508513e-02 0.083563
+  13  0.6500 +0.2612   1  1 5.083255e-01 5.083255e-01 1.444807e-02 0.083563
+  14  0.7000 +0.2270   1  1 5.170730e-01 5.170730e-01 1.364595e-02 0.081726
+  15  0.7500 +0.1913   1  1 5.209158e-01 5.209158e-01 1.265520e-02 0.078972
+  16  0.8000 +0.1545   1  1 5.200673e-01 5.200673e-01 1.151499e-02 0.074380
+  17  0.8500 +0.1167   1  1 5.146865e-01 5.146865e-01 1.027186e-02 0.073462
+  18  0.9000 +0.0782   1  1 5.048434e-01 5.048434e-01 8.898937e-03 0.068871
+  19  0.9500 +0.0392   1  1 4.906089e-01 4.906089e-01 7.427466e-03 0.065197
+  20  1.0000 -0.0000   1  1 4.734378e-01 4.734378e-01 5.897057e-03 0.059688
+  21  1.0500 -0.0392   1  1 4.519180e-01 4.519180e-01 4.329136e-03 0.047750
+  22  1.1000 -0.0782   1  1 4.264055e-01 4.264055e-01 2.576242e-03 0.021120
+  23  1.1500 -0.1167   1  1 3.971031e-01 3.971031e-01 1.405211e-03 0.001837
+  24  1.2000 -0.1545   1  1 3.654972e-01 3.654972e-01 1.189141e-03 0.000000
+  25  1.2500 -0.1913   1  1 3.321918e-01 3.321918e-01 1.065001e-03 0.000000
+  26  1.3000 -0.2270   1  1 2.973628e-01 2.973628e-01 9.223411e-04 0.001837
+  27  1.3500 -0.2612   1  1 2.611811e-01 2.611811e-01 8.630974e-04 0.006428
+  28  1.4000 -0.2939   1  1 2.234569e-01 2.234569e-01 1.135203e-03 0.029385
+  29  1.4500 -0.3247   1  1 1.831775e-01 1.831775e-01 2.125745e-03 0.046832
+  30  1.5000 -0.3536   1  1 1.385713e-01 1.385713e-01 4.241876e-03 0.054178
+  31  1.5500 -0.3802   1  1 1.181696e-01 1.181696e-01 6.389454e-03 0.059688
+  32  1.6000 -0.4045   1  1 1.372726e-01 1.372726e-01 8.507503e-03 0.067952
+  33  1.6500 -0.4263   1  1 1.598838e-01 1.598838e-01 1.072879e-02 0.072544
+  34  1.7000 -0.4455   1  1 1.810420e-01 1.810420e-01 1.306631e-02 0.074380
+  35  1.7500 -0.4619   1  1 2.238122e-01 2.238122e-01 1.512466e-02 0.078053
+  36  1.8000 -0.4755   1  1 2.638722e-01 2.638722e-01 1.687399e-02 0.081726
+  37  1.8500 -0.4862   1  1 3.133549e-01 3.133549e-01 1.829238e-02 0.081726
+  38  1.9000 -0.4938   1  1 3.615901e-01 3.615901e-01 1.939778e-02 0.081726
+  39  1.9500 -0.4985   1  1 4.063672e-01 4.063672e-01 2.028848e-02 0.084481
+  40  2.0000 -0.5000   1  1 4.473514e-01 4.473514e-01 2.105355e-02 0.087236
+  41  2.0500 -0.4985   1  1 4.842762e-01 4.842762e-01 2.163985e-02 0.089073
+  42  2.1000 -0.4938   1  1 5.282834e-01 5.282834e-01 2.193861e-02 0.089073
+  43  2.1500 -0.4862   1  1 5.772664e-01 5.772664e-01 2.206459e-02 0.090909
+  44  2.2000 -0.4755   1  1 6.219866e-01 6.219866e-01 2.213212e-02 0.091827
+  45  2.2500 -0.4619   1  1 6.619491e-01 6.619491e-01 2.224729e-02 0.092746
+  46  2.3000 -0.4455   1  1 6.969564e-01 6.969564e-01 2.214004e-02 0.092746
+  47  2.3500 -0.4263   1  1 7.266837e-01 7.266837e-01 2.194797e-02 0.092746
+  48  2.4000 -0.4045   1  1 7.510734e-01 7.510734e-01 2.152773e-02 0.092746
+  49  2.4500 -0.3802   1  1 7.702274e-01 7.702274e-01 2.089505e-02 0.092746
+  50  2.5000 -0.3536   1  1 7.843171e-01 7.843171e-01 2.006671e-02 0.092746
+  51  2.5500 -0.3247   1  1 7.935401e-01 7.935401e-01 1.905951e-02 0.091827
+  52  2.6000 -0.2939   1  1 7.981186e-01 7.981186e-01 1.788337e-02 0.090909
+  53  2.6500 -0.2612   1  1 7.981725e-01 7.981725e-01 1.685370e-02 0.089073
+  54  2.7000 -0.2270   1  1 7.938056e-01 7.938056e-01 1.579015e-02 0.085399
+  55  2.7500 -0.1913   1  1 7.851243e-01 7.851243e-01 1.454144e-02 0.083563
+  56  2.8000 -0.1545   1  1 7.721168e-01 7.721168e-01 1.317253e-02 0.078053
+  57  2.8500 -0.1167   1  1 7.547879e-01 7.547879e-01 1.169327e-02 0.073462
+  58  2.9000 -0.0782   1  1 7.332337e-01 7.332337e-01 1.027793e-02 0.068871
+  59  2.9500 -0.0392   1  1 7.075753e-01 7.075753e-01 8.729184e-03 0.065197
+  60  3.0000 -0.0000   1  1 6.779846e-01 6.779846e-01 7.188479e-03 0.059688
+  61  3.0500 +0.0392   1  1 6.465370e-01 6.465370e-01 5.643225e-03 0.053260
+  62  3.1000 +0.0782   1  1 6.116895e-01 6.116895e-01 3.843751e-03 0.028466
+  63  3.1500 +0.1167   1  1 5.744340e-01 5.744340e-01 2.409459e-03 0.004591
+  64  3.2000 +0.1545   1  1 5.350145e-01 5.350145e-01 1.969759e-03 0.001837
+  65  3.2500 +0.1913   1  1 4.938750e-01 4.938750e-01 1.888315e-03 0.001837
+  66  3.3000 +0.2270   1  1 4.513645e-01 4.513645e-01 1.724640e-03 0.004591
+  67  3.3500 +0.2612   1  1 4.077729e-01 4.077729e-01 1.635097e-03 0.009183
+  68  3.4000 +0.2939   1  1 3.628311e-01 3.628311e-01 1.949322e-03 0.030303
+  69  3.4500 +0.3247   1  1 3.157002e-01 3.157002e-01 2.924769e-03 0.044995
+  70  3.5000 +0.3536   1  1 2.653372e-01 2.653372e-01 4.546049e-03 0.055096
+  71  3.5500 +0.3802   1  1 2.125265e-01 2.125265e-01 6.607843e-03 0.059688
+  72  3.6000 +0.4045   1  1 1.559260e-01 1.559260e-01 8.480549e-03 0.067034
+  73  3.6500 +0.4263   1  1 1.598838e-01 1.598838e-01 1.051411e-02 0.072544
+  74  3.7000 +0.4455   1  1 1.662188e-01 1.662188e-01 1.269594e-02 0.074380
+  75  3.7500 +0.4619   1  1 1.986397e-01 1.986397e-01 1.494535e-02 0.078053
+  76  3.8000 +0.4755   1  1 2.455196e-01 2.455196e-01 1.685755e-02 0.081726
+  77  3.8500 +0.4862   1  1 2.903839e-01 2.903839e-01 1.850313e-02 0.081726
+  78  3.9000 +0.4938   1  1 3.370835e-01 3.370835e-01 1.973221e-02 0.081726
+  79  3.9500 +0.4985   1  1 3.880890e-01 3.880890e-01 2.074312e-02 0.083563
+  80  4.0000 +0.5000   1  1 4.353984e-01 4.353984e-01 2.153348e-02 0.086318
+  81  4.0500 +0.4985   1  1 4.785444e-01 4.785444e-01 2.212614e-02 0.089073
+  82  4.1000 +0.4938   1  1 5.167683e-01 5.167683e-01 2.247357e-02 0.089073
+  83  4.1500 +0.4862   1  1 5.504927e-01 5.504927e-01 2.259279e-02 0.089073
+  84  4.2000 +0.4755   1  1 5.946387e-01 5.946387e-01 2.243399e-02 0.089073
+  85  4.2500 +0.4619   1  1 6.372919e-01 6.372919e-01 2.205612e-02 0.090909
+  86  4.3000 +0.4455   1  1 6.734048e-01 6.734048e-01 2.166966e-02 0.092746
+  87  4.3500 +0.4263   1  1 7.027984e-01 7.027984e-01 2.121580e-02 0.092746
+  88  4.4000 +0.4045   1  1 7.257176e-01 7.257176e-01 2.071868e-02 0.092746
+  89  4.4500 +0.3802   1  1 7.426603e-01 7.426603e-01 2.025395e-02 0.092746
+  90  4.5000 +0.3536   1  1 7.542223e-01 7.542223e-01 1.969271e-02 0.091827
+  91  4.5500 +0.3247   1  1 7.609657e-01 7.609657e-01 1.900148e-02 0.089991
+  92  4.6000 +0.2939   1  1 7.634485e-01 7.634485e-01 1.813807e-02 0.089991
+  93  4.6500 +0.2612   1  1 7.620670e-01 7.620670e-01 1.713053e-02 0.088154
+  94  4.7000 +0.2270   1  1 7.570430e-01 7.570430e-01 1.594556e-02 0.085399
+  95  4.7500 +0.1913   1  1 7.484863e-01 7.484863e-01 1.460595e-02 0.083563
+  96  4.8000 +0.1545   1  1 7.363491e-01 7.363491e-01 1.314961e-02 0.076217
+  97  4.8500 +0.1167   1  1 7.204547e-01 7.204547e-01 1.171749e-02 0.072544
+  98  4.9000 +0.0782   1  1 7.013981e-01 7.013981e-01 1.021931e-02 0.068871
+  99  4.9500 +0.0392   1  1 6.798032e-01 6.798032e-01 8.675941e-03 0.065197
+ 100  5.0000 +0.0000   1  1 6.539379e-01 6.539379e-01 7.059479e-03 0.059688
+ 101  5.0500 -0.0392   1  1 6.240594e-01 6.240594e-01 5.518673e-03 0.048669
+ 102  5.1000 -0.0782   1  1 5.904925e-01 5.904925e-01 3.551771e-03 0.024793
+ 103  5.1500 -0.1167   1  1 5.535297e-01 5.535297e-01 2.128064e-03 0.001837
+ 104  5.2000 -0.1545   1  1 5.145025e-01 5.145025e-01 1.647455e-03 0.000000
+ 105  5.2500 -0.1913   1  1 4.740482e-01 4.740482e-01 1.587837e-03 0.001837
+ 106  5.3000 -0.2270   1  1 4.323105e-01 4.323105e-01 1.406939e-03 0.002755
+ 107  5.3500 -0.2612   1  1 3.894801e-01 3.894801e-01 1.331123e-03 0.009183
+ 108  5.4000 -0.2939   1  1 3.453496e-01 3.453496e-01 1.591105e-03 0.031221
+ 109  5.4500 -0.3247   1  1 2.989171e-01 2.989171e-01 2.527376e-03 0.044995
+ 110  5.5000 -0.3536   1  1 2.484559e-01 2.484559e-01 4.543657e-03 0.054178
+ 111  5.5500 -0.3802   1  1 1.957117e-01 1.957117e-01 6.666710e-03 0.059688
+ 112  5.6000 -0.4045   1  1 1.414415e-01 1.414415e-01 8.680619e-03 0.067034
+ 113  5.6500 -0.4263   1  1 1.598838e-01 1.598838e-01 1.084531e-02 0.072544
+ 114  5.7000 -0.4455   1  1 1.660375e-01 1.660375e-01 1.310024e-02 0.074380
+ 115  5.7500 -0.4619   1  1 1.951370e-01 1.951370e-01 1.524265e-02 0.078053
+ 116  5.8000 -0.4755   1  1 2.367414e-01 2.367414e-01 1.697206e-02 0.081726
+ 117  5.8500 -0.4862   1  1 2.792419e-01 2.792419e-01 1.839754e-02 0.081726
+ 118  5.9000 -0.4938   1  1 3.291106e-01 3.291106e-01 1.942385e-02 0.081726
+ 119  5.9500 -0.4985   1  1 3.755431e-01 3.755431e-01 2.029558e-02 0.083563
+ 120  6.0000 -0.5000   1  1 4.181972e-01 4.181972e-01 2.101870e-02 0.086318
diff --git a/docs/developer/design/_phase_f_etd1_pc_picard.trace.txt b/docs/developer/design/_phase_f_etd1_pc_picard.trace.txt
new file mode 100644
index 00000000..4938e56c
--- /dev/null
+++ b/docs/developer/design/_phase_f_etd1_pc_picard.trace.txt
@@ -0,0 +1,123 @@
+# Phase F predictor-corrector trace: etd1_pc_picard
+# integrator='etd' order=1 apply_radial_return=True in_residual_yield=False
+# columns: step, t, V_top, snes_iters_total, picard_iters, sigma_eq_max, sigma_eq_max_after_correction, u_y_max, yielded_fraction
+   1  0.0500 +0.4985   2  6 4.161077e-02 4.161077e-02 5.099459e-11 0.000000
+   2  0.1000 +0.4938   2  6 8.093501e-02 8.093501e-02 1.349445e-05 0.046832
+   3  0.1500 +0.4862   3  6 1.287508e-01 1.287508e-01 3.351302e-03 0.061524
+   4  0.2000 +0.4755   4  6 1.811023e-01 1.811023e-01 8.347638e-03 0.072544
+   5  0.2500 +0.4619   6  6 2.302776e-01 2.302776e-01 1.234744e-02 0.073462
+   6  0.3000 +0.4455   4  6 2.775334e-01 2.775334e-01 1.493068e-02 0.081726
+   7  0.3500 +0.4263   6  6 3.230740e-01 3.230740e-01 1.627572e-02 0.081726
+   8  0.4000 +0.4045   6  6 3.674049e-01 3.674049e-01 1.692438e-02 0.081726
+   9  0.4500 +0.3802   6  6 4.058537e-01 4.058537e-01 1.703386e-02 0.081726
+  10  0.5000 +0.3536   6  6 4.377588e-01 4.377588e-01 1.663614e-02 0.082645
+  11  0.5500 +0.3247   6  6 4.635655e-01 4.635655e-01 1.593578e-02 0.083563
+  12  0.6000 +0.2939   5  5 4.832436e-01 4.832436e-01 1.509389e-02 0.083563
+  13  0.6500 +0.2612   5  5 4.976155e-01 4.976155e-01 1.456039e-02 0.083563
+  14  0.7000 +0.2270   4  4 5.067653e-01 5.067653e-01 1.378263e-02 0.081726
+  15  0.7500 +0.1913   4  4 5.113699e-01 5.113699e-01 1.287287e-02 0.078053
+  16  0.8000 +0.1545   5  5 5.115385e-01 5.115385e-01 1.179448e-02 0.074380
+  17  0.8500 +0.1167   5  5 5.072425e-01 5.072425e-01 1.050574e-02 0.073462
+  18  0.9000 +0.0782   6  6 4.983598e-01 4.983598e-01 9.114029e-03 0.068871
+  19  0.9500 +0.0392   6  6 4.859537e-01 4.859537e-01 7.607633e-03 0.065197
+  20  1.0000 -0.0000   6  6 4.694749e-01 4.694749e-01 6.055486e-03 0.059688
+  21  1.0500 -0.0392   6  6 4.485793e-01 4.485793e-01 4.131973e-03 0.047750
+  22  1.1000 -0.0782   6  6 4.235467e-01 4.235467e-01 2.559961e-03 0.022957
+  23  1.1500 -0.1167   6  6 3.948545e-01 3.948545e-01 1.422857e-03 0.000918
+  24  1.2000 -0.1545   6  6 3.638478e-01 3.638478e-01 1.121558e-03 0.000000
+  25  1.2500 -0.1913   6  6 3.311189e-01 3.311189e-01 1.010840e-03 0.000000
+  26  1.3000 -0.2270   6  6 2.968686e-01 2.968686e-01 8.951009e-04 0.001837
+  27  1.3500 -0.2612   6  6 2.612707e-01 2.612707e-01 8.206035e-04 0.004591
+  28  1.4000 -0.2939   6  6 2.241692e-01 2.241692e-01 1.015615e-03 0.029385
+  29  1.4500 -0.3247   6  6 1.846865e-01 1.846865e-01 1.975358e-03 0.046832
+  30  1.5000 -0.3536   6  6 1.409845e-01 1.409845e-01 4.005596e-03 0.054178
+  31  1.5500 -0.3802   6  6 1.166370e-01 1.166370e-01 6.163564e-03 0.059688
+  32  1.6000 -0.4045   6  6 1.336565e-01 1.336565e-01 8.330606e-03 0.066116
+  33  1.6500 -0.4263   6  6 1.594410e-01 1.594410e-01 1.050882e-02 0.072544
+  34  1.7000 -0.4455   6  6 1.765654e-01 1.765654e-01 1.282975e-02 0.073462
+  35  1.7500 -0.4619   6  6 2.194027e-01 2.194027e-01 1.493627e-02 0.078053
+  36  1.8000 -0.4755   6  6 2.595226e-01 2.595226e-01 1.670650e-02 0.081726
+  37  1.8500 -0.4862   5  6 3.063497e-01 3.063497e-01 1.817514e-02 0.081726
+  38  1.9000 -0.4938   5  6 3.544942e-01 3.544942e-01 1.928309e-02 0.081726
+  39  1.9500 -0.4985   6  6 3.991369e-01 3.991369e-01 2.019328e-02 0.084481
+  40  2.0000 -0.5000   5  6 4.398635e-01 4.398635e-01 2.094383e-02 0.086318
+  41  2.0500 -0.4985   5  6 4.766056e-01 4.766056e-01 2.155494e-02 0.089073
+  42  2.1000 -0.4938   5  6 5.164216e-01 5.164216e-01 2.186644e-02 0.089073
+  43  2.1500 -0.4862   5  6 5.647899e-01 5.647899e-01 2.200411e-02 0.089991
+  44  2.2000 -0.4755   6  6 6.088865e-01 6.088865e-01 2.205423e-02 0.091827
+  45  2.2500 -0.4619   6  6 6.482230e-01 6.482230e-01 2.215879e-02 0.091827
+  46  2.3000 -0.4455   6  6 6.827193e-01 6.827193e-01 2.204627e-02 0.092746
+  47  2.3500 -0.4263   6  6 7.121618e-01 7.121618e-01 2.181352e-02 0.092746
+  48  2.4000 -0.4045   5  5 7.360911e-01 7.360911e-01 2.145828e-02 0.092746
+  49  2.4500 -0.3802   5  5 7.551158e-01 7.551158e-01 2.084470e-02 0.092746
+  50  2.5000 -0.3536   4  4 7.689838e-01 7.689838e-01 2.003559e-02 0.092746
+  51  2.5500 -0.3247   4  4 7.784161e-01 7.784161e-01 1.908171e-02 0.089991
+  52  2.6000 -0.2939   4  4 7.835573e-01 7.835573e-01 1.817218e-02 0.089073
+  53  2.6500 -0.2612   4  4 7.845050e-01 7.845050e-01 1.729180e-02 0.089073
+  54  2.7000 -0.2270   4  4 7.813407e-01 7.813407e-01 1.624477e-02 0.085399
+  55  2.7500 -0.1913   5  5 7.738206e-01 7.738206e-01 1.499546e-02 0.083563
+  56  2.8000 -0.1545   5  5 7.620874e-01 7.620874e-01 1.361492e-02 0.077135
+  57  2.8500 -0.1167   5  5 7.461220e-01 7.461220e-01 1.206496e-02 0.073462
+  58  2.9000 -0.0782   6  6 7.256369e-01 7.256369e-01 1.060674e-02 0.068871
+  59  2.9500 -0.0392   6  6 7.010363e-01 7.010363e-01 9.047866e-03 0.065197
+  60  3.0000 -0.0000   6  6 6.738234e-01 6.738234e-01 7.415888e-03 0.059688
+  61  3.0500 +0.0392   6  6 6.433692e-01 6.433692e-01 5.543277e-03 0.053260
+  62  3.1000 +0.0782   6  6 6.099471e-01 6.099471e-01 3.871836e-03 0.029385
+  63  3.1500 +0.1167   6  6 5.734248e-01 5.734248e-01 2.376721e-03 0.004591
+  64  3.2000 +0.1545   6  6 5.346290e-01 5.346290e-01 1.916660e-03 0.001837
+  65  3.2500 +0.1913   6  6 4.940989e-01 4.940989e-01 1.827304e-03 0.001837
+  66  3.3000 +0.2270   6  6 4.522099e-01 4.522099e-01 1.683525e-03 0.004591
+  67  3.3500 +0.2612   6  6 4.092209e-01 4.092209e-01 1.594976e-03 0.009183
+  68  3.4000 +0.2939   6  6 3.649311e-01 3.649311e-01 1.932707e-03 0.030303
+  69  3.4500 +0.3247   6  6 3.186272e-01 3.186272e-01 2.831886e-03 0.044995
+  70  3.5000 +0.3536   6  6 2.685705e-01 2.685705e-01 4.340578e-03 0.054178
+  71  3.5500 +0.3802   6  6 2.166092e-01 2.166092e-01 6.361989e-03 0.059688
+  72  3.6000 +0.4045   6  6 1.610261e-01 1.610261e-01 8.260349e-03 0.066116
+  73  3.6500 +0.4263   6  6 1.595580e-01 1.595580e-01 1.034355e-02 0.072544
+  74  3.7000 +0.4455   6  6 1.632820e-01 1.632820e-01 1.248723e-02 0.074380
+  75  3.7500 +0.4619   6  6 1.940748e-01 1.940748e-01 1.479202e-02 0.078053
+  76  3.8000 +0.4755   6  6 2.410678e-01 2.410678e-01 1.669484e-02 0.081726
+  77  3.8500 +0.4862   6  6 2.858449e-01 2.858449e-01 1.840059e-02 0.081726
+  78  3.9000 +0.4938   6  6 3.294513e-01 3.294513e-01 1.965186e-02 0.081726
+  79  3.9500 +0.4985   6  6 3.802306e-01 3.802306e-01 2.068451e-02 0.082645
+  80  4.0000 +0.5000   6  6 4.271951e-01 4.271951e-01 2.145156e-02 0.085399
+  81  4.0500 +0.4985   6  6 4.698095e-01 4.698095e-01 2.204894e-02 0.089073
+  82  4.1000 +0.4938   5  6 5.072628e-01 5.072628e-01 2.239363e-02 0.089073
+  83  4.1500 +0.4862   5  6 5.400127e-01 5.400127e-01 2.251380e-02 0.089073
+  84  4.2000 +0.4755   5  6 5.800820e-01 5.800820e-01 2.235144e-02 0.089073
+  85  4.2500 +0.4619   6  6 6.211326e-01 6.211326e-01 2.199080e-02 0.090909
+  86  4.3000 +0.4455   6  6 6.558938e-01 6.558938e-01 2.156490e-02 0.092746
+  87  4.3500 +0.4263   6  6 6.843123e-01 6.843123e-01 2.102370e-02 0.092746
+  88  4.4000 +0.4045   5  5 7.063759e-01 7.063759e-01 2.062574e-02 0.092746
+  89  4.4500 +0.3802   5  5 7.231025e-01 7.231025e-01 2.032786e-02 0.092746
+  90  4.5000 +0.3536   5  5 7.350305e-01 7.350305e-01 1.985498e-02 0.091827
+  91  4.5500 +0.3247   4  4 7.424922e-01 7.424922e-01 1.919538e-02 0.089991
+  92  4.6000 +0.2939   4  4 7.461228e-01 7.461228e-01 1.844436e-02 0.089073
+  93  4.6500 +0.2612   4  4 7.462011e-01 7.462011e-01 1.746419e-02 0.088154
+  94  4.7000 +0.2270   4  4 7.428746e-01 7.428746e-01 1.629555e-02 0.085399
+  95  4.7500 +0.1913   5  5 7.358811e-01 7.358811e-01 1.502104e-02 0.083563
+  96  4.8000 +0.1545   5  5 7.253639e-01 7.253639e-01 1.351238e-02 0.076217
+  97  4.8500 +0.1167   5  5 7.110578e-01 7.110578e-01 1.204830e-02 0.072544
+  98  4.9000 +0.0782   6  6 6.946220e-01 6.946220e-01 1.054086e-02 0.068871
+  99  4.9500 +0.0392   6  6 6.739125e-01 6.739125e-01 8.925269e-03 0.065197
+ 100  5.0000 +0.0000   6  6 6.489214e-01 6.489214e-01 7.307332e-03 0.059688
+ 101  5.0500 -0.0392   6  6 6.198678e-01 6.198678e-01 5.249013e-03 0.050505
+ 102  5.1000 -0.0782   6  6 5.868589e-01 5.868589e-01 3.540912e-03 0.026630
+ 103  5.1500 -0.1167   6  6 5.506593e-01 5.506593e-01 2.155882e-03 0.001837
+ 104  5.2000 -0.1545   6  6 5.124078e-01 5.124078e-01 1.573846e-03 0.000000
+ 105  5.2500 -0.1913   6  6 4.726878e-01 4.726878e-01 1.526428e-03 0.001837
+ 106  5.3000 -0.2270   6  6 4.316547e-01 4.316547e-01 1.350858e-03 0.002755
+ 107  5.3500 -0.2612   6  6 3.895309e-01 3.895309e-01 1.259554e-03 0.008264
+ 108  5.4000 -0.2939   6  6 3.461341e-01 3.461341e-01 1.468956e-03 0.030303
+ 109  5.4500 -0.3247   6  6 3.006082e-01 3.006082e-01 2.371470e-03 0.044995
+ 110  5.5000 -0.3536   6  6 2.510723e-01 2.510723e-01 4.310133e-03 0.054178
+ 111  5.5500 -0.3802   6  6 1.995589e-01 1.995589e-01 6.431268e-03 0.059688
+ 112  5.6000 -0.4045   6  6 1.452066e-01 1.452066e-01 8.472646e-03 0.066116
+ 113  5.6500 -0.4263   6  6 1.594970e-01 1.594970e-01 1.064486e-02 0.072544
+ 114  5.7000 -0.4455   6  6 1.630191e-01 1.630191e-01 1.287049e-02 0.074380
+ 115  5.7500 -0.4619   6  6 1.902722e-01 1.902722e-01 1.505430e-02 0.077135
+ 116  5.8000 -0.4755   6  6 2.319192e-01 2.319192e-01 1.681173e-02 0.081726
+ 117  5.8500 -0.4862   6  6 2.719600e-01 2.719600e-01 1.828151e-02 0.081726
+ 118  5.9000 -0.4938   5  6 3.215692e-01 3.215692e-01 1.932832e-02 0.081726
+ 119  5.9500 -0.4985   6  6 3.678246e-01 3.678246e-01 2.020076e-02 0.083563
+ 120  6.0000 -0.5000   6  6 4.102416e-01 4.102416e-01 2.091711e-02 0.086318
diff --git a/docs/developer/design/_phase_f_etd2_pc1.trace.txt b/docs/developer/design/_phase_f_etd2_pc1.trace.txt
new file mode 100644
index 00000000..d4f42070
--- /dev/null
+++ b/docs/developer/design/_phase_f_etd2_pc1.trace.txt
@@ -0,0 +1,123 @@
+# Phase F predictor-corrector trace: etd2_pc1
+# integrator='etd' order=2 apply_radial_return=True in_residual_yield=False
+# columns: step, t, V_top, snes_iters_total, picard_iters, sigma_eq_max, sigma_eq_max_after_correction, u_y_max, yielded_fraction
+   1  0.0500 +0.4985   1  1 2.131127e-02 2.131127e-02 9.914503e-06 0.000000
+   2  0.1000 +0.4938   1  1 6.239662e-02 6.239662e-02 2.606727e-05 0.033976
+   3  0.1500 +0.4862   1  1 1.049722e-01 1.049722e-01 1.715476e-03 0.056933
+   4  0.2000 +0.4755   1  1 1.573794e-01 1.573794e-01 1.068129e-02 0.067034
+   5  0.2500 +0.4619   1  1 2.099182e-01 2.099182e-01 1.096408e-02 0.072544
+   6  0.3000 +0.4455   1  1 2.570379e-01 2.570379e-01 1.714362e-02 0.078972
+   7  0.3500 +0.4263   1  1 3.056119e-01 3.056119e-01 1.463398e-02 0.081726
+   8  0.4000 +0.4045   1  1 3.528050e-01 3.528050e-01 1.893268e-02 0.081726
+   9  0.4500 +0.3802   1  1 3.950287e-01 3.950287e-01 1.531541e-02 0.081726
+  10  0.5000 +0.3536   1  1 4.303910e-01 4.303910e-01 1.860162e-02 0.081726
+  11  0.5500 +0.3247   1  1 4.596525e-01 4.596525e-01 1.503352e-02 0.083563
+  12  0.6000 +0.2939   1  1 4.823697e-01 4.823697e-01 1.744548e-02 0.083563
+  13  0.6500 +0.2612   1  1 4.997832e-01 4.997832e-01 1.443183e-02 0.083563
+  14  0.7000 +0.2270   1  1 5.114560e-01 5.114560e-01 1.609903e-02 0.081726
+  15  0.7500 +0.1913   1  1 5.187145e-01 5.187145e-01 1.290046e-02 0.078972
+  16  0.8000 +0.1545   1  1 5.207510e-01 5.207510e-01 1.444387e-02 0.076217
+  17  0.8500 +0.1167   1  1 5.186851e-01 5.186851e-01 1.077858e-02 0.074380
+  18  0.9000 +0.0782   1  1 5.115076e-01 5.115076e-01 1.346888e-02 0.070707
+  19  0.9500 +0.0392   1  1 5.015462e-01 5.015462e-01 9.097274e-03 0.065197
+  20  1.0000 -0.0000   1  1 4.867079e-01 4.867079e-01 1.245325e-02 0.062443
+  21  1.0500 -0.0392   1  1 4.675201e-01 4.675201e-01 1.021491e-02 0.052342
+  22  1.1000 -0.0782   1  1 4.434251e-01 4.434251e-01 1.038060e-02 0.038567
+  23  1.1500 -0.1167   1  1 4.164446e-01 4.164446e-01 1.173302e-02 0.006428
+  24  1.2000 -0.1545   1  1 3.849434e-01 3.849434e-01 1.269477e-02 0.000000
+  25  1.2500 -0.1913   1  1 3.531562e-01 3.531562e-01 1.352314e-02 0.000000
+  26  1.3000 -0.2270   1  1 3.179293e-01 3.179293e-01 1.607292e-02 0.000918
+  27  1.3500 -0.2612   1  1 2.833158e-01 2.833158e-01 1.672074e-02 0.003673
+  28  1.4000 -0.2939   1  1 2.450061e-01 2.450061e-01 1.905884e-02 0.016529
+  29  1.4500 -0.3247   1  1 2.071696e-01 2.071696e-01 2.134301e-02 0.039486
+  30  1.5000 -0.3536   1  1 1.635172e-01 1.635172e-01 2.036617e-02 0.050505
+  31  1.5500 -0.3802   1  1 1.178158e-01 1.178158e-01 2.706291e-02 0.059688
+  32  1.6000 -0.4045   1  1 1.280774e-01 1.280774e-01 2.330717e-02 0.064279
+  33  1.6500 -0.4263   1  1 1.517607e-01 1.517607e-01 3.185378e-02 0.068871
+  34  1.7000 -0.4455   1  1 1.598838e-01 1.598838e-01 2.657179e-02 0.072544
+  35  1.7500 -0.4619   1  1 2.045166e-01 2.045166e-01 3.583165e-02 0.076217
+  36  1.8000 -0.4755   1  1 2.435566e-01 2.435566e-01 2.884058e-02 0.081726
+  37  1.8500 -0.4862   1  1 2.918784e-01 2.918784e-01 3.794113e-02 0.081726
+  38  1.9000 -0.4938   1  1 3.363541e-01 3.363541e-01 2.983185e-02 0.081726
+  39  1.9500 -0.4985   1  1 3.886665e-01 3.886665e-01 3.713012e-02 0.083563
+  40  2.0000 -0.5000   1  1 4.241066e-01 4.241066e-01 2.952963e-02 0.086318
+  41  2.0500 -0.4985   1  1 4.703930e-01 4.703930e-01 3.455771e-02 0.089073
+  42  2.1000 -0.4938   1  1 5.022749e-01 5.022749e-01 2.753084e-02 0.089073
+  43  2.1500 -0.4862   1  1 5.534926e-01 5.534926e-01 3.177891e-02 0.089991
+  44  2.2000 -0.4755   1  1 5.979954e-01 5.979954e-01 2.425708e-02 0.091827
+  45  2.2500 -0.4619   1  1 6.417369e-01 6.417369e-01 3.041661e-02 0.091827
+  46  2.3000 -0.4455   1  1 6.769766e-01 6.769766e-01 2.647030e-02 0.091827
+  47  2.3500 -0.4263   1  1 7.102783e-01 7.102783e-01 3.046915e-02 0.092746
+  48  2.4000 -0.4045   1  1 7.360519e-01 7.360519e-01 2.877922e-02 0.092746
+  49  2.4500 -0.3802   1  1 7.581262e-01 7.581262e-01 2.953757e-02 0.092746
+  50  2.5000 -0.3536   1  1 7.747097e-01 7.747097e-01 3.010213e-02 0.092746
+  51  2.5500 -0.3247   1  1 7.862858e-01 7.862858e-01 2.827722e-02 0.089991
+  52  2.6000 -0.2939   1  1 7.943083e-01 7.943083e-01 3.064881e-02 0.089073
+  53  2.6500 -0.2612   1  1 7.961581e-01 7.961581e-01 2.784177e-02 0.089073
+  54  2.7000 -0.2270   1  1 7.958760e-01 7.958760e-01 3.067474e-02 0.087236
+  55  2.7500 -0.1913   1  1 7.885778e-01 7.885778e-01 2.694955e-02 0.084481
+  56  2.8000 -0.1545   1  1 7.800704e-01 7.800704e-01 3.004956e-02 0.081726
+  57  2.8500 -0.1167   1  1 7.639095e-01 7.639095e-01 2.975656e-02 0.074380
+  58  2.9000 -0.0782   1  1 7.467074e-01 7.467074e-01 2.920571e-02 0.070707
+  59  2.9500 -0.0392   1  1 7.223785e-01 7.223785e-01 3.390057e-02 0.067034
+  60  3.0000 -0.0000   1  1 6.973502e-01 6.973502e-01 3.174960e-02 0.062443
+  61  3.0500 +0.0392   1  1 6.667682e-01 6.667682e-01 3.857958e-02 0.056015
+  62  3.1000 +0.0782   1  1 6.348044e-01 6.348044e-01 3.902196e-02 0.046832
+  63  3.1500 +0.1167   1  1 5.979348e-01 5.979348e-01 4.217937e-02 0.015611
+  64  3.2000 +0.1545   1  1 5.606210e-01 5.606210e-01 4.461844e-02 0.002755
+  65  3.2500 +0.1913   1  1 5.185266e-01 5.185266e-01 4.670081e-02 0.001837
+  66  3.3000 +0.2270   1  1 4.780786e-01 4.780786e-01 5.137915e-02 0.004591
+  67  3.3500 +0.2612   1  1 4.326961e-01 4.326961e-01 5.301539e-02 0.007346
+  68  3.4000 +0.2939   1  1 3.901218e-01 3.901218e-01 5.731251e-02 0.012856
+  69  3.4500 +0.3247   1  1 3.425765e-01 3.425765e-01 5.724200e-02 0.041322
+  70  3.5000 +0.3536   1  1 2.949486e-01 2.949486e-01 6.371193e-02 0.050505
+  71  3.5500 +0.3802   1  1 2.421294e-01 2.421294e-01 5.811950e-02 0.059688
+  72  3.6000 +0.4045   1  1 1.881670e-01 1.881670e-01 6.840521e-02 0.064279
+  73  3.6500 +0.4263   1  1 1.555694e-01 1.555694e-01 5.713230e-02 0.067952
+  74  3.7000 +0.4455   1  1 1.598838e-01 1.598838e-01 7.112147e-02 0.072544
+  75  3.7500 +0.4619   1  1 1.791295e-01 1.791295e-01 5.442180e-02 0.076217
+  76  3.8000 +0.4755   1  1 2.257324e-01 2.257324e-01 7.161664e-02 0.079890
+  77  3.8500 +0.4862   1  1 2.690861e-01 2.690861e-01 5.083165e-02 0.081726
+  78  3.9000 +0.4938   1  1 3.166168e-01 3.166168e-01 7.074444e-02 0.081726
+  79  3.9500 +0.4985   1  1 3.635441e-01 3.635441e-01 4.568633e-02 0.081726
+  80  4.0000 +0.5000   1  1 4.151861e-01 4.151861e-01 6.803058e-02 0.084481
+  81  4.0500 +0.4985   1  1 4.566118e-01 4.566118e-01 4.174742e-02 0.088154
+  82  4.1000 +0.4938   1  1 5.005229e-01 5.005229e-01 6.408209e-02 0.089073
+  83  4.1500 +0.4862   1  1 5.305149e-01 5.305149e-01 4.089872e-02 0.089073
+  84  4.2000 +0.4755   1  1 5.707071e-01 5.707071e-01 5.870990e-02 0.089073
+  85  4.2500 +0.4619   1  1 6.126959e-01 6.126959e-01 3.954730e-02 0.090909
+  86  4.3000 +0.4455   1  1 6.540621e-01 6.540621e-01 5.234383e-02 0.092746
+  87  4.3500 +0.4263   1  1 6.829757e-01 6.829757e-01 3.965990e-02 0.092746
+  88  4.4000 +0.4045   1  1 7.113745e-01 7.113745e-01 4.544573e-02 0.092746
+  89  4.4500 +0.3802   1  1 7.282133e-01 7.282133e-01 3.923397e-02 0.092746
+  90  4.5000 +0.3536   1  1 7.450524e-01 7.450524e-01 4.160950e-02 0.091827
+  91  4.5500 +0.3247   1  1 7.526353e-01 7.526353e-01 3.734183e-02 0.089991
+  92  4.6000 +0.2939   1  1 7.598689e-01 7.598689e-01 4.341982e-02 0.089073
+  93  4.6500 +0.2612   1  1 7.603876e-01 7.603876e-01 3.460950e-02 0.089073
+  94  4.7000 +0.2270   1  1 7.590870e-01 7.590870e-01 5.127460e-02 0.087236
+  95  4.7500 +0.1913   1  1 7.533418e-01 7.533418e-01 4.776523e-02 0.083563
+  96  4.8000 +0.1545   1  1 7.438568e-01 7.438568e-01 6.608833e-02 0.079890
+  97  4.8500 +0.1167   1  1 7.321045e-01 7.321045e-01 6.812981e-02 0.075298
+  98  4.9000 +0.0782   1  1 7.178501e-01 7.178501e-01 8.677898e-02 0.070707
+  99  4.9500 +0.0392   1  1 6.975008e-01 6.975008e-01 9.291052e-02 0.067034
+ 100  5.0000 +0.0000   1  1 6.742254e-01 6.742254e-01 1.110457e-01 0.061524
+ 101  5.0500 -0.0392   1  1 6.448974e-01 6.448974e-01 1.211323e-01 0.052342
+ 102  5.1000 -0.0782   1  1 6.150097e-01 6.150097e-01 1.369753e-01 0.041322
+ 103  5.1500 -0.1167   1  1 5.758267e-01 5.758267e-01 1.497804e-01 0.010101
+ 104  5.2000 -0.1545   1  1 5.411679e-01 5.411679e-01 1.634744e-01 0.002755
+ 105  5.2500 -0.1913   1  1 4.970565e-01 4.970565e-01 1.772986e-01 0.001837
+ 106  5.3000 -0.2270   1  1 4.599766e-01 4.599766e-01 1.902496e-01 0.001837
+ 107  5.3500 -0.2612   1  1 4.129634e-01 4.129634e-01 2.035024e-01 0.006428
+ 108  5.4000 -0.2939   1  1 3.736203e-01 3.736203e-01 2.145889e-01 0.011019
+ 109  5.4500 -0.3247   1  1 3.237010e-01 3.237010e-01 2.273457e-01 0.041322
+ 110  5.5000 -0.3536   1  1 2.798159e-01 2.798159e-01 2.378508e-01 0.050505
+ 111  5.5500 -0.3802   1  1 2.228483e-01 2.228483e-01 2.586419e-01 0.058770
+ 112  5.6000 -0.4045   1  1 1.766384e-01 1.766384e-01 2.637201e-01 0.064279
+ 113  5.6500 -0.4263   1  1 1.547365e-01 1.547365e-01 2.853110e-01 0.068871
+ 114  5.7000 -0.4455   1  1 1.598838e-01 1.598838e-01 2.825352e-01 0.073462
+ 115  5.7500 -0.4619   1  1 2.088551e-01 2.088551e-01 3.047250e-01 0.076217
+ 116  5.8000 -0.4755   1  1 2.197518e-01 2.197518e-01 2.940441e-01 0.081726
+ 117  5.8500 -0.4862   1  1 2.587456e-01 2.587456e-01 3.149978e-01 0.081726
+ 118  5.9000 -0.4938   1  1 3.094653e-01 3.094653e-01 2.954414e-01 0.081726
+ 119  5.9500 -0.4985   1  1 3.424852e-01 3.424852e-01 3.132496e-01 0.082645
+ 120  6.0000 -0.5000   1  1 4.068626e-01 4.068626e-01 2.862348e-01 0.084481
diff --git a/docs/developer/design/_phase_f_etd2_pc_picard.trace.txt b/docs/developer/design/_phase_f_etd2_pc_picard.trace.txt
new file mode 100644
index 00000000..4c1aac47
--- /dev/null
+++ b/docs/developer/design/_phase_f_etd2_pc_picard.trace.txt
@@ -0,0 +1,63 @@
+# Phase F predictor-corrector trace: etd2_pc_picard
+# integrator='etd' order=2 apply_radial_return=True in_residual_yield=False
+# columns: step, t, V_top, snes_iters_total, picard_iters, sigma_eq_max, sigma_eq_max_after_correction, u_y_max, yielded_fraction
+   1  0.0500 +0.4985   6  6 4.128329e-02 4.128329e-02 9.032622e-06 0.000000
+   2  0.1000 +0.4938   6  6 8.062246e-02 8.062246e-02 8.778956e-06 0.046832
+   3  0.1500 +0.4862   6  6 1.282990e-01 1.282990e-01 4.488153e-04 0.061524
+   4  0.2000 +0.4755   6  6 1.806463e-01 1.806463e-01 1.659196e-03 0.071625
+   5  0.2500 +0.4619   6  6 2.299833e-01 2.299833e-01 3.567426e-03 0.073462
+   6  0.3000 +0.4455   6  6 2.763849e-01 2.763849e-01 6.038144e-03 0.081726
+   7  0.3500 +0.4263   6  6 3.231831e-01 3.231831e-01 8.951024e-03 0.081726
+   8  0.4000 +0.4045   6  6 3.676827e-01 3.676827e-01 1.271368e-02 0.081726
+   9  0.4500 +0.3802   6  6 4.063513e-01 4.063513e-01 2.101424e-02 0.081726
+  10  0.5000 +0.3536   6  6 4.385887e-01 4.385887e-01 3.332654e-02 0.082645
+  11  0.5500 +0.3247   6  6 4.648292e-01 4.648292e-01 5.096470e-02 0.083563
+  12  0.6000 +0.2939   6  6 4.853601e-01 4.853601e-01 7.694427e-02 0.083563
+  13  0.6500 +0.2612   6  6 5.006052e-01 5.006052e-01 1.156709e-01 0.083563
+  14  0.7000 +0.2270   6  6 5.111314e-01 5.111314e-01 1.678903e-01 0.081726
+  15  0.7500 +0.1913   6  6 5.168793e-01 5.168793e-01 2.345889e-01 0.078053
+  16  0.8000 +0.1545   6  6 5.186714e-01 5.186714e-01 3.194702e-01 0.075298
+  17  0.8500 +0.1167   6  6 5.165864e-01 5.165864e-01 4.280691e-01 0.068871
+  18  0.9000 +0.0782   6  6 5.234129e-01 5.234129e-01 5.854402e-01 0.059688
+  19  0.9500 +0.0392   6  6 5.867592e-01 5.867592e-01 7.774405e-01 0.053260
+  20  1.0000 -0.0000   6  6 6.367049e-01 6.367049e-01 9.775868e-01 0.041322
+  21  1.0500 -0.0392   6  6 6.757299e-01 6.757299e-01 1.155523e+00 0.028466
+  22  1.1000 -0.0782   6  6 6.955732e-01 6.955732e-01 1.281643e+00 0.011019
+  23  1.1500 -0.1167   6  6 7.025940e-01 7.025940e-01 1.366677e+00 0.000000
+  24  1.2000 -0.1545   6  6 7.152548e-01 7.152548e-01 1.413835e+00 0.000000
+  25  1.2500 -0.1913   6  6 7.402345e-01 7.402345e-01 1.468174e+00 0.001837
+  26  1.3000 -0.2270   6  6 7.759602e-01 7.759602e-01 1.538369e+00 0.004591
+  27  1.3500 -0.2612   6  6 8.198429e-01 8.198429e-01 1.592886e+00 0.008264
+  28  1.4000 -0.2939   6  6 8.827329e-01 8.827329e-01 1.635576e+00 0.025712
+  29  1.4500 -0.3247   6  6 9.305599e-01 9.305599e-01 1.671694e+00 0.036731
+  30  1.5000 -0.3536   6  6 9.176821e-01 9.176821e-01 1.689775e+00 0.045914
+  31  1.5500 -0.3802   6  6 1.097916e+00 1.097916e+00 1.674682e+00 0.048669
+  32  1.6000 -0.4045   6  6 1.214428e+00 1.214428e+00 1.637503e+00 0.049587
+  33  1.6500 -0.4263   6  6 1.339736e+00 1.339736e+00 1.619010e+00 0.054178
+  34  1.7000 -0.4455   6  6 1.569842e+00 1.569842e+00 1.712545e+00 0.053260
+  35  1.7500 -0.4619   6  6 1.768136e+00 1.768136e+00 2.102587e+00 0.061524
+  36  1.8000 -0.4755   6  6 2.032997e+00 2.032997e+00 2.454732e+00 0.062443
+  37  1.8500 -0.4862   6  6 2.176244e+00 2.176244e+00 2.849964e+00 0.061524
+  38  1.9000 -0.4938   6  6 2.602614e+00 2.602614e+00 3.596606e+00 0.059688
+  39  1.9500 -0.4985   6  6 2.625967e+00 2.625967e+00 4.030853e+00 0.049587
+  40  2.0000 -0.5000   6  6 2.560084e+00 2.560084e+00 3.803991e+00 0.049587
+  41  2.0500 -0.4985   6  6 2.421060e+00 2.421060e+00 4.127847e+00 0.032140
+  42  2.1000 -0.4938   6  6 2.439370e+00 2.439370e+00 4.099199e+00 0.034894
+  43  2.1500 -0.4862   6  6 2.701228e+00 2.701228e+00 4.153585e+00 0.032140
+  44  2.2000 -0.4755   6  6 2.955110e+00 2.955110e+00 4.392278e+00 0.021120
+  45  2.2500 -0.4619   6  6 3.186634e+00 3.186634e+00 4.594595e+00 0.011019
+  46  2.3000 -0.4455   6  6 3.102073e+00 3.102073e+00 4.461982e+00 0.015611
+  47  2.3500 -0.4263   6  6 2.950377e+00 2.950377e+00 4.048292e+00 0.015611
+  48  2.4000 -0.4045   6  6 2.916105e+00 2.916105e+00 4.480963e+00 0.015611
+  49  2.4500 -0.3802   6  6 2.552756e+00 2.552756e+00 4.639798e+00 0.012856
+  50  2.5000 -0.3536   6  6 2.372077e+00 2.372077e+00 4.676110e+00 0.013774
+  51  2.5500 -0.3247   6  6 2.319044e+00 2.319044e+00 5.137358e+00 0.012856
+  52  2.6000 -0.2939   6  6 2.363969e+00 2.363969e+00 5.114993e+00 0.013774
+  53  2.6500 -0.2612   6  6 2.210504e+00 2.210504e+00 5.205307e+00 0.012856
+  54  2.7000 -0.2270   6  6 2.292182e+00 2.292182e+00 5.493810e+00 0.016529
+  55  2.7500 -0.1913   6  6 2.216241e+00 2.216241e+00 5.507382e+00 0.020202
+  56  2.8000 -0.1545   6  6 2.484720e+00 2.484720e+00 5.237860e+00 0.013774
+  57  2.8500 -0.1167   6  6 2.584487e+00 2.584487e+00 4.986766e+00 0.018365
+  58  2.9000 -0.0782   6  6 2.634986e+00 2.634986e+00 4.819767e+00 0.010101
+  59  2.9500 -0.0392   6  6 2.773244e+00 2.773244e+00 5.291063e+00 0.012856
+  60  3.0000 -0.0000   6  6 1.821914e+01 1.821914e+01 1.022878e+01 0.000000
diff --git a/docs/developer/design/_phase_f_predictor_corrector.py b/docs/developer/design/_phase_f_predictor_corrector.py
new file mode 100644
index 00000000..03225da8
--- /dev/null
+++ b/docs/developer/design/_phase_f_predictor_corrector.py
@@ -0,0 +1,437 @@
+"""Phase F: ETD-2 VE predictor + J2 radial-return corrector (isotropic).
+
+Implements the predictor-corrector architecture from the web advice
+(``vep_stress_update_full_latex.md`` §15 Stage 4) on the isotropic VEP
+case. The TI extension is the end goal but isotropic is a cleaner first
+test of the architecture.
+
+Per-step structure:
+  1. Stokes solve with constitutive_model = isotropic Maxwell ETD-2
+     (yield_stress = ∞, so no in-residual yield clipping).
+  2. Read psi_star (= unclipped VE trial stress).
+  3. J2 radial return at each quadrature node: if |σ|_eq > σ_y, scale
+     σ ← (σ_y/|σ|_eq)·σ. Overwrite psi_star.
+  4. The corrected psi_star becomes σⁿ for the next timestep's
+     history term (α·σⁿ in the ETD-2 update).
+
+This is "predictor-corrector without outer Picard" — single-shot
+correction per timestep. If stability fails, add an outer Picard
+loop with stress damping (advice §10, ω_τ ≈ 0.5).
+
+Comparison: same harmonic shear box geometry as the existing killer
+test but isotropic (no fault, no director, uniform σ_y). Compare
+trajectory against BDF-1 and ETD-1 baselines run in this script
+in the same setup.
+
+Per-step diagnostics every 5 steps; runaway guard.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_phase_f_predictor_corrector.py
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA = 1.0; MU = 1.0
+# Spatially-varying σ_y(x): small in the fault influence zone, large in
+# the bulk. Isotropic von Mises elsewhere — no director, no rank-4
+# projector, just the same fault influence-function used to localise
+# yielding. Constant σ_y everywhere would fail everywhere (the correct
+# solution); the localised weak zone gives partial yielding.
+TAU_Y_FAULT = 0.05      # yield stress in the weak zone
+TAU_Y_BULK  = 200.0     # effectively no yield outside
+THETA_DEG = 15.0        # weak-zone tilt (no director use; just the geometry)
+RES = 32
+
+OUT_DIR = "output"
+
+
+def _build_isotropic_stokes(label, integrator, order, yield_stress_value):
+    """Common Stokes + isotropic VEP setup with spatial yield-stress
+    field via the fault influence function (isotropic von Mises;
+    director NOT used).
+
+    yield_stress_value: pass the spatial sympy expression to apply the
+    localised yield zone, or sympy.oo to disable yielding entirely
+    (predictor-corrector path applies J2 return mapping externally).
+    """
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+    # Build the localised weak zone using the fault influence-function
+    # (same geometry as the killer test). σ_y(x) small near the layer
+    # axis, large in the bulk. We're NOT using the director — this is
+    # just a spatial yield-stress field that happens to be drawn from
+    # the fault helper.
+    cx, cy = 0.5 * W, 0.5 * H
+    theta = np.radians(THETA_DEG)
+    dx_layer = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy_layer = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"layer_{label}", mesh,
+        np.array([[cx - dx_layer, cy - dy_layer],
+                  [cx + dx_layer, cy + dy_layer]]),
+        symbol=f"L{label}",
+    )
+    fault.discretize()
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / TAU_Y_FAULT,
+        value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    u = uw.discretisation.MeshVariable(f"U_{label}", mesh, 2, degree=2,
+                                        vtype=VarType.VECTOR)
+    p_sol = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1,
+                                            continuous=True, vtype=VarType.SCALAR)
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, integrator=integrator, order=order,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA
+    cm.Parameters.shear_modulus = MU
+    # Use the spatial yield-stress field by default; sympy.oo overrides
+    # for the no-yield-in-residual predictor-corrector path.
+    cm.Parameters.yield_stress = (
+        yield_stress_value if yield_stress_value is not None else tau_y_field
+    )
+    cm.Parameters.shear_viscosity_min = ETA * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return dict(mesh=mesh, stokes=stokes, u=u, p=p_sol, V_top=V_top, cm=cm)
+
+
+def _j2_radial_return(sigma_arr, sigma_y):
+    """Apply J2 radial return at each quadrature node.
+
+    sigma_arr: shape (n_nodes, dim, dim) symmetric tensor field.
+    Returns the corrected array (in-place not allowed — return new copy
+    so the caller can decide whether to overwrite).
+    """
+    # Equivalent stress: σ_eq = sqrt(3/2 · σ:σ) for deviatoric σ.
+    sig_dot_sig = (sigma_arr * sigma_arr).sum(axis=(1, 2))  # σ_ij σ_ij
+    sigma_eq = np.sqrt(1.5 * sig_dot_sig)
+    # Avoid divide-by-zero
+    safe_eq = np.where(sigma_eq > 1.0e-12, sigma_eq, 1.0)
+    scale = np.where(sigma_eq > sigma_y, sigma_y / safe_eq, 1.0)
+    return sigma_arr * scale[:, None, None]
+
+
+def run_case(label, integrator, order, n_periods=1.5,
+             apply_radial_return=False, in_residual_yield=True,
+             max_picard_iters=1, picard_tol=1.0e-3, omega_tau=0.5):
+    """Generic runner.
+
+    integrator/order: passed to constitutive model.
+    apply_radial_return: if True, apply J2 radial return post-solve.
+    in_residual_yield: if False, set yield_stress=∞ in the model so it
+                       doesn't clip in-residual; only post-solve return
+                       maps. If True, use the spatial yield_stress
+                       field (small near layer, large in bulk).
+    max_picard_iters: outer Picard within a timestep. With value 1 the
+                       Stokes solve runs once (no equilibration after
+                       correction). With value > 1 we save σ_n at step
+                       start, run Stokes, apply correction, damp σ, and
+                       re-solve with the corrected psi_star until σ
+                       converges.
+    omega_tau: stress damping coefficient inside the Picard loop
+               (advice §10; ω_τ ~ 0.5 is the standard value).
+    """
+    # None → use the spatial weak-zone field built inside _build_..; oo
+    # → disable in-residual yielding (predictor-corrector path).
+    yield_stress_value = None if in_residual_yield else sympy.oo
+    obj = _build_isotropic_stokes(label, integrator, order, yield_stress_value)
+    mesh = obj["mesh"]; stokes = obj["stokes"]
+    u = obj["u"]; V_top = obj["V_top"]; cm = obj["cm"]
+    DFDt = stokes.Unknowns.DFDt
+
+    # Per-step trace file — updated each step (flush every line). Lets a
+    # killed run still leave usable data, and lets the plot script parse
+    # it without rerunning. Pattern from feedback_per_step_logging.md.
+    trace_path = os.path.join(
+        os.path.dirname(__file__), f"_phase_f_{label}.trace.txt"
+    )
+    trace_fh = open(trace_path, "w")
+    trace_fh.write(
+        f"# Phase F predictor-corrector trace: {label}\n"
+        f"# integrator={integrator!r} order={order} "
+        f"apply_radial_return={apply_radial_return} "
+        f"in_residual_yield={in_residual_yield}\n"
+        f"# columns: step, t, V_top, snes_iters_total, picard_iters, "
+        f"sigma_eq_max, sigma_eq_max_after_correction, u_y_max, yielded_fraction\n"
+    )
+    trace_fh.flush()
+
+    # Evaluate spatial σ_y(x) at psi_star coords ONCE — the yield stress
+    # field doesn't change in time (just space). Used by the radial
+    # return corrector AND by the yielded-fraction diagnostic.
+    sigma_coords = DFDt.psi_star[0].coords
+    cx, cy = 0.5 * W, 0.5 * H
+    theta = np.radians(THETA_DEG)
+    n_x_l = -np.sin(theta); n_y_l = np.cos(theta)
+    # Re-derive the same Gaussian as in _build (signed-distance to the
+    # layer axis). Avoids needing to evaluate the cm's yield_stress
+    # symbolically for every node — which can be expensive.
+    sd = np.abs((sigma_coords[:, 0] - cx) * n_x_l
+                + (sigma_coords[:, 1] - cy) * n_y_l)
+    half_length = 0.5 * FAULT_LENGTH
+    along = (sigma_coords[:, 0] - cx) * n_y_l - (sigma_coords[:, 1] - cy) * n_x_l
+    in_extent = np.abs(along) <= half_length
+    # The fault influence_function uses a Gaussian normal to the layer
+    # axis, restricted to the layer extent. value_near at sd=0 (and
+    # within extent), value_far at large sd (or outside extent).
+    weakness_arr = np.where(
+        in_extent,
+        (1.0 / TAU_Y_FAULT) * np.exp(-(sd / FAULT_WIDTH) ** 2)
+        + (1.0 / TAU_Y_BULK)
+        * (1.0 - np.exp(-(sd / FAULT_WIDTH) ** 2)),
+        1.0 / TAU_Y_BULK * np.ones_like(sd),
+    )
+    sigma_y_at_nodes = 1.0 / weakness_arr
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    iters = []; reasons = []
+    sigma_eq_max_per_step = []
+    sigma_eq_centre_per_step = []
+    u_y_max_per_step = []
+    yielded_fraction_per_step = []
+    centre = np.array([[0.5 * W, 0.5 * H]])
+
+    t_cur = 0.0
+    t0 = time.time()
+    picard_iters_per_step = []  # diagnostic: how many Picard iters used
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+
+        # Save σ_n at start of step — needed if we Picard-iterate within
+        # the step (psi_star has to read σ_n for E_eff history each
+        # solve, but the post-solve correction overwrites it).
+        sigma_n = np.asarray(DFDt.psi_star[0].array).copy()
+        sigma_iter = sigma_n.copy()  # current best estimate of σ_{n+1}
+
+        snes_iters_total = 0
+        snes_reason_last = 0
+        picard_k_used = max_picard_iters
+        for picard_k in range(max_picard_iters):
+            # Restore start-of-step state so model sees σ_n as history
+            DFDt.psi_star[0].array[...] = sigma_n
+            try:
+                stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+            except Exception as exc:
+                print(f"  step at t={t_end_step:.3f} picard={picard_k}: "
+                      f"solve raised — {exc}", flush=True)
+                snes_iters_total = -1
+                snes_reason_last = -99
+                picard_k_used = picard_k
+                break
+            snes_iters_total += int(stokes.snes.getIterationNumber())
+            snes_reason_last = int(stokes.snes.getConvergedReason())
+
+            sigma_trial = np.asarray(DFDt.psi_star[0].array).copy()
+            if apply_radial_return:
+                sigma_corrected = _j2_radial_return(sigma_trial, sigma_y_at_nodes)
+            else:
+                sigma_corrected = sigma_trial
+
+            if max_picard_iters == 1:
+                # Single-shot mode: just accept the correction
+                sigma_iter = sigma_corrected
+                picard_k_used = 1
+                break
+
+            # Damp the σ update for outer-Picard convergence (advice §10)
+            sigma_new = (1.0 - omega_tau) * sigma_iter + omega_tau * sigma_corrected
+            # Convergence check
+            denom = max(np.linalg.norm(sigma_new), 1e-12)
+            diff = np.linalg.norm(sigma_new - sigma_iter) / denom
+            sigma_iter = sigma_new
+            if diff < picard_tol:
+                picard_k_used = picard_k + 1
+                break
+
+        # Final accepted state
+        DFDt.psi_star[0].array[...] = sigma_iter
+        picard_iters_per_step.append(picard_k_used)
+
+        if snes_iters_total < 0:
+            iters.append(-1); reasons.append(snes_reason_last)
+            break
+        iters.append(snes_iters_total)
+        reasons.append(snes_reason_last)
+
+        # Diagnostics on the FINAL accepted σ
+        sig_dot_sig = (sigma_iter * sigma_iter).sum(axis=(1, 2))
+        sigma_eq = np.sqrt(1.5 * sig_dot_sig)
+        sigma_eq_max_per_step.append(float(sigma_eq.max()))
+        n_yielded = int((sigma_eq > sigma_y_at_nodes * 0.99).sum())
+        yielded_fraction_per_step.append(n_yielded / sigma_eq.size)
+        # σ_eq AFTER correction (same as final state since sigma_iter is corrected)
+        sigma_eq_centre_per_step.append(float(sigma_eq.max()))
+
+        u_arr = np.asarray(u.array).reshape(-1, 2)
+        u_y_max_per_step.append(float(np.abs(u_arr[:, 1]).max()))
+
+        step_idx = len(iters)
+        # Persistent per-step trace — written EVERY step, flushed
+        trace_fh.write(
+            f"{step_idx:4d} {t_end_step:7.4f} {v_now:+.4f} "
+            f"{iters[-1]:3d} {picard_k_used:2d} "
+            f"{sigma_eq_max_per_step[-1]:.6e} "
+            f"{sigma_eq_centre_per_step[-1]:.6e} "
+            f"{u_y_max_per_step[-1]:.6e} "
+            f"{yielded_fraction_per_step[-1]:.6f}\n"
+        )
+        trace_fh.flush()
+        if step_idx <= 5 or step_idx % 5 == 0:
+            picard_str = f" pic={picard_k_used:d}" if max_picard_iters > 1 else ""
+            print(
+                f"  step {step_idx:3d}/120  t={t_end_step:5.3f}  "
+                f"V={v_now:+.3f}  iters={iters[-1]:2d}{picard_str}  "
+                f"|σ|_eq_max={sigma_eq_max_per_step[-1]:.3e}  "
+                f"|u_y|={u_y_max_per_step[-1]:.3e}  "
+                f"yielded={yielded_fraction_per_step[-1]:.2%}",
+                flush=True,
+            )
+
+        if sigma_eq_max_per_step[-1] > 100.0 or u_y_max_per_step[-1] > 10.0:
+            print(f"  *** runaway at step {step_idx} — breaking ***", flush=True)
+            break
+
+        t_cur = t_end_step
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+    print(
+        f"  ran {len(iters)} steps in {time.time()-t0:.1f}s; "
+        f"{label} (integrator={integrator}, order={order}, "
+        f"radial_return={apply_radial_return}, in_residual_yield={in_residual_yield})",
+        flush=True,
+    )
+    if iters_arr.size > 0 and (iters_arr >= 0).any():
+        print(
+            f"  SNES iters mean={iters_arr[iters_arr>=0].mean():.1f} "
+            f"max={iters_arr[iters_arr>=0].max()} "
+            f"diverged={int((reasons_arr<0).sum())}/{len(reasons_arr)}",
+            flush=True,
+        )
+    if sigma_eq_max_per_step:
+        print(
+            f"  σ_eq_max: end={sigma_eq_max_per_step[-1]:.4f}  "
+            f"global max={max(sigma_eq_max_per_step):.4f}  "
+            f"({max(sigma_eq_max_per_step)/TAU_Y_FAULT:.2f}·τ_y_fault)",
+            flush=True,
+        )
+        print(
+            f"  |u_y|_max: end={u_y_max_per_step[-1]:.4f}  "
+            f"global max={max(u_y_max_per_step):.4f}",
+            flush=True,
+        )
+        print(
+            f"  yielded fraction: end={yielded_fraction_per_step[-1]:.2%}  "
+            f"max={max(yielded_fraction_per_step):.2%}",
+            flush=True,
+        )
+
+    out_npz = os.path.join(OUT_DIR, f"phase_f_{label}.npz")
+    np.savez(
+        out_npz,
+        iters=iters_arr, reasons=reasons_arr,
+        sigma_eq_max_per_step=np.asarray(sigma_eq_max_per_step),
+        sigma_eq_centre_per_step=np.asarray(sigma_eq_centre_per_step),
+        u_y_max_per_step=np.asarray(u_y_max_per_step),
+        yielded_fraction_per_step=np.asarray(yielded_fraction_per_step),
+        T_END=np.array(T_END),
+        n_steps=np.array(len(iters)),
+        wall_seconds=np.array(time.time() - t0),
+    )
+    print(f"  saved → {out_npz}", flush=True)
+
+
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+
+    # Each tuple: (label, integrator, order, apply_pc, max_picard_iters)
+    # apply_pc=True → yield_stress=∞ in model, radial-return correction
+    # max_picard_iters=1 → single shot; >1 → outer Picard equilibrate
+    cases = [
+        # Baseline: BDF-1 yield-in-residual via softmin (works)
+        ("bdf1_iso",           "bdf", 1, False, 1),
+        # NOTE: ETD-1 / ETD-2 with yield_stress=spatial-field +
+        # yield_mode=softmin in the parent ViscoElasticPlasticFlowModel
+        # currently produces SNES line-search divergence (separate bug,
+        # filed for follow-up). Skipping those baselines here — the
+        # predictor-corrector path below sets yield_stress=∞ in the
+        # model so it never enters that broken in-residual code path.
+        # Predictor-corrector: single shot
+        ("etd2_pc1",           "etd", 2, True,  1),
+        ("etd1_pc1",           "etd", 1, True,  1),
+        # Predictor-corrector with outer Picard equilibration (ω_τ=0.5)
+        ("etd2_pc_picard",     "etd", 2, True,  6),
+        ("etd1_pc_picard",     "etd", 1, True,  6),
+    ]
+    for label, integrator, order, apply_pc, max_picard in cases:
+        cache = os.path.join(OUT_DIR, f"phase_f_{label}.npz")
+        if os.path.exists(cache):
+            print(f"\n=== {label}: cache hit, skipping ===", flush=True)
+            continue
+        if apply_pc:
+            mode = ("predictor-corrector single-shot"
+                    if max_picard == 1
+                    else f"predictor-corrector + outer Picard ({max_picard} iters)")
+            print(
+                f"\n=== {label}: integrator={integrator!r}, order={order}, "
+                f"{mode} ===",
+                flush=True,
+            )
+            run_case(label, integrator, order, apply_radial_return=True,
+                     in_residual_yield=False, max_picard_iters=max_picard)
+        else:
+            print(
+                f"\n=== {label}: integrator={integrator!r}, order={order}, "
+                f"yield-in-residual (softmin in model) ===",
+                flush=True,
+            )
+            run_case(label, integrator, order, apply_radial_return=False,
+                     in_residual_yield=True, max_picard_iters=1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_b_bdf_vs_etd.py b/docs/developer/design/_plot_phase_b_bdf_vs_etd.py
new file mode 100644
index 00000000..e74ce3a9
--- /dev/null
+++ b/docs/developer/design/_plot_phase_b_bdf_vs_etd.py
@@ -0,0 +1,238 @@
+"""Plot BDF-1 vs ETD-2 (lumped) vs split-ETD-2 trajectories at
+τ_y=0.05, θ=+15°.
+
+Loads time series saved by ``_phase_b_bdf_vs_etd_at_tight_yield.py``
+(BDF-1, lumped ETD-2) and ``_phase_d_killer_split.py`` (split ETD-2);
+produces a 3-panel figure on shared time axes:
+
+  1. centre σ_xy(t)
+  2. global max |σ|_II(t)
+  3. global max |u_y|(t)
+
+τ_y=0.05 reference lines are drawn on panels 1 and 2.  The split
+trace is expected to sit inside the lumped runaway and ideally close
+to the BDF-1 baseline.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_plot_phase_b_bdf_vs_etd.py
+"""
+
+import os
+
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+OMEGA = np.pi / 2.0
+DT = 0.05
+THETA = 15.0
+TAU_Y = 0.05
+OUT_DIR = "output"
+
+
+def _path(integrator):
+    return os.path.join(
+        OUT_DIR,
+        f"phase_b_{integrator}_th{THETA:+.0f}_ty{TAU_Y:.2f}".replace(".", "p") + ".npz",
+    )
+
+
+def _load(integrator):
+    path = _path(integrator)
+    if not os.path.exists(path):
+        raise FileNotFoundError(
+            f"missing trajectory cache {path} — run "
+            f"_phase_b_bdf_vs_etd_at_tight_yield.py first"
+        )
+    return np.load(path)
+
+
+def _load_bdf2_from_log(log_path=None):
+    """Parse the per-step BDF-2 trace from the runner's stdout log.
+    Returns ``(t, sigma_II, u_y, sigma_par)`` numpy arrays of the
+    sparse sample points (every 5 steps after the first 5) up to the
+    runaway. None if the log doesn't exist.
+    """
+    if log_path is None:
+        # Tracked trace lives next to this script (git won't track *.log).
+        candidates = [
+            os.path.join(
+                os.path.dirname(__file__),
+                "_phase_b_bdf2_th+15_ty0p05.trace.txt",
+            ),
+            os.path.join(OUT_DIR, "phase_b_bdf2_th+15_ty0p05.log"),
+        ]
+        log_path = next((p for p in candidates if os.path.exists(p)), None)
+        if log_path is None:
+            return None
+    elif not os.path.exists(log_path):
+        return None
+    import re
+    pat = re.compile(
+        r"step\s+(\d+)/\d+\s+t=([\d.+\-eE]+)\s+V=[\d.+\-eE]+\s+iters=\s*\d+\s+"
+        r"\|σ\|_II=([\d.+\-eE]+)\s+\|u_y\|=([\d.+\-eE]+)\s+\|σ_∥\|=([\d.+\-eE]+)"
+    )
+    rows = []
+    with open(log_path) as f:
+        for line in f:
+            m = pat.search(line)
+            if m:
+                rows.append((int(m.group(1)), float(m.group(2)),
+                             float(m.group(3)), float(m.group(4)),
+                             float(m.group(5))))
+    if not rows:
+        return None
+    rows.sort(key=lambda r: r[0])
+    return (
+        np.array([r[1] for r in rows]),
+        np.array([r[2] for r in rows]),
+        np.array([r[3] for r in rows]),
+        np.array([r[4] for r in rows]),
+    )
+
+
+def main():
+    bdf = _load("bdf")
+    etd = _load("etd")
+    try:
+        split = _load("etd-split")
+    except FileNotFoundError:
+        split = None
+        print("  (no split-ETD trajectory cached — skipping)", flush=True)
+    try:
+        hybrid = _load("hybrid")
+    except FileNotFoundError:
+        hybrid = None
+        print("  (no hybrid trajectory cached — skipping)", flush=True)
+
+    bdf2 = _load_bdf2_from_log()
+    if bdf2 is None:
+        print("  (no BDF-2 log found — skipping)", flush=True)
+
+    try:
+        etd1 = _load("etd1")
+    except FileNotFoundError:
+        etd1 = None
+        print("  (no ETD-1 trajectory cached — skipping)", flush=True)
+
+    n_bdf = int(bdf["n_steps"])
+    n_etd = int(etd["n_steps"])
+    t_bdf = (np.arange(n_bdf) + 1) * DT
+    t_etd = (np.arange(n_etd) + 1) * DT
+    period = 2.0 * np.pi / OMEGA
+    if split is not None:
+        n_split = int(split["n_steps"])
+        t_split = (np.arange(n_split) + 1) * DT
+    if hybrid is not None:
+        n_hybrid = int(hybrid["n_steps"])
+        t_hybrid = (np.arange(n_hybrid) + 1) * DT
+    if etd1 is not None:
+        n_etd1 = int(etd1["n_steps"])
+        t_etd1 = (np.arange(n_etd1) + 1) * DT
+
+    fig, axes = plt.subplots(3, 1, figsize=(8.5, 9.5), sharex=True)
+
+    # Panel 1 — fault-resolved |σ_∥|(t) at fault centre.
+    ax = axes[0]
+    if "sigma_par_centre" in bdf.files:
+        ax.plot(t_bdf / period, bdf["sigma_par_centre"], "-", color="#1f77b4",
+                label=f"BDF-1 (peak |σ_∥|={bdf['sigma_par_centre'].max():.3f})")
+    if "sigma_par_centre" in etd.files:
+        ax.plot(t_etd / period, etd["sigma_par_centre"], "-", color="#d62728",
+                label=f"ETD-2 lumped (peak |σ_∥|={etd['sigma_par_centre'].max():.3f})")
+    if split is not None and "sigma_par_centre" in split.files:
+        ax.plot(t_split / period, split["sigma_par_centre"], "-", color="#2ca02c",
+                label=f"ETD-2 split (peak |σ_∥|={split['sigma_par_centre'].max():.3f})")
+    if hybrid is not None and "sigma_par_centre" in hybrid.files:
+        ax.plot(t_hybrid / period, hybrid["sigma_par_centre"], "-", color="#9467bd",
+                label=f"hybrid (peak |σ_∥|={hybrid['sigma_par_centre'].max():.3f})")
+    if etd1 is not None and "sigma_par_centre" in etd1.files:
+        ax.plot(t_etd1 / period, etd1["sigma_par_centre"], "-", color="#17becf", lw=2.0,
+                label=f"ETD-1 (peak |σ_∥|={etd1['sigma_par_centre'].max():.3f}) ★")
+    if bdf2 is not None:
+        t_b2, sII_b2, uy_b2, spar_b2 = bdf2
+        ax.plot(t_b2 / period, spar_b2, "x-", color="#ff7f0e",
+                label=f"BDF-2 → blow-up (peak |σ_∥|={spar_b2.max():.3f})")
+    ax.axhline(TAU_Y, color="#888888", lw=0.8, linestyle="--",
+               label=rf"$\tau_y={TAU_Y}$")
+    ax.set_ylabel(r"centre $|\sigma_\parallel|$  (resolved fault shear)")
+    ax.legend(loc="upper right", fontsize=9)
+    ax.grid(alpha=0.3)
+
+    # Panel 2 — global max |σ|_II(t)
+    ax = axes[1]
+    ax.semilogy(t_bdf / period, np.abs(bdf["sigma_II_max_per_step"]),
+                "-", color="#1f77b4",
+                label=f"BDF-1 (peak={bdf['sigma_II_max_per_step'].max():.3f})")
+    ax.semilogy(t_etd / period, np.abs(etd["sigma_II_max_per_step"]),
+                "-", color="#d62728",
+                label=f"ETD-2 lumped (peak={etd['sigma_II_max_per_step'].max():.3f})")
+    if split is not None:
+        ax.semilogy(t_split / period, np.abs(split["sigma_II_max_per_step"]),
+                    "-", color="#2ca02c",
+                    label=f"ETD-2 split (peak={split['sigma_II_max_per_step'].max():.3f})")
+    if hybrid is not None:
+        ax.semilogy(t_hybrid / period, np.abs(hybrid["sigma_II_max_per_step"]),
+                    "-", color="#9467bd",
+                    label=f"hybrid (peak={hybrid['sigma_II_max_per_step'].max():.3f})")
+    if etd1 is not None:
+        ax.semilogy(t_etd1 / period, np.abs(etd1["sigma_II_max_per_step"]),
+                    "-", color="#17becf", lw=2.0,
+                    label=f"ETD-1 (peak={etd1['sigma_II_max_per_step'].max():.3f}) ★")
+    if bdf2 is not None:
+        t_b2, sII_b2, uy_b2, spar_b2 = bdf2
+        ax.semilogy(t_b2 / period, sII_b2, "x-", color="#ff7f0e",
+                    label=f"BDF-2 → blow-up (last sample={sII_b2[-1]:.2e})")
+    ax.axhline(TAU_Y, color="#888888", lw=0.8, linestyle="--",
+               label=rf"$\tau_y={TAU_Y}$")
+    ax.set_ylabel(r"max $|\sigma|_{II}$ (log)")
+    ax.legend(loc="upper left", fontsize=9)
+    ax.grid(alpha=0.3, which="both")
+
+    # Panel 3 — global max |u_y|(t)
+    ax = axes[2]
+    ax.semilogy(t_bdf / period, bdf["u_y_max_per_step"],
+                "-", color="#1f77b4",
+                label=f"BDF-1 (peak={bdf['u_y_max_per_step'].max():.3f})")
+    ax.semilogy(t_etd / period, etd["u_y_max_per_step"],
+                "-", color="#d62728",
+                label=f"ETD-2 lumped (peak={etd['u_y_max_per_step'].max():.3f})")
+    if split is not None:
+        ax.semilogy(t_split / period, split["u_y_max_per_step"],
+                    "-", color="#2ca02c",
+                    label=f"ETD-2 split (peak={split['u_y_max_per_step'].max():.3f})")
+    if hybrid is not None:
+        ax.semilogy(t_hybrid / period, hybrid["u_y_max_per_step"],
+                    "-", color="#9467bd",
+                    label=f"hybrid (peak={hybrid['u_y_max_per_step'].max():.3f})")
+    if etd1 is not None:
+        ax.semilogy(t_etd1 / period, etd1["u_y_max_per_step"],
+                    "-", color="#17becf", lw=2.0,
+                    label=f"ETD-1 (peak={etd1['u_y_max_per_step'].max():.3f}) ★")
+    if bdf2 is not None:
+        t_b2, sII_b2, uy_b2, spar_b2 = bdf2
+        ax.semilogy(t_b2 / period, uy_b2, "x-", color="#ff7f0e",
+                    label=f"BDF-2 → blow-up (last sample={uy_b2[-1]:.2e})")
+    ax.set_ylabel(r"max $|u_y|$ (log)")
+    ax.set_xlabel(r"time $t / T$ (periods)")
+    ax.legend(loc="upper left", fontsize=9)
+    ax.grid(alpha=0.3, which="both")
+
+    fig.suptitle(
+        rf"BDF-1 vs ETD-2 lumped/split/hybrid at $\theta=+15^\circ$, $\tau_y={TAU_Y}$, RES=32",
+        y=0.995,
+    )
+    fig.tight_layout()
+
+    out_png = os.path.join(OUT_DIR, "exp_integrator_phase_b_bdf_vs_etd.png")
+    fig.savefig(out_png, dpi=140)
+    plt.close(fig)
+    print(f"  wrote {out_png}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_b_fields.py b/docs/developer/design/_plot_phase_b_fields.py
new file mode 100644
index 00000000..0f7d7a95
--- /dev/null
+++ b/docs/developer/design/_plot_phase_b_fields.py
@@ -0,0 +1,248 @@
+"""Phase B field plots — velocity / strain-rate / stress for yield-active cases.
+
+Runs the bench_ti_vep_harmonic geometry with ETD-2 for one period at one
+or more yield-active (θ, τ_y) combinations, captures the full mesh-variable
+fields at peak forcing, and plots velocity vectors + strain-rate magnitude
++ stress magnitude with the spatial τ_y(x) yield zone overlaid.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_plot_phase_b_fields.py
+"""
+
+import os
+import time
+import sys
+
+import numpy as np
+import sympy
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.tri import Triangulation, LinearTriInterpolator
+
+import underworld3 as uw
+from underworld3.function import expression
+
+
+# Add the design dir to path so we can reuse the killer-test build helpers
+_DESIGN_DIR = os.path.dirname(os.path.abspath(__file__))
+if _DESIGN_DIR not in sys.path:
+    sys.path.insert(0, _DESIGN_DIR)
+from _exp_integrator_phase_b_killer import build_ti_exp_stokes  # noqa: E402
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0
+W = 1.0
+FAULT_WIDTH = 0.06
+
+
+def run_capture_at_yield_peak(theta_deg, tau_y_at_fault, n_periods=2):
+    """Run ``n_periods`` of the harmonic forcing and capture fields at
+    the step where the yield-zone σ_II reaches its peak — i.e. when yield
+    is most active in the fault zone.
+
+    Strategy: run forward, after each step record σ_II_max in the
+    fault-zone mask AND a snapshot of the full state; at the end pick
+    the step with the largest in-fault σ_II to plot.
+    """
+    label = f"fields_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+    mesh, stokes, V_top, n_vec = build_ti_exp_stokes(label, theta_deg, tau_y_at_fault)
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    E_sym = stokes.Unknowns.E
+    n_x, n_y = n_vec
+    cx, cy = 0.5 * W, 0.5 * H
+    sigma_coords = DFDt.psi_star[0].coords
+    # fault-zone mask: signed distance to fault line ≤ 1.5·FAULT_WIDTH
+    sd = np.abs((sigma_coords[:, 0] - cx) * n_x + (sigma_coords[:, 1] - cy) * n_y)
+    fault_mask = sd < 1.5 * FAULT_WIDTH
+
+    # τ_y(x) field — constant in time, evaluate once
+    ty_field = np.asarray(
+        uw.function.evaluate(cm.Parameters.yield_stress.sym, sigma_coords)
+    ).flatten()
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    snapshots = []          # list of dicts captured each step
+    snapshot_metrics = []   # [(t, sigma_II_in_fault_max), ...]
+    t_cur = 0.0
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        # Track σ_II in fault zone, and discount the initial transient
+        # (first half-period where σ is still ramping from zero) by
+        # only considering steps after t > T_period.
+        t_recordable = t_end_step > 2.0 * np.pi / OMEGA
+        in_fault_max = float(sigma_II[fault_mask].max()) if fault_mask.any() else 0.0
+        snapshot_metrics.append((t_end_step, in_fault_max, t_recordable))
+
+        # Capture every step (cheap) — snapshot for the chosen step at end
+        # Strain rate (eval is the expensive bit, do it inline)
+        edot_xx = np.asarray(uw.function.evaluate(E_sym[0, 0], sigma_coords)).flatten()
+        edot_xy = np.asarray(uw.function.evaluate(E_sym[0, 1], sigma_coords)).flatten()
+        edot_yy = np.asarray(uw.function.evaluate(E_sym[1, 1], sigma_coords)).flatten()
+        edot_II = np.sqrt(0.5 * (edot_xx ** 2 + edot_yy ** 2 + 2 * edot_xy ** 2))
+        u_arr = np.asarray(stokes.u.array)
+        snapshots.append(dict(
+            t=t_end_step,
+            v_top=v_now,
+            u_coords=stokes.u.coords.copy(),
+            u=u_arr.reshape(-1, 2).copy(),
+            sigma_arr=sigma_arr.copy(),
+            sigma_II=sigma_II.copy(),
+            edot_II=edot_II,
+            edot_xy=edot_xy,
+        ))
+        t_cur = t_end_step
+
+    # Pick the step (after the first period) with the biggest in-fault σ_II.
+    candidates = [(t, m) for (t, m, ok) in snapshot_metrics if ok]
+    if not candidates:
+        # Fallback: take the last step
+        chosen = snapshots[-1]
+    else:
+        idx = int(np.argmax([m for (_, m) in candidates]))
+        # candidates index → snapshot index (count of recordable + initial transient)
+        first_recordable = next(
+            i for i, sm in enumerate(snapshot_metrics) if sm[2]
+        )
+        chosen = snapshots[first_recordable + idx]
+    chosen.update(
+        theta_deg=theta_deg,
+        tau_y_at_fault=tau_y_at_fault,
+        n_vec=n_vec,
+        sigma_coords=sigma_coords,
+        ty_field=ty_field,
+        fault_mask=fault_mask,
+        T_END=T_END,
+    )
+    chosen["yield_ratio"] = chosen["sigma_II"] / np.maximum(ty_field, 1e-30)
+    chosen["sigma_xy"] = chosen["sigma_arr"][:, 0, 1]
+    chosen["sigma_xx"] = chosen["sigma_arr"][:, 0, 0]
+    chosen["sigma_yy"] = chosen["sigma_arr"][:, 1, 1]
+
+    print(
+        f"  picked step at t={chosen['t']:.3f} (V_top={chosen['v_top']:+.4f}); "
+        f"max in-fault σ_II = {float(chosen['sigma_II'][fault_mask].max()):.4f} "
+        f"(τ_y_centre={tau_y_at_fault}, ratio "
+        f"{float(chosen['sigma_II'][fault_mask].max())/tau_y_at_fault:.3f}·τ_y)",
+        flush=True,
+    )
+    return chosen
+
+
+def plot_one(snapshot, out_path):
+    th = snapshot["theta_deg"]
+    ty_fault = snapshot["tau_y_at_fault"]
+    n_x, n_y = snapshot["n_vec"]
+    cx, cy = 0.5 * W, 0.5 * H
+
+    sx, sy = snapshot["sigma_coords"][:, 0], snapshot["sigma_coords"][:, 1]
+    tri = Triangulation(sx, sy)
+
+    ux, uy = snapshot["u_coords"][:, 0], snapshot["u_coords"][:, 1]
+    u_x, u_y = snapshot["u"][:, 0], snapshot["u"][:, 1]
+
+    fig, axes = plt.subplots(2, 2, figsize=(13, 11), sharex=True, sharey=True)
+
+    # ---- Top-left: velocity field with fault overlay ------------------
+    ax = axes[0, 0]
+    speed = np.sqrt(u_x ** 2 + u_y ** 2)
+    ax.tricontourf(
+        Triangulation(ux, uy), speed, levels=24, cmap="Blues", alpha=0.7,
+    )
+    # Subsample for arrows (~every 2nd node)
+    sub = slice(None, None, 4)
+    ax.quiver(
+        ux[sub], uy[sub], u_x[sub], u_y[sub],
+        scale=8.0, width=0.0035, color="0.2", alpha=0.85,
+    )
+    _overlay_fault(ax, snapshot)
+    ax.set_title("velocity field (arrows + |u| heatmap)")
+    ax.set_aspect("equal")
+
+    # ---- Top-right: |ε̇|_II ---------------------------------------------
+    ax = axes[0, 1]
+    cax = ax.tricontourf(tri, snapshot["edot_II"], levels=20, cmap="viridis")
+    fig.colorbar(cax, ax=ax, fraction=0.040, pad=0.02)
+    _overlay_fault(ax, snapshot)
+    ax.set_title(r"$|\dot\varepsilon|_{II}$ (strain-rate 2nd invariant)")
+    ax.set_aspect("equal")
+
+    # ---- Bottom-left: |σ|_II with τ_y(x) contour ------------------------
+    ax = axes[1, 0]
+    cax = ax.tricontourf(tri, snapshot["sigma_II"], levels=20, cmap="magma")
+    fig.colorbar(cax, ax=ax, fraction=0.040, pad=0.02)
+    # Contour the τ_y(x) field at a few values to show the fault
+    tri_full = Triangulation(sx, sy)
+    ax.tricontour(
+        tri_full, snapshot["ty_field"],
+        levels=[0.5, 1.0, 5.0, 50.0], colors="cyan", linewidths=0.7, alpha=0.7,
+    )
+    _overlay_fault(ax, snapshot, color="white")
+    ax.set_title(r"$|\sigma|_{II}$ — cyan: $\tau_y(x)$ contours (0.5, 1, 5, 50)")
+    ax.set_aspect("equal")
+
+    # ---- Bottom-right: yield ratio σ_II / τ_y(x) ------------------------
+    ax = axes[1, 1]
+    ratio = snapshot["yield_ratio"]
+    levels = np.linspace(0, 1.2, 25)
+    cax = ax.tricontourf(tri, np.clip(ratio, 0, 1.2), levels=levels, cmap="RdYlGn_r")
+    fig.colorbar(cax, ax=ax, fraction=0.040, pad=0.02, label=r"$|\sigma|_{II}/\tau_y(x)$")
+    ax.tricontour(tri, ratio, levels=[1.0], colors="black", linewidths=1.2)
+    _overlay_fault(ax, snapshot)
+    ax.set_title(r"yield activation: $|\sigma|_{II}/\tau_y(x)$  (black contour at 1)")
+    ax.set_aspect("equal")
+
+    fig.suptitle(
+        f"ETD-2 fields at yield-active step "
+        f"(t={snapshot['t']:.2f}, V_top={snapshot['v_top']:+.3f}) — "
+        f"θ={th:+.0f}°, fault τ_y={ty_fault}, "
+        f"max |σ_II/τ_y| = {ratio.max():.3f}",
+        fontsize=12, y=0.995,
+    )
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=130, bbox_inches="tight")
+    plt.close(fig)
+    print(f"wrote {out_path}", flush=True)
+
+
+def _overlay_fault(ax, snap, color="red"):
+    """Draw the fault line and the 1·FAULT_WIDTH band."""
+    n_x, n_y = snap["n_vec"]
+    cx, cy = 0.5 * W, 0.5 * H
+    # Fault segment endpoints (length 0.6 like FAULT_LENGTH in the bench)
+    L = 0.6
+    t_x, t_y = n_y, -n_x  # tangent
+    p1 = (cx - 0.5 * L * t_x, cy - 0.5 * L * t_y)
+    p2 = (cx + 0.5 * L * t_x, cy + 0.5 * L * t_y)
+    ax.plot([p1[0], p2[0]], [p1[1], p2[1]], color=color, lw=1.5, alpha=0.85)
+
+
+def main():
+    os.makedirs("output", exist_ok=True)
+    cases = [(0.0, 0.15), (15.0, 0.15)]
+    for theta, ty in cases:
+        print(f"\n=== θ={theta:+.0f}°, τ_y={ty:.2f} ===", flush=True)
+        t0 = time.time()
+        snap = run_capture_at_yield_peak(theta, ty, n_periods=2)
+        print(f"  ran in {time.time()-t0:.1f}s, max per-node yield ratio = {snap['yield_ratio'].max():.3f}")
+        out = f"output/exp_integrator_phase_b_fields_th{theta:+.0f}_ty{ty:.2f}".replace(".", "p") + ".png"
+        plot_one(snap, out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_b_pyvista.py b/docs/developer/design/_plot_phase_b_pyvista.py
new file mode 100644
index 00000000..8de371f2
--- /dev/null
+++ b/docs/developer/design/_plot_phase_b_pyvista.py
@@ -0,0 +1,492 @@
+"""Phase B PyVista field plots — high-resolution snapshot at yield-active step.
+
+Runs the bench_ti_vep_harmonic geometry with ETD-2 at **RES=32** for one
+yielding cycle and **checkpoints** the snapshot via ``mesh.write_timestep``
+(HDF5 + XDMF, ParaView-compatible) so we can replot without re-running.
+Renders 4-panel PyVista figures using the UW3 ``visualisation`` API.
+
+Capture-or-load pattern: each case checkpoints to
+``output/phase_b_<key>.{mesh, U, sigma, edot_II, ty, sigma_II, yield_ratio}.00000.h5``.
+If those files exist, skip the simulation and read back from disk.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_plot_phase_b_pyvista.py
+
+Force re-capture::
+
+    rm output/phase_b_*.h5 output/phase_b_*.xdmf
+    pixi run -e amr-dev python -u docs/developer/design/_plot_phase_b_pyvista.py
+"""
+
+import os
+import sys
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+import underworld3.visualisation as vis
+
+
+# Geometric parameters (kept aligned with the killer test, but at RES=32)
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def _key(theta_deg, tau_y_at_fault):
+    return f"phase_b_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+
+def _meta_path(key):
+    return os.path.join(OUT_DIR, key + ".meta.npz")
+
+
+# ---------------------------------------------------------------------------
+# Build a fresh model + plotting variables for a given (θ, τ_y).
+# Used by both the capture path and the load path so the mesh+var
+# discretisation is byte-identical.
+# ---------------------------------------------------------------------------
+
+def build_model(theta_deg, tau_y_at_fault, label_suffix=""):
+    label = _key(theta_deg, tau_y_at_fault) + label_suffix
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+
+    # Solver variables (degree=2 / degree=1)
+    u = uw.discretisation.MeshVariable(
+        f"U_{label}", mesh, 2, degree=2, vtype=VarType.VECTOR,
+    )
+    p_sol = uw.discretisation.MeshVariable(
+        f"P_{label}", mesh, 1, degree=1,
+        continuous=True, vtype=VarType.SCALAR,
+    )
+
+    # Fault geometry / spatial yield_stress field
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    # Scalar mesh variables for the four post-solve plottable fields.
+    # Same degree=1 / continuous so they share the canonical mesh nodes.
+    edot_II_var = uw.discretisation.MeshVariable(
+        f"edotII_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    tau_y_var = uw.discretisation.MeshVariable(
+        f"tauy_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    sigma_II_var = uw.discretisation.MeshVariable(
+        f"sigmaII_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    yield_ratio_var = uw.discretisation.MeshVariable(
+        f"yieldRatio_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+
+    # Solver (only built when capturing)
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, integrator="etd",
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+    # Iteration monitoring (only printed during the capture phase; load
+    # path doesn't run the solver).
+    if os.environ.get("UW_SNES_MONITOR", "0") == "1":
+        stokes.petsc_options["snes_monitor"] = None
+        stokes.petsc_options["snes_converged_reason"] = None
+        stokes.petsc_options["snes_max_it"] = 50
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return dict(
+        mesh=mesh, stokes=stokes,
+        u=u, V_top=V_top,
+        edot_II_var=edot_II_var, tau_y_var=tau_y_var,
+        sigma_II_var=sigma_II_var, yield_ratio_var=yield_ratio_var,
+        n_vec=np.array([n_x, n_y]),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Capture: run the sim, project plottable fields, write_timestep
+# ---------------------------------------------------------------------------
+
+def capture(theta_deg, tau_y_at_fault, n_periods=1.5):
+    """Run + checkpoint a yield-active snapshot via mesh.write_timestep."""
+    obj = build_model(theta_deg, tau_y_at_fault, label_suffix="_cap")
+    mesh = obj["mesh"]; stokes = obj["stokes"]
+    u = obj["u"]
+    V_top = obj["V_top"]
+    edot_II_var = obj["edot_II_var"]
+    tau_y_var = obj["tau_y_var"]
+    sigma_II_var = obj["sigma_II_var"]
+    yield_ratio_var = obj["yield_ratio_var"]
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    sigma_coords = DFDt.psi_star[0].coords
+    n_x, n_y = obj["n_vec"]; cx, cy = 0.5 * W, 0.5 * H
+    sd = np.abs((sigma_coords[:, 0] - cx) * n_x + (sigma_coords[:, 1] - cy) * n_y)
+    fault_mask = sd < 1.5 * FAULT_WIDTH
+    E_sym = stokes.Unknowns.E
+    ty_at_psi_coords = np.asarray(
+        uw.function.evaluate(cm.Parameters.yield_stress.sym, sigma_coords)
+    ).flatten()
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    best = None     # (in_fault_max, step_index)
+    saved = []      # full state history so we can rewind to the chosen step
+    iters = []      # SNES iteration count per step
+    reasons = []    # SNES convergence reason per step
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        # post-transient window
+        recordable = t_end_step > 0.5 * 2.0 * np.pi / OMEGA
+        in_fault_max = float(sigma_II[fault_mask].max()) if fault_mask.any() else 0.0
+        if recordable and (best is None or in_fault_max > best[0]):
+            # Snapshot current state
+            best = (in_fault_max, len(saved))
+        # Snapshot of solver+history state (cheap: arrays)
+        saved.append(dict(
+            t=t_end_step, v_top=v_now,
+            u_arr=np.asarray(u.array).copy(),
+            sigma_arr=sigma_arr.copy(),
+            sigma_II=sigma_II.copy(),
+        ))
+        t_cur = t_end_step
+
+    if best is None:
+        # Edge case: didn't reach the post-transient window
+        best = (saved[-1]["sigma_II"].max(), len(saved) - 1)
+    chosen = saved[best[1]]
+
+    # Replant chosen state (so subsequent eval calls see the right u for ε̇)
+    u.array[...] = chosen["u_arr"]
+    DFDt.psi_star[0].array[...] = chosen["sigma_arr"]
+    u._sync_lvec_to_gvec()
+
+    # Project the four scalar fields onto the plotting mesh variables.
+    # Use direct nodal evaluation (degree=1 continuous nodes).
+    plot_coords = edot_II_var.coords
+    edot_xx = np.asarray(uw.function.evaluate(E_sym[0, 0], plot_coords)).flatten()
+    edot_xy = np.asarray(uw.function.evaluate(E_sym[0, 1], plot_coords)).flatten()
+    edot_yy = np.asarray(uw.function.evaluate(E_sym[1, 1], plot_coords)).flatten()
+    edot_II_at_plot = np.sqrt(0.5 * (edot_xx ** 2 + edot_yy ** 2 + 2 * edot_xy ** 2))
+    edot_II_var.array[:, 0, 0] = edot_II_at_plot
+
+    ty_at_plot = np.asarray(
+        uw.function.evaluate(cm.Parameters.yield_stress.sym, plot_coords)
+    ).flatten()
+    tau_y_var.array[:, 0, 0] = ty_at_plot
+
+    # σ_II at plot nodes — interpolate from psi_star[0] coords via kd-tree.
+    # psi_star[0] degree = u.degree-1 = 1 → typically same nodes as plot
+    # mesh, but in general we use uw.function.evaluate on psi_star[0].sym
+    # for safety.
+    sigma_sym_II = sympy.sqrt((DFDt.psi_star[0].sym * DFDt.psi_star[0].sym).trace() / 2)
+    try:
+        sigma_II_at_plot = np.asarray(
+            uw.function.evaluate(sigma_sym_II, plot_coords)
+        ).flatten()
+    except Exception:
+        # Fallback — kd-tree interpolation from psi_star coords
+        from underworld3.kdtree import KDTree
+        tree = KDTree(np.asarray(sigma_coords))
+        sigma_II_at_plot = tree.rbf_interpolator_local(
+            plot_coords, chosen["sigma_II"][:, None], 4, 2,
+        ).flatten()
+    sigma_II_var.array[:, 0, 0] = sigma_II_at_plot
+
+    yield_ratio_var.array[:, 0, 0] = sigma_II_at_plot / np.maximum(ty_at_plot, 1e-30)
+
+    # Write the checkpoint
+    key = _key(theta_deg, tau_y_at_fault)
+    os.makedirs(OUT_DIR, exist_ok=True)
+    mesh.write_timestep(
+        key, index=0, outputPath=OUT_DIR,
+        meshVars=[u, edot_II_var, tau_y_var, sigma_II_var, yield_ratio_var],
+        create_xdmf=True,
+    )
+    # Also the raw stress (rank-2 sym tensor) — psi_star[0] is on the
+    # solver's DDt, save by writing its underlying mesh-variable
+    DFDt.psi_star[0].write(
+        os.path.join(OUT_DIR, key + ".mesh.sigma.00000.h5")
+    )
+
+    iters_arr = np.array(iters)
+    reasons_arr = np.array(reasons)
+    metadata = dict(
+        theta_deg=theta_deg,
+        tau_y_at_fault=tau_y_at_fault,
+        n_x=float(n_x), n_y=float(n_y),
+        t=float(chosen["t"]), v_top=float(chosen["v_top"]),
+        T_END=float(T_END), RES=int(RES),
+        wall_seconds=float(time.time() - t0),
+        max_in_fault_sigma_II=float(best[0]),
+        n_steps=len(saved),
+        iters=iters_arr,            # SNES iteration count per step
+        reasons=reasons_arr,        # SNES convergence reason per step (>0 OK)
+    )
+    np.savez(os.path.join(OUT_DIR, key + ".meta.npz"), **metadata)
+
+    n_diverged = int((reasons_arr < 0).sum())
+    print(
+        f"  ran {len(saved)} steps in {metadata['wall_seconds']:.1f}s; "
+        f"chose t={metadata['t']:.3f}, V_top={metadata['v_top']:+.4f}; "
+        f"max in-fault σ_II = {metadata['max_in_fault_sigma_II']:.4f} "
+        f"({metadata['max_in_fault_sigma_II']/tau_y_at_fault:.3f}·τ_y_centre); "
+        f"checkpointed → {OUT_DIR}/{key}.*",
+        flush=True,
+    )
+    print(
+        f"  SNES iterations per step: mean={iters_arr.mean():.1f} "
+        f"median={int(np.median(iters_arr))} max={iters_arr.max()} "
+        f"diverged_steps={n_diverged}/{len(reasons_arr)}",
+        flush=True,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Load: rebuild mesh + variables, read_timestep into them, return them
+# ---------------------------------------------------------------------------
+
+def load_into_fresh_model(theta_deg, tau_y_at_fault):
+    obj = build_model(theta_deg, tau_y_at_fault, label_suffix="_load")
+    key = _key(theta_deg, tau_y_at_fault)
+    # The plotting mesh variables — ``read_timestep`` interpolates from the
+    # checkpointed coords to the current mesh's coords (kd-tree RBF).
+    obj["edot_II_var"].read_timestep(
+        key, obj["edot_II_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["tau_y_var"].read_timestep(
+        key, obj["tau_y_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["sigma_II_var"].read_timestep(
+        key, obj["sigma_II_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["yield_ratio_var"].read_timestep(
+        key, obj["yield_ratio_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["u"].read_timestep(
+        key, obj["u"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    meta = dict(np.load(os.path.join(OUT_DIR, key + ".meta.npz")))
+    obj["meta"] = {k: (v.item() if v.ndim == 0 else v) for k, v in meta.items()}
+    return obj
+
+
+# ---------------------------------------------------------------------------
+# Plot via UW3 visualisation (PyVista)
+# ---------------------------------------------------------------------------
+
+def plot_panels(obj, out_path, off_screen=True):
+    import pyvista as pv
+
+    pv.global_theme.background = "white"
+    pv.global_theme.anti_aliasing = "ssaa"
+
+    mesh = obj["mesh"]
+    u = obj["u"]
+    sII = obj["sigma_II_var"]
+    eII = obj["edot_II_var"]
+    yr = obj["yield_ratio_var"]
+    ty = obj["tau_y_var"]
+    meta = obj["meta"]
+
+    # Build PV mesh once + add scalar fields as point_data
+    pvmesh = vis.mesh_to_pv_mesh(mesh)
+    pvmesh.point_data["sigma_II"] = vis.scalar_fn_to_pv_points(pvmesh, sII.sym)
+    pvmesh.point_data["edot_II"] = vis.scalar_fn_to_pv_points(pvmesh, eII.sym)
+    pvmesh.point_data["yield_ratio"] = np.clip(
+        vis.scalar_fn_to_pv_points(pvmesh, yr.sym), 0.0, 1.5,
+    )
+    pvmesh.point_data["tau_y"] = vis.scalar_fn_to_pv_points(pvmesh, ty.sym)
+
+    # Velocity arrows from the velocity-degree variable
+    u_cloud = vis.meshVariable_to_pv_cloud(u)
+    u_cloud.point_data["u"] = vis.vector_fn_to_pv_points(u_cloud, u.sym)
+    u_speed = np.linalg.norm(u_cloud.point_data["u"][:, :2], axis=1)
+    u_cloud.point_data["|u|"] = u_speed
+    # u_y as a separate scalar — the BC drives a horizontal shear so u_x
+    # is dominant everywhere; u_y concentrates where the fault forces a
+    # rotation of the velocity field toward the fault tangent direction.
+    # Plot u_y as the panel-1 colormap to make that pattern visible.
+    pvmesh.point_data["u_y"] = vis.scalar_fn_to_pv_points(pvmesh, u.sym[1])
+    pvmesh.point_data["|u|"] = vis.scalar_fn_to_pv_points(
+        pvmesh, sympy.sqrt(u.sym.dot(u.sym))
+    )
+
+    # Fault line for overlay
+    n_x = float(meta["n_x"]); n_y = float(meta["n_y"])
+    cx, cy = 0.5 * W, 0.5 * H
+    L = FAULT_LENGTH
+    t_x, t_y = n_y, -n_x
+    fault_line = pv.Line(
+        (cx - 0.5 * L * t_x, cy - 0.5 * L * t_y, 0.0),
+        (cx + 0.5 * L * t_x, cy + 0.5 * L * t_y, 0.0),
+    )
+
+    pl = pv.Plotter(off_screen=off_screen, shape=(2, 2),
+                    window_size=(1500, 1400), border=True)
+
+    def _common(p):
+        p.view_xy()
+        p.camera.parallel_projection = True
+        p.add_mesh(fault_line, color="red", line_width=4)
+
+    # Velocity — u_y heatmap (small but reveals fault-induced rotation)
+    # with full-vector arrows on top.
+    pl.subplot(0, 0)
+    uy_max = float(np.max(np.abs(pvmesh.point_data["u_y"])))
+    pl.add_mesh(
+        pvmesh, scalars="u_y", cmap="seismic",
+        clim=(-uy_max, uy_max),
+        show_scalar_bar=True, scalar_bar_args={"title": "u_y"},
+    )
+    sub = max(1, len(u_cloud.points) // 250)
+    pl.add_arrows(u_cloud.points[::sub], u_cloud.point_data["u"][::sub],
+                  mag=0.35, color="#333333")
+    pl.add_text(
+        "velocity: u_y heatmap (+arrows show full u)",
+        position="upper_edge", font_size=11, color="black",
+    )
+    _common(pl)
+
+    # |ε̇|_II
+    pl.subplot(0, 1)
+    pl.add_mesh(pvmesh, scalars="edot_II", cmap="viridis",
+                show_scalar_bar=True, scalar_bar_args={"title": "|ε̇|_II"})
+    pl.add_text("|ε̇|_II", position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    # |σ|_II
+    pl.subplot(1, 0)
+    pl.add_mesh(pvmesh, scalars="sigma_II", cmap="magma",
+                show_scalar_bar=True, scalar_bar_args={"title": "|σ|_II"})
+    ty_levels = [meta["tau_y_at_fault"] * f for f in (4.0, 20.0, 100.0)]
+    contours = pvmesh.contour(isosurfaces=ty_levels, scalars="tau_y")
+    if contours.n_points > 0:
+        pl.add_mesh(contours, color="cyan", line_width=1.2)
+    pl.add_text("|σ|_II — cyan: τ_y(x) contours",
+                position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    # σ/τ_y ratio with active surface contour
+    pl.subplot(1, 1)
+    pl.add_mesh(pvmesh, scalars="yield_ratio", cmap="RdYlGn_r",
+                clim=(0.0, 1.2),
+                show_scalar_bar=True, scalar_bar_args={"title": "|σ|_II / τ_y(x)"})
+    yc = pvmesh.contour(isosurfaces=[1.0], scalars="yield_ratio")
+    if yc.n_points > 0:
+        pl.add_mesh(yc, color="black", line_width=2.0)
+    pl.add_text("yield activation",
+                position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.add_text(
+        f"ETD-2, RES={int(meta['RES'])}, θ={meta['theta_deg']:+.0f}°, "
+        f"τ_y_fault={meta['tau_y_at_fault']} "
+        f"(t={meta['t']:.2f}, V_top={meta['v_top']:+.3f})",
+        position="lower_edge", font_size=10, color="black",
+    )
+
+    pl.screenshot(out_path, scale=1.5)
+    pl.close()
+    print(f"  wrote {out_path}", flush=True)
+
+
+# ---------------------------------------------------------------------------
+# Driver
+# ---------------------------------------------------------------------------
+
+def capture_or_load(theta_deg, tau_y_at_fault, n_periods=1.5):
+    if os.path.exists(_meta_path(_key(theta_deg, tau_y_at_fault))):
+        print(f"  cache hit: {_key(theta_deg, tau_y_at_fault)}.* — skipping run",
+              flush=True)
+    else:
+        print(f"  cache miss: running capture", flush=True)
+        capture(theta_deg, tau_y_at_fault, n_periods=n_periods)
+    return load_into_fresh_model(theta_deg, tau_y_at_fault)
+
+
+def main():
+    cases = [(0.0, 0.15), (15.0, 0.15), (0.0, 0.05), (15.0, 0.05)]
+    for theta, ty in cases:
+        print(f"\n=== θ={theta:+.0f}°, τ_y={ty:.2f} ===", flush=True)
+        obj = capture_or_load(theta, ty, n_periods=1.5)
+        out = os.path.join(
+            OUT_DIR,
+            f"exp_integrator_phase_b_pyvista_th{theta:+.0f}_ty{ty:.2f}".replace(".", "p")
+            + ".png",
+        )
+        plot_panels(obj, out)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_b_results.py b/docs/developer/design/_plot_phase_b_results.py
new file mode 100644
index 00000000..fc1f7ab3
--- /dev/null
+++ b/docs/developer/design/_plot_phase_b_results.py
@@ -0,0 +1,153 @@
+"""Phase B comparison plots — ETD-2 vs BDF-1 vs BDF-2.
+
+Produces side-by-side panels reading the saved npz traces:
+- ``output/benchmarks/ve_harmonic.npz``           (BDF-1, BDF-2 + analytical)
+- ``output/exp_integrator_phase_b_ve_harmonic.npz`` (ETD-2 + analytical)
+
+Outputs PNGs in ``output/exp_integrator_phase_b_*.png``.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_plot_phase_b_results.py
+"""
+
+import os
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")  # non-interactive — write files only
+import matplotlib.pyplot as plt
+
+
+C_ANA = "#222222"
+C_BDF1 = "#1f77b4"
+C_BDF2 = "#d62728"
+C_ETD = "#2ca02c"
+
+
+def plot_ve_harmonic():
+    bdf = np.load("output/benchmarks/ve_harmonic.npz", allow_pickle=True)
+    etd = np.load("output/exp_integrator_phase_b_ve_harmonic.npz", allow_pickle=True)
+
+    # Both runs share the same time grid and analytical reference; sanity-check.
+    t_bdf = bdf["arr_times"]
+    t_etd = etd["times"]
+    sigma_ana_bdf = bdf["arr_sigma_ana"]
+    sigma_ana_etd = etd["sigma_ana"]
+    sigma_bdf1 = bdf["arr_sigma_bdf1"]
+    sigma_bdf2 = bdf["arr_sigma_bdf2"]
+    sigma_etd2 = etd["sigma_exp"]
+
+    assert np.allclose(t_bdf, t_etd), "Time grids differ between runs"
+
+    err_bdf1 = np.abs(sigma_bdf1 - sigma_ana_bdf)
+    err_bdf2 = np.abs(sigma_bdf2 - sigma_ana_bdf)
+    err_etd2 = np.abs(sigma_etd2 - sigma_ana_etd)
+
+    A_inf = float(etd["A_inf"])
+
+    fig, (ax_s, ax_e) = plt.subplots(
+        2, 1, figsize=(10.5, 7.0), sharex=True,
+        gridspec_kw={"height_ratios": [2.0, 1.2]},
+    )
+
+    # σ trace panel
+    ax_s.plot(t_bdf, sigma_ana_bdf, "-", color=C_ANA, lw=1.5, label="analytical")
+    ax_s.plot(t_bdf, sigma_bdf1, ".", color=C_BDF1, ms=4, alpha=0.7,
+              label=f"BDF-1 (max|err|={err_bdf1.max():.2e})")
+    ax_s.plot(t_bdf, sigma_bdf2, ".", color=C_BDF2, ms=4, alpha=0.7,
+              label=f"BDF-2 (max|err|={err_bdf2.max():.2e})")
+    ax_s.plot(t_etd, sigma_etd2, ".", color=C_ETD, ms=4, alpha=0.85,
+              label=f"ETD-2 (max|err|={err_etd2.max():.2e})")
+    ax_s.set_ylabel(r"$\sigma_{xy}$ at centre")
+    ax_s.axhline(0, color="0.7", lw=0.6, zorder=0)
+    ax_s.axhline(+A_inf, color="0.6", lw=0.6, ls="--", zorder=0)
+    ax_s.axhline(-A_inf, color="0.6", lw=0.6, ls="--", zorder=0)
+    ax_s.set_title(
+        "bench_ve_harmonic (peak-start IC, ω=π/2, dt=0.05) — ETD-2 vs BDF-1, BDF-2"
+    )
+    ax_s.legend(loc="upper right", fontsize=9, framealpha=0.85)
+    ax_s.grid(True, alpha=0.3)
+
+    # |error| panel (semilog)
+    ax_e.semilogy(t_bdf, err_bdf1 + 1e-16, "-", color=C_BDF1, lw=0.9, alpha=0.85, label="BDF-1")
+    ax_e.semilogy(t_bdf, err_bdf2 + 1e-16, "-", color=C_BDF2, lw=0.9, alpha=0.85, label="BDF-2")
+    ax_e.semilogy(t_etd, err_etd2 + 1e-16, "-", color=C_ETD, lw=1.1, alpha=0.95, label="ETD-2")
+    ax_e.set_xlabel("t")
+    ax_e.set_ylabel(r"$|\sigma - \sigma_\mathrm{ana}|$")
+    ax_e.legend(loc="upper right", fontsize=9, framealpha=0.85)
+    ax_e.grid(True, alpha=0.3, which="both")
+
+    fig.tight_layout()
+    out = "output/exp_integrator_phase_b_ve_harmonic.png"
+    fig.savefig(out, dpi=130, bbox_inches="tight")
+    plt.close(fig)
+    print(f"wrote {out}", flush=True)
+    print(f"  BDF-1 max|err|={err_bdf1.max():.4e}  rms={np.sqrt((err_bdf1**2).mean()):.4e}")
+    print(f"  BDF-2 max|err|={err_bdf2.max():.4e}  rms={np.sqrt((err_bdf2**2).mean()):.4e}")
+    print(f"  ETD-2 max|err|={err_etd2.max():.4e}  rms={np.sqrt((err_etd2**2).mean()):.4e}")
+
+
+def plot_killer_summary():
+    """Killer-test summary: bar chart of |τ_resolved|/τ_y per (θ, τ_y) for ETD-2 and BDF-1."""
+    # Hard-coded from the BDF-1 production npz files (already validated centre probes
+    # earlier in the session) and the latest ETD-2 sweep.
+    cases = [
+        # (theta_deg, tau_y, etd_tau_res_ratio, bdf1_tau_res_ratio, bdf2_tau_res_ratio_log10)
+        (0,    0.15, 1.103, 1.122, np.log10(5.689)),
+        (15,   0.15, 1.118, 1.143, np.log10(2.157e9)),
+        (-15,  0.15, 1.120, 1.127, np.log10(6.889e7)),
+        (0,    0.30, 0.922, 1.150, np.log10(9.620)),
+        (15,   0.30, 0.804, 1.139, np.log10(9.091e9)),
+        (-15,  0.30, 0.803, 1.138, np.log10(1.859e8)),
+    ]
+    labels = [f"θ={c[0]:+}°,\nτ_y={c[1]}" for c in cases]
+    etd_ratios = [c[2] for c in cases]
+    bdf1_ratios = [c[3] for c in cases]
+    bdf2_log10 = [c[4] for c in cases]
+    x = np.arange(len(cases))
+
+    fig, (ax_main, ax_bdf2) = plt.subplots(
+        2, 1, figsize=(10.5, 7.0), sharex=True,
+        gridspec_kw={"height_ratios": [2.2, 1.0]},
+    )
+
+    width = 0.36
+    ax_main.bar(x - width / 2, bdf1_ratios, width, color=C_BDF1, alpha=0.85, label="BDF-1 (production)")
+    ax_main.bar(x + width / 2, etd_ratios, width, color=C_ETD, alpha=0.9, label="ETD-2 (this work)")
+    ax_main.axhline(1.0, color="0.4", lw=0.8, ls="--", zorder=0, label=r"$\tau_y$")
+    ax_main.axhline(1.20, color="0.7", lw=0.8, ls=":", zorder=0, label=r"gate (1.20·$\tau_y$)")
+    ax_main.set_ylabel(r"$|\tau_\mathrm{resolved}|$ at fault centre / $\tau_y$")
+    ax_main.set_title(
+        "bench_ti_vep_harmonic killer test — ETD-2 vs BDF-1 (centre probe, 6/6 PASS)"
+    )
+    ax_main.legend(loc="upper right", fontsize=9, framealpha=0.85)
+    ax_main.grid(True, alpha=0.3, axis="y")
+    ax_main.set_ylim(0.0, 1.4)
+
+    # BDF-2 |σ_xy| log-blow-up panel — BDF-2 is the integrator ETD-2 *replaces*
+    ax_bdf2.bar(x, bdf2_log10, color=C_BDF2, alpha=0.85, label=r"BDF-2 $\log_{10}|\sigma_{xy}|$ (centre)")
+    ax_bdf2.axhline(np.log10(1.5), color="0.4", lw=0.8, ls="--", zorder=0,
+                    label=r"$\log_{10}(1.5\cdot\tau_y\sim O(1))$")
+    ax_bdf2.set_xticks(x)
+    ax_bdf2.set_xticklabels(labels, fontsize=9)
+    ax_bdf2.set_ylabel(r"$\log_{10}|\sigma_{xy}|$ at fault centre")
+    ax_bdf2.legend(loc="upper right", fontsize=9, framealpha=0.85)
+    ax_bdf2.grid(True, alpha=0.3, axis="y")
+    ax_bdf2.set_title("BDF-2: blows up to 10⁵–10⁹ on every yield-active combo")
+
+    fig.tight_layout()
+    out = "output/exp_integrator_phase_b_killer_summary.png"
+    fig.savefig(out, dpi=130, bbox_inches="tight")
+    plt.close(fig)
+    print(f"wrote {out}", flush=True)
+
+
+def main():
+    os.makedirs("output", exist_ok=True)
+    plot_ve_harmonic()
+    plot_killer_summary()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_d_pyvista_split.py b/docs/developer/design/_plot_phase_d_pyvista_split.py
new file mode 100644
index 00000000..d9d135b9
--- /dev/null
+++ b/docs/developer/design/_plot_phase_d_pyvista_split.py
@@ -0,0 +1,413 @@
+"""Phase D PyVista field plots — split-ETD-2 with explicit-parallel η_∥.
+
+Identical pattern to ``_plot_phase_b_pyvista.py`` but uses
+``TransverseIsotropicVEPSplitFlowModel`` (Phase D split + lag, with the
+forcing_star-based η_∥ used for both α_∥/φ_∥ and the C_∥ multiplier).
+
+Captures the yield-active step from a 1.5-period run at θ=+15°, τ_y=0.05
+(also τ_y=0.15 for the easier baseline) and renders the same 4-panel
+PyVista figure (u_y heatmap, |ε̇|_II, |σ|_II, yield_ratio) so we can
+compare the field structure directly to the BDF/lumped Phase B plots.
+
+Run::
+
+    pixi run -e amr-dev python -u docs/developer/design/_plot_phase_d_pyvista_split.py
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+import underworld3.visualisation as vis
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def _key(theta_deg, tau_y_at_fault):
+    return f"phase_d_split_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+
+def _meta_path(key):
+    return os.path.join(OUT_DIR, key + ".meta.npz")
+
+
+def build_model(theta_deg, tau_y_at_fault, label_suffix=""):
+    label = _key(theta_deg, tau_y_at_fault) + label_suffix
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+
+    u = uw.discretisation.MeshVariable(
+        f"U_{label}", mesh, 2, degree=2, vtype=VarType.VECTOR,
+    )
+    p_sol = uw.discretisation.MeshVariable(
+        f"P_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    edot_II_var = uw.discretisation.MeshVariable(
+        f"edotII_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    tau_y_var = uw.discretisation.MeshVariable(
+        f"tauy_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    sigma_II_var = uw.discretisation.MeshVariable(
+        f"sigmaII_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    yield_ratio_var = uw.discretisation.MeshVariable(
+        f"yieldRatio_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    # *** Phase D split-ETD-2 (explicit-parallel) ***
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPSplitFlowModel(
+        stokes.Unknowns,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return dict(
+        mesh=mesh, stokes=stokes,
+        u=u, V_top=V_top,
+        edot_II_var=edot_II_var, tau_y_var=tau_y_var,
+        sigma_II_var=sigma_II_var, yield_ratio_var=yield_ratio_var,
+        n_vec=np.array([n_x, n_y]),
+    )
+
+
+def capture(theta_deg, tau_y_at_fault, n_periods=1.5):
+    obj = build_model(theta_deg, tau_y_at_fault, label_suffix="_cap")
+    mesh = obj["mesh"]; stokes = obj["stokes"]
+    u = obj["u"]; V_top = obj["V_top"]
+    edot_II_var = obj["edot_II_var"]
+    tau_y_var = obj["tau_y_var"]
+    sigma_II_var = obj["sigma_II_var"]
+    yield_ratio_var = obj["yield_ratio_var"]
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    sigma_coords = DFDt.psi_star[0].coords
+    n_x, n_y = obj["n_vec"]; cx, cy = 0.5 * W, 0.5 * H
+    sd = np.abs((sigma_coords[:, 0] - cx) * n_x + (sigma_coords[:, 1] - cy) * n_y)
+    fault_mask = sd < 1.5 * FAULT_WIDTH
+    E_sym = stokes.Unknowns.E
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    best = None
+    saved = []
+    iters = []; reasons = []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        recordable = t_end_step > 0.5 * 2.0 * np.pi / OMEGA
+        in_fault_max = float(sigma_II[fault_mask].max()) if fault_mask.any() else 0.0
+        if recordable and (best is None or in_fault_max > best[0]):
+            best = (in_fault_max, len(saved))
+        saved.append(dict(
+            t=t_end_step, v_top=v_now,
+            u_arr=np.asarray(u.array).copy(),
+            sigma_arr=sigma_arr.copy(),
+            sigma_II=sigma_II.copy(),
+        ))
+        t_cur = t_end_step
+
+    if best is None:
+        best = (saved[-1]["sigma_II"].max(), len(saved) - 1)
+    chosen = saved[best[1]]
+    u.array[...] = chosen["u_arr"]
+    DFDt.psi_star[0].array[...] = chosen["sigma_arr"]
+    u._sync_lvec_to_gvec()
+
+    plot_coords = edot_II_var.coords
+    edot_xx = np.asarray(uw.function.evaluate(E_sym[0, 0], plot_coords)).flatten()
+    edot_xy = np.asarray(uw.function.evaluate(E_sym[0, 1], plot_coords)).flatten()
+    edot_yy = np.asarray(uw.function.evaluate(E_sym[1, 1], plot_coords)).flatten()
+    edot_II_at_plot = np.sqrt(0.5 * (edot_xx ** 2 + edot_yy ** 2 + 2 * edot_xy ** 2))
+    edot_II_var.array[:, 0, 0] = edot_II_at_plot
+
+    ty_at_plot = np.asarray(
+        uw.function.evaluate(cm.Parameters.yield_stress.sym, plot_coords)
+    ).flatten()
+    tau_y_var.array[:, 0, 0] = ty_at_plot
+
+    sigma_sym_II = sympy.sqrt(
+        (DFDt.psi_star[0].sym * DFDt.psi_star[0].sym).trace() / 2
+    )
+    try:
+        sigma_II_at_plot = np.asarray(
+            uw.function.evaluate(sigma_sym_II, plot_coords)
+        ).flatten()
+    except Exception:
+        from underworld3.kdtree import KDTree
+        tree = KDTree(np.asarray(sigma_coords))
+        sigma_II_at_plot = tree.rbf_interpolator_local(
+            plot_coords, chosen["sigma_II"][:, None], 4, 2,
+        ).flatten()
+    sigma_II_var.array[:, 0, 0] = sigma_II_at_plot
+
+    yield_ratio_var.array[:, 0, 0] = sigma_II_at_plot / np.maximum(ty_at_plot, 1e-30)
+
+    key = _key(theta_deg, tau_y_at_fault)
+    os.makedirs(OUT_DIR, exist_ok=True)
+    mesh.write_timestep(
+        key, index=0, outputPath=OUT_DIR,
+        meshVars=[u, edot_II_var, tau_y_var, sigma_II_var, yield_ratio_var],
+        create_xdmf=True,
+    )
+    DFDt.psi_star[0].write(
+        os.path.join(OUT_DIR, key + ".mesh.sigma.00000.h5")
+    )
+
+    iters_arr = np.array(iters); reasons_arr = np.array(reasons)
+    metadata = dict(
+        theta_deg=theta_deg,
+        tau_y_at_fault=tau_y_at_fault,
+        n_x=float(n_x), n_y=float(n_y),
+        t=float(chosen["t"]), v_top=float(chosen["v_top"]),
+        T_END=float(T_END), RES=int(RES),
+        wall_seconds=float(time.time() - t0),
+        max_in_fault_sigma_II=float(best[0]),
+        n_steps=len(saved),
+        iters=iters_arr, reasons=reasons_arr,
+    )
+    np.savez(os.path.join(OUT_DIR, key + ".meta.npz"), **metadata)
+    n_diverged = int((reasons_arr < 0).sum())
+    print(
+        f"  ran {len(saved)} steps in {metadata['wall_seconds']:.1f}s; "
+        f"chose t={metadata['t']:.3f}, V_top={metadata['v_top']:+.4f}; "
+        f"max in-fault σ_II = {metadata['max_in_fault_sigma_II']:.4f} "
+        f"({metadata['max_in_fault_sigma_II']/tau_y_at_fault:.3f}·τ_y_centre); "
+        f"checkpointed → {OUT_DIR}/{key}.*",
+        flush=True,
+    )
+    print(
+        f"  SNES iters per step: mean={iters_arr.mean():.1f} "
+        f"median={int(np.median(iters_arr))} max={iters_arr.max()} "
+        f"diverged_steps={n_diverged}/{len(reasons_arr)}",
+        flush=True,
+    )
+
+
+def load_into_fresh_model(theta_deg, tau_y_at_fault):
+    obj = build_model(theta_deg, tau_y_at_fault, label_suffix="_load")
+    key = _key(theta_deg, tau_y_at_fault)
+    obj["edot_II_var"].read_timestep(
+        key, obj["edot_II_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["tau_y_var"].read_timestep(
+        key, obj["tau_y_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["sigma_II_var"].read_timestep(
+        key, obj["sigma_II_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["yield_ratio_var"].read_timestep(
+        key, obj["yield_ratio_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["u"].read_timestep(
+        key, obj["u"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    meta = dict(np.load(os.path.join(OUT_DIR, key + ".meta.npz")))
+    obj["meta"] = {k: (v.item() if v.ndim == 0 else v) for k, v in meta.items()}
+    return obj
+
+
+def plot_panels(obj, out_path, off_screen=True):
+    import pyvista as pv
+
+    pv.global_theme.background = "white"
+    pv.global_theme.anti_aliasing = "ssaa"
+
+    mesh = obj["mesh"]
+    u = obj["u"]
+    sII = obj["sigma_II_var"]
+    eII = obj["edot_II_var"]
+    yr = obj["yield_ratio_var"]
+    ty = obj["tau_y_var"]
+    meta = obj["meta"]
+
+    pvmesh = vis.mesh_to_pv_mesh(mesh)
+    pvmesh.point_data["sigma_II"] = vis.scalar_fn_to_pv_points(pvmesh, sII.sym)
+    pvmesh.point_data["edot_II"] = vis.scalar_fn_to_pv_points(pvmesh, eII.sym)
+    pvmesh.point_data["yield_ratio"] = np.clip(
+        vis.scalar_fn_to_pv_points(pvmesh, yr.sym), 0.0, 1.5,
+    )
+    pvmesh.point_data["tau_y"] = vis.scalar_fn_to_pv_points(pvmesh, ty.sym)
+
+    u_cloud = vis.meshVariable_to_pv_cloud(u)
+    u_cloud.point_data["u"] = vis.vector_fn_to_pv_points(u_cloud, u.sym)
+    u_speed = np.linalg.norm(u_cloud.point_data["u"][:, :2], axis=1)
+    u_cloud.point_data["|u|"] = u_speed
+    pvmesh.point_data["u_y"] = vis.scalar_fn_to_pv_points(pvmesh, u.sym[1])
+    pvmesh.point_data["|u|"] = vis.scalar_fn_to_pv_points(
+        pvmesh, sympy.sqrt(u.sym.dot(u.sym))
+    )
+
+    n_x = float(meta["n_x"]); n_y = float(meta["n_y"])
+    cx, cy = 0.5 * W, 0.5 * H
+    L = FAULT_LENGTH
+    t_x, t_y = n_y, -n_x
+    fault_line = pv.Line(
+        (cx - 0.5 * L * t_x, cy - 0.5 * L * t_y, 0.0),
+        (cx + 0.5 * L * t_x, cy + 0.5 * L * t_y, 0.0),
+    )
+
+    pl = pv.Plotter(off_screen=off_screen, shape=(2, 2),
+                    window_size=(1500, 1400), border=True)
+
+    def _common(p):
+        p.view_xy()
+        p.camera.parallel_projection = True
+        p.add_mesh(fault_line, color="red", line_width=4)
+
+    pl.subplot(0, 0)
+    uy_max = float(np.max(np.abs(pvmesh.point_data["u_y"])))
+    pl.add_mesh(
+        pvmesh, scalars="u_y", cmap="seismic",
+        clim=(-uy_max, uy_max),
+        show_scalar_bar=True, scalar_bar_args={"title": "u_y"},
+    )
+    sub = max(1, len(u_cloud.points) // 250)
+    pl.add_arrows(u_cloud.points[::sub], u_cloud.point_data["u"][::sub],
+                  mag=0.35, color="#333333")
+    pl.add_text(
+        "velocity: u_y heatmap (+arrows show full u)",
+        position="upper_edge", font_size=11, color="black",
+    )
+    _common(pl)
+
+    pl.subplot(0, 1)
+    pl.add_mesh(pvmesh, scalars="edot_II", cmap="viridis",
+                show_scalar_bar=True, scalar_bar_args={"title": "|ε̇|_II"})
+    pl.add_text("|ε̇|_II", position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.subplot(1, 0)
+    pl.add_mesh(pvmesh, scalars="sigma_II", cmap="magma",
+                show_scalar_bar=True, scalar_bar_args={"title": "|σ|_II"})
+    ty_levels = [meta["tau_y_at_fault"] * f for f in (4.0, 20.0, 100.0)]
+    contours = pvmesh.contour(isosurfaces=ty_levels, scalars="tau_y")
+    if contours.n_points > 0:
+        pl.add_mesh(contours, color="cyan", line_width=1.2)
+    pl.add_text("|σ|_II — cyan: τ_y(x) contours",
+                position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.subplot(1, 1)
+    pl.add_mesh(pvmesh, scalars="yield_ratio", cmap="RdYlGn_r",
+                clim=(0.0, 1.2),
+                show_scalar_bar=True, scalar_bar_args={"title": "|σ|_II / τ_y(x)"})
+    yc = pvmesh.contour(isosurfaces=[1.0], scalars="yield_ratio")
+    if yc.n_points > 0:
+        pl.add_mesh(yc, color="black", line_width=2.0)
+    pl.add_text("yield activation",
+                position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.add_text(
+        f"Phase D split-ETD-2, RES={int(meta['RES'])}, "
+        f"θ={meta['theta_deg']:+.0f}°, τ_y_fault={meta['tau_y_at_fault']} "
+        f"(t={meta['t']:.2f}, V_top={meta['v_top']:+.3f})",
+        position="lower_edge", font_size=10, color="black",
+    )
+
+    pl.screenshot(out_path, scale=1.5)
+    pl.close()
+    print(f"  wrote {out_path}", flush=True)
+
+
+def capture_or_load(theta_deg, tau_y_at_fault, n_periods=1.5):
+    if os.path.exists(_meta_path(_key(theta_deg, tau_y_at_fault))):
+        print(f"  cache hit: {_key(theta_deg, tau_y_at_fault)}.* — skipping run",
+              flush=True)
+    else:
+        print(f"  cache miss: running capture", flush=True)
+        capture(theta_deg, tau_y_at_fault, n_periods=n_periods)
+    return load_into_fresh_model(theta_deg, tau_y_at_fault)
+
+
+def main():
+    cases = [(15.0, 0.05), (15.0, 0.15)]
+    for theta, ty in cases:
+        print(f"\n=== θ={theta:+.0f}°, τ_y={ty:.2f} ===", flush=True)
+        obj = capture_or_load(theta, ty, n_periods=1.5)
+        out_path = os.path.join(
+            OUT_DIR, f"exp_integrator_phase_d_pyvista_split_th{theta:+.0f}_ty{ty:.2f}".replace(".", "p") + ".png",
+        )
+        plot_panels(obj, out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_d_uy_diagnosis.py b/docs/developer/design/_plot_phase_d_uy_diagnosis.py
new file mode 100644
index 00000000..e39ddd1f
--- /dev/null
+++ b/docs/developer/design/_plot_phase_d_uy_diagnosis.py
@@ -0,0 +1,69 @@
+"""Diagnose: is split-ETD's |u_y| growing without bound, or settling?
+
+Plots BDF-1 / lumped-ETD / split-ETD u_y(t) and σ_∥(t) on shared time
+axes — answers whether the 16× higher |u_y| peak in split-ETD is a
+stable accumulation matching the elastic-loading/plastic-yielding
+cycle, or unbounded drift.
+"""
+
+import os
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+DT = 0.05
+OMEGA = np.pi / 2.0
+PERIOD = 2.0 * np.pi / OMEGA
+TAU_Y = 0.05
+OUT_DIR = "output"
+
+
+def main():
+    bdf = np.load(os.path.join(OUT_DIR, "phase_b_bdf_th+15_ty0p05.npz"))
+    etd = np.load(os.path.join(OUT_DIR, "phase_b_etd_th+15_ty0p05.npz"))
+    split = np.load(os.path.join(OUT_DIR, "phase_b_etd-split_th+15_ty0p05.npz"))
+
+    t_b = (np.arange(int(bdf["n_steps"])) + 1) * DT / PERIOD
+    t_e = (np.arange(int(etd["n_steps"])) + 1) * DT / PERIOD
+    t_s = (np.arange(int(split["n_steps"])) + 1) * DT / PERIOD
+
+    fig, axes = plt.subplots(2, 1, figsize=(9, 7), sharex=True)
+
+    ax = axes[0]
+    ax.plot(t_b, bdf["u_y_max_per_step"], "-", color="#1f77b4", label="BDF-1")
+    ax.plot(t_e, etd["u_y_max_per_step"], "-", color="#d62728",
+            label="ETD lumped", alpha=0.5)
+    ax.plot(t_s, split["u_y_max_per_step"], "-", color="#2ca02c",
+            label="split (explicit-parallel)")
+    ax.set_yscale("log")
+    ax.set_ylabel(r"max $|u_y|$  (log)")
+    ax.legend(loc="lower right")
+    ax.grid(alpha=0.3, which="both")
+    ax.set_title(
+        r"split-ETD vs BDF-1 / lumped-ETD: $|u_y|$ and $|\sigma_\parallel|$  "
+        r"($\tau_y=0.05$, $\theta=+15^\circ$)"
+    )
+
+    ax = axes[1]
+    ax.plot(t_b, bdf["sigma_par_centre"], "-", color="#1f77b4", label="BDF-1")
+    ax.plot(t_e, etd["sigma_par_centre"], "-", color="#d62728",
+            label="ETD lumped", alpha=0.5)
+    ax.plot(t_s, split["sigma_par_centre"], "-", color="#2ca02c",
+            label="split (explicit-parallel)")
+    ax.axhline(TAU_Y, color="black", lw=0.7, linestyle="--",
+               label=rf"$\tau_y={TAU_Y}$")
+    ax.set_xlabel(r"time $t/T$ (periods)")
+    ax.set_ylabel(r"centre $|\sigma_\parallel|$")
+    ax.legend(loc="upper right")
+    ax.grid(alpha=0.3)
+
+    fig.tight_layout()
+    out_png = os.path.join(OUT_DIR, "exp_integrator_phase_d_uy_diagnosis.png")
+    fig.savefig(out_png, dpi=140)
+    print(f"  wrote {out_png}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_e_pyvista_hybrid.py b/docs/developer/design/_plot_phase_e_pyvista_hybrid.py
new file mode 100644
index 00000000..ee6cda57
--- /dev/null
+++ b/docs/developer/design/_plot_phase_e_pyvista_hybrid.py
@@ -0,0 +1,408 @@
+"""Phase E PyVista field plot — hybrid BDF/ETD integrator.
+
+Same structure as ``_plot_phase_b_pyvista.py`` but uses
+``TransverseIsotropicVEPFlowModel(integrator='hybrid', fault_weight=...)``.
+Captures the yield-active step at θ=+15°, τ_y ∈ {0.05, 0.15} and
+renders the 4-panel field figure (u_y, |ε̇|_II, |σ|_II, yield_ratio).
+"""
+
+import os
+import time
+
+import numpy as np
+import sympy
+
+import underworld3 as uw
+from underworld3 import VarType
+from underworld3.function import expression
+import underworld3.visualisation as vis
+
+
+V0 = 0.5
+OMEGA = np.pi / 2.0
+DT = 0.05
+H = 1.0; W = 1.0
+FAULT_LENGTH = 0.6
+FAULT_WIDTH = 0.06
+ETA_0 = 1.0; ETA_1 = 1.0; MU = 1.0
+TAU_Y_BULK = 200.0
+RES = 32
+
+OUT_DIR = "output"
+
+
+def _key(theta_deg, tau_y_at_fault):
+    return f"phase_e_hybrid_th{theta_deg:+.0f}_ty{tau_y_at_fault:.2f}".replace(".", "p")
+
+
+def _meta_path(key):
+    return os.path.join(OUT_DIR, key + ".meta.npz")
+
+
+def build_model(theta_deg, tau_y_at_fault, label_suffix=""):
+    label = _key(theta_deg, tau_y_at_fault) + label_suffix
+
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(RES, RES),
+        minCoords=(0.0, 0.0), maxCoords=(W, H),
+        qdegree=3,
+    )
+
+    u = uw.discretisation.MeshVariable(
+        f"U_{label}", mesh, 2, degree=2, vtype=VarType.VECTOR,
+    )
+    p_sol = uw.discretisation.MeshVariable(
+        f"P_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+
+    theta = np.radians(theta_deg)
+    cx, cy = 0.5 * W, 0.5 * H
+    dx = 0.5 * FAULT_LENGTH * np.cos(theta)
+    dy = 0.5 * FAULT_LENGTH * np.sin(theta)
+    fault = uw.meshing.Surface(
+        f"fault_{label}", mesh,
+        np.array([[cx - dx, cy - dy], [cx + dx, cy + dy]]),
+        symbol=f"F{label}",
+    )
+    fault.discretize()
+
+    n_x = -np.sin(theta); n_y = np.cos(theta)
+    director = sympy.Matrix([n_x, n_y])
+    weakness = fault.influence_function(
+        width=FAULT_WIDTH,
+        value_near=1.0 / tau_y_at_fault, value_far=1.0 / TAU_Y_BULK,
+        profile="gaussian",
+    )
+    tau_y_field = 1.0 / weakness
+
+    weakness_min = 1.0 / TAU_Y_BULK
+    weakness_max = 1.0 / tau_y_at_fault
+    fault_weight = (weakness - weakness_min) / (weakness_max - weakness_min)
+
+    edot_II_var = uw.discretisation.MeshVariable(
+        f"edotII_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    tau_y_var = uw.discretisation.MeshVariable(
+        f"tauy_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    sigma_II_var = uw.discretisation.MeshVariable(
+        f"sigmaII_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+    yield_ratio_var = uw.discretisation.MeshVariable(
+        f"yieldRatio_{label}", mesh, 1, degree=1, continuous=True, vtype=VarType.SCALAR,
+    )
+
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p_sol)
+    stokes.constitutive_model = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, integrator="hybrid", fault_weight=fault_weight,
+    )
+    cm = stokes.constitutive_model
+    cm.Parameters.shear_viscosity_0 = ETA_0
+    cm.Parameters.shear_viscosity_1 = ETA_1
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = tau_y_field
+    cm.Parameters.director = director
+    cm.Parameters.shear_viscosity_min = ETA_0 * 1.0e-3
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm.yield_mode = "softmin"
+
+    stokes.saddle_preconditioner = 1.0 / cm.K
+    stokes.tolerance = 1.0e-4
+    stokes.petsc_options["ksp_type"] = "fgmres"
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    V_top = expression(rf"V_{{top,{label}}}", sympy.Float(0.0), "Top BC")
+    stokes.add_essential_bc(sympy.Matrix([V_top, 0.0]), "Top")
+    stokes.add_essential_bc(sympy.Matrix([0.0, 0.0]), "Bottom")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Left")
+    stokes.add_essential_bc((sympy.oo, 0.0), "Right")
+    stokes.bodyforce = sympy.Matrix([0.0, 0.0])
+
+    return dict(
+        mesh=mesh, stokes=stokes,
+        u=u, V_top=V_top,
+        edot_II_var=edot_II_var, tau_y_var=tau_y_var,
+        sigma_II_var=sigma_II_var, yield_ratio_var=yield_ratio_var,
+        n_vec=np.array([n_x, n_y]),
+    )
+
+
+def capture(theta_deg, tau_y_at_fault, n_periods=1.5):
+    obj = build_model(theta_deg, tau_y_at_fault, label_suffix="_cap")
+    mesh = obj["mesh"]; stokes = obj["stokes"]
+    u = obj["u"]; V_top = obj["V_top"]
+    edot_II_var = obj["edot_II_var"]
+    tau_y_var = obj["tau_y_var"]
+    sigma_II_var = obj["sigma_II_var"]
+    yield_ratio_var = obj["yield_ratio_var"]
+    cm = stokes.constitutive_model
+    DFDt = stokes.Unknowns.DFDt
+
+    sigma_coords = DFDt.psi_star[0].coords
+    n_x, n_y = obj["n_vec"]; cx, cy = 0.5 * W, 0.5 * H
+    sd = np.abs((sigma_coords[:, 0] - cx) * n_x + (sigma_coords[:, 1] - cy) * n_y)
+    fault_mask = sd < 1.5 * FAULT_WIDTH
+    E_sym = stokes.Unknowns.E
+
+    T_END = n_periods * 2.0 * np.pi / OMEGA
+    best = None
+    saved = []
+    iters = []; reasons = []
+    t_cur = 0.0
+    t0 = time.time()
+    while t_cur < T_END - 1e-9:
+        dt = min(DT, T_END - t_cur)
+        t_end_step = t_cur + dt
+        v_now = V0 * float(np.cos(OMEGA * t_end_step))
+        V_top.sym = sympy.Float(v_now)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+        iters.append(int(stokes.snes.getIterationNumber()))
+        reasons.append(int(stokes.snes.getConvergedReason()))
+
+        sigma_arr = np.asarray(DFDt.psi_star[0].array)
+        sigma_II = np.sqrt(0.5 * (sigma_arr ** 2).sum(axis=(1, 2)))
+        recordable = t_end_step > 0.5 * 2.0 * np.pi / OMEGA
+        in_fault_max = float(sigma_II[fault_mask].max()) if fault_mask.any() else 0.0
+        if recordable and (best is None or in_fault_max > best[0]):
+            best = (in_fault_max, len(saved))
+        saved.append(dict(
+            t=t_end_step, v_top=v_now,
+            u_arr=np.asarray(u.array).copy(),
+            sigma_arr=sigma_arr.copy(),
+            sigma_II=sigma_II.copy(),
+        ))
+        t_cur = t_end_step
+
+    if best is None:
+        best = (saved[-1]["sigma_II"].max(), len(saved) - 1)
+    chosen = saved[best[1]]
+    u.array[...] = chosen["u_arr"]
+    DFDt.psi_star[0].array[...] = chosen["sigma_arr"]
+    u._sync_lvec_to_gvec()
+
+    plot_coords = edot_II_var.coords
+    edot_xx = np.asarray(uw.function.evaluate(E_sym[0, 0], plot_coords)).flatten()
+    edot_xy = np.asarray(uw.function.evaluate(E_sym[0, 1], plot_coords)).flatten()
+    edot_yy = np.asarray(uw.function.evaluate(E_sym[1, 1], plot_coords)).flatten()
+    edot_II_at_plot = np.sqrt(0.5 * (edot_xx ** 2 + edot_yy ** 2 + 2 * edot_xy ** 2))
+    edot_II_var.array[:, 0, 0] = edot_II_at_plot
+
+    ty_at_plot = np.asarray(
+        uw.function.evaluate(cm.Parameters.yield_stress.sym, plot_coords)
+    ).flatten()
+    tau_y_var.array[:, 0, 0] = ty_at_plot
+
+    sigma_sym_II = sympy.sqrt(
+        (DFDt.psi_star[0].sym * DFDt.psi_star[0].sym).trace() / 2
+    )
+    try:
+        sigma_II_at_plot = np.asarray(
+            uw.function.evaluate(sigma_sym_II, plot_coords)
+        ).flatten()
+    except Exception:
+        from underworld3.kdtree import KDTree
+        tree = KDTree(np.asarray(sigma_coords))
+        sigma_II_at_plot = tree.rbf_interpolator_local(
+            plot_coords, chosen["sigma_II"][:, None], 4, 2,
+        ).flatten()
+    sigma_II_var.array[:, 0, 0] = sigma_II_at_plot
+    yield_ratio_var.array[:, 0, 0] = sigma_II_at_plot / np.maximum(ty_at_plot, 1e-30)
+
+    key = _key(theta_deg, tau_y_at_fault)
+    os.makedirs(OUT_DIR, exist_ok=True)
+    mesh.write_timestep(
+        key, index=0, outputPath=OUT_DIR,
+        meshVars=[u, edot_II_var, tau_y_var, sigma_II_var, yield_ratio_var],
+        create_xdmf=True,
+    )
+    DFDt.psi_star[0].write(
+        os.path.join(OUT_DIR, key + ".mesh.sigma.00000.h5")
+    )
+
+    iters_arr = np.array(iters); reasons_arr = np.array(reasons)
+    metadata = dict(
+        theta_deg=theta_deg,
+        tau_y_at_fault=tau_y_at_fault,
+        n_x=float(n_x), n_y=float(n_y),
+        t=float(chosen["t"]), v_top=float(chosen["v_top"]),
+        T_END=float(T_END), RES=int(RES),
+        wall_seconds=float(time.time() - t0),
+        max_in_fault_sigma_II=float(best[0]),
+        n_steps=len(saved),
+        iters=iters_arr, reasons=reasons_arr,
+    )
+    np.savez(os.path.join(OUT_DIR, key + ".meta.npz"), **metadata)
+    n_diverged = int((reasons_arr < 0).sum())
+    print(
+        f"  ran {len(saved)} steps in {metadata['wall_seconds']:.1f}s; "
+        f"chose t={metadata['t']:.3f}, V_top={metadata['v_top']:+.4f}; "
+        f"max in-fault σ_II = {metadata['max_in_fault_sigma_II']:.4f} "
+        f"({metadata['max_in_fault_sigma_II']/tau_y_at_fault:.3f}·τ_y_centre); "
+        f"checkpointed → {OUT_DIR}/{key}.*",
+        flush=True,
+    )
+    print(
+        f"  SNES iters per step: mean={iters_arr.mean():.1f} "
+        f"median={int(np.median(iters_arr))} max={iters_arr.max()} "
+        f"diverged_steps={n_diverged}/{len(reasons_arr)}",
+        flush=True,
+    )
+
+
+def load_into_fresh_model(theta_deg, tau_y_at_fault):
+    obj = build_model(theta_deg, tau_y_at_fault, label_suffix="_load")
+    key = _key(theta_deg, tau_y_at_fault)
+    obj["edot_II_var"].read_timestep(
+        key, obj["edot_II_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["tau_y_var"].read_timestep(
+        key, obj["tau_y_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["sigma_II_var"].read_timestep(
+        key, obj["sigma_II_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["yield_ratio_var"].read_timestep(
+        key, obj["yield_ratio_var"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    obj["u"].read_timestep(
+        key, obj["u"].clean_name.replace("_load", "_cap"),
+        index=0, outputPath=OUT_DIR,
+    )
+    meta = dict(np.load(os.path.join(OUT_DIR, key + ".meta.npz")))
+    obj["meta"] = {k: (v.item() if v.ndim == 0 else v) for k, v in meta.items()}
+    return obj
+
+
+def plot_panels(obj, out_path, off_screen=True):
+    import pyvista as pv
+
+    pv.global_theme.background = "white"
+    pv.global_theme.anti_aliasing = "ssaa"
+
+    mesh = obj["mesh"]
+    u = obj["u"]
+    sII = obj["sigma_II_var"]
+    eII = obj["edot_II_var"]
+    yr = obj["yield_ratio_var"]
+    ty = obj["tau_y_var"]
+    meta = obj["meta"]
+
+    pvmesh = vis.mesh_to_pv_mesh(mesh)
+    pvmesh.point_data["sigma_II"] = vis.scalar_fn_to_pv_points(pvmesh, sII.sym)
+    pvmesh.point_data["edot_II"] = vis.scalar_fn_to_pv_points(pvmesh, eII.sym)
+    pvmesh.point_data["yield_ratio"] = np.clip(
+        vis.scalar_fn_to_pv_points(pvmesh, yr.sym), 0.0, 1.5,
+    )
+    pvmesh.point_data["tau_y"] = vis.scalar_fn_to_pv_points(pvmesh, ty.sym)
+
+    u_cloud = vis.meshVariable_to_pv_cloud(u)
+    u_cloud.point_data["u"] = vis.vector_fn_to_pv_points(u_cloud, u.sym)
+    u_speed = np.linalg.norm(u_cloud.point_data["u"][:, :2], axis=1)
+    u_cloud.point_data["|u|"] = u_speed
+    pvmesh.point_data["u_y"] = vis.scalar_fn_to_pv_points(pvmesh, u.sym[1])
+    pvmesh.point_data["|u|"] = vis.scalar_fn_to_pv_points(
+        pvmesh, sympy.sqrt(u.sym.dot(u.sym))
+    )
+
+    n_x = float(meta["n_x"]); n_y = float(meta["n_y"])
+    cx, cy = 0.5 * W, 0.5 * H
+    L = FAULT_LENGTH
+    t_x, t_y = n_y, -n_x
+    fault_line = pv.Line(
+        (cx - 0.5 * L * t_x, cy - 0.5 * L * t_y, 0.0),
+        (cx + 0.5 * L * t_x, cy + 0.5 * L * t_y, 0.0),
+    )
+
+    pl = pv.Plotter(off_screen=off_screen, shape=(2, 2),
+                    window_size=(1500, 1400), border=True)
+
+    def _common(p):
+        p.view_xy()
+        p.camera.parallel_projection = True
+        p.add_mesh(fault_line, color="red", line_width=4)
+
+    pl.subplot(0, 0)
+    uy_max = float(np.max(np.abs(pvmesh.point_data["u_y"])))
+    pl.add_mesh(
+        pvmesh, scalars="u_y", cmap="seismic",
+        clim=(-uy_max, uy_max),
+        show_scalar_bar=True, scalar_bar_args={"title": "u_y"},
+    )
+    sub = max(1, len(u_cloud.points) // 250)
+    pl.add_arrows(u_cloud.points[::sub], u_cloud.point_data["u"][::sub],
+                  mag=0.35, color="#333333")
+    pl.add_text(
+        "velocity: u_y heatmap (+arrows show full u)",
+        position="upper_edge", font_size=11, color="black",
+    )
+    _common(pl)
+
+    pl.subplot(0, 1)
+    pl.add_mesh(pvmesh, scalars="edot_II", cmap="viridis",
+                show_scalar_bar=True, scalar_bar_args={"title": "|ε̇|_II"})
+    pl.add_text("|ε̇|_II", position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.subplot(1, 0)
+    pl.add_mesh(pvmesh, scalars="sigma_II", cmap="magma",
+                show_scalar_bar=True, scalar_bar_args={"title": "|σ|_II"})
+    ty_levels = [meta["tau_y_at_fault"] * f for f in (4.0, 20.0, 100.0)]
+    contours = pvmesh.contour(isosurfaces=ty_levels, scalars="tau_y")
+    if contours.n_points > 0:
+        pl.add_mesh(contours, color="cyan", line_width=1.2)
+    pl.add_text("|σ|_II — cyan: τ_y(x) contours",
+                position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.subplot(1, 1)
+    pl.add_mesh(pvmesh, scalars="yield_ratio", cmap="RdYlGn_r",
+                clim=(0.0, 1.2),
+                show_scalar_bar=True, scalar_bar_args={"title": "|σ|_II / τ_y(x)"})
+    yc = pvmesh.contour(isosurfaces=[1.0], scalars="yield_ratio")
+    if yc.n_points > 0:
+        pl.add_mesh(yc, color="black", line_width=2.0)
+    pl.add_text("yield activation",
+                position="upper_edge", font_size=12, color="black")
+    _common(pl)
+
+    pl.add_text(
+        f"Phase E hybrid BDF/ETD, RES={int(meta['RES'])}, "
+        f"θ={meta['theta_deg']:+.0f}°, τ_y_fault={meta['tau_y_at_fault']} "
+        f"(t={meta['t']:.2f}, V_top={meta['v_top']:+.3f})",
+        position="lower_edge", font_size=10, color="black",
+    )
+
+    pl.screenshot(out_path, scale=1.5)
+    pl.close()
+    print(f"  wrote {out_path}", flush=True)
+
+
+def capture_or_load(theta_deg, tau_y_at_fault, n_periods=1.5):
+    if os.path.exists(_meta_path(_key(theta_deg, tau_y_at_fault))):
+        print(f"  cache hit: {_key(theta_deg, tau_y_at_fault)}.* — skipping run",
+              flush=True)
+    else:
+        print(f"  cache miss: running capture", flush=True)
+        capture(theta_deg, tau_y_at_fault, n_periods=n_periods)
+    return load_into_fresh_model(theta_deg, tau_y_at_fault)
+
+
+def main():
+    cases = [(15.0, 0.05), (15.0, 0.15)]
+    for theta, ty in cases:
+        print(f"\n=== θ={theta:+.0f}°, τ_y={ty:.2f} ===", flush=True)
+        obj = capture_or_load(theta, ty, n_periods=1.5)
+        out_path = os.path.join(
+            OUT_DIR,
+            f"exp_integrator_phase_e_pyvista_hybrid_th{theta:+.0f}_ty{ty:.2f}".replace(".", "p") + ".png",
+        )
+        plot_panels(obj, out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/_plot_phase_f.py b/docs/developer/design/_plot_phase_f.py
new file mode 100644
index 00000000..dcf0d86c
--- /dev/null
+++ b/docs/developer/design/_plot_phase_f.py
@@ -0,0 +1,109 @@
+"""Plot Phase F results — predictor-corrector experiments on isotropic VEP.
+
+Reads the per-step trace files (text format, written each step) so the
+plot reproduces from a fresh clone if the npz files are absent.
+"""
+
+import os
+import numpy as np
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+OUT_DIR = "output"
+TRACE_DIR = "docs/developer/design"
+DT = 0.05
+OMEGA = np.pi / 2.0
+PERIOD = 2.0 * np.pi / OMEGA
+TAU_Y_FAULT = 0.05
+
+
+def _load_trace(label):
+    path = os.path.join(TRACE_DIR, f"_phase_f_{label}.trace.txt")
+    if not os.path.exists(path):
+        return None
+    rows = np.loadtxt(path, comments="#")
+    if rows.size == 0:
+        return None
+    if rows.ndim == 1:
+        rows = rows.reshape(1, -1)
+    # Columns: step, t, V_top, snes_iters, picard_iters, sigma_eq_max,
+    # sigma_eq_max_after_correction, u_y_max, yielded_fraction
+    return dict(
+        step=rows[:, 0],
+        t=rows[:, 1],
+        V=rows[:, 2],
+        snes_iters=rows[:, 3],
+        picard_iters=rows[:, 4],
+        sigma_eq_max=rows[:, 5],
+        sigma_eq_max_corrected=rows[:, 6],
+        u_y_max=rows[:, 7],
+        yielded_fraction=rows[:, 8],
+    )
+
+
+def main():
+    cases = [
+        ("bdf1_iso",       "BDF-1 (yield-in-residual)", "#1f77b4", "-"),
+        ("etd1_pc1",       "ETD-1 + RR single-shot",   "#2ca02c", "-"),
+        ("etd1_pc_picard", "ETD-1 + RR + Picard",       "#17becf", "--"),
+        ("etd2_pc1",       "ETD-2 + RR single-shot",   "#ff7f0e", "-"),
+        ("etd2_pc_picard", "ETD-2 + RR + Picard",       "#d62728", ":"),
+    ]
+    traces = {label: _load_trace(label) for label, _, _, _ in cases}
+
+    fig, axes = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
+
+    # Panel 1 — σ_eq_max (log)
+    ax = axes[0]
+    for label, name, color, ls in cases:
+        tr = traces[label]
+        if tr is None:
+            continue
+        ax.semilogy(tr["t"] / PERIOD, tr["sigma_eq_max"], ls, color=color,
+                    label=f"{name} (peak={tr['sigma_eq_max'].max():.3f})")
+    ax.axhline(TAU_Y_FAULT, color="#888888", lw=0.7, linestyle="--",
+               label=rf"$\tau_y^{{fault}}={TAU_Y_FAULT}$")
+    ax.set_ylabel(r"max $|\sigma|_{eq}$ (log)")
+    ax.legend(loc="upper left", fontsize=8)
+    ax.grid(alpha=0.3, which="both")
+    ax.set_title(
+        rf"Phase F: predictor-corrector on isotropic VEP, "
+        rf"localised weak zone (τ_y_fault={TAU_Y_FAULT}, τ_y_bulk=200)"
+    )
+
+    # Panel 2 — |u_y|_max (log)
+    ax = axes[1]
+    for label, name, color, ls in cases:
+        tr = traces[label]
+        if tr is None:
+            continue
+        ax.semilogy(tr["t"] / PERIOD, tr["u_y_max"], ls, color=color,
+                    label=f"{name} (peak={tr['u_y_max'].max():.3e})")
+    ax.set_ylabel(r"max $|u_y|$ (log)")
+    ax.legend(loc="upper left", fontsize=8)
+    ax.grid(alpha=0.3, which="both")
+
+    # Panel 3 — yielded fraction
+    ax = axes[2]
+    for label, name, color, ls in cases:
+        tr = traces[label]
+        if tr is None:
+            continue
+        ax.plot(tr["t"] / PERIOD, tr["yielded_fraction"] * 100, ls, color=color,
+                label=name)
+    ax.set_ylabel(r"yielded fraction (%)")
+    ax.set_xlabel(r"time $t / T$ (periods)")
+    ax.legend(loc="upper left", fontsize=8)
+    ax.grid(alpha=0.3)
+
+    fig.tight_layout()
+    out_png = os.path.join(OUT_DIR, "exp_integrator_phase_f_predictor_corrector.png")
+    fig.savefig(out_png, dpi=140)
+    plt.close(fig)
+    print(f"  wrote {out_png}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/developer/design/exp_integrator_phase_a.png b/docs/developer/design/exp_integrator_phase_a.png
new file mode 100644
index 00000000..cfbbeb16
Binary files /dev/null and b/docs/developer/design/exp_integrator_phase_a.png differ
diff --git a/docs/developer/design/exp_integrator_phase_b_largedt.png b/docs/developer/design/exp_integrator_phase_b_largedt.png
new file mode 100644
index 00000000..8886782b
Binary files /dev/null and b/docs/developer/design/exp_integrator_phase_b_largedt.png differ
diff --git a/docs/developer/design/exp_integrator_phase_b_square.png b/docs/developer/design/exp_integrator_phase_b_square.png
new file mode 100644
index 00000000..4fce7217
Binary files /dev/null and b/docs/developer/design/exp_integrator_phase_b_square.png differ
diff --git a/docs/developer/design/exp_integrator_phase_b_vardt.png b/docs/developer/design/exp_integrator_phase_b_vardt.png
new file mode 100644
index 00000000..ce653943
Binary files /dev/null and b/docs/developer/design/exp_integrator_phase_b_vardt.png differ
diff --git a/docs/developer/design/exp_integrator_phase_b_yield.png b/docs/developer/design/exp_integrator_phase_b_yield.png
new file mode 100644
index 00000000..dddb3ace
Binary files /dev/null and b/docs/developer/design/exp_integrator_phase_b_yield.png differ
diff --git a/docs/examples/WIP/VEP_Fault_Investigation.py b/docs/examples/WIP/VEP_Fault_Investigation.py
index 97cab167..3ea9dd16 100644
--- a/docs/examples/WIP/VEP_Fault_Investigation.py
+++ b/docs/examples/WIP/VEP_Fault_Investigation.py
@@ -87,7 +87,7 @@
 cm.Parameters.shear_viscosity_0 = ETA
 cm.Parameters.shear_modulus = MU
 cm.Parameters.yield_stress = tau_y_field
-# yield_mode="smooth" is the default (corrected harmonic, no Min/Max)
+# yield_mode="softmin" is the default (smooth approximation to Min)
 cm.Parameters.shear_viscosity_min = ETA * 1.0e-2
 cm.Parameters.strainrate_inv_II_min = 1.0e-5
 
diff --git a/docs/examples/solid_mechanics/advanced/Ex_Sheared_Layer_Elastic.py b/docs/examples/solid_mechanics/advanced/Ex_Sheared_Layer_Elastic.py
index dcf4637f..c99a8154 100644
--- a/docs/examples/solid_mechanics/advanced/Ex_Sheared_Layer_Elastic.py
+++ b/docs/examples/solid_mechanics/advanced/Ex_Sheared_Layer_Elastic.py
@@ -202,19 +202,25 @@ def mesh_return_coords_to_bounds(coords):
 
 # %% [markdown]
 """
-## VE_Stokes Solver with Viscoelastic Rheology
+## Stokes Solver with Viscoelastic Rheology
+
+The plain ``Stokes`` solver creates the stress-history infrastructure
+automatically when a ``ViscoElasticPlasticFlowModel`` is assigned —
+no need for the legacy ``VE_Stokes`` wrapper.
 """
 
 # %%
-stokes = uw.systems.VE_Stokes(
-    mesh1, velocityField=v_soln, pressureField=p_soln, verbose=False, order=1
+stokes = uw.systems.Stokes(
+    mesh1, velocityField=v_soln, pressureField=p_soln, verbose=False,
 )
 
 stokes.petsc_options["snes_monitor"] = None
 stokes.petsc_options["ksp_monitor"] = None
 
 # Viscoelastic-plastic constitutive model
-stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=1,
+)
 
 stokes.constitutive_model.Parameters.shear_viscosity_0 = params.uw_shear_viscosity
 stokes.constitutive_model.Parameters.shear_modulus = params.uw_shear_modulus
diff --git a/publications/blog-posts/constitutive-models.md b/publications/blog-posts/constitutive-models.md
index 82ff20de..404c0e05 100644
--- a/publications/blog-posts/constitutive-models.md
+++ b/publications/blog-posts/constitutive-models.md
@@ -202,7 +202,7 @@ The stress history lives on particles via the solver's `DFDt` (flux time derivat
 
 If you don't want to use particles for 
 
-For VEP problems, the viscoelastic effective strain rate includes contributions from the stress history, and the plastic yield criterion is evaluated against this total deformation rate. The `bdf_blend` parameter controls blending between BDF-1 and BDF-2 near the yield surface, where pure BDF-2 can produce oscillations. The model auto-detects the appropriate blend: pure VE problems get full BDF-2 accuracy, while VEP problems get a stable near-optimal blend.
+For VEP problems, the viscoelastic effective strain rate includes contributions from the stress history, and the plastic yield criterion is evaluated against this total deformation rate. The Min-mode plasticity has a non-smooth kink at the yield surface, which historically caused twitchy SNES behaviour at yield onset. Two safeguards keep the solver well-behaved there: a Picard-style retry on SNES divergence (`divergence_retries`), and a snapshot-based projection in the stress-history machinery that prevents an implicit feedback loop between the projection's source and target under variable timestep.
 
 Recent work has extended the anisotropic model to `TransverseIsotropicVEPFlowModel`, combining directional weakness with viscoelastic stress memory and plastic yielding. The yield criterion is evaluated on the resolved shear stress on the fault plane, computed from the full stress tensor and the director orientation. In UW3, this is a class that inherits from the VEP model and overrides the stress computation with additional director terms. The Jacobian follows automatically. In UW2, it would have been extremely difficult to implement.
 
diff --git a/src/underworld3/constitutive_models.py b/src/underworld3/constitutive_models.py
index bfda444c..fbec591a 100644
--- a/src/underworld3/constitutive_models.py
+++ b/src/underworld3/constitutive_models.py
@@ -592,6 +592,37 @@ def requires_stress_history(self):
         """
         return False
 
+    @property
+    def stress_history_ddt_kwargs(self):
+        """Extra kwargs passed to the auto-DDt creation when this model
+        triggers it via ``requires_stress_history = True``.
+
+        Default: empty dict (BDF-only models). ETD-2 / exponential models
+        override to inject ``with_forcing_history=True`` so the DDt
+        allocates a forcing-history slot.
+        """
+        return {}
+
+    def _update_history_coefficients(self):
+        """Uniform pre-solve hook the Stokes solver calls before each solve.
+
+        Default: no-op (non-stress-history models have nothing to update).
+        BDF-style stress-history subclasses (VEP, TI-VEP) override to
+        delegate to ``_update_bdf_coefficients``. ETD-2 / exponential
+        subclasses (e.g. ``MaxwellExponentialFlowModel``) override to
+        update α, φ on the DDt. The solver dispatches uniformly through
+        this method — no ``isinstance`` checks at the solver layer.
+        """
+        return
+
+    def _update_history_post_solve(self):
+        """Uniform post-solve hook the Stokes solver calls after each solve.
+
+        Default: no-op. Subclasses that store extra integrator state for
+        the next step (e.g. ETD-2 storing ε̇ⁿ in ``forcing_star``) override.
+        """
+        return
+
     @property
     def plastic_fraction(self):
         """Fraction of strain rate that is plastic (0 for non-plastic models).
@@ -1075,11 +1106,54 @@ class ViscoElasticPlasticFlowModel(ViscousFlowModel):
 
     """
 
-    def __init__(self, unknowns, order=1, material_name: str = None):
+    def __init__(self, unknowns, order=1, integrator: str = "bdf",
+                 material_name: str = None):
+        """Construct a viscoelastic-plastic flow model.
 
-        ## We just need to add the expressions for the stress history terms in here.\
-        ## They are properties to hold expressions that are persistent for this instance
-        ## (i.e. we only update the value, not the object)
+        Parameters
+        ----------
+        unknowns : Unknowns
+            The solver unknowns (velocity, pressure).
+        order : int, default 1
+            Time-integration order. Combines with ``integrator``:
+
+            - ``integrator='bdf', order=1``: BDF-1 (backward Euler).
+            - ``integrator='bdf', order=2``: BDF-2.
+            - ``integrator='etd', order=1``: ETD-1 (single-step,
+              fully L-stable, recommended default for VEP+yield).
+            - ``integrator='etd', order=2``: ETD-2 (single-step
+              with linear-quadrature forcing history; accurate on
+              smooth VE but **unstable in tight-yield VEP** —
+              produces global runaway, see EXPONENTIAL_VE_INTEGRATOR.md
+              lessons #7, #9).
+        integrator : str, default "bdf"
+            Time-integration scheme:
+
+            - ``"bdf"``: backward differentiation formula on the
+              deviatoric-stress rate equation. Production default.
+            - ``"etd"``: exponential time-differencing — integrates the
+              Maxwell relaxation operator analytically (``α = exp(-Δt/τ)``).
+              ``order=1`` is the recommended default for new code: same
+              stability as BDF-1, exact handling of the relaxation factor
+              at large ``Δt/τ``. ``order=2`` adds linear quadrature on
+              the forcing history for higher accuracy on smooth VE
+              (4.3× more accurate than BDF-2 on ``bench_ve_harmonic``)
+              but blows up under active yield in tight-yield TI faults.
+              See ``docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md``.
+        material_name : str, optional
+            Name identifier for this material.
+        """
+        if integrator not in ("bdf", "etd"):
+            raise ValueError(
+                f"integrator must be 'bdf' or 'etd', got '{integrator!r}'"
+            )
+        if integrator == "etd" and order not in (1, 2):
+            raise ValueError(
+                f"integrator='etd' supports order=1 (ETD-1, default-recommended) "
+                f"or order=2 (ETD-2, accurate for smooth VE; avoid in tight-yield "
+                f"VEP where it produces global runaway). Got order={order}."
+            )
+        self._integrator = integrator
 
         # Store material_name before creating expressions (needed by create_unique_symbol)
         self._material_name = material_name
@@ -1115,7 +1189,6 @@ def __init__(self, unknowns, order=1, material_name: str = None):
         self._order = order
         self._yield_mode = "softmin"  # "min", "harmonic", "smooth", or "softmin"
         self._yield_softness = 0.1  # δ parameter for "softmin" mode
-        self._bdf_blend = None  # auto: 1.0 for VE, 0.75 for VEP
 
         # Timestep — set by the solver before each solve(). Not a user parameter.
         # Initialised to oo (viscous limit). The solver overwrites this with the
@@ -1363,18 +1436,6 @@ def _update_bdf_coefficients(self):
                     pass  # symbolic dt — can't evaluate, keep requested order
 
             coeffs = _bdf_coefficients(order, dt_current, dt_history)
-
-            # Blend with O1 coefficients for stability
-            # 0 = pure O1, 0.5 = balanced (default), 1 = pure requested order
-            alpha = self.bdf_blend  # property resolves None → auto-detect
-            if 0 < alpha < 1 and order >= 2:
-                coeffs_o1 = _bdf_coefficients(1, dt_current, dt_history)
-                while len(coeffs_o1) < len(coeffs):
-                    coeffs_o1.append(sympy.Integer(0))
-                coeffs = [
-                    (1 - alpha) * c1 + alpha * ck
-                    for c1, ck in zip(coeffs_o1, coeffs)
-                ]
         else:
             coeffs = _bdf_coefficients(order, None, [])
 
@@ -1387,6 +1448,69 @@ def _update_bdf_coefficients(self):
         self._bdf_c2.sym = coeffs[2]
         self._bdf_c3.sym = coeffs[3]
 
+    def _update_history_coefficients(self):
+        """Pre-solve hook: refresh integrator coefficients.
+
+        Dispatches on ``(self._integrator, self._order)``:
+        - ``"bdf"`` (order 1 or 2): updates BDF c-coefficients via
+          :py:meth:`_update_bdf_coefficients`.
+        - ``"etd"`` order=2 (Phase B ETD-2): updates α, φ on the DDt
+          from ``τ_VE = η/μ``; forcing-history slot active.
+        - ``"etd"`` order=1 (ETD-1): updates α, φ as for ETD-2 then
+          forces ``φ = α`` so the ``(φ-α)·ε̇*`` term zeros out — fully
+          L-stable single-step, no forcing-history slot needed.
+        """
+        if self._integrator == "etd":
+            if self.Unknowns.DFDt is None:
+                return
+            params = self.Parameters
+            if params.shear_modulus.sym is sympy.oo:
+                tau_eff = sympy.oo
+            else:
+                try:
+                    eta_val = float(params.shear_viscosity_0.sym)
+                    mu_val = float(params.shear_modulus.sym)
+                    tau_eff = eta_val / mu_val if mu_val > 0 else sympy.oo
+                except (TypeError, ValueError):
+                    tau_eff = None
+            try:
+                dt_val = (
+                    float(params.dt_elastic.sym)
+                    if params.dt_elastic.sym is not sympy.oo
+                    else None
+                )
+            except (TypeError, ValueError):
+                dt_val = None
+            self.Unknowns.DFDt.update_exp_coefficients(dt_val, tau_eff)
+            if self._order == 1:
+                # ETD-1 reduction: φ = α makes the (φ-α)·ε̇* term zero
+                # AND turns (1-φ)·ε̇ into (1-α)·ε̇.
+                self.Unknowns.DFDt._exp_phi.sym = self.Unknowns.DFDt._exp_alpha.sym
+        else:
+            self._update_bdf_coefficients()
+
+    def _update_history_post_solve(self):
+        """Post-solve hook.
+
+        - BDF / ETD-1: no-op (no forcing-history slot).
+        - ETD-2: refresh ``forcing_star`` from the just-solved ε̇ for
+          the next step's history term.
+        """
+        if self._integrator == "etd" and self._order == 2 and self.Unknowns.DFDt is not None:
+            if self.Unknowns.DFDt.forcing_star is not None:
+                self.Unknowns.DFDt.update_forcing_history(forcing_fn=self.Unknowns.E)
+
+    @property
+    def stress_history_ddt_kwargs(self):
+        """SemiLagrangian DDt kwargs based on integrator selection.
+
+        ETD-2 (order=2) needs the forcing-history slot; BDF and ETD-1
+        (order=1) do not.
+        """
+        if self._integrator == "etd" and self._order == 2:
+            return {"with_forcing_history": True}
+        return {}
+
     # The following should have no setters
     @property
     def stress_star(self):
@@ -1412,26 +1536,60 @@ def stress_2star(self):
 
     @property
     def E_eff(self):
-        r"""Effective strain rate including elastic contribution.
+        r"""Effective strain rate including elastic-history coupling.
+
+        For BDF integration:
 
         .. math::
-            \dot{\varepsilon}_{\mathrm{eff}} = \dot{\varepsilon} + \frac{\boldsymbol{\sigma}^*}{2 G \Delta t}
-        """
-        E = self.Unknowns.E
+            \dot{\varepsilon}_\mathrm{eff} = \dot{\varepsilon}
+                - \sum_i c_i \frac{\sigma^{*(i)}}{2 \mu \Delta t}
 
-        if self.Unknowns.DFDt is not None:
+        For ETD-2 (exponential) integration:
 
-            if self.is_elastic:
-                mu_dt = self.Parameters.dt_elastic * self.Parameters.shear_modulus
-                # BDF history coefficients as UWexpressions (route through constants[])
-                bdf_cs = [self._bdf_c1, self._bdf_c2, self._bdf_c3]
+        .. math::
+            \dot{\varepsilon}_\mathrm{eff} = (1-\varphi)\,\dot{\varepsilon}
+                + \frac{\alpha}{2\eta}\,\sigma^*
+                + (\varphi-\alpha)\,\dot{\varepsilon}^*
+
+        Both forms reduce to bare ``ε̇`` when no elastic history is
+        active. The yield criterion ``η_pl = τ_y/(2|E_eff|_II)`` is the
+        same expression structure for both — it adapts naturally to the
+        integrator's ``E_eff``.
+        """
+        E = self.Unknowns.E
 
-                # History contribution: -Σ cᵢ·σ_star[i-1] / (2·μ·dt)
-                for i in range(self.Unknowns.DFDt.order):
-                    E += -bdf_cs[i] * self.Unknowns.DFDt.psi_star[i].sym / (2 * mu_dt)
+        if self.Unknowns.DFDt is None or not self.is_elastic:
+            self._E_eff.sym = E
+            return self._E_eff
+
+        DDt = self.Unknowns.DFDt
+
+        if self._integrator == "etd":
+            # ETD-2 effective strain rate carrying α·σ*/(2η) and (φ-α)·ε̇*.
+            # ETD-1 (order=1): φ = α (set in _update_history_coefficients),
+            # so the (φ-α)·ε̇* term zeros out and (1-φ)·ε̇ → (1-α)·ε̇ — same
+            # expression tree, no separate code path needed.
+            alpha = DDt._exp_alpha
+            phi = DDt._exp_phi
+            sigma_star = DDt.psi_star[0].sym
+            if DDt.forcing_star is not None:
+                edot_star = DDt.forcing_star.sym
+            else:
+                edot_star = sympy.zeros(*E.shape)
+            eta_raw = self.Parameters.shear_viscosity_0
+            self._E_eff.sym = (
+                (1 - phi) * E
+                + (alpha / (2 * eta_raw)) * sigma_star
+                + (phi - alpha) * edot_star
+            )
+            return self._E_eff
 
+        # BDF default
+        mu_dt = self.Parameters.dt_elastic * self.Parameters.shear_modulus
+        bdf_cs = [self._bdf_c1, self._bdf_c2, self._bdf_c3]
+        for i in range(DDt.order):
+            E += -bdf_cs[i] * DDt.psi_star[i].sym / (2 * mu_dt)
         self._E_eff.sym = E
-
         return self._E_eff
 
     @property
@@ -1447,6 +1605,20 @@ def K(self):
         """Effective stiffness parameter (viscosity for visco-elastic-plastic flow)."""
         return self.viscosity
 
+    @property
+    def _unclipped_ve_viscosity(self):
+        """Unclipped viscoelastic viscosity (no yield wrap), depends on integrator.
+
+        - BDF: ``ve_effective_viscosity = η·μΔt/(c₀·η + μΔt)`` —
+          baked-in time-integration factor for backward differentiation.
+        - ETD-2: ``η`` (raw) — the time-integration factor ``(1-φ)`` is
+          carried symbolically in :py:attr:`E_eff`, not folded into
+          this viscosity.
+        """
+        if self._integrator == "etd":
+            return self.Parameters.shear_viscosity_0
+        return self.Parameters.ve_effective_viscosity
+
     @property
     def viscosity(self):
         r"""Effective viscosity combining visco-elastic and plastic limits.
@@ -1460,30 +1632,24 @@ def viscosity(self):
           when η_ve is small relative to η_pl.
         - ``"min"``: sharp ``Min(η_ve, η_pl)``. Exact yield stress but can
           cause SNES divergence with higher-order BDF time integration.
+
+        The unclipped η_ve depends on ``self._integrator`` —
+        :py:attr:`_unclipped_ve_viscosity` returns the correct base for
+        BDF or ETD-2 (raw η for ETD-2 since the time factor lives in
+        ``E_eff``; ``ve_effective_viscosity`` for BDF).
         """
 
         inner_self = self.Parameters
 
         if inner_self.yield_stress.sym == sympy.oo:
-            return inner_self.ve_effective_viscosity
+            return self._unclipped_ve_viscosity
 
-        effective_viscosity = inner_self.ve_effective_viscosity
+        effective_viscosity = self._unclipped_ve_viscosity
 
         if self.is_viscoplastic:
             vp_effective_viscosity = self._plastic_effective_viscosity
             if self._yield_mode == "harmonic":
                 effective_viscosity = 1 / (1 / effective_viscosity + 1 / vp_effective_viscosity)
-            elif self._yield_mode == "smooth":
-                # Corrected harmonic: cancels the excess 1/η_ve contribution
-                # at deep yielding while staying smooth everywhere.
-                #   η_eff = η_ve · (1+f) / (1 + f + f²)
-                # where f = η_ve/η_pl measures yield overshoot.
-                #
-                # f → 0 (elastic): η_eff → η_ve   (no correction)
-                # f → ∞ (yielding): η_eff → η_pl  (exact yield)
-                # No Min/Max — just arithmetic. Continuous derivatives.
-                f = effective_viscosity / vp_effective_viscosity
-                effective_viscosity = effective_viscosity * (1 + f) / (1 + f + f**2)
             elif self._yield_mode == "softmin":
                 # Smooth approximation to Min(η_ve, η_pl):
                 #   η_eff = η_ve / g(f)
@@ -1500,12 +1666,12 @@ def viscosity(self):
             else:
                 effective_viscosity = sympy.Min(effective_viscosity, vp_effective_viscosity)
 
-        # Apply viscosity floor — but skip for smooth/harmonic yield modes
+        # Apply viscosity floor — but skip for smooth-blend yield modes
         # where the outer Max creates a nested Min/Max that breaks the
         # BDF-2 Jacobian. Those modes are already smooth and bounded.
 
         if inner_self.shear_viscosity_min.sym != -sympy.oo:
-            if self.is_viscoplastic and self._yield_mode in ("harmonic", "smooth", "softmin"):
+            if self.is_viscoplastic and self._yield_mode in ("harmonic", "softmin"):
                 return effective_viscosity
             else:
                 return sympy.Max(
@@ -1645,24 +1811,33 @@ def stress_projection(self):
         return stress
 
     def stress(self):
-        """Viscoelastic(-plastic) deviatoric stress for the weak form."""
+        """Viscoelastic(-plastic) deviatoric stress for the weak form.
 
-        edot = self.grad_u
-
-        stress = 2 * self.viscosity * edot
-
-        if self.Unknowns.DFDt is not None:
-
-            if self.is_elastic:
-                mu_dt = self.Parameters.dt_elastic * self.Parameters.shear_modulus
-                bdf_cs = [self._bdf_c1, self._bdf_c2, self._bdf_c3]
-
-                for i in range(self.Unknowns.DFDt.order):
-                    stress += 2 * self.viscosity * (
-                        -bdf_cs[i] * self.Unknowns.DFDt.psi_star[i].sym / (2 * mu_dt)
-                    )
+        Both BDF and ETD-2 are written as ``σ = 2·viscosity·E_eff``.
+        :py:attr:`E_eff` carries the integrator-specific elastic-history
+        coupling, and :py:attr:`viscosity` returns the appropriate yield-
+        wrapped effective viscosity (``ve_effective_viscosity`` for BDF,
+        raw ``η`` for ETD-2 since the time factor is in ``E_eff``).
+        """
+        if not self.is_elastic or self.Unknowns.DFDt is None:
+            return 2 * self.viscosity * self.grad_u
+
+        # ETD-1 (order=1) uses the same E_eff machinery but with φ=α, so
+        # forcing_star is not required (the (φ-α)·ε̇* term zeros out).
+        # Only ETD-2 (order=2) needs forcing_star.
+        if (
+            self._integrator == "etd"
+            and self._order == 2
+            and self.Unknowns.DFDt.forcing_star is None
+        ):
+            raise RuntimeError(
+                "integrator='etd' requires a SemiLagrangian DDt with "
+                "with_forcing_history=True. The auto-DDt creation path "
+                "reads stress_history_ddt_kwargs — re-create the solver/"
+                "model so the kwargs propagate."
+            )
 
-        return stress
+        return 2 * self.viscosity * self.E_eff.sym
 
     # def eff_edot(self):
 
@@ -1722,24 +1897,38 @@ def _object_viewer(self):
     def yield_mode(self):
         r"""How to combine VE and plastic viscosities.
 
-        ``"smooth"`` (default): corrected harmonic —
-            ``η_ve · (1+f) / (1+f+f²)`` where ``f = η_ve/η_pl``.
-            Smooth, no Min/Max. Best balance of accuracy and robustness.
-        ``"softmin"``: smooth approximation to Min —
+        ``"softmin"`` (default): smooth approximation to Min —
             ``η_ve / g(f)`` where ``g(f) ≈ max(1, f)`` with smoothing
-            parameter δ (``yield_softness``, default 0.5).
-            Closer to exact yield than ``"smooth"`` but less robust.
+            parameter δ (``yield_softness``, default 0.1).  Approaches
+            exact Min as δ → 0; smooth derivatives at the kink.
+            Recommended default: gets within ~2 % of the true yield
+            surface while avoiding the SNES kink penalties of ``"min"``.
         ``"harmonic"``: parallel blending — ``1/(1/η_ve + 1/η_pl)``.
             Smooth but undershoots τ_y for soft materials.
         ``"min"``: sharp cutoff — ``Min(η_ve, η_pl)``.
-            Exact yield but can cause SNES divergence with BDF-2.
+            Exact yield but can cause SNES divergence with BDF-2 and
+            BDF-2 phase-lag at BC discontinuities (see benchmarks).
+
+        Note: the previous ``"smooth"`` mode (corrected harmonic
+        ``η_ve·(1+f)/(1+f+f²)``) was retired — it under-clipped the
+        yield surface by ~50 % under realistic forcing, with no
+        compensating benefit over ``softmin``.  Recover from git
+        history if needed (commit message keyword: ``smooth_yield``).
         """
         return self._yield_mode
 
     @yield_mode.setter
     def yield_mode(self, value):
-        if value not in ("min", "harmonic", "smooth", "softmin"):
-            raise ValueError(f"yield_mode must be 'min', 'harmonic', 'smooth', or 'softmin', got '{value}'")
+        if value == "smooth":
+            raise ValueError(
+                "yield_mode='smooth' has been retired — it under-clipped "
+                "the yield surface by ~50%. Use 'softmin' instead "
+                "(default; close to exact Min with smooth derivatives)."
+            )
+        if value not in ("min", "harmonic", "softmin"):
+            raise ValueError(
+                f"yield_mode must be 'min', 'harmonic', or 'softmin', got '{value}'"
+            )
         self._yield_mode = value
         self._reset()
 
@@ -1761,25 +1950,6 @@ def yield_softness(self, value):
         self._yield_softness = value
         self._reset()
 
-    @property
-    def bdf_blend(self):
-        r"""Blending parameter α for BDF history coefficients.
-
-        Blends O1 and O2 BDF coefficients: ``c = (1-α)·c_O1 + α·c_O2``.
-
-        - ``α = 0``: pure BDF-1 (most stable, first-order accurate)
-        - ``α = 0.75``: default for VEP (stable, near-optimal accuracy)
-        - ``α = 1``: pure BDF-2 (default for pure VE, second-order accurate)
-        - ``None`` (default): auto-detect — 1.0 for VE, 0.75 for VEP
-        """
-        if self._bdf_blend is None:
-            return 0.75 if self.is_viscoplastic else 1.0
-        return self._bdf_blend
-
-    @bdf_blend.setter
-    def bdf_blend(self, value):
-        self._bdf_blend = value
-
     @property
     def requires_stress_history(self):
         """VEP models always require stress history tracking."""
@@ -1815,6 +1985,30 @@ def is_viscoplastic(self):
 ###
 
 
+class MaxwellExponentialFlowModel(ViscoElasticPlasticFlowModel):
+    r"""Thin alias: ``ViscoElasticPlasticFlowModel(integrator='etd', order=1)``.
+
+    .. deprecated:: Phase B
+        Use the canonical form ``ViscoElasticPlasticFlowModel(unknowns,
+        integrator='etd', order=1)`` directly. This sibling class
+        survives as a thin scaffold so existing scripts continue to
+        work; defaults to ETD-1 (recommended).
+
+    See ``docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md`` for the
+    formulation.
+    """
+
+    def __init__(self, unknowns, material_name=None):
+        super().__init__(
+            unknowns, order=1, integrator="etd",
+            material_name=material_name,
+        )
+
+
+
+###
+
+
 class DiffusionModel(Constitutive_Model):
     r"""
     Diffusion (Fourier/Fick) constitutive model for scalar transport.
@@ -2436,7 +2630,86 @@ class TransverseIsotropicVEPFlowModel(TransverseIsotropicFlowModel):
     ViscoElasticPlasticFlowModel : Isotropic VEP model.
     """
 
-    def __init__(self, unknowns, order=1, material_name: str = None):
+    def __init__(self, unknowns, order=1, integrator: str = "bdf",
+                 fault_weight=None,
+                 material_name: str = None):
+        """Construct a transversely isotropic VEP flow model.
+
+        Parameters
+        ----------
+        unknowns : Unknowns
+            Solver unknowns (velocity, pressure).
+        order : int, default 1
+            Time-integration order. Combines with ``integrator``:
+
+            - ``integrator='bdf', order=1``: BDF-1 (backward Euler).
+            - ``integrator='bdf', order=2``: BDF-2.
+            - ``integrator='etd', order=1``: ETD-1 (single-step,
+              fully L-stable, **recommended default for VEP+yield**).
+              Reproduces BDF-1 byte-identically on the killer test;
+              wins at large ``Δt/τ`` where the analytical relaxation
+              factor matters.
+            - ``integrator='etd', order=2``: ETD-2 (linear-quadrature
+              forcing history). 4× more accurate than BDF-2 on smooth
+              VE but blows up under active yield in tight-yield TI
+              faults — see lessons #7, #9 in EXPONENTIAL_VE_INTEGRATOR.md.
+
+            ``integrator='hybrid'`` pins ``order=1``.
+        integrator : str, default "bdf"
+            Time integration scheme:
+
+            - ``'bdf'``: Backward differentiation (default, robust for VEP).
+            - ``'etd'``: Exponential time-differencing — analytical
+              relaxation factor ``α = exp(-Δt/τ)``. Pair with ``order=1``
+              for the recommended default; ``order=2`` is available but
+              unsafe under active yield (see lessons #7, #9).
+            - ``'hybrid'``: **EXPERIMENTAL — DO NOT USE FOR PRODUCTION.**
+              Spatial blend of BDF (inside fault) and ETD (outside
+              fault). Phase E investigation: σ enforcement is
+              BDF-class but |u_y| drifts monotonically over cycles
+              from shared-history coupling between the two branches.
+              See ``docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md``
+              lesson #11. Use ``'bdf'`` for deep-yield fault problems.
+              Requires ``fault_weight`` parameter.
+        fault_weight : sympy expression, optional
+            Spatial weight ``w(x) ∈ [0, 1]`` selecting BDF (``w=1``) vs
+            ETD (``w=0``) per quadrature point. Required when
+            ``integrator='hybrid'``. Typically built from the
+            ``influence_function`` used to construct ``yield_stress``,
+            normalised so that ``w=1`` inside the fault zone (where
+            yielding can happen) and ``w=0`` in the bulk (where
+            ``τ_y → τ_y_bulk`` and yielding is structurally
+            unreachable). The flux blend is
+            ``σ = w·σ_BDF + (1-w)·σ_ETD``.
+        material_name : str, optional
+            Name identifier for this material.
+        """
+        if integrator not in ("bdf", "etd", "hybrid"):
+            raise ValueError(
+                f"integrator must be 'bdf', 'etd', or 'hybrid', "
+                f"got '{integrator!r}'"
+            )
+        if integrator == "etd" and order not in (1, 2):
+            raise ValueError(
+                f"integrator='etd' supports order=1 (ETD-1, default-recommended) "
+                f"or order=2 (ETD-2; avoid in tight-yield VEP). "
+                f"Got order={order}."
+            )
+        self._integrator = integrator
+        if integrator == "hybrid" and order != 1:
+            import warnings
+            warnings.warn(
+                f"integrator='hybrid' uses one stress history slot; "
+                f"``order`` is pinned to 1 (you passed order={order}).",
+                UserWarning, stacklevel=2,
+            )
+            order = 1
+        if integrator == "hybrid" and fault_weight is None:
+            raise ValueError(
+                "integrator='hybrid' requires a ``fault_weight`` sympy "
+                "expression in [0, 1] (1 inside fault → BDF; 0 outside → ETD)."
+            )
+        self._fault_weight = fault_weight
 
         self._material_name = material_name
 
@@ -2461,7 +2734,17 @@ def __init__(self, unknowns, order=1, material_name: str = None):
         self._order = order
         self._yield_mode = "softmin"
         self._yield_softness = 0.1
-        self._bdf_blend = 0.5
+        # BDF order-blending α ∈ [0, 1].  α=1 → pure BDF-2 (default);
+        # α=0 → pure BDF-1 coefficients; intermediate → linear blend.
+        #
+        # NOTE: TI-VEP at order=2 with a spatially varying ``yield_stress``
+        # field (e.g. ``influence_function``-localised faults) is unstable
+        # for α > ~0.25.  The recommended user-facing fix is ``order=1``
+        # (the class default).  This knob is left as an *explicit* damping
+        # option for users who must use order=2 — it doesn't paper over
+        # the bug at the default.  Empirical stability threshold:
+        # α ≤ 0.25 stable, α ≥ 0.5 blow-up.  Investigated 2026-04.
+        self._bdf_blend = 1.0
         self._max_dt_ratio_for_higher_order = 2.0
 
         # Timestep (set by solver)
@@ -2627,6 +2910,78 @@ def effective_order(self):
             return min(self._order, ddt_eff)
         return self._order
 
+    def _update_etd_coefficients(self):
+        """Refresh DDt's (α, φ) UWexpressions from τ_eff = η_1/μ."""
+        if self.Unknowns.DFDt is None:
+            return
+        params = self.Parameters
+        if params.shear_modulus.sym is sympy.oo:
+            tau_eff = sympy.oo
+        else:
+            try:
+                eta_val = float(params.shear_viscosity_1.sym)
+                mu_val = float(params.shear_modulus.sym)
+                tau_eff = eta_val / mu_val if mu_val > 0 else sympy.oo
+            except (TypeError, ValueError):
+                tau_eff = None
+        try:
+            dt_val = (
+                float(params.dt_elastic.sym)
+                if params.dt_elastic.sym is not sympy.oo
+                else None
+            )
+        except (TypeError, ValueError):
+            dt_val = None
+        self.Unknowns.DFDt.update_exp_coefficients(dt_val, tau_eff)
+
+    def _update_history_coefficients(self):
+        r"""Pre-solve hook — dispatches BDF, ETD (order 1/2), or hybrid.
+
+        BDF: refresh ``_bdf_c0..c3``. ETD-2 (order=2): refresh ``α, φ``
+        on the DDt from ``η_1/μ``. ETD-1 (order=1): same plus force
+        ``φ = α`` so the ``(φ-α)·ε̇*`` history term vanishes. Hybrid:
+        refresh both BDF and ETD-2 — flux uses both per-spatial weight.
+        """
+        if self._integrator == "etd":
+            self._update_etd_coefficients()
+            if self._order == 1:
+                # ETD-1 reduction: φ = α zeros (φ-α)·ε̇* and turns
+                # (1-φ)·ε̇ into (1-α)·ε̇.
+                DDt = self.Unknowns.DFDt
+                if DDt is not None:
+                    DDt._exp_phi.sym = DDt._exp_alpha.sym
+        elif self._integrator == "hybrid":
+            # Hybrid uses BOTH integrators with spatial blend; update
+            # both coefficient sets each step.
+            self._update_bdf_coefficients()
+            self._update_etd_coefficients()
+        else:
+            self._update_bdf_coefficients()
+
+    def _update_history_post_solve(self):
+        """Post-solve hook — refresh forcing_star for ETD-2 / hybrid.
+
+        ETD-1 (order=1) doesn't need this; the (φ-α)·ε̇* term zeros out.
+        """
+        needs_forcing = (
+            (self._integrator == "etd" and self._order == 2)
+            or self._integrator == "hybrid"
+        )
+        if needs_forcing and self.Unknowns.DFDt is not None:
+            if self.Unknowns.DFDt.forcing_star is not None:
+                self.Unknowns.DFDt.update_forcing_history(forcing_fn=self.Unknowns.E)
+
+    @property
+    def stress_history_ddt_kwargs(self):
+        """ETD-2 (order=2) and hybrid need the forcing-history slot;
+        BDF and ETD-1 do not.
+        """
+        if self._integrator == "hybrid":
+            return {"with_forcing_history": True}
+        if self._integrator == "etd" and self._order == 2:
+            return {"with_forcing_history": True}
+        return {}
+
     def _update_bdf_coefficients(self):
         """Update BDF coefficient UWexpressions with blending."""
         order = self.effective_order
@@ -2646,19 +3001,24 @@ def _update_bdf_coefficients(self):
                     pass
 
             coeffs = _bdf_coefficients(order, dt_current, dt_history)
-
-            alpha = self._bdf_blend
-            if 0 < alpha < 1 and order >= 2:
-                coeffs_o1 = _bdf_coefficients(1, dt_current, dt_history)
-                while len(coeffs_o1) < len(coeffs):
-                    coeffs_o1.append(sympy.Integer(0))
-                coeffs = [
-                    (1 - alpha) * c1 + alpha * ck
-                    for c1, ck in zip(coeffs_o1, coeffs)
-                ]
         else:
             coeffs = _bdf_coefficients(order, None, [])
 
+        # BDF order blending — see ``self._bdf_blend`` docstring.
+        # Linear mix of BDF-1 and the requested-order coefficients.
+        # α=0 → pure BDF-1; α=1 → no blend (skip).
+        alpha = self._bdf_blend
+        if alpha < 1 and order >= 2:
+            coeffs_o1 = _bdf_coefficients(1, dt_current, dt_history) \
+                if (self.Unknowns is not None and self.Unknowns.DFDt is not None) \
+                else _bdf_coefficients(1, None, [])
+            while len(coeffs_o1) < len(coeffs):
+                coeffs_o1.append(sympy.Integer(0))
+            coeffs = [
+                (1 - alpha) * c1 + alpha * ck
+                for c1, ck in zip(coeffs_o1, coeffs)
+            ]
+
         while len(coeffs) < 4:
             coeffs.append(sympy.Integer(0))
 
@@ -2667,6 +3027,32 @@ def _update_bdf_coefficients(self):
         self._bdf_c2.sym = coeffs[2]
         self._bdf_c3.sym = coeffs[3]
 
+    @property
+    def bdf_blend(self):
+        r"""BDF coefficient blending α ∈ [0, 1].  **Damping knob.**
+
+        Linearly mixes BDF-1 and the requested-order coefficients:
+        ``c = (1-α)·c_BDF1 + α·c_requested_order``.  α=1 is no blend.
+
+        **For TI-VEP fault simulations, prefer ``order=1`` over tuning
+        this knob.**  At ``order=2`` with a spatially varying
+        ``yield_stress`` field, the simulation drifts unstably:
+        |σ_xy| → 10⁸ over ~10 t_r.  Empirically the instability is
+        gated by the magnitude of the ψ*_{n-1} weight: α ≤ 0.25 stable,
+        α ≥ 0.5 blow-up.  Lower α values stabilise but throw away
+        most of BDF-2's accuracy advantage — at α=0.10 the trace
+        difference vs ``order=1`` is ~0.1% of peak, for ~50% wall-time
+        overhead.  Use this knob only if you specifically need
+        order-2 behaviour on a problem with uniform yield_stress.
+        """
+        return self._bdf_blend
+
+    @bdf_blend.setter
+    def bdf_blend(self, value):
+        if not (0.0 <= float(value) <= 1.0):
+            raise ValueError(f"bdf_blend must be in [0, 1], got {value}")
+        self._bdf_blend = float(value)
+
     @property
     def stress_star(self):
         r"""Previous timestep stress from history."""
@@ -2674,18 +3060,52 @@ def stress_star(self):
             self._stress_star.sym = self.Unknowns.DFDt.psi_star[0].sym
         return self._stress_star
 
-    @property
-    def E_eff(self):
-        r"""Effective strain rate including elastic history."""
+    def _e_eff_for(self, integrator_mode):
+        r"""Build E_eff for a given integrator mode (without storing on
+        ``self._E_eff``). Used by both the public :py:attr:`E_eff` and
+        the hybrid ``stress()`` path which needs both forms in one
+        evaluation.
+        """
         E = self.Unknowns.E
+        if self.Unknowns.DFDt is None or not self.is_elastic:
+            return E
+        DDt = self.Unknowns.DFDt
+
+        if integrator_mode == "etd":
+            alpha = DDt._exp_alpha
+            phi = DDt._exp_phi
+            sigma_star = DDt.psi_star[0].sym
+            if DDt.forcing_star is not None:
+                edot_star = DDt.forcing_star.sym
+            else:
+                edot_star = sympy.zeros(*E.shape)
+            eta_1 = self.Parameters.shear_viscosity_1
+            return (
+                (1 - phi) * E
+                + (alpha / (2 * eta_1)) * sigma_star
+                + (phi - alpha) * edot_star
+            )
 
-        if self.Unknowns.DFDt is not None and self.is_elastic:
-            mu_dt = self.Parameters.dt_elastic * self.Parameters.shear_modulus
-            bdf_cs = [self._bdf_c1, self._bdf_c2, self._bdf_c3]
-            for i in range(self.Unknowns.DFDt.order):
-                E += -bdf_cs[i] * self.Unknowns.DFDt.psi_star[i].sym / (2 * mu_dt)
+        # BDF default
+        mu_dt = self.Parameters.dt_elastic * self.Parameters.shear_modulus
+        bdf_cs = [self._bdf_c1, self._bdf_c2, self._bdf_c3]
+        out = E
+        for i in range(DDt.order):
+            out = out - bdf_cs[i] * DDt.psi_star[i].sym / (2 * mu_dt)
+        return out
 
-        self._E_eff.sym = E
+    @property
+    def E_eff(self):
+        r"""Effective strain rate including elastic-history coupling.
+
+        BDF: ``E_eff = ε̇ - Σc_i·σ*/(2μΔt)``.
+        ETD-2: ``E_eff = (1-φ)·ε̇ + α·σ*/(2η₁) + (φ-α)·ε̇*``.
+        Hybrid: returns the BDF form (E_eff is consumed by yield-clip
+        code that should see the BDF rate metric); the actual flux uses
+        both forms inside :py:meth:`stress`.
+        """
+        mode = "bdf" if self._integrator in ("bdf", "hybrid") else "etd"
+        self._E_eff.sym = self._e_eff_for(mode)
         return self._E_eff
 
     @property
@@ -2699,9 +3119,9 @@ def E_eff_inv_II(self):
     def viscosity(self):
         r"""Effective viscosity for the fault-plane shear component.
 
-        Applies the yield mode (smooth/softmin/min/harmonic) to η₁,
-        leaving η₀ (bulk) unchanged. The anisotropic tensor handles
-        the directional dependence.
+        Applies the yield mode (softmin/min/harmonic) to η₁, leaving
+        η₀ (bulk) unchanged. The anisotropic tensor handles the
+        directional dependence.
         """
         inner_self = self.Parameters
 
@@ -2715,9 +3135,6 @@ def viscosity(self):
             vp_eff = self._plastic_effective_viscosity
             if self._yield_mode == "harmonic":
                 eta_1_eff = 1 / (1 / eta_1_eff + 1 / vp_eff)
-            elif self._yield_mode == "smooth":
-                f = eta_1_eff / vp_eff
-                eta_1_eff = eta_1_eff * (1 + f) / (1 + f + f**2)
             elif self._yield_mode == "softmin":
                 delta = self._yield_softness
                 f = eta_1_eff / vp_eff
@@ -2777,59 +3194,63 @@ def _plastic_effective_viscosity(self):
 
         return viscosity_yield
 
-    def _build_c_tensor(self):
-        """Build the anisotropic tensor with VE effective viscosities.
-
-        Both η₀ and η₁ are replaced by their VE effective values:
-          η₀_ve = η₀·μ·dt / (c₀·η₀ + μ·dt)
-          η₁_ve = η₁·μ·dt / (c₀·η₁ + μ·dt)
-        Then η₁_ve is further yield-limited to η₁_eff. This ensures
-        Δ = η₀_ve - η₁_eff = 0 when η₁ = η₀ and yield is inactive.
+    def _eta_for_tensor(self, integrator_mode, apply_yield):
+        """Return ``(eta_0, eta_1_eff)`` for tensor build, parameterised
+        by integrator mode and whether to apply yield clipping.
+
+        - ``integrator_mode='bdf'``: η₀, η₁ are VE-effective (c₀-baked
+          Δt scaling — needed for BDF's E_eff structure).
+        - ``integrator_mode='etd'``: η₀, η₁ are raw (time factor lives
+          in α/φ symbolically). Used for both ETD-1 and ETD-2 — the
+          C tensor is identical; only the symbolic E_eff differs (via
+          ``self._order``).
+        - ``apply_yield=True``: softmin/min/harmonic clip on η₁_eff.
+        - ``apply_yield=False``: no clipping (use this for the ETD-VE
+          branch of the hybrid integrator, where the bulk is
+          structurally non-yieldable so clipping is a no-op anyway).
         """
+        if integrator_mode == "etd":
+            eta_0 = self.Parameters.shear_viscosity_0.sym
+            eta_1_eff = self.Parameters.shear_viscosity_1
+        else:  # bdf
+            eta_0_raw = self.Parameters.shear_viscosity_0
+            mu = self.Parameters.shear_modulus
+            dt_e = self.Parameters.dt_elastic
+            c0 = self._bdf_c0
+            mu_val = mu.sym if hasattr(mu, 'sym') else mu
+            if mu_val is sympy.oo:
+                eta_0 = eta_0_raw.sym if hasattr(eta_0_raw, 'sym') else eta_0_raw
+            else:
+                eta_0 = eta_0_raw * mu * dt_e / (c0 * eta_0_raw + mu * dt_e)
+            eta_1_eff = self.Parameters.ve_effective_viscosity
 
-        if self._is_setup:
-            return
-
-        d = self.dim
-
-        # η₀: VE effective (no yield)
-        eta_0_raw = self.Parameters.shear_viscosity_0
-        mu = self.Parameters.shear_modulus
-        dt_e = self.Parameters.dt_elastic
-        c0 = self._bdf_c0
-
-        mu_val = mu.sym if hasattr(mu, 'sym') else mu
-        if mu_val is sympy.oo:
-            eta_0 = eta_0_raw.sym if hasattr(eta_0_raw, 'sym') else eta_0_raw
-        else:
-            eta_0 = eta_0_raw * mu * dt_e / (c0 * eta_0_raw + mu * dt_e)
-
-        # η₁: VE effective + yield limited
-        eta_1_eff = self.Parameters.ve_effective_viscosity
-
-        if self.is_viscoplastic:
+        if apply_yield and self.is_viscoplastic:
             vp_eff = self._plastic_effective_viscosity
             if self._yield_mode == "harmonic":
                 eta_1_eff = 1 / (1 / eta_1_eff + 1 / vp_eff)
-            elif self._yield_mode == "smooth":
-                f = eta_1_eff / vp_eff
-                eta_1_eff = eta_1_eff * (1 + f) / (1 + f + f**2)
             elif self._yield_mode == "softmin":
                 delta = self._yield_softness
                 f = eta_1_eff / vp_eff
-                import math  # float offset avoids sympy expression blowup in tensor
+                import math
                 offset = (-1 + math.sqrt(1 + delta**2)) / 2
                 g = 1 + (f - 1 + sympy.sqrt((f - 1)**2 + delta**2)) / 2 - offset
                 eta_1_eff = eta_1_eff / g
             else:
                 eta_1_eff = sympy.Min(eta_1_eff, vp_eff)
+        return eta_0, eta_1_eff
+
+    def _assemble_c_tensor(self, eta_0, eta_1_eff):
+        """Build the anisotropic rank-4 tensor from ``(eta_0, eta_1_eff)``.
 
+        Loop body identical to :py:meth:`_build_c_tensor_ve`; refactored
+        into a helper so the hybrid path can call it twice (BDF tensor
+        with yield clip, ETD tensor without) without code duplication.
+        """
+        d = self.dim
         n = self.Parameters.director.sym
         Delta = eta_0 - eta_1_eff
-
         identity = uw.maths.tensor.rank4_identity(d)
         lambda_mat = sympy.MutableDenseNDimArray.zeros(d, d, d, d)
-
         for i in range(d):
             for j in range(d):
                 for k in range(d):
@@ -2848,9 +3269,31 @@ def _build_c_tensor(self):
                         if hasattr(val, '__getitem__') and not isinstance(val, (sympy.MatrixBase, sympy.NDimArray)):
                             val = sympy.Mul(sympy.S.One, val, evaluate=False)
                         lambda_mat[i, j, k, l] = val
-
         lambda_mat = uw.maths.tensor.rank4_to_mandel(lambda_mat, d)
-        self._c = uw.maths.tensor.mandel_to_rank4(lambda_mat, d)
+        return uw.maths.tensor.mandel_to_rank4(lambda_mat, d)
+
+    def _build_c_tensor(self):
+        """Build the anisotropic tensor(s) for the active integrator.
+
+        - ``'bdf'`` / ``'etd'``: single tensor ``self._c``.
+        - ``'hybrid'``: two tensors ``self._c_bdf`` (yield-clipped) and
+          ``self._c_etd`` (no clip — bulk is non-yieldable). The flux
+          blend ``w·σ_BDF + (1-w)·σ_ETD`` happens in :py:meth:`stress`.
+        """
+        if self._is_setup:
+            return
+
+        if self._integrator == "hybrid":
+            eta_0_bdf, eta_1_bdf = self._eta_for_tensor("bdf", apply_yield=True)
+            self._c_bdf = self._assemble_c_tensor(eta_0_bdf, eta_1_bdf)
+            eta_0_etd, eta_1_etd = self._eta_for_tensor("etd", apply_yield=False)
+            self._c_etd = self._assemble_c_tensor(eta_0_etd, eta_1_etd)
+            self._c = self._c_bdf  # default for any callers reading self._c
+        else:
+            eta_0, eta_1_eff = self._eta_for_tensor(
+                self._integrator, apply_yield=self.is_viscoplastic
+            )
+            self._c = self._assemble_c_tensor(eta_0, eta_1_eff)
 
         self._is_setup = True
         self._solver_is_setup = False
@@ -2921,9 +3364,25 @@ def stress(self):
         C(η₁_eff) handles anisotropy; the history uses the same η₁_eff as
         a scalar multiplier (consistent with how isotropic VEP uses
         self.viscosity for both).
+
+        Hybrid path: ``σ = w·σ_BDF + (1-w)·σ_ETD`` with the spatial
+        weight from ``self._fault_weight``. Each branch contracts its
+        own E_eff with its own C-tensor; the blend lives at the flux
+        level so neither integrator's structure is compromised.
         """
         self._build_c_tensor()
 
+        if self._integrator == "hybrid":
+            # σ_BDF: BDF flux with yield-clipped C tensor
+            edot_eff_bdf = self._e_eff_for("bdf")
+            sigma_bdf = self._q_with(self._c_bdf, edot_eff_bdf)
+            # σ_ETD: ETD flux with no-yield C tensor
+            edot_eff_etd = self._e_eff_for("etd")
+            sigma_etd = self._q_with(self._c_etd, edot_eff_etd)
+            # Spatial blend
+            w = self._fault_weight
+            return w * sigma_bdf + (1 - w) * sigma_etd
+
         # Apply the anisotropic tensor to the effective strain rate
         # (current + VE history): σ = C(η₀_ve, η₁_eff) : ε̇_eff
         # This is the correct VE formula — the tensor handles anisotropy
@@ -2933,19 +3392,44 @@ def stress(self):
 
         return stress
 
+    def _q_with(self, c, edot):
+        """Apply a given rank-4 tensor to a strain rate (helper for
+        the hybrid flux that needs to contract two distinct C tensors
+        in one ``stress()`` call).
+        """
+        rank = len(c.shape)
+        if rank == 2:
+            flux = c * edot
+        else:
+            flux = sympy.tensorcontraction(
+                sympy.tensorcontraction(sympy.tensorproduct(c, edot), (1, 5)),
+                (0, 3),
+            )
+        return sympy.Matrix(flux)
+
     @property
     def yield_mode(self):
         r"""How to apply yield limiting to the fault-plane viscosity.
 
         Same options as :class:`ViscoElasticPlasticFlowModel`:
-        ``"smooth"`` (default), ``"softmin"``, ``"harmonic"``, ``"min"``.
+        ``"softmin"`` (default), ``"harmonic"``, ``"min"``.  The
+        ``"smooth"`` option was retired (under-clipped by ~50 %); see
+        the parent class's :attr:`yield_mode` docstring for details.
         """
         return self._yield_mode
 
     @yield_mode.setter
     def yield_mode(self, value):
-        if value not in ("min", "harmonic", "smooth", "softmin"):
-            raise ValueError(f"yield_mode must be 'min', 'harmonic', 'smooth', or 'softmin', got '{value}'")
+        if value == "smooth":
+            raise ValueError(
+                "yield_mode='smooth' has been retired — it under-clipped "
+                "the yield surface by ~50%. Use 'softmin' instead "
+                "(default; close to exact Min with smooth derivatives)."
+            )
+        if value not in ("min", "harmonic", "softmin"):
+            raise ValueError(
+                f"yield_mode must be 'min', 'harmonic', or 'softmin', got '{value}'"
+            )
         self._yield_mode = value
         self._reset()
 
@@ -2959,15 +3443,6 @@ def yield_softness(self, value):
         self._yield_softness = value
         self._reset()
 
-    @property
-    def bdf_blend(self):
-        """BDF coefficient blending: 0=pure O1, 0.5=default, 1=pure O2."""
-        return self._bdf_blend
-
-    @bdf_blend.setter
-    def bdf_blend(self, value):
-        self._bdf_blend = value
-
     @property
     def requires_stress_history(self):
         """Transverse isotropic VEP requires stress history tracking."""
@@ -2983,6 +3458,407 @@ def plastic_fraction(self):
         return sympy.Max(0, 1 - eta_1_eff / eta_1_ve.sym if hasattr(eta_1_ve, 'sym') else 0)
 
 
+class TransverseIsotropicMaxwellExponentialFlowModel(TransverseIsotropicVEPFlowModel):
+    r"""Thin alias: ``TransverseIsotropicVEPFlowModel(integrator='etd', order=1)``.
+
+    .. deprecated:: Phase B
+        Use the canonical form ``TransverseIsotropicVEPFlowModel(unknowns,
+        integrator='etd', order=1)`` directly. This sibling class
+        survives as a thin scaffold for existing scripts; defaults to
+        ETD-1 (recommended).
+    """
+
+    def __init__(self, unknowns, material_name=None):
+        super().__init__(
+            unknowns, order=1, integrator="etd",
+            material_name=material_name,
+        )
+
+
+class TransverseIsotropicVEPSplitFlowModel(TransverseIsotropicVEPFlowModel):
+    r"""**EXPERIMENTAL — DO NOT USE FOR PRODUCTION.**
+
+    Phase D investigation artefact. The σ_∥ enforcement reaches BDF-class
+    fidelity (1.21·τ_y vs BDF's 1.04·τ_y at τ_y=0.05) but the velocity
+    field overshoots the boundary value and ratchets monotonically over
+    cycles to ~21× BDF-1's |u_y|. The fault-tip stress concentrations
+    in the PyVista field plot are non-physical for this loading.
+
+    Retained on the branch for reproducibility of the investigation; see
+    ``docs/developer/design/EXPONENTIAL_VE_INTEGRATOR.md`` lessons #9 and
+    #10 for why this doesn't ship.
+
+    For deep-yield TI fault problems use
+    ``TransverseIsotropicVEPFlowModel(integrator='bdf')``.
+
+    --
+
+    Phase D — per-component ``(α_⊥, φ_⊥)/(α_∥, φ_∥)`` ETD-2 for TI VEP.
+
+    The rank-4 modulus splits into two orthogonal projectors:
+
+    .. math::
+        C(\eta_0, \eta_\parallel) = 2\eta_0 \, \mathbf{P}_\perp
+            + 2\eta_\parallel \, \mathbf{P}_\parallel
+
+    where :math:`\mathbf{P}_\parallel` is the director-aligned projector
+    (the ``K`` kernel built in :py:meth:`_build_c_tensor`) and
+    :math:`\mathbf{P}_\perp = \mathbf{I}_4 - \mathbf{P}_\parallel`.
+    Each branch has its own Maxwell relaxation time:
+
+    .. math::
+        \tau_\perp = \eta_0 / \mu, \qquad
+        \tau_\parallel = \eta_\parallel^\text{eff} / \mu
+
+    so the analytical exponential factors differ:
+
+    .. math::
+        \alpha_k = e^{-\Delta t / \tau_k}, \qquad
+        \varphi_k = (1 - \alpha_k) \tau_k / \Delta t
+
+    The split flux integrates each branch independently and sums:
+
+    .. math::
+        \sigma^{n+1} = (\alpha_\perp \mathbf{P}_\perp + \alpha_\parallel
+            \mathbf{P}_\parallel) : \sigma^*
+            + 2[\eta_0(1-\varphi_\perp) \mathbf{P}_\perp
+                 + \eta_\parallel^\text{eff}(1-\varphi_\parallel)
+                   \mathbf{P}_\parallel] : \dot\varepsilon^{n+1}
+            + 2[\eta_0(\varphi_\perp - \alpha_\perp) \mathbf{P}_\perp
+                 + \eta_\parallel^\text{eff}(\varphi_\parallel - \alpha_\parallel)
+                   \mathbf{P}_\parallel] : \dot\varepsilon^*
+
+    Phase B uses a single lumped ``(α, φ)`` from ``η_∥_eff/μ`` for the
+    whole tensor — empirically blows up at tight yield surfaces because
+    the matrix branch has no business being yielded. The split scheme
+    relaxes each channel on its proper timescale; the analytical floor
+    on ``σ_∥`` is then ``≲ τ_y`` by construction.
+
+    Implementation choice: ``α_⊥, φ_⊥`` come from the DDt's existing
+    scalar ``_exp_coeffs`` (matrix viscosity is fixed and spatially
+    uniform — a single per-step scalar is right). ``α_∥, φ_∥`` are
+    inlined as sympy expressions of the yield-clipped ``η_∥_eff`` so
+    the JIT evaluates them per quadrature point (spatial heterogeneity
+    captured automatically). No DDt changes, no solver changes.
+    """
+
+    # Default cap on τ_∥/Δt — recommendation 4 from the practical
+    # stabilisation strategy. α_∥ ≥ exp(-1/c); c=1 → α_∥ ≥ 0.37
+    # (37% of σ* retained each step, matches BDF-style elastic
+    # damping). Set to 0 to disable (recovers the un-capped behaviour
+    # where boundary motion goes straight into plastic slip).
+    _tau_par_cap_factor = 1.0
+
+    def __init__(self, unknowns, material_name=None):
+        super().__init__(
+            unknowns, order=1, integrator="etd",
+            material_name=material_name,
+        )
+
+    @property
+    def tau_par_cap_factor(self):
+        r"""Lower bound ``c`` such that ``τ_∥ ≥ c·Δt`` in the parallel
+        branch's exponential factor.
+
+        Caps how aggressively the parallel-branch's elastic memory is
+        relaxed during yielding. Without this cap, ``η_∥_eff → 0`` at
+        deep yield drives ``α_∥ = exp(-Δt/τ_∥) → 0`` — boundary
+        motion goes straight into slip each step with no elastic
+        spring-back, ratcheting fault displacement at the BC rate.
+        With ``c=1`` (default), ``α_∥ ≥ 1/e``; with ``c=2``,
+        ``α_∥ ≥ exp(-0.5)``.
+
+        Set to ``0`` to disable (recovers the explicit-plasticity
+        behaviour). The cap applies only to the (α_∥, φ_∥) factors —
+        ``C_∥`` keeps the natural yield-clipped η_∥ so ``σ_∥`` still
+        sits at the yield surface.
+        """
+        return self._tau_par_cap_factor
+
+    @tau_par_cap_factor.setter
+    def tau_par_cap_factor(self, value):
+        self._tau_par_cap_factor = float(value)
+
+    def _update_history_coefficients(self):
+        r"""Update ``(α_⊥, φ_⊥)`` only — the matrix branch.
+
+        ``(α_∥, φ_∥)`` are inlined per-quadrature in :py:meth:`stress`.
+        Picks ``τ_⊥ = η_0/μ`` (raw matrix viscosity).
+        """
+        if self._integrator != "etd" or self.Unknowns.DFDt is None:
+            return super()._update_history_coefficients()
+        params = self.Parameters
+        if params.shear_modulus.sym is sympy.oo:
+            tau_perp = sympy.oo
+        else:
+            try:
+                eta_val = float(params.shear_viscosity_0.sym)
+                mu_val = float(params.shear_modulus.sym)
+                tau_perp = eta_val / mu_val if mu_val > 0 else sympy.oo
+            except (TypeError, ValueError):
+                tau_perp = None
+        try:
+            dt_val = (
+                float(params.dt_elastic.sym)
+                if params.dt_elastic.sym is not sympy.oo
+                else None
+            )
+        except (TypeError, ValueError):
+            dt_val = None
+        self.Unknowns.DFDt.update_exp_coefficients(dt_val, tau_perp)
+
+    def _eta_par_eff(self):
+        """Yield-clipped ``η_∥_eff`` — same softmin/min/harmonic as parent.
+
+        For ETD the base is the raw ``η_1`` (no VE pre-clip); the yield
+        envelope is then applied via the configured yield_mode.
+        """
+        params = self.Parameters
+        eta_par = params.shear_viscosity_1
+        if hasattr(eta_par, 'sym'):
+            eta_par = eta_par.sym
+
+        if not self.is_viscoplastic or params.yield_stress.sym is sympy.oo:
+            return eta_par
+
+        vp_eff = self._plastic_effective_viscosity
+        if self._yield_mode == "harmonic":
+            return 1 / (1 / eta_par + 1 / vp_eff)
+        elif self._yield_mode == "softmin":
+            delta = self._yield_softness
+            f = eta_par / vp_eff
+            import math
+            offset = (-1 + math.sqrt(1 + delta**2)) / 2
+            g = 1 + (f - 1 + sympy.sqrt((f - 1) ** 2 + delta ** 2)) / 2 - offset
+            return eta_par / g
+        else:
+            return sympy.Min(eta_par, vp_eff)
+
+    def _eta_par_eff_lagged(self):
+        """Yield-clipped ``η_∥_eff`` using the **lagged** strain rate
+        (``forcing_star``, projected post-solve) instead of the current
+        ``E_eff``.
+
+        Same softmin/harmonic/min envelope as :py:meth:`_eta_par_eff`,
+        but the plastic estimate ``vp_eff_lag = τ_y/(2|γ̇*|)`` uses the
+        *previous step's* fault-plane shear magnitude. This is the
+        proper "lag η_∥_eff": elastic regime → η_1_raw (low |γ̇*|);
+        yielded regime → ``τ_y/(2|γ̇*|)`` (saturated stress, large rate).
+        Using the parent's E_eff-based formula here would couple α_∥
+        back into the current Newton iterate, which collapses to a
+        1-iteration trivial residual (over-damping). Holding the rate
+        fixed at the post-solve value breaks that feedback.
+        """
+        params = self.Parameters
+        DDt = self.Unknowns.DFDt
+
+        eta_par = params.shear_viscosity_1
+        if hasattr(eta_par, 'sym'):
+            eta_par = eta_par.sym
+
+        if not self.is_viscoplastic or params.yield_stress.sym is sympy.oo:
+            return eta_par
+        if DDt is None or DDt.forcing_star is None:
+            return self._eta_par_eff()  # no history yet — fall back to current
+
+        # Lagged plastic estimate: |γ̇*_∥| from forcing_star + director
+        Edot_lag = DDt.forcing_star.sym
+        n = params.director.sym
+        T_lag = Edot_lag * n
+        edot_n_lag = (n.T * T_lag)[0, 0]
+        T_sq_lag = (T_lag.T * T_lag)[0, 0]
+        gamma_dot_sq_lag = T_sq_lag - edot_n_lag ** 2
+        gamma_dot_abs_lag = sympy.sqrt(sympy.Max(gamma_dot_sq_lag, 0))
+
+        # Strip Pint from the ε_min floor (forcing_star is unitless storage)
+        edot_min_raw = params.strainrate_inv_II_min.sym
+        if hasattr(edot_min_raw, 'magnitude'):
+            edot_min_val = float(edot_min_raw.magnitude)
+        else:
+            try:
+                edot_min_val = float(edot_min_raw)
+            except (TypeError, ValueError):
+                edot_min_val = 1.0e-6
+
+        tau_y_sym = params.yield_stress.sym
+        vp_eff_lag = tau_y_sym / (
+            2 * (gamma_dot_abs_lag + sympy.Float(edot_min_val))
+        )
+
+        if self._yield_mode == "harmonic":
+            return 1 / (1 / eta_par + 1 / vp_eff_lag)
+        elif self._yield_mode == "softmin":
+            delta = self._yield_softness
+            f = eta_par / vp_eff_lag
+            import math
+            offset = (-1 + math.sqrt(1 + delta ** 2)) / 2
+            g = 1 + (f - 1 + sympy.sqrt((f - 1) ** 2 + delta ** 2)) / 2 - offset
+            return eta_par / g
+        else:
+            return sympy.Min(eta_par, vp_eff_lag)
+
+    def _build_split_c_tensors(self, eta_perp, eta_par):
+        r"""Build ``C_⊥ = 2·η_⊥·P_⊥`` and ``C_∥ = 2·η_∥·P_∥``.
+
+        Identical loop structure to :py:meth:`_build_c_tensor`, but
+        each tensor isolates one projector by zeroing the other
+        viscosity coefficient.
+        """
+        d = self.dim
+        n = self.Parameters.director.sym
+        identity = uw.maths.tensor.rank4_identity(d)
+
+        c_perp_arr = sympy.MutableDenseNDimArray.zeros(d, d, d, d)
+        c_par_arr = sympy.MutableDenseNDimArray.zeros(d, d, d, d)
+
+        for i in range(d):
+            for j in range(d):
+                for k in range(d):
+                    for l in range(d):
+                        I_ijkl = identity[i, j, k, l]
+                        K_ijkl = (
+                            (n[i] * n[k] * int(j == l)
+                             + n[j] * n[k] * int(l == i)
+                             + n[i] * n[l] * int(j == k)
+                             + n[j] * n[l] * int(k == i)) / 2
+                            - 2 * n[i] * n[j] * n[k] * n[l]
+                        )
+                        # 2·η_⊥·P_⊥ = 2·η_⊥·(I - K)
+                        v_perp = 2 * eta_perp * (I_ijkl - K_ijkl)
+                        # 2·η_∥·P_∥ = 2·η_∥·K
+                        v_par = 2 * eta_par * K_ijkl
+                        # Same guard as parent _build_c_tensor — sympy
+                        # NDimArray.__setitem__ refuses iterable RHS.
+                        if hasattr(v_perp, '__getitem__') and not isinstance(
+                            v_perp, (sympy.MatrixBase, sympy.NDimArray)
+                        ):
+                            v_perp = sympy.Mul(sympy.S.One, v_perp, evaluate=False)
+                        if hasattr(v_par, '__getitem__') and not isinstance(
+                            v_par, (sympy.MatrixBase, sympy.NDimArray)
+                        ):
+                            v_par = sympy.Mul(sympy.S.One, v_par, evaluate=False)
+                        c_perp_arr[i, j, k, l] = v_perp
+                        c_par_arr[i, j, k, l] = v_par
+
+        c_perp = uw.maths.tensor.mandel_to_rank4(
+            uw.maths.tensor.rank4_to_mandel(c_perp_arr, d), d)
+        c_par = uw.maths.tensor.mandel_to_rank4(
+            uw.maths.tensor.rank4_to_mandel(c_par_arr, d), d)
+        return c_perp, c_par
+
+    def stress(self):
+        r"""Per-component ETD-2 flux with **lagged** ``(α_∥, φ_∥)``.
+
+        Each branch's E_eff is built and contracted with its own
+        sub-modulus; the two are summed.
+
+        ``α_∥, φ_∥`` are derived from a *lagged* parallel viscosity
+        computed from the projected stress and strain-rate histories:
+
+        .. math::
+            \eta_\parallel^{\,\mathrm{lag}}
+              = \frac{|\sigma^*_\parallel|}
+                     {2\,\max(|\dot\varepsilon^*_\parallel|, \dot\varepsilon_\min)}
+
+        where ``|·|_∥`` is the Pythagorean fault-plane shear magnitude
+        (same pattern as :py:meth:`_plastic_effective_viscosity`). This
+        sits naturally on the yield surface during yielding (because
+        ``|σ^*_∥|`` saturates near ``τ_y`` while ``|ε̇^*_∥|`` is large)
+        and tracks the elastic VE response otherwise. Critically the
+        expression depends only on previous-step storage — no
+        plasticity-clip recursion through ``E_eff`` — keeping the JIT
+        codegen tree shallow.
+
+        The C_∥ sub-modulus still uses the **current** yield-clipped
+        ``η_∥_eff`` (preserves Newton's nonlinear plasticity Jacobian
+        through the multiplicative response weights).
+        """
+        params = self.Parameters
+        DDt = self.Unknowns.DFDt
+        E = self.Unknowns.E
+
+        # Matrix branch: scalar (α_⊥, φ_⊥) from DDt
+        alpha_perp = DDt._exp_alpha
+        phi_perp = DDt._exp_phi
+        eta_perp = params.shear_viscosity_0
+        eta_perp_sym = eta_perp.sym
+
+        # Histories
+        sigma_star = DDt.psi_star[0].sym
+        if DDt.forcing_star is not None:
+            edot_star = DDt.forcing_star.sym
+        else:
+            edot_star = sympy.zeros(*E.shape)
+
+        # ── Lagged η_∥_eff via parent's softmin envelope, evaluated
+        # against forcing_star (previous-step ε̇) instead of E_eff
+        # (current Newton iterate). Breaks the per-quad-split's
+        # 1-iteration trivial-Newton failure mode.
+        eta_par_lag_chain = self._eta_par_eff_lagged()
+        if hasattr(eta_par_lag_chain, 'sym'):
+            eta_par_lagged = eta_par_lag_chain.sym
+        else:
+            eta_par_lagged = eta_par_lag_chain
+
+        mu_sym = params.shear_modulus.sym
+        dt_sym = params.dt_elastic.sym if hasattr(params.dt_elastic, 'sym') else params.dt_elastic
+        # x_par = Δt/τ_∥ — natural value (no cap)
+        tau_par_natural = eta_par_lagged / mu_sym
+        x_par_natural = dt_sym / tau_par_natural
+        # Soft cap on x_par: x_eff = (1 - exp(-c·x))/c
+        # • x → 0:    x_eff → x          (elastic, no cap)
+        # • x → ∞:    x_eff → 1/c        (capped → α_∥ ≥ exp(-1/c))
+        # • smooth derivatives everywhere; pre-evaluates to a finite
+        #   scalar at codegen-time defaults (dt=∞, μ=∞, η=Pint) where
+        #   x_natural = oo, exp(-c·oo) = 0, x_eff = 1/c — avoids the
+        #   oo-vs-Pint dimensional clash that breaks sympy.Max/+ caps.
+        # Equivalent to capping τ_∥ ≥ c·Δt (recommendation #4).
+        if self._tau_par_cap_factor > 0.0:
+            c = sympy.Float(self._tau_par_cap_factor)
+            x_par = (1 - sympy.exp(-c * x_par_natural)) / c
+        else:
+            x_par = x_par_natural
+        alpha_par = sympy.exp(-x_par)
+        phi_par = (1 - alpha_par) / x_par
+
+        # C_∥ uses the natural lagged η (no cap) — preserves the
+        # σ_∥ ≈ τ_y enforcement on the yield surface. The cap only
+        # tames the elastic-memory factor (α_∥, φ_∥) so the parallel
+        # branch retains some spring-back instead of fully releasing
+        # in one step. Recovers BDF-style behaviour where boundary
+        # motion is partly absorbed by elastic accumulation rather
+        # than dumped entirely into slip.
+        eta_par_current = eta_par_lagged
+
+        # Build the split sub-moduli with the lagged η_∥
+        c_perp, c_par = self._build_split_c_tensors(eta_perp_sym, eta_par_current)
+
+        # E_eff_⊥ = (1-φ_⊥)·ε̇ + α_⊥/(2η_⊥)·σ* + (φ_⊥-α_⊥)·ε̇*
+        e_eff_perp = (
+            (1 - phi_perp) * E
+            + (alpha_perp / (2 * eta_perp)) * sigma_star
+            + (phi_perp - alpha_perp) * edot_star
+        )
+        # E_eff_∥ uses lagged α_∥, φ_∥ but current η_∥_eff in the
+        # σ*-projection denominator (so the projected history reads
+        # off the same modulus the next step's flux is built on).
+        e_eff_par = (
+            (1 - phi_par) * E
+            + (alpha_par / (2 * eta_par_current)) * sigma_star
+            + (phi_par - alpha_par) * edot_star
+        )
+
+        def _contract(c, x):
+            if len(c.shape) == 2:
+                return sympy.Matrix(c * x)
+            return sympy.Matrix(sympy.tensorcontraction(
+                sympy.tensorcontraction(sympy.tensorproduct(c, x), (1, 5)),
+                (0, 3),
+            ))
+
+        return _contract(c_perp, e_eff_perp) + _contract(c_par, e_eff_par)
+
+
 class MultiMaterialConstitutiveModel(Constitutive_Model):
     r"""
     Multi-material constitutive model using level-set weighted flux averaging.
diff --git a/src/underworld3/cython/petsc_generic_snes_solvers.pyx b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
index 1aa27dc5..57c20a56 100644
--- a/src/underworld3/cython/petsc_generic_snes_solvers.pyx
+++ b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
@@ -534,6 +534,46 @@ class SolverBaseClass(uw_object):
             f"  to investigate.\n"
         )
 
+    def _snes_solve_with_retries(self, gvec, divergence_retries=0, verbose=False):
+        """Call self.snes.solve(None, gvec) with warm-start retries on divergence.
+
+        Useful for nonlinear problems whose residual has kinks (e.g. VEP at
+        the yield surface Min/softmin cutoff) where Newton can land on a
+        bad iterate that trips DIVERGED_MAX_IT or DIVERGED_LINE_SEARCH. A
+        single warm-start re-solve from the just-computed iterate commonly
+        steps off the kink.
+
+        The first solve is always performed. If the SNES converged reason is
+        negative, up to ``divergence_retries`` additional calls to
+        ``snes.solve(None, gvec)`` are made; ``gvec`` is retained between
+        calls so each retry is a warm start. Returns once the SNES reports
+        converged or the retry budget is exhausted.
+
+        Parameters
+        ----------
+        gvec : PETSc.Vec
+            Global solution vector (modified in place by snes.solve).
+        divergence_retries : int, default=0
+            Maximum retries on DIVERGED. 0 preserves legacy behaviour.
+            Typically 1 is enough for VEP kink-related divergence.
+        verbose : bool, default=False
+            Log each retry on rank 0.
+        """
+        self.snes.solve(None, gvec)
+        if divergence_retries <= 0:
+            return
+        for _r in range(divergence_retries):
+            reason = self.snes.getConvergedReason()
+            if reason >= 0:
+                return
+            if verbose and uw.mpi.rank == 0:
+                print(
+                    f"SNES DIVERGED (reason={reason}); "
+                    f"warm-start retry {_r + 1}/{divergence_retries}",
+                    flush=True,
+                )
+            self.snes.solve(None, gvec)
+
     @timing.routine_timer_decorator
     def _build(self,
                     verbose: bool = False,
@@ -2027,7 +2067,8 @@ class SNES_Scalar(SolverBaseClass):
               verbose:         bool=False,
               debug:           bool=False,
               debug_name:      str=None,
-              time=None, ):
+              time=None,
+              divergence_retries: int=0, ):
         """
         Solve the system of equations.
 
@@ -2055,6 +2096,10 @@ class SNES_Scalar(SolverBaseClass):
             pointwise functions. Expressions using ``mesh.t`` evaluate at
             this time. Non-dimensionalised when scaling is active.
             Default: None (petsc_t unchanged).
+        divergence_retries : int, default=0
+            If SNES reports DIVERGED after the solve, re-call it with warm
+            start up to this many times. A single retry rescues most VEP
+            yield-surface kink divergences. 0 preserves legacy behaviour.
 
         Returns
         -------
@@ -2133,7 +2178,7 @@ class SNES_Scalar(SolverBaseClass):
         self._update_constants()
 
         # solve
-        self.snes.solve(None, gvec)
+        self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         lvec = self.dm.getLocalVec()
         cdef Vec clvec = lvec
@@ -3021,6 +3066,7 @@ class SNES_Vector(SolverBaseClass):
               verbose=False,
               debug=False,
               debug_name=None,
+              divergence_retries: int=0,
                ):
         """
         Solve the vector field system of equations.
@@ -3041,6 +3087,9 @@ class SNES_Vector(SolverBaseClass):
             Enable debug output.
         debug_name : str, optional
             Name prefix for debug output files.
+        divergence_retries : int, default=0
+            If SNES reports DIVERGED after the solve, re-call it with warm
+            start up to this many times. 0 preserves legacy behaviour.
 
         Returns
         -------
@@ -3119,7 +3168,7 @@ class SNES_Vector(SolverBaseClass):
         self._update_constants()
 
         # solve
-        self.snes.solve(None,gvec)
+        self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         lvec = self.dm.getLocalVec()
         cdef Vec clvec = lvec
@@ -3765,11 +3814,18 @@ class SNES_MultiComponent(SolverBaseClass):
               verbose=False,
               debug=False,
               debug_name=None,
+              divergence_retries: int=0,
                ):
         """Solve the multi-component SNES problem.
 
         Collective across all MPI ranks. The solution is written back to
         ``self.u.vec`` and made available through ``self.u.array``.
+
+        Parameters
+        ----------
+        divergence_retries : int, default=0
+            If SNES reports DIVERGED after the solve, re-call it with warm
+            start up to this many times. 0 preserves legacy behaviour.
         """
 
         if _force_setup:
@@ -3796,7 +3852,7 @@ class SNES_MultiComponent(SolverBaseClass):
 
         self._update_constants()
 
-        self.snes.solve(None, gvec)
+        self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         lvec = self.dm.getLocalVec()
         cdef Vec clvec = lvec
@@ -5007,10 +5063,17 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
         U = sympy.Array(self.u.sym).reshape(dim)
         P = sympy.Array(self.p.sym).reshape(1)
 
+        # Optional override: differentiate an alternative F1 to build the
+        # uu and up Jacobian blocks while leaving the residual F1
+        # unchanged. Used for inexact Newton (e.g. softmin Jacobian with
+        # Min residual at a yield kink). When None, autodiff F1 itself.
+        F1_jac_src = getattr(self, "_F1_jacobian_source", None)
+        F1_for_jac = sympy.Array(F1_jac_src) if F1_jac_src is not None else F1
+
         G0 = sympy.derive_by_array(F0, self.u.sym)
         G1 = sympy.derive_by_array(F0, self.Unknowns.L)
-        G2 = sympy.derive_by_array(F1, self.u.sym)
-        G3 = sympy.derive_by_array(F1, self.Unknowns.L)
+        G2 = sympy.derive_by_array(F1_for_jac, self.u.sym)
+        G3 = sympy.derive_by_array(F1_for_jac, self.Unknowns.L)
 
         # reorganise indices from sympy to petsc orssdering / reshape to Matrix form
         # ijkl -> LJKI (hence 3120)
@@ -5035,8 +5098,8 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
 
         G0 = sympy.derive_by_array(F0, self.p.sym)
         G1 = sympy.derive_by_array(F0, self._G)
-        G2 = sympy.derive_by_array(F1, self.p.sym)
-        G3 = sympy.derive_by_array(F1, self._G)
+        G2 = sympy.derive_by_array(F1_for_jac, self.p.sym)
+        G3 = sympy.derive_by_array(F1_for_jac, self._G)
 
         self._up_G0 = sympy.ImmutableMatrix(G0.reshape(dim))  # zero in tests
         self._up_G1 = sympy.ImmutableMatrix(sympy.permutedims(G1, permutation).reshape(dim,dim))  # zero in stokes tests
@@ -5637,7 +5700,8 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
               debug=False,
               debug_name=None,
               _force_setup: bool =False,
-              time=None, ):
+              time=None,
+              divergence_retries: int = 0, ):
         """
         Solve the Stokes system for velocity and pressure.
 
@@ -5669,6 +5733,10 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
             pointwise residual and Jacobian functions. Expressions using
             ``mesh.t`` will evaluate at this time. Non-dimensionalised
             automatically when scaling is active. Default: None (petsc_t=0).
+        divergence_retries : int, default=0
+            If the final SNES solve reports DIVERGED, re-call it with warm
+            start up to this many times. A single retry rescues most VEP
+            yield-surface kink divergences. 0 preserves legacy behaviour.
 
         Returns
         -------
@@ -5787,7 +5855,7 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
             self.petsc_options.setValue("snes_max_it", snes_max_it)
             self.snes.setFromOptions()
             self._attach_stokes_nullspace()
-            self.snes.solve(None, gvec)
+            self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         else:
         # Standard Newton solve
@@ -5797,7 +5865,7 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
             self.petsc_options.setValue("snes_max_it", snes_max_it)
             self.snes.setFromOptions()
             self._attach_stokes_nullspace()
-            self.snes.solve(None, gvec)
+            self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
         cdef DM dm = self.dm
         cdef Vec clvec = self.dm.getLocalVec()
diff --git a/src/underworld3/systems/ddt.py b/src/underworld3/systems/ddt.py
index 289c4758..39bf5692 100644
--- a/src/underworld3/systems/ddt.py
+++ b/src/underworld3/systems/ddt.py
@@ -276,6 +276,65 @@ def _update_am_values(coeffs, effective_order, theta=0.5):
         coeffs[i].sym = 0.0
 
 
+def _create_exp_coefficients(instance_id):
+    """Create UWexpression objects for the ETD-2 coefficients ``[α, φ]``.
+
+    These are named to render as ``α_exp`` and ``φ_exp`` (with the
+    DDt instance id appended) so they remain visually distinct from
+    BDF/AM coefficients in symbolic output.
+    """
+    alpha = _UWexpression(
+        rf"{{\alpha^{{\mathrm{{exp}}}}_{{[{instance_id}]}}}}",
+        sym=0.0,
+        description=f"Exp integrator α = exp(-Δt/τ) (DDt instance {instance_id})",
+        _unique_name_generation=True,
+    )
+    phi = _UWexpression(
+        rf"{{\varphi^{{\mathrm{{exp}}}}_{{[{instance_id}]}}}}",
+        sym=0.0,
+        description=f"Exp integrator φ = (1-α)/(Δt/τ) (DDt instance {instance_id})",
+        _unique_name_generation=True,
+    )
+    return [alpha, phi]
+
+
+def _update_exp_values(coeffs, dt, tau_eff):
+    r"""Update exponential-integrator coefficient values for current state.
+
+    Computes :math:`\alpha = e^{-\Delta t/\tau_\mathrm{eff}}` and
+    :math:`\varphi = (1-\alpha)/(\Delta t/\tau_\mathrm{eff})` and stores
+    them in ``coeffs[0]``, ``coeffs[1]`` respectively. The viscous limit
+    (:math:`\Delta t/\tau \to \infty`) gives ``α=0, φ=0``; the elastic
+    limit (:math:`\Delta t/\tau \to 0`) gives ``α=1, φ=1``.
+
+    Parameters
+    ----------
+    coeffs : list of UWexpression
+        Two-element list ``[α, φ]`` to update.
+    dt : float or None
+        Current timestep.
+    tau_eff : float or None
+        Maxwell relaxation time (η_eff/μ). When None or non-positive,
+        defaults to the viscous limit.
+    """
+    dt_f = _as_float(dt)
+    tau_f = _as_float(tau_eff)
+    if dt_f is None or tau_f is None or tau_f <= 0.0 or dt_f <= 0.0:
+        alpha, phi = 0.0, 0.0  # viscous limit
+    else:
+        x = dt_f / tau_f
+        if x < 1e-12:
+            alpha, phi = 1.0, 1.0  # elastic limit
+        elif x > 50.0:
+            alpha = 0.0
+            phi = 1.0 / x  # well-defined small phi, exact for large x
+        else:
+            alpha = float(np.exp(-x))
+            phi = (1.0 - alpha) / x
+    coeffs[0].sym = alpha
+    coeffs[1].sym = phi
+
+
 def _build_weighted_sum(coeffs, psi_fn, psi_star_syms):
     """Build a fixed-structure weighted sum: c0*psi + c1*psi_star[0] + ...
 
@@ -404,9 +463,14 @@ def __init__(
         # BDF/AM coefficient UWexpressions — routed through PetscDS constants[]
         self._bdf_coeffs = _create_coefficients(order, r"c^{\mathrm{BDF}}", self.instance_number)
         self._am_coeffs = _create_coefficients(order, r"a^{\mathrm{AM}}", self.instance_number)
-        # Initialise to order-1 values
+        # ETD-2 (exponential) coefficients [α, φ] for Maxwell-relaxation integration.
+        # Treated as a peer to the BDF/AM coefficient sets; values are pushed via
+        # PetscDSSetConstants every step in update_exp_coefficients().
+        self._exp_coeffs = _create_exp_coefficients(self.instance_number)
+        # Initialise to order-1 / viscous values
         _update_bdf_values(self._bdf_coeffs, 1, None, [])
         _update_am_values(self._am_coeffs, 1, self.theta)
+        _update_exp_values(self._exp_coeffs, None, None)
 
         return
 
@@ -560,6 +624,18 @@ def adams_moulton_flux(self, order: Optional[int] = None):
         """
         return _build_weighted_sum(self._am_coeffs, self.psi_fn, self.psi_star)
 
+    def update_exp_coefficients(self, dt, tau_eff):
+        r"""Update the ETD-2 (exponential) coefficient values for this step.
+
+        Sets ``self._exp_coeffs[0].sym = α`` and ``self._exp_coeffs[1].sym = φ``
+        from current ``dt`` and ``tau_eff`` (Maxwell relaxation time
+        :math:`\tau = \eta_\mathrm{eff}/\mu`). Called by the constitutive
+        model (which owns τ_eff) before each solve, peer to the BDF/AM
+        coefficient updates that happen automatically in
+        ``update_pre_solve``.
+        """
+        _update_exp_values(self._exp_coeffs, dt, tau_eff)
+
 
 class Eulerian(uw_object):
     r"""
@@ -700,9 +776,11 @@ def __init__(
         # BDF/AM coefficient UWexpressions — routed through PetscDS constants[]
         self._bdf_coeffs = _create_coefficients(order, r"c^{\mathrm{BDF}}", self.instance_number)
         self._am_coeffs = _create_coefficients(order, r"a^{\mathrm{AM}}", self.instance_number)
-        # Initialise to order-1 values
+        self._exp_coeffs = _create_exp_coefficients(self.instance_number)
+        # Initialise to order-1 / viscous values
         _update_bdf_values(self._bdf_coeffs, 1, None, [])
         _update_am_values(self._am_coeffs, 1, self.theta)
+        _update_exp_values(self._exp_coeffs, None, None)
 
         return
 
@@ -834,6 +912,49 @@ def initialise_history(self):
         self._history_initialised = True
         return
 
+    def set_initial_history(self, values, dt=None):
+        r"""Plant history values for BDF restart or analytical IC.
+
+        Bypasses the automatic ``effective_order`` ramp so the very
+        first solve runs at the full BDF order rather than starting at
+        BDF-1. Use this when you have known values at :math:`t` and
+        past times — e.g. an analytical periodic solution, or a
+        checkpointed history loaded from disk.
+
+        Parameters
+        ----------
+        values : sequence of length ``self.order``
+            ``values[k]`` is :math:`\psi` at :math:`t - k\,\Delta t`,
+            i.e. ``values[0]`` is the current state. Each entry must
+            be assignable to ``psi_star[k].array`` — either an array
+            of matching shape, or a scalar that broadcasts.
+        dt : float, optional
+            Uniform timestep assumed between history slots. Required
+            for ``order >= 2`` to seed correct multistep coefficients
+            on the first solve. Ignored for ``order = 1``.
+        """
+        if len(values) != self.order:
+            raise ValueError(
+                f"set_initial_history requires {self.order} value(s) "
+                f"(one per history slot, including the current state); "
+                f"got {len(values)}."
+            )
+        for k, val in enumerate(values):
+            self.psi_star[k].array[...] = val
+        self._history_initialised = True
+        self._n_solves_completed = self.order
+        if dt is not None:
+            self._dt_history = [float(dt)] * self.order
+        elif self.order >= 2:
+            import warnings
+            warnings.warn(
+                "set_initial_history called with order >= 2 but no "
+                "dt — variable-dt BDF coefficients will be wrong on "
+                "the first solve. Pass dt=<timestep> to suppress.",
+                stacklevel=2,
+            )
+        return
+
     def initiate_history_fn(self):
         """Deprecated: use ``initialise_history`` instead."""
         self.initialise_history()
@@ -956,6 +1077,10 @@ def adams_moulton_flux(self, order=None):
         """
         return _build_weighted_sum(self._am_coeffs, self.psi_fn, [ps.sym for ps in self.psi_star])
 
+    def update_exp_coefficients(self, dt, tau_eff):
+        r"""Update the ETD-2 (exponential) coefficient values for this step."""
+        _update_exp_values(self._exp_coeffs, dt, tau_eff)
+
 
 class SemiLagrangian(uw_object):
     r"""
@@ -1005,6 +1130,15 @@ class SemiLagrangian(uw_object):
         Smoothing parameter for projections.
     preserve_moments : bool, default=False
         Use moment-preserving projection (experimental).
+    with_forcing_history : bool, default=False
+        When True, allocate an additional ``forcing_star`` MeshVariable
+        (matching ``psi_star[0]``'s shape, vtype, degree, continuity) to
+        store one history slot for the strain-rate forcing. Required by
+        ETD-2 exponential integration of the Maxwell relaxation operator;
+        ignored for BDF/AM. Populated each step via
+        :meth:`update_forcing_history` (direct nodal evaluation of
+        ``forcing_fn`` — typically the constitutive model's strain-rate
+        symbol).
 
     Notes
     -----
@@ -1037,6 +1171,7 @@ def __init__(
         order=1,
         smoothing=0.0,
         preserve_moments=False,
+        with_forcing_history: bool = False,
     ):
         super().__init__()
 
@@ -1049,6 +1184,16 @@ def __init__(
         self.V_fn = V_fn
         self.order = order
         self.preserve_moments = preserve_moments
+        self.with_forcing_history = with_forcing_history
+
+        # Forcing-history storage. Allocated only if requested. Populated
+        # each step via update_forcing_history(forcing_fn) — used by ETD-2
+        # exponential integration of the Maxwell relaxation operator to
+        # supply the ε̇ⁿ history term in the constitutive flux.
+        self.forcing_star = None
+        self._forcing_fn = None  # set by the constitutive model
+        self._forcing_vtype = None
+        self._forcing_indep_indices = None
 
         # History tracking: deferred initialization and effective order
         self._history_initialised = False
@@ -1056,6 +1201,18 @@ def __init__(
         self._dt = None  # current timestep (set by solver or update_pre_solve)
         self._dt_history = [None] * order  # previous timesteps for variable-dt BDF
 
+        # Source snapshot machinery (opt-in via enable_source_snapshot()).
+        # Used when psi_fn references psi_star[0] itself (e.g. VE/VEP stress
+        # history where flux = 2·viscosity·E_eff and E_eff contains psi_star[0]
+        # via its history term). Without a snapshot the projection becomes
+        # implicit in psi_star[0] and Min-mode at yield admits the wrong fixed
+        # point. With snapshot, psi_star[0] symbols in the source are
+        # substituted with a frozen snapshot variable that's refreshed each
+        # step from psi_star[0]'s data array. The projection becomes a true
+        # one-shot Galerkin projection.
+        self._psi_snapshot_enabled = False
+        self._psi_snapshot = None
+
         if swarm_degree is None:
             self.swarm_degree = degree
         else:
@@ -1106,12 +1263,40 @@ def __init__(
                 )
             )
 
-        # BDF/AM coefficient UWexpressions — routed through PetscDS constants[]
+        # Forcing-history slot (only allocated when ETD-2 / exponential
+        # integration is engaged). Mirrors psi_star[0] in shape/vtype/
+        # discretisation; populated each step in update_forcing_history()
+        # via direct nodal evaluation of forcing_fn (typically the model's
+        # strain-rate symbol).
+        #
+        # Units: deliberately ``units=None``. The forcing field is the
+        # strain rate (1/time), distinct from psi_star's stress units
+        # (Pa·s × ε̇ = Pa). We don't know strain-rate units at construction
+        # time (forcing_fn is supplied later by the constitutive model).
+        # ``update_forcing_history`` non-dimensionalises the evaluated
+        # forcing before storing, matching the codebase convention that
+        # variable storage holds non-dimensional values internally and
+        # units are re-attached at the .data interface.
+        self._forcing_vtype = vtype
+        if with_forcing_history:
+            self.forcing_star = uw.discretisation.MeshVariable(
+                f"forcing_star_sl_{self.instance_number}",
+                self.mesh,
+                vtype=vtype,
+                degree=self.degree,
+                continuous=self.continuous,
+                varsymbol=rf"{{ {varsymbol}_{{F}}^{{ * }} }}",
+                units=None,
+            )
+
+        # BDF/AM/exp coefficient UWexpressions — routed through PetscDS constants[]
         self._bdf_coeffs = _create_coefficients(order, r"c^{\mathrm{BDF}}", self.instance_number)
         self._am_coeffs = _create_coefficients(order, r"a^{\mathrm{AM}}", self.instance_number)
-        # Initialise to order-1 values
+        self._exp_coeffs = _create_exp_coefficients(self.instance_number)
+        # Initialise to order-1 / viscous values
         _update_bdf_values(self._bdf_coeffs, 1, None, [])
         _update_am_values(self._am_coeffs, 1, 0.5)
+        _update_exp_values(self._exp_coeffs, None, None)
 
         # Working variable that has a potentially different discretisation from psi_star
         # We project from this to psi_star and we use this variable to define the
@@ -1204,16 +1389,101 @@ def psi_fn(self):
 
     @psi_fn.setter
     def psi_fn(self, new_fn):
-        """Set the tracked expression."""
+        """Set the tracked expression and propagate to the projection's source.
+
+        When :meth:`enable_source_snapshot` has been called, ``psi_star[0]``
+        symbols in ``new_fn`` are transparently substituted with the
+        snapshot variable's symbols before the source is pushed to the
+        projection solver — so the projection becomes a true one-shot
+        Galerkin projection regardless of whether ``new_fn`` references
+        ``psi_star[0]``.
+        """
         self._psi_fn = new_fn
+        self._psi_star_projection_solver.uw_function = self._build_projection_source(new_fn)
+        return
+
+    def _build_projection_source(self, source_fn):
+        """Construct the row matrix used as the projection's ``uw_function``.
+
+        Applies snapshot substitution (psi_star[0] → snap) when enabled.
+        Used by both ``psi_fn.setter`` and the ``initialise_history``
+        fallback path so substitution semantics are consistent.
+        """
         if getattr(self, '_psi_star_use_multicomponent', False):
             import sympy
             indep = self._psi_star_indep_indices
-            row = sympy.Matrix([[new_fn[i, j] for (i, j) in indep]])
-            self._psi_star_projection_solver.uw_function = row
+            row = sympy.Matrix([[source_fn[i, j] for (i, j) in indep]])
+            if self._psi_snapshot_enabled and self._psi_snapshot is not None:
+                ps0 = self.psi_star[0]
+                psi_snapshot = self._psi_snapshot
+                substitutions = {
+                    ps0.sym[i, j]: psi_snapshot.sym[i, j]
+                    for i in range(self.mesh.dim)
+                    for j in range(self.mesh.dim)
+                }
+                row = row.subs(substitutions)
+            return row
         else:
-            self._psi_star_projection_solver.uw_function = self._psi_fn
-        return
+            # Scalar / vector path: psi_star[0] is a scalar/vector field. If
+            # snapshot is needed for these vtypes, extend here similarly.
+            return source_fn
+
+    def enable_source_snapshot(self):
+        """Enable snapshot substitution in the projection's source field.
+
+        Call this once when the source expression (``psi_fn``) references
+        ``psi_star[0]`` itself — without it the projection's residual
+        ``(target − flux(psi_star[0]))·weight`` is implicit in the target
+        because target and source share the same data field. With Min-mode
+        plasticity at the yield kink, the implicit projection admits two
+        fixed points (elastic and yield branches); under timestep change the
+        iteration drifts to the elastic-branch fixed point and σ violates
+        the yield surface.
+
+        The snapshot is a separate mesh variable matching ``psi_star[0]``'s
+        shape/vtype/degree. Each call to ``update_pre_solve`` copies
+        ``psi_star[0].array → psi_snapshot.array``, freezing the source's
+        input for the upcoming projection. Substitution makes the
+        projection's compiled C code read from ``psi_snapshot.array``
+        instead of ``psi_star[0].array`` — there's no recompile per step,
+        just a memcpy.
+
+        Idempotent: safe to call more than once.
+        """
+        if not getattr(self, '_psi_star_use_multicomponent', False):
+            # Currently only wired for tensor projections (the case that
+            # exposed the bug).  Scalar/vector extension is straightforward
+            # if needed later.
+            return
+
+        if self._psi_snapshot is None:
+            ps0 = self.psi_star[0]
+            # NOTE: this currently registers a persistent MeshVariable in the
+            # mesh DM, which is overkill for a transient buffer that's only
+            # read by this DDt's projection.  A future improvement would be
+            # a transient/scratch-variable mechanism (likely backed by
+            # PETSc's auxiliary Vec machinery — already used elsewhere in
+            # the codebase via DMSetAuxiliaryVec_UW) so the snapshot doesn't
+            # accumulate in the DM across DDt creations.  See:
+            # docs/developer/ai-notes/historical-notes.md for the
+            # variable-deletion limitation context.
+            self._psi_snapshot = uw.discretisation.MeshVariable(
+                f"psi_snapshot_{self.instance_number}",
+                self.mesh,
+                ps0.shape,
+                vtype=ps0.vtype,
+                degree=ps0.degree,
+                continuous=ps0.continuous,
+            )
+            # Initialise psi_snapshot's data to current psi_star[0]'s data
+            # so the source evaluates consistently before the first refresh.
+            self._psi_snapshot.data[...] = ps0.data[...]
+
+        self._psi_snapshot_enabled = True
+
+        # Re-run the psi_fn setter so the substitution is applied to the
+        # currently-installed projection source.
+        self.psi_fn = self._psi_fn
 
     def _object_viewer(self):
         from IPython.display import Latex, Markdown, display
@@ -1260,23 +1530,19 @@ def initialise_history(self):
                 eval_result = UnitAwareArray(eval_result, units=psi_units)
             self.psi_star[0].array[...] = eval_result
         except Exception:
+            # Fallback: project psi_fn onto psi_star[0] via the SNES projector.
+            # Route through the shared builder so snapshot substitution
+            # semantics are consistent.
+            self._psi_star_projection_solver.uw_function = self._build_projection_source(self.psi_fn)
+            self._psi_star_projection_solver.smoothing = 0.0
+            self._psi_star_projection_solver.solve()
             if getattr(self, '_psi_star_use_multicomponent', False):
-                import sympy
-                indep = self._psi_star_indep_indices
-                row = sympy.Matrix([[self.psi_fn[i, j] for (i, j) in indep]])
-                self._psi_star_projection_solver.uw_function = row
-                self._psi_star_projection_solver.smoothing = 0.0
-                self._psi_star_projection_solver.solve()
                 # Fan out flat result to tensor psi_star[0]
-                for k, (i, j) in enumerate(indep):
+                for k, (i, j) in enumerate(self._psi_star_indep_indices):
                     vals = self._psi_star_flat_var.array[:, 0, k]
                     self.psi_star[0].array[:, i, j] = vals
                     if i != j:
                         self.psi_star[0].array[:, j, i] = vals
-            else:
-                self._psi_star_projection_solver.uw_function = self.psi_fn
-                self._psi_star_projection_solver.smoothing = 0.0
-                self._psi_star_projection_solver.solve()
 
         # Copy to all other history slots
         for i in range(1, self.order):
@@ -1285,6 +1551,49 @@ def initialise_history(self):
         self._history_initialised = True
         return
 
+    def set_initial_history(self, values, dt=None):
+        r"""Plant history values for BDF restart or analytical IC.
+
+        Bypasses the automatic ``effective_order`` ramp so the very
+        first solve runs at the full BDF order rather than starting
+        at BDF-1. Use this when you have known values at :math:`t`
+        and past times — e.g. an analytical periodic solution, or a
+        checkpointed history loaded from disk.
+
+        Parameters
+        ----------
+        values : sequence of length ``self.order``
+            ``values[k]`` is :math:`\psi` at :math:`t - k\,\Delta t`,
+            i.e. ``values[0]`` is the current state. Each entry must
+            be assignable to ``psi_star[k].array`` — either an array
+            of matching shape, or a scalar that broadcasts.
+        dt : float, optional
+            Uniform timestep assumed between history slots. Required
+            for ``order >= 2`` to seed correct multistep coefficients
+            on the first solve. Ignored for ``order = 1``.
+        """
+        if len(values) != self.order:
+            raise ValueError(
+                f"set_initial_history requires {self.order} value(s) "
+                f"(one per history slot, including the current state); "
+                f"got {len(values)}."
+            )
+        for k, val in enumerate(values):
+            self.psi_star[k].array[...] = val
+        self._history_initialised = True
+        self._n_solves_completed = self.order
+        if dt is not None:
+            self._dt_history = [float(dt)] * self.order
+        elif self.order >= 2:
+            import warnings
+            warnings.warn(
+                "set_initial_history called with order >= 2 but no "
+                "dt — variable-dt BDF coefficients will be wrong on "
+                "the first solve. Pass dt=<timestep> to suppress.",
+                stacklevel=2,
+            )
+        return
+
     def initiate_history_fn(self):
         """Deprecated: use ``initialise_history`` instead."""
         self.initialise_history()
@@ -1349,6 +1658,17 @@ def update_pre_solve(
         if not self._history_initialised:
             self.initialise_history()
 
+        # Refresh the source-snapshot variable so the projection's source
+        # field captures psi_star[0]'s state from BEFORE this step's solve.
+        # Per-step memcpy keeps the snapshot machinery aligned with
+        # psi_star[0] without recompiling the projection.  Routes through
+        # ``.data`` rather than ``.array`` to skip unit conversion (both
+        # variables already live in non-dimensional space) while keeping
+        # the callback sync that pushes values into the underlying PETSc
+        # local Vec.
+        if self._psi_snapshot_enabled and self._psi_snapshot is not None:
+            self._psi_snapshot.data[...] = self.psi_star[0].data[...]
+
         # Update coefficient values for current effective_order and dt
         _update_bdf_values(self._bdf_coeffs, self.effective_order, self._dt, self._dt_history)
         _update_am_values(self._am_coeffs, self.effective_order, 0.5)
@@ -1731,6 +2051,124 @@ def adams_moulton_flux(self, order=None):
         """
         return _build_weighted_sum(self._am_coeffs, self.psi_fn, [ps.sym for ps in self.psi_star])
 
+    def update_exp_coefficients(self, dt, tau_eff):
+        r"""Update the scalar ETD-2 (exponential) coefficient UWexpressions.
+
+        Sets ``self._exp_coeffs[0].sym = α = exp(-Δt/τ_eff)`` and
+        ``self._exp_coeffs[1].sym = φ = (1-α)/(Δt/τ_eff)`` so the next solve
+        uses the correct exponential coefficients via PetscDSSetConstants
+        on the next ``_update_constants`` call.
+        """
+        _update_exp_values(self._exp_coeffs, dt, tau_eff)
+
+    @property
+    def _exp_alpha(self):
+        """Convenience accessor for the ETD-2 ``α`` coefficient UWexpression."""
+        return self._exp_coeffs[0]
+
+    @property
+    def _exp_phi(self):
+        """Convenience accessor for the ETD-2 ``φ`` coefficient UWexpression."""
+        return self._exp_coeffs[1]
+
+    def update_forcing_history(self, forcing_fn=None, evalf=False, verbose=False):
+        r"""Refresh ``forcing_star`` from ``forcing_fn`` via direct nodal evaluation.
+
+        Used by ETD-2 exponential integration to store the current
+        strain-rate field as :math:`\dot\varepsilon^{n}` for the next
+        step's history term. Called by the constitutive model from the
+        solver's post-solve hook. Direct nodal evaluation (rather than an
+        L2 projection through SNES) is sufficient because the strain rate
+        is :math:`\nabla\mathbf{u}`, well-defined at nodes, with no
+        history-coupled term that would make a projection implicit.
+
+        Unit handling
+        -------------
+        ``forcing_star`` is allocated with ``units=None`` (see
+        ``__init__``). When the model is unit-aware, ``forcing_fn`` is a
+        symbolic expression of the velocity field whose evaluation
+        returns a ``UnitAwareArray`` carrying strain-rate units (1/time).
+        We non-dimensionalise that result via the active scaling system
+        before assigning to ``forcing_star.array``, which keeps the
+        stored values consistent with the rest of the variable storage
+        (codebase convention: variable storage is non-dimensional;
+        units are re-attached at the ``.data`` interface). When the
+        model is not unit-aware, the evaluation returns a plain ndarray
+        and assignment is a straight numpy copy.
+
+        Parameters
+        ----------
+        forcing_fn : sympy expression, optional
+            Symbolic strain-rate field to evaluate at each node. If
+            None, falls back to ``self._forcing_fn`` (set by the
+            constitutive model at solve-attach time). No-op if neither
+            is set or ``with_forcing_history=False``.
+        evalf : bool, optional
+            Forwarded to ``uw.function.evaluate`` (forces numerical
+            evaluation when True).
+        verbose : bool, optional
+            Enable verbose output.
+        """
+        if not self.with_forcing_history or self.forcing_star is None:
+            return
+        if forcing_fn is None:
+            forcing_fn = self._forcing_fn
+        if forcing_fn is None:
+            return  # constitutive model hasn't wired the forcing source yet
+
+        from underworld3.utilities.unit_aware_array import UnitAwareArray
+
+        coords = self.forcing_star.coords
+        # Use non-dimensional coords for evaluate() (mirrors the psi_star
+        # path in update_pre_solve)
+        if hasattr(coords, "magnitude"):
+            coords_nd = uw.non_dimensionalise(coords)
+            if isinstance(coords_nd, UnitAwareArray):
+                coords_nd = np.array(coords_nd)
+            elif hasattr(coords_nd, 'magnitude'):
+                coords_nd = coords_nd.magnitude
+        else:
+            coords_nd = coords
+
+        def _eval_nd(component_expr):
+            """Evaluate component at coords and non-dimensionalise to a
+            plain 1-D float array suitable for nodal storage."""
+            result = uw.function.evaluate(component_expr, coords_nd, evalf=evalf)
+            # If the evaluation returned units (model is unit-aware),
+            # non-dimensionalise before storing — keeps forcing_star's
+            # internal storage non-dimensional like psi_star.
+            if isinstance(result, UnitAwareArray) or hasattr(result, "magnitude"):
+                result = uw.non_dimensionalise(result)
+                if isinstance(result, UnitAwareArray):
+                    result = np.array(result)
+                elif hasattr(result, "magnitude"):
+                    result = result.magnitude
+            return np.asarray(result).flatten()
+
+        vtype = self._forcing_vtype
+        if vtype == uw.VarType.SYM_TENSOR or vtype == uw.VarType.TENSOR:
+            dim = self.mesh.dim
+            indep = (
+                [(i, j) for i in range(dim) for j in range(i, dim)]
+                if vtype == uw.VarType.SYM_TENSOR
+                else [(i, j) for i in range(dim) for j in range(dim)]
+            )
+            new_arr = np.zeros_like(np.asarray(self.forcing_star.array))
+            for (i, j) in indep:
+                vals = _eval_nd(forcing_fn[i, j])
+                new_arr[:, i, j] = vals
+                if i != j:
+                    new_arr[:, j, i] = vals
+            self.forcing_star.array[...] = new_arr
+        elif vtype == uw.VarType.VECTOR:
+            dim = self.mesh.dim
+            new_arr = np.zeros_like(np.asarray(self.forcing_star.array))
+            for i in range(dim):
+                new_arr[:, i] = _eval_nd(forcing_fn[i])
+            self.forcing_star.array[...] = new_arr
+        else:  # SCALAR
+            self.forcing_star.array[:] = _eval_nd(forcing_fn)
+
 
 ## Consider Deprecating this one - it is the same as the Lagrangian_Swarm but
 ## sets up the swarm for itself. This does not have a practical use-case - the swarm version
diff --git a/src/underworld3/systems/solvers.py b/src/underworld3/systems/solvers.py
index 3ac2c760..f55f9089 100644
--- a/src/underworld3/systems/solvers.py
+++ b/src/underworld3/systems/solvers.py
@@ -505,6 +505,7 @@ def solve(
         timestep: float = None,
         verbose: bool = False,
         _force_setup: bool = False,
+        divergence_retries: int = 0,
     ):
         r"""Solve the Darcy flow system.
 
@@ -521,6 +522,9 @@ def solve(
             If True, print solver progress information.
         _force_setup : bool, optional
             Force re-setup of solver even if already configured.
+        divergence_retries : int, optional
+            If SNES reports DIVERGED, retry with warm start up to this
+            many times. 0 preserves legacy behaviour.
 
         Notes
         -----
@@ -535,7 +539,8 @@ def solve(
 
         # Solve pressure
 
-        super().solve(zero_init_guess, _force_setup)
+        super().solve(zero_init_guess, _force_setup,
+                      divergence_retries=divergence_retries)
 
         # Now solve flow field: v = -flux = -K(grad(h) - s)
 
@@ -770,6 +775,7 @@ def solve(
         timestep=None,
         _force_setup: bool = False,
         verbose=False,
+        divergence_retries: int = 0,
     ):
         r"""
         Solve the transient Darcy system for one timestep.
@@ -784,6 +790,9 @@ def solve(
             Force re-setup of solver.
         verbose : bool, optional
             Print solver progress.
+        divergence_retries : int, optional
+            If SNES reports DIVERGED, retry with warm start up to this
+            many times. 0 preserves legacy behaviour.
         """
         if timestep is not None and timestep != self.delta_t:
             self.delta_t = timestep
@@ -802,7 +811,8 @@ def solve(
         self.DFDt.update_pre_solve(timestep, verbose=verbose)
 
         # Solve PDE (bypass SNES_Darcy.solve to avoid double setup/projection)
-        SNES_Scalar.solve(self, zero_init_guess, _force_setup)
+        SNES_Scalar.solve(self, zero_init_guess, _force_setup,
+                          divergence_retries=divergence_retries)
 
         # Invalidate cached data views
         target_var = getattr(self.u, "_base_var", self.u)
@@ -1137,19 +1147,70 @@ def __init__(
 
         self._constitutive_model = None
 
+        # Optional: alternative F1 expression to autodiff for the
+        # Jacobian.  When None, autodiff F1 itself (default).  Used for
+        # inexact-Newton tricks like a smooth-Jacobian / sharp-residual
+        # split at a VEP yield kink.  See ``set_jacobian_F1_source``.
+        self._F1_jacobian_source = None
+
         return
 
+    def set_jacobian_F1_source(self, F1_source, linesearch="cp"):
+        r"""Override the F1 expression used to build the Jacobian blocks.
+
+        By default, the Stokes Jacobian's uu / up G2, G3 blocks are
+        autodiff'd from the residual F1.  Some problems benefit from
+        differentiating a *different* but related expression — e.g. a
+        smooth (softmin) viscosity formula for the Jacobian while the
+        residual F1 keeps a sharp Min, so Newton sees a continuous
+        derivative even when the iterate sits exactly on the yield kink.
+
+        Setting ``F1_source`` triggers a JIT recompile (the Jacobian
+        symbols change).  Pass ``None`` to revert to autodiff of F1.
+
+        Parameters
+        ----------
+        F1_source : sympy.Matrix or None
+            Alternative expression of the same shape as ``F1.sym``.
+        linesearch : str or None, default ``"cp"``
+            SNES linesearch type to install when ``F1_source`` is set.
+            Defaults to ``"cp"`` (critical-point) because inexact-Newton
+            steps don't reliably reduce the residual norm and PETSc's
+            default ``bt`` (backtracking) consequently rejects useful
+            steps with ``DIVERGED_LINE_SEARCH``.  ``cp`` accepts the
+            predicted step at the optimum of the local linearisation and
+            converges cleanly on the same problems where ``bt`` flails.
+            Set to ``None`` to leave the linesearch type untouched (e.g.
+            if you've already configured one via ``petsc_options``).
+            Has no effect when ``F1_source is None``.
+        """
+        self._F1_jacobian_source = F1_source
+        self.is_setup = False
+        if F1_source is not None and linesearch is not None:
+            self.petsc_options["snes_linesearch_type"] = linesearch
+
     def _create_stress_history_ddt(self, order=2):
         """Create DFDt for stress history tracking (VE/VEP models).
 
         Called automatically when a constitutive model with
         ``requires_stress_history = True`` is assigned. Can also be called
         explicitly to pre-create the DFDt with a specific order.
+
+        Constitutive models can inject extra SemiLagrangian kwargs via the
+        ``stress_history_ddt_kwargs`` property — used e.g. by
+        ``MaxwellExponentialFlowModel`` to set ``with_forcing_history=True``.
         """
         if self.Unknowns.DFDt is not None:
             return  # already created
 
         self._order = order
+        # Constitutive model may request extra SemiLagrangian kwargs (e.g.
+        # with_forcing_history for ETD-2 integration).
+        cm = getattr(self, "constitutive_model", None)
+        ddt_kwargs = {}
+        if cm is not None:
+            ddt_kwargs = dict(getattr(cm, "stress_history_ddt_kwargs", {}))
+
         self.Unknowns.DFDt = uw.systems.ddt.SemiLagrangian(
             self.mesh,
             sympy.Matrix.zeros(self.mesh.dim, self.mesh.dim),
@@ -1162,7 +1223,13 @@ def _create_stress_history_ddt(self, order=2):
             bcs=None,
             order=order,
             smoothing=0.0001,
+            **ddt_kwargs,
         )
+        # Stress flux = 2·viscosity·E_eff references psi_star[0] in E_eff's
+        # history term — without snapshot substitution the projection of
+        # flux→psi_star[0] becomes implicit in psi_star[0] and Min-mode at
+        # yield admits the wrong fixed point under timestep change.
+        self.Unknowns.DFDt.enable_source_snapshot()
 
     @timing.routine_timer_decorator
     def solve(
@@ -1174,6 +1241,7 @@ def solve(
         evalf=False,
         order=None,
         picard: int = 0,
+        divergence_retries: int = 0,
     ):
         """Solve the Stokes system, with optional viscoelastic stress history.
 
@@ -1199,6 +1267,15 @@ def solve(
             Number of Picard iterations before switching to Newton.
             Picard uses a simplified Jacobian and can help convergence
             for strongly nonlinear problems like VEP at yield onset.
+        divergence_retries : int, default=0
+            If SNES returns a DIVERGED reason after the main solve, re-call
+            the underlying Newton up to this many times with a warm start
+            (``zero_init_guess=False``) to try to rescue. Each retry uses
+            the just-computed iterate plus the freshly-advected stress
+            history, which is often enough for VEP at yield onset (Min/softmin
+            kinks) to step off a bad Newton iterate. ``0`` preserves legacy
+            behaviour (divergence is terminal). Typical useful value is 1.
+            Only applies in the VE/VEP branch (``DFDt is not None``).
         """
 
         has_stress_history = self.Unknowns.DFDt is not None
@@ -1246,7 +1323,11 @@ def solve(
 
             self.DFDt.update_pre_solve(timestep, verbose=verbose, evalf=evalf,
                                        store_result=False)
-            self.constitutive_model._update_bdf_coefficients()
+            # Uniform pre-solve coefficient hook: VEP delegates to
+            # _update_bdf_coefficients(); MaxwellExponentialFlowModel updates
+            # α, φ on the DDt via _update_exp_coefficients(). No isinstance
+            # checks at the solver layer.
+            self.constitutive_model._update_history_coefficients()
 
             # 2. SOLVE
             if uw.mpi.rank == 0 and verbose:
@@ -1257,6 +1338,7 @@ def solve(
                 _force_setup=_force_setup,
                 verbose=verbose,
                 picard=picard,
+                divergence_retries=divergence_retries,
             )
 
             # 3. PROJECT actual stress and SHIFT history
@@ -1268,17 +1350,17 @@ def solve(
             _advected_sigma_star = np.copy(self.DFDt.psi_star[0].array[...])
 
             if getattr(self.DFDt, '_psi_star_use_multicomponent', False):
-                # Multi-component projection: solve all components at once.
-                # Only set uw_function on first call — the flux expression
-                # structure is stable; constant values flow through PetscDS.
-                if not getattr(self.DFDt, '_psi_star_projector_initialised', False) or not self.constitutive_model._solver_is_setup:
-                    import sympy
-                    flux = self.constitutive_model.flux
-                    indep = self.DFDt._psi_star_indep_indices
-                    row = sympy.Matrix([[flux[i, j] for (i, j) in indep]])
-                    self.DFDt._psi_star_projection_solver.uw_function = row
-                    self.DFDt._psi_star_projection_solver.smoothing = 0.0
-                    self.DFDt._psi_star_projector_initialised = True
+                # Multi-component projection of flux → psi_star[0].
+                #
+                # The DFDt's source-snapshot machinery (enabled once in
+                # _create_stress_history_ddt) intercepts psi_fn assignment
+                # to substitute psi_star[0] symbols with a frozen
+                # psi_snapshot variable, refreshed each step in
+                # update_pre_solve. So the projection's compiled source
+                # reads from psi_snapshot (not psi_star[0] itself) and is a
+                # true one-shot Galerkin projection — no implicit
+                # fixed-point iteration.
+                self.DFDt._psi_star_projection_solver.smoothing = 0.0
                 self.DFDt._psi_star_projection_solver.solve(verbose=verbose)
                 # Fan flat result back to psi_star[0] tensor variable
                 for k, (i, j) in enumerate(self.DFDt._psi_star_indep_indices):
@@ -1299,6 +1381,12 @@ def solve(
 
             self.DFDt.update_post_solve(timestep, verbose=verbose, evalf=evalf)
 
+            # Uniform post-solve hook for any extra integrator-state storage.
+            # VEP: no-op. ETD-2 / MaxwellExponentialFlowModel: refresh
+            # forcing_star with current ε̇^{n+1} so the next step's history
+            # term has access to ε̇ⁿ.
+            self.constitutive_model._update_history_post_solve()
+
             self.is_setup = True
             self.constitutive_model._solver_is_setup = True
 
@@ -1308,6 +1396,7 @@ def solve(
                 zero_init_guess,
                 _force_setup=_force_setup,
                 verbose=verbose,
+                divergence_retries=divergence_retries,
             )
 
     @property
@@ -1767,11 +1856,24 @@ class SNES_VE_Stokes(SNES_Stokes):
         Use ``uw.systems.Stokes`` directly with a
         ``ViscoElasticPlasticFlowModel`` constitutive model. The Stokes
         solver now creates stress history infrastructure automatically
-        when the constitutive model requires it.
+        when the constitutive model is assigned (the lazy-creation
+        pathway also reads ``stress_history_ddt_kwargs`` from the model
+        — required for ``integrator='etd'`` to allocate
+        ``forcing_star``). VE_Stokes pre-creates the DDt at solver
+        ``__init__`` time, before the model exists, so it can't see
+        those kwargs and is incompatible with ``integrator='etd'``.
+
+    Migration: replace::
+
+        stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=2)
+        stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
 
-    This wrapper pre-creates the DFDt with a specific ``order`` parameter,
-    which is useful when you want to control the BDF order before assigning
-    the constitutive model.
+    with::
+
+        stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+        stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+            stokes.Unknowns, order=2,
+        )
 
     Parameters
     ----------
@@ -1797,6 +1899,17 @@ def __init__(
         DuDt: Union[SemiLagrangian_DDt, Lagrangian_DDt] = None,
         DFDt: Union[SemiLagrangian_DDt, Lagrangian_DDt] = None,
     ):
+        import warnings
+        warnings.warn(
+            "VE_Stokes is deprecated. Use uw.systems.Stokes(...) directly and "
+            "assign the constitutive model afterwards — the Stokes solver creates "
+            "DDt infrastructure lazily when the model is assigned, and the lazy "
+            "path correctly forwards stress_history_ddt_kwargs from the model "
+            "(required for integrator='etd' to allocate forcing_star). "
+            "VE_Stokes pre-creates DDt at __init__ time, before the model exists, "
+            "so it cannot use ETD-2. See the class docstring for migration.",
+            DeprecationWarning, stacklevel=2,
+        )
         super().__init__(
             mesh,
             velocityField,
@@ -2154,8 +2267,16 @@ def __init__(
     ## Need to over-ride solve method to run over all components
 
     @timing.routine_timer_decorator
-    def solve(self, verbose=False):
-        """Solve by projecting each tensor component sequentially."""
+    def solve(self, verbose=False, divergence_retries: int = 0):
+        """Solve by projecting each tensor component sequentially.
+
+        Parameters
+        ----------
+        verbose : bool
+        divergence_retries : int, default=0
+            Forwarded to each per-component SNES solve. 0 preserves
+            legacy behaviour.
+        """
         # Loop over the components of the tensor. If this is a symmetric
         # tensor, we'll usually be given the 1d form to prevent duplication
 
@@ -2791,6 +2912,7 @@ def solve(
         _force_setup: bool = False,
         _evalf=False,
         verbose=False,
+        divergence_retries: int = 0,
     ):
         """
         Generates solution to constructed system.
@@ -2800,6 +2922,9 @@ def solve(
         zero_init_guess:
             If `True`, a zero initial guess will be used for the
             system solution. Otherwise, the current values of `self.u` will be used.
+        divergence_retries:
+            If SNES reports DIVERGED, retry with warm start up to this
+            many times. 0 preserves legacy behaviour.
         """
 
         if timestep is not None and timestep != self.delta_t:
@@ -2823,7 +2948,8 @@ def solve(
         self.DuDt.update_pre_solve(timestep, verbose=verbose, evalf=_evalf)
         self.DFDt.update_pre_solve(timestep, verbose=verbose, evalf=_evalf)
 
-        super().solve(zero_init_guess, _force_setup)
+        super().solve(zero_init_guess, _force_setup,
+                      divergence_retries=divergence_retries)
 
         # Invalidate cached data views - PETSc may have replaced underlying buffers
         # This ensures .data and .array properties return fresh data from PETSc
@@ -3162,6 +3288,7 @@ def solve(
         evalf: bool = False,
         _force_setup: bool = False,
         verbose=False,
+        divergence_retries: int = 0,
     ):
         """
         Generates solution to constructed system.
@@ -3171,6 +3298,9 @@ def solve(
         zero_init_guess:
             If `True`, a zero initial guess will be used for the
             system solution. Otherwise, the current values of `self.u` will be used.
+        divergence_retries:
+            If SNES reports DIVERGED, retry with warm start up to this
+            many times. 0 preserves legacy behaviour.
         """
 
         if timestep is not None and timestep != self.delta_t:
@@ -3195,7 +3325,8 @@ def solve(
         self.DuDt.update_pre_solve(timestep, evalf=evalf, verbose=verbose)
         self.DFDt.update_pre_solve(timestep, evalf=evalf, verbose=verbose)
 
-        super().solve(zero_init_guess, _force_setup)
+        super().solve(zero_init_guess, _force_setup,
+                      divergence_retries=divergence_retries)
 
         # Invalidate cached data views - PETSc may have replaced underlying buffers
         target_var = getattr(self.u, "_base_var", self.u)
@@ -3579,6 +3710,7 @@ def solve(
         verbose=False,
         _evalf=False,
         order=None,
+        divergence_retries: int = 0,
     ):
         """
         Generates solution to constructed system.
@@ -3588,6 +3720,9 @@ def solve(
         zero_init_guess:
             If `True`, a zero initial guess will be used for the
             system solution. Otherwise, the current values of `self.u` will be used.
+        divergence_retries:
+            If SNES reports DIVERGED, retry with warm start up to this
+            many times. 0 preserves legacy behaviour.
         """
 
         if order is None or order > self._order:
@@ -3629,6 +3764,7 @@ def solve(
             _force_setup=_force_setup,
             verbose=verbose,
             picard=0,
+            divergence_retries=divergence_retries,
         )
 
         if uw.mpi.rank == 0 and verbose:
diff --git a/src/underworld3/utilities/mathematical_mixin.py b/src/underworld3/utilities/mathematical_mixin.py
index 2bb87ba2..8054610d 100644
--- a/src/underworld3/utilities/mathematical_mixin.py
+++ b/src/underworld3/utilities/mathematical_mixin.py
@@ -642,8 +642,14 @@ def to(self, target_units: str):
 
     def __getattr__(self, name):
         """Enhanced method delegation with signature handling."""
-        # Prevent recursion if _validate_sym is being accessed
-        if name == "_validate_sym" or name.startswith("_"):
+        # Prevent recursion. "sym" is guarded because _validate_sym below calls
+        # self.sym — if the sym @property getter raises AttributeError (e.g.
+        # accessing internal state not yet set during __init__), Python falls
+        # back to __getattr__, which re-enters _validate_sym and loops. Names
+        # starting with "_" are guarded so internal attribute probing (e.g.
+        # hasattr checks during early init) fails fast rather than through the
+        # sym-delegation path.
+        if name in ("_validate_sym", "sym") or name.startswith("_"):
             raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
         try:
diff --git a/tests/minimal_vep_timing.py b/tests/minimal_vep_timing.py
index e03f2937..797e34eb 100644
--- a/tests/minimal_vep_timing.py
+++ b/tests/minimal_vep_timing.py
@@ -6,8 +6,7 @@
 import time
 import sympy
 import underworld3 as uw
-from underworld3.systems import VE_Stokes
-
+from underworld3.systems import Stokes
 t0 = time.time()
 
 # --- Mesh ---
@@ -24,8 +23,10 @@
 print(f"Variables: {time.time() - t0:.1f}s")
 
 # --- Solver + constitutive model ---
-stokes = VE_Stokes(mesh, velocityField=v, pressureField=p, order=1)
-stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes = Stokes(mesh, velocityField=v, pressureField=p)
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=1,
+)
 stokes.constitutive_model.Parameters.shear_viscosity_0 = 1.0
 stokes.constitutive_model.Parameters.shear_modulus = 1.0
 stokes.constitutive_model.Parameters.shear_viscosity_min = 1.0e-3
@@ -64,8 +65,10 @@
 print(f"Second solve (cached): {time.time() - t2:.1f}s")
 
 # --- Compare: pure VE (no yield) ---
-stokes2 = VE_Stokes(mesh, velocityField=v, pressureField=p, order=1)
-stokes2.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes2 = Stokes(mesh, velocityField=v, pressureField=p)
+stokes2.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes2.Unknowns, order=1,
+)
 stokes2.constitutive_model.Parameters.shear_viscosity_0 = 1.0
 stokes2.constitutive_model.Parameters.shear_modulus = 1.0
 # yield_stress defaults to sympy.oo — pure VE
diff --git a/tests/parallel/test_0780_ve_stokes_first_solve_mpi.py b/tests/parallel/test_0780_ve_stokes_first_solve_mpi.py
index bc0c3983..78640f0b 100644
--- a/tests/parallel/test_0780_ve_stokes_first_solve_mpi.py
+++ b/tests/parallel/test_0780_ve_stokes_first_solve_mpi.py
@@ -43,11 +43,11 @@ def test_ve_stokes_first_solve_does_not_deadlock():
     v = uw.discretisation.MeshVariable("V", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(
-        mesh, velocityField=v, pressureField=p, order=2
+    stokes = uw.systems.Stokes(
+        mesh, velocityField=v, pressureField=p,
     )
-    stokes.constitutive_model = (
-        uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=2,
     )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = 1.0
     stokes.constitutive_model.Parameters.shear_modulus = 1.0
diff --git a/tests/plot_ve_oscillatory_validation.py b/tests/plot_ve_oscillatory_validation.py
index f3ac103f..86d0683e 100644
--- a/tests/plot_ve_oscillatory_validation.py
+++ b/tests/plot_ve_oscillatory_validation.py
@@ -44,8 +44,10 @@ def run_oscillatory(order, n_steps, dt, V0, H, ETA, MU, omega, save_prefix=None)
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/profile_jit_phases.py b/tests/profile_jit_phases.py
index ecd12eba..c1c04a7d 100644
--- a/tests/profile_jit_phases.py
+++ b/tests/profile_jit_phases.py
@@ -9,8 +9,7 @@
 import time
 import sympy
 import underworld3 as uw
-from underworld3.systems import VE_Stokes
-
+from underworld3.systems import Stokes
 # ── Setup (fast) ──────────────────────────────────────────────────────────────
 
 mesh = uw.meshing.UnstructuredSimplexBox(
@@ -22,8 +21,10 @@
 p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1, continuous=True,
                                     vtype=uw.VarType.SCALAR)
 
-stokes = VE_Stokes(mesh, velocityField=v, pressureField=p, order=1)
-stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes = Stokes(mesh, velocityField=v, pressureField=p)
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=1,
+)
 stokes.constitutive_model.Parameters.shear_viscosity_0 = 1.0
 stokes.constitutive_model.Parameters.shear_modulus = 1.0
 stokes.constitutive_model.Parameters.shear_viscosity_min = 1.0e-3
diff --git a/tests/run_ve_order2_debug.py b/tests/run_ve_order2_debug.py
index 286b7359..e1fc82e1 100644
--- a/tests/run_ve_order2_debug.py
+++ b/tests/run_ve_order2_debug.py
@@ -15,8 +15,10 @@
 v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
 p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=2)
-stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=2,
+)
 stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
 stokes.constitutive_model.Parameters.shear_modulus = MU
 stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_ve_oscillatory.py b/tests/run_ve_oscillatory.py
index 2cf7e21d..71ac56f7 100644
--- a/tests/run_ve_oscillatory.py
+++ b/tests/run_ve_oscillatory.py
@@ -42,8 +42,10 @@ def run_oscillatory(order, n_steps, dt_over_tr, De):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_ve_shear_order2_quick.py b/tests/run_ve_shear_order2_quick.py
index fb40d71f..5192d4cb 100644
--- a/tests/run_ve_shear_order2_quick.py
+++ b/tests/run_ve_shear_order2_quick.py
@@ -15,8 +15,10 @@
 v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
 p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=2)
-stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=2,
+)
 stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
 stokes.constitutive_model.Parameters.shear_modulus = MU
 stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_ve_shear_quick.py b/tests/run_ve_shear_quick.py
index debced41..efa07c5a 100644
--- a/tests/run_ve_shear_quick.py
+++ b/tests/run_ve_shear_quick.py
@@ -15,8 +15,10 @@
 v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
 p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=1)
-stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+    stokes.Unknowns, order=1,
+)
 stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
 stokes.constitutive_model.Parameters.shear_modulus = MU
 stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_ve_shear_validation.py b/tests/run_ve_shear_validation.py
index 759616ed..db219505 100644
--- a/tests/run_ve_shear_validation.py
+++ b/tests/run_ve_shear_validation.py
@@ -28,8 +28,10 @@ def run(order, n_steps, dt_ratio):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_ve_vep_oscillatory_checkpoint.py b/tests/run_ve_vep_oscillatory_checkpoint.py
index 8ae07f01..aa0bcd48 100644
--- a/tests/run_ve_vep_oscillatory_checkpoint.py
+++ b/tests/run_ve_vep_oscillatory_checkpoint.py
@@ -27,8 +27,10 @@ def run_oscillatory(order, n_steps, dt, omega, V0, tau_y=None):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_ve_vep_oscillatory_plot.py b/tests/run_ve_vep_oscillatory_plot.py
index b5ee4907..052f8072 100644
--- a/tests/run_ve_vep_oscillatory_plot.py
+++ b/tests/run_ve_vep_oscillatory_plot.py
@@ -35,8 +35,10 @@ def run_oscillatory(order, n_steps, dt, omega, V0, t0, tau_y=None):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
@@ -78,7 +80,7 @@ def run_oscillatory(order, n_steps, dt, omega, V0, t0, tau_y=None):
         time_phys += dt
         V_bc.sym = V0 * np.sin(omega * time_phys)
 
-        stokes.solve(zero_init_guess=False, evalf=False)
+        stokes.solve(zero_init_guess=False, timestep=dt, evalf=False)
 
         val = uw.function.evaluate(stokes.tau.sym[0, 1], centre)
         sigma_xy = float(val.flatten()[0])
diff --git a/tests/run_vep_oscillatory.py b/tests/run_vep_oscillatory.py
index e76e212e..8d27803d 100644
--- a/tests/run_vep_oscillatory.py
+++ b/tests/run_vep_oscillatory.py
@@ -39,8 +39,10 @@ def run_vep_oscillatory(order, n_steps, dt_over_tr, De, tau_y):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/run_vep_shear_box.py b/tests/run_vep_shear_box.py
index 7d4fabe3..4ec49e77 100644
--- a/tests/run_vep_shear_box.py
+++ b/tests/run_vep_shear_box.py
@@ -41,8 +41,10 @@ def run_vep_shear(order, n_steps, dt_over_tr, tau_y):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=v, pressureField=p, order=order)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
diff --git a/tests/test_1050_VEstokesCart.py b/tests/test_1050_VEstokesCart.py
index 4fb5edf5..f27f3968 100644
--- a/tests/test_1050_VEstokesCart.py
+++ b/tests/test_1050_VEstokesCart.py
@@ -52,8 +52,10 @@ def test_stokes_boxmesh(mesh):
     )
     p = uw.discretisation.MeshVariable(r"mathbf{p}", mesh, 1, vtype=uw.VarType.SCALAR, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=u, pressureField=p, order=1)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=1,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = 1
     stokes.constitutive_model.Parameters.shear_modulus = 1
     stokes.constitutive_model.Parameters.dt_elastic = sympy.sympify(1) / 10
@@ -229,8 +231,10 @@ def test_stokes_boxmesh_bc_failure(mesh):
     )
     p = uw.discretisation.MeshVariable(r"mathbf{p}", mesh, 1, vtype=uw.VarType.SCALAR, degree=1)
 
-    stokes = uw.systems.VE_Stokes(mesh, velocityField=u, pressureField=p, order=1)
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes = uw.systems.Stokes(mesh, velocityField=u, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=1,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = 1
     stokes.constitutive_model.Parameters.shear_modulus = 1
     stokes.constitutive_model.Parameters.dt_elastic = sympy.sympify(1) / 10
diff --git a/tests/test_1051_VE_shear_box.py b/tests/test_1051_VE_shear_box.py
index f169b75c..cf8425ac 100644
--- a/tests/test_1051_VE_shear_box.py
+++ b/tests/test_1051_VE_shear_box.py
@@ -69,15 +69,16 @@ def _run_ve_shear(order, n_steps, dt_over_tr):
     v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=2)
     p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
 
-    stokes = uw.systems.VE_Stokes(
-        mesh, velocityField=v, pressureField=p, order=order, verbose=False,
+    stokes = uw.systems.Stokes(
+        mesh, velocityField=v, pressureField=p, verbose=False,
     )
 
-    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=order,
+    )
     stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
     stokes.constitutive_model.Parameters.shear_modulus = MU
     stokes.constitutive_model.Parameters.dt_elastic = dt
-    # bdf_blend auto-detects: 1.0 for pure VE, 0.75 for VEP
 
     stokes.add_dirichlet_bc((V0, 0.0), "Top")
     stokes.add_dirichlet_bc((-V0, 0.0), "Bottom")
diff --git a/tests/test_1052_VEP_stability_regression.py b/tests/test_1052_VEP_stability_regression.py
new file mode 100644
index 00000000..7bf44738
--- /dev/null
+++ b/tests/test_1052_VEP_stability_regression.py
@@ -0,0 +1,316 @@
+"""VEP stability regressions — yield-surface lock under fixed and variable dt.
+
+These tests exercise code paths that historically had subtle bugs and
+would catch their re-introduction. They are slower than the level_1
+suite (~30s each) so are tagged level_2; run with::
+
+    pytest -m "level_2 and tier_a" tests/test_1052_VEP_stability_regression.py
+
+Bugs each test would have caught
+--------------------------------
+
+1. ``test_vep_yield_lock_fixed_dt`` — under sustained loading at constant
+   dt with ε̇ above yield, σ should hold at τ_y. Catches: any failure of
+   Min-mode to clip stress correctly (e.g. softmin under-clip masquerading
+   as Min, JIT mis-compilation of Min(η_ve, η_pl)).
+
+2. ``test_vep_yield_lock_variable_dt`` — halving and doubling dt at yield
+   should not knock σ off the yield surface. Catches: the implicit-
+   projection drift fixed by the psi_snapshot machinery (see commits
+   8f2b0dd, 31abad1, 380ab4b).  Pre-fix this test would have shown peak
+   |σ| ≈ 0.65 against τ_y = 0.5.
+
+3. ``test_vep_snes_no_divergence_loading_through_yield`` — loading from
+   σ=0 through the yield onset should not produce SNES divergences (with
+   the divergence_retries safety net). Catches: regressions in the
+   Picard-retry mechanism or in the BDF-2 stability at the kink (the
+   reason ``bdf_blend`` was retired).
+
+4. ``test_pure_ve_variable_dt_accuracy`` — pure-VE accuracy under variable
+   dt against the analytical Maxwell square-wave solution. Catches:
+   regressions in variable-dt BDF-2 coefficients or in the snapshot
+   refresh ordering. Threshold loose enough (max_err < 0.10) that it
+   doesn't false-trip on minor BDF-coefficient tuning.
+"""
+
+import pytest
+import numpy as np
+import sympy
+import underworld3 as uw
+from underworld3.function import expression
+
+
+pytestmark = [pytest.mark.level_2, pytest.mark.tier_a]
+
+
+# ---------------------------------------------------------------------------
+# Common setup: shear box at unit Maxwell time, τ_y = 0.5, ε̇ = 0.5 (above yield)
+# ---------------------------------------------------------------------------
+
+ETA = 1.0
+MU = 1.0
+TAU_Y = 0.5
+V0 = 0.5  # gives ε̇_xy = 0.5 in the symmetric strain rate convention
+T_R = ETA / MU
+HALF_PERIOD = 2.0 * T_R
+
+
+def _build_stokes(label, yield_mode="min", yield_stress=TAU_Y):
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(16, 8), minCoords=(-1.0, -0.5), maxCoords=(1.0, 0.5),
+    )
+    v = uw.discretisation.MeshVariable(f"U_{label}", mesh, mesh.dim, degree=2)
+    p = uw.discretisation.MeshVariable(f"P_{label}", mesh, 1, degree=1)
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    stokes.constitutive_model = uw.constitutive_models.ViscoElasticPlasticFlowModel(
+        stokes.Unknowns, order=2,
+    )
+    stokes.constitutive_model.Parameters.shear_viscosity_0 = ETA
+    stokes.constitutive_model.Parameters.shear_modulus = MU
+    stokes.constitutive_model.Parameters.yield_stress = yield_stress
+    stokes.constitutive_model.Parameters.strainrate_inv_II_min = 1.0e-6
+    stokes.constitutive_model._yield_mode = yield_mode
+    V_top = expression(rf"V_{{{label}}}", sympy.Float(V0), "Top V")
+    stokes.add_dirichlet_bc((V_top, 0.0), "Top")
+    stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+    return mesh, stokes, V_top
+
+
+def _step(stokes, V_top, dt, V_sign=1.0):
+    V_top.sym = sympy.Float(V_sign * V0)
+    stokes.constitutive_model.Parameters.dt_elastic = dt
+    stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+    return int(stokes.snes.getConvergedReason())
+
+
+def _sigma(stokes, c=np.array([[0.0, 0.0]])):
+    return float(uw.function.evaluate(stokes.tau.sym[0, 1], c).flatten()[0])
+
+
+# ---------------------------------------------------------------------------
+# Test 1: yield surface lock under fixed dt
+# ---------------------------------------------------------------------------
+
+def test_vep_yield_lock_fixed_dt():
+    """Under sustained constant-ε̇ loading above yield, σ should clamp at τ_y.
+
+    Catches: Min-mode mis-compilation, viscosity-formula drift, projection
+    failure. Pre-fix the bug surfaced as σ slowly drifting upward each step
+    once at yield.
+    """
+    _, stokes, V_top = _build_stokes("yield_lock_fixed")
+    dt = 0.20 * T_R
+    sigmas = []
+    for _ in range(20):
+        _step(stokes, V_top, dt)
+        sigmas.append(_sigma(stokes))
+    sigmas = np.array(sigmas)
+    # After ~4 steps σ should be at yield. Take the second half as the
+    # plateau and verify it sits exactly on τ_y.
+    plateau = sigmas[10:]
+    assert np.abs(plateau).max() <= TAU_Y * 1.001, (
+        f"yield surface violation under fixed dt: peak={np.abs(plateau).max():.4f} "
+        f"vs τ_y={TAU_Y}"
+    )
+    assert np.abs(plateau).min() >= TAU_Y * 0.999, (
+        f"yield surface under-clip: min={np.abs(plateau).min():.4f} "
+        f"vs τ_y={TAU_Y}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test 2: yield surface lock under variable dt (the snapshot fix)
+# ---------------------------------------------------------------------------
+
+def test_vep_yield_lock_variable_dt():
+    """Halving and doubling dt at yield must not push σ off the yield surface.
+
+    Catches: implicit-projection drift fixed by psi_snapshot. Pre-fix this
+    test would have shown peak |σ| ≈ 0.65 (30% violation).
+    """
+    _, stokes, V_top = _build_stokes("yield_lock_var")
+    dt_max, dt_min = 0.20 * T_R, 0.10 * T_R
+    # Phase 1: warm up to yield at dt_max
+    for _ in range(5):
+        _step(stokes, V_top, dt_max)
+    assert abs(_sigma(stokes) - TAU_Y) < TAU_Y * 0.001, "warm-up did not reach yield"
+    # Phase 2: halve dt, take 4 steps — these are where the bug surfaced
+    sigmas_after_halve = []
+    for _ in range(4):
+        _step(stokes, V_top, dt_min)
+        sigmas_after_halve.append(_sigma(stokes))
+    # Phase 3: double dt back, take 4 steps
+    sigmas_after_double = []
+    for _ in range(4):
+        _step(stokes, V_top, dt_max)
+        sigmas_after_double.append(_sigma(stokes))
+    sigmas = np.array(sigmas_after_halve + sigmas_after_double)
+    assert np.abs(sigmas).max() <= TAU_Y * 1.01, (
+        f"variable-dt yield violation: peak={np.abs(sigmas).max():.4f} "
+        f"vs τ_y={TAU_Y}. Pre-fix this would have been ~0.65."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test 3: SNES convergence through yield onset
+# ---------------------------------------------------------------------------
+
+def test_vep_snes_no_divergence_loading_through_yield():
+    """Loading from σ=0 through yield onset must not produce SNES divergences.
+
+    Catches: regressions in divergence_retries, Picard machinery, or BDF-2
+    stability at the Min kink (the reason ``bdf_blend`` existed). Allows
+    a small number of soft divergences that the retry machinery rescues.
+    """
+    _, stokes, V_top = _build_stokes("snes_loading")
+    dt = 0.20 * T_R
+    final_reasons = []
+    for _ in range(15):
+        reason = _step(stokes, V_top, dt)
+        final_reasons.append(reason)
+    final_reasons = np.array(final_reasons)
+    # All steps should converge once retries are accounted for
+    n_diverged = int((final_reasons < 0).sum())
+    assert n_diverged == 0, (
+        f"SNES diverged on {n_diverged}/{len(final_reasons)} steps loading through yield. "
+        f"Reasons: {final_reasons.tolist()}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test 4: pure VE accuracy under variable dt (analytical comparison)
+# ---------------------------------------------------------------------------
+
+def _step_square_analytical(t, eta, mu, gamma_dot, half_period):
+    t_r = eta / mu
+    sigma_ss = eta * gamma_dot
+    out = np.zeros_like(t, dtype=float)
+    for i, ti in enumerate(t):
+        n = int(ti / half_period)
+        t_local = ti - n * half_period
+        sigma_start = 0.0
+        for j in range(n):
+            sign = 1.0 if j % 2 == 0 else -1.0
+            target = sign * sigma_ss
+            sigma_start = target + (sigma_start - target) * np.exp(-half_period / t_r)
+        sign = 1.0 if n % 2 == 0 else -1.0
+        target = sign * sigma_ss
+        out[i] = target + (sigma_start - target) * np.exp(-t_local / t_r)
+    return out
+
+
+def test_ti_vep_yield_lock_variable_dt():
+    """TransverseIsotropicVEPFlowModel should inherit the same yield-lock
+    behaviour as VEP under variable dt — both share SemiLagrangian DDt and
+    its snapshot machinery via SNES_Stokes._create_stress_history_ddt.
+
+    Catches: regressions in the TI-VEP code path that would prevent the
+    snapshot substitution from applying (e.g. changes to psi_star[0]'s
+    symbolic structure under TI flux, or accidental override of the DDt
+    setup).
+    """
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(16, 8), minCoords=(-1.0, -0.5), maxCoords=(1.0, 0.5),
+    )
+    v = uw.discretisation.MeshVariable("U_ti", mesh, mesh.dim, degree=2)
+    p = uw.discretisation.MeshVariable("P_ti", mesh, 1, degree=1)
+    stokes = uw.systems.Stokes(mesh, velocityField=v, pressureField=p)
+    cm = uw.constitutive_models.TransverseIsotropicVEPFlowModel(
+        stokes.Unknowns, order=2,
+    )
+    stokes.constitutive_model = cm
+    cm.Parameters.shear_viscosity_0 = ETA
+    cm.Parameters.shear_viscosity_1 = ETA
+    cm.Parameters.shear_modulus = MU
+    cm.Parameters.yield_stress = TAU_Y
+    cm.Parameters.director = sympy.Matrix([0.0, 1.0])  # horizontal fault
+    cm.Parameters.strainrate_inv_II_min = 1.0e-6
+    cm._yield_mode = "min"
+
+    V_top = expression(R"V_{top_ti}", sympy.Float(V0), "Top V")
+    stokes.add_dirichlet_bc((V_top, 0.0), "Top")
+    stokes.add_dirichlet_bc((-V_top, 0.0), "Bottom")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Left")
+    stokes.add_dirichlet_bc((sympy.oo, 0.0), "Right")
+    stokes.tolerance = 1.0e-6
+    stokes.petsc_options["snes_force_iteration"] = True
+
+    # Sanity check: snapshot machinery is wired in
+    assert stokes.Unknowns.DFDt is not None
+    assert stokes.Unknowns.DFDt._psi_snapshot_enabled
+    assert stokes.Unknowns.DFDt._psi_snapshot is not None
+
+    centre = np.array([[0.0, 0.0]])
+
+    def _step_ti(dt):
+        V_top.sym = sympy.Float(V0)
+        cm.Parameters.dt_elastic = dt
+        stokes.solve(zero_init_guess=False, timestep=dt, divergence_retries=2)
+
+    def _sigma_ti():
+        return float(uw.function.evaluate(stokes.tau.sym[0, 1], centre).flatten()[0])
+
+    # Phase 1: warm up to yield at dt = 0.20
+    for _ in range(6):
+        _step_ti(0.20 * T_R)
+    assert abs(_sigma_ti() - TAU_Y) < TAU_Y * 0.01, "TI-VEP did not reach yield"
+    # Phase 2 + 3: halve and double, σ must hold
+    sigmas = []
+    for _ in range(4):
+        _step_ti(0.10 * T_R); sigmas.append(_sigma_ti())
+    for _ in range(4):
+        _step_ti(0.20 * T_R); sigmas.append(_sigma_ti())
+    sigmas = np.array(sigmas)
+    assert np.abs(sigmas).max() <= TAU_Y * 1.01, (
+        f"TI-VEP variable-dt yield violation: peak={np.abs(sigmas).max():.4f} "
+        f"vs τ_y={TAU_Y}"
+    )
+
+
+def test_pure_ve_variable_dt_accuracy():
+    """Pure VE under variable dt should match the analytical square-wave
+    solution within a loose tolerance.
+
+    Catches: regressions in variable-dt BDF-2 coefficients or in the
+    snapshot refresh ordering.  The threshold (max_err < 0.10) is loose
+    enough that minor BDF-coefficient tuning won't false-trip it but
+    tight enough to catch large drifts. Current head produces ~0.06.
+    """
+    # Pure VE: yield_stress -> infinity
+    _, stokes, V_top = _build_stokes("pure_ve_acc", yield_stress=1.0e6)
+    flip_times = (HALF_PERIOD * 1, HALF_PERIOD * 2, HALF_PERIOD * 3)
+    window = 0.40 * T_R
+    dt_max, dt_min = 0.20 * T_R, 0.10 * T_R
+
+    def schedule_dt(t):
+        for f in flip_times:
+            if abs(t - f) <= window or 0 <= f - t <= window:
+                return dt_min
+        return dt_max
+
+    times, taus = [], []
+    t_cur = 0.0
+    t_end = 4 * HALF_PERIOD  # 8 t_r
+    while t_cur < t_end - 1e-9:
+        dt = min(schedule_dt(t_cur), t_end - t_cur)
+        n_half = int((t_cur + 0.5 * dt) / HALF_PERIOD)
+        V_sign = 1.0 if n_half % 2 == 0 else -1.0
+        _step(stokes, V_top, dt, V_sign=V_sign)
+        t_cur += dt
+        times.append(t_cur)
+        taus.append(_sigma(stokes))
+    times = np.array(times)
+    taus = np.array(taus)
+
+    sigma_analytical = _step_square_analytical(
+        times, ETA, MU, 2.0 * V0, HALF_PERIOD
+    )
+    max_err = float(np.max(np.abs(taus - sigma_analytical)))
+    assert max_err < 0.10, (
+        f"pure-VE accuracy regressed under variable dt: "
+        f"max|err|={max_err:.4f} (threshold 0.10; current head ~0.06)"
+    )
diff --git a/tests/test_1052_ddt_set_initial_history.py b/tests/test_1052_ddt_set_initial_history.py
new file mode 100644
index 00000000..ae5eefc3
--- /dev/null
+++ b/tests/test_1052_ddt_set_initial_history.py
@@ -0,0 +1,88 @@
+"""Regression tests for SemiLagrangian / Eulerian DDt.set_initial_history.
+
+The method is the supported entry point for planting BDF history at the
+start of a run — used both for analytical-IC benchmarks (no startup
+transient) and for checkpoint/restart (resume the multistep history
+without an order ramp).
+
+We instantiate a SemiLagrangian DDt directly (no Stokes solver) and
+check the bookkeeping after set_initial_history.
+"""
+
+import warnings
+import numpy as np
+import pytest
+import sympy
+
+import underworld3 as uw
+from underworld3.systems import ddt as ddt_module
+
+pytestmark = [pytest.mark.level_1, pytest.mark.tier_a]
+
+
+def _make_semilagrangian(order):
+    """Build a tiny SemiLagrangian DDt for SYM_TENSOR fields."""
+    mesh = uw.meshing.StructuredQuadBox(
+        elementRes=(4, 4), minCoords=(0.0, 0.0), maxCoords=(1.0, 1.0)
+    )
+    v = uw.discretisation.MeshVariable("U", mesh, mesh.dim, degree=1)
+    psi = sympy.zeros(2, 2)  # zero stress placeholder
+    return ddt_module.SemiLagrangian(
+        mesh,
+        psi_fn=psi,
+        V_fn=v.sym,
+        vtype=uw.VarType.SYM_TENSOR,
+        degree=2,
+        continuous=False,
+        order=order,
+    )
+
+
+class TestSetInitialHistory:
+
+    def test_order1_marks_initialised(self):
+        d = _make_semilagrangian(order=1)
+        n_nodes = d.psi_star[0].array.shape[0]
+        arr = np.zeros((n_nodes, 2, 2))
+        arr[:, 0, 1] = arr[:, 1, 0] = 0.4
+        d.set_initial_history([arr])
+        assert d._history_initialised is True
+        assert d._n_solves_completed == 1
+        assert np.allclose(d.psi_star[0].array[:, 0, 1], 0.4)
+
+    def test_order2_planted_history_and_dt(self):
+        d = _make_semilagrangian(order=2)
+        n_nodes = d.psi_star[0].array.shape[0]
+        a = np.zeros((n_nodes, 2, 2)); a[:, 0, 1] = a[:, 1, 0] = 0.5
+        b = np.zeros((n_nodes, 2, 2)); b[:, 0, 1] = b[:, 1, 0] = 0.4
+        d.set_initial_history([a, b], dt=0.05)
+        assert d._history_initialised is True
+        assert d._n_solves_completed == 2
+        assert d._dt_history == [0.05, 0.05]
+        assert np.allclose(d.psi_star[0].array[:, 0, 1], 0.5)
+        assert np.allclose(d.psi_star[1].array[:, 0, 1], 0.4)
+
+    def test_scalar_broadcast(self):
+        """Scalar values broadcast to whole field."""
+        d = _make_semilagrangian(order=1)
+        d.set_initial_history([0.7])
+        assert np.allclose(d.psi_star[0].array, 0.7)
+
+    def test_wrong_length_raises(self):
+        d = _make_semilagrangian(order=2)
+        with pytest.raises(ValueError, match="requires 2 value"):
+            d.set_initial_history([0.0])
+
+    def test_order2_no_dt_warns(self):
+        d = _make_semilagrangian(order=2)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            d.set_initial_history([0.0, 0.0])
+            assert any("variable-dt" in str(rec.message) for rec in w)
+
+    def test_order1_no_dt_silent(self):
+        d = _make_semilagrangian(order=1)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            d.set_initial_history([0.0])
+            assert not any("variable-dt" in str(rec.message) for rec in w)