In [None]:
import socket, struct, threading, time, random, types, contextlib
import torch, numpy as np, torch.nn as nn
from functools import wraps
from torch.nn.modules.module import register_module_forward_hook



HOST, PORT_BASE, GLOBAL_SEED = "127.0.0.1", 11234, 42
random.seed(GLOBAL_SEED); torch.manual_seed(GLOBAL_SEED)

orig_matmul = torch.matmul
orig_tensor_matmul = torch.Tensor.__matmul__
orig_tensor_rmatmul = torch.Tensor.__rmatmul__



def _verifier_server(B_public, m, n, layer_idx, n_rows, n_cols):
    """Bare-bones TCP listener; exits after one blob."""
    def recvall(sock, n):
        buf = bytearray()
        while len(buf) < n:
            chunk = sock.recv(n - len(buf))
            if not chunk:
                raise RuntimeError("socket closed")
            buf.extend(chunk)
        return bytes(buf)

    srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    srv.bind((HOST, PORT_BASE + layer_idx))
    srv.listen(1)
    conn, _ = srv.accept()

    hdr = recvall(conn, 8)
    layer, rows, cols = struct.unpack("<HHH2x", hdr)
    print(f"[Verifier] Received header: layer_idx={layer}, n_rows={rows}, n_cols={cols}")

    k = B_public.shape[0]
    bytes_A = rows * k * 4
    bytes_C = rows * cols * 4
    payload = recvall(conn, bytes_A + bytes_C)
    print(f"[Verifier] Received payload: {len(payload)} bytes (A: {bytes_A}, C: {bytes_C})")

    A_rows = np.frombuffer(payload[:bytes_A], dtype=np.float32).reshape(rows, k)
    C_vals = np.frombuffer(payload[bytes_A:], dtype=np.float32).reshape(rows, cols)

    rng = random.Random(GLOBAL_SEED)
    row_idx_v = torch.tensor(rng.sample(range(m), n_rows))
    col_idx_v = torch.tensor(rng.sample(range(n), n_cols))

    print(f"[Verifier] row_idx shape: {row_idx_v.shape}, first few: {row_idx_v[:5]}")
    print(f"[Verifier] col_idx shape: {col_idx_v.shape}, first few: {col_idx_v[:5]}")

    # disable hooks **inside verifier** to avoid recursion
    _THREAD.no_hook = True
    try:
        B_sub      = B_public[:, col_idx_v]                    # k × cols
        recomputed = orig_tensor_matmul(torch.from_numpy(A_rows.copy()), B_sub)
    finally:
        _THREAD.no_hook = False

    diff = torch.abs(recomputed - torch.from_numpy(C_vals.copy()))
    ok = torch.allclose(recomputed, torch.from_numpy(C_vals), atol=1e-3, rtol=1e-3)

    print(f"[Verifier] Max diff: {diff.max().item()}, mean diff: {diff.mean().item()}")
    print(f"[Verifier] layer {layer} passed? {ok}\n")
    conn.close(); srv.close()


def _prover_send(sampled_A, sampled_C, layer_idx):
    n_rows = sampled_A.shape[0]
    n_cols = sampled_C.shape[1]
    print(f"[Prover] Sending: n_rows={n_rows}, n_cols={n_cols}")

    buf_A = sampled_A.detach().cpu().numpy().astype(np.float32).tobytes()
    buf_C = sampled_C.detach().cpu().numpy().astype(np.float32).tobytes()
    hdr = struct.pack("<HHH2x", layer_idx, n_rows, n_cols)

    with socket.create_connection((HOST, PORT_BASE + layer_idx)) as s:
        s.sendall(hdr + buf_A + buf_C)
        print(f"[Prover] Sent {len(hdr)+len(buf_A)+len(buf_C)} total bytes\n")


def audit_protocol(A, B, layer_idx):
    if getattr(_THREAD,"in_audit",False):
        return

    if B.dim() != 2:
      raise ValueError("B must be a 2D tensor")

    A_flat = A.reshape(-1,A.shape[-1])

    _THREAD.in_audit = True
    try:
        # use the *un-patched* implementation exactly once
        C = orig_matmul(A, B)
        C_flat = C.reshape(-1,C.shape[-1])

        m = A_flat.shape[0]
        n    = C_flat.shape[1]

        rng = random.Random(GLOBAL_SEED)
        row_idx = torch.tensor(rng.sample(range(m), max(1, int(m * 0.001))))
        col_idx = torch.tensor(rng.sample(range(n), max(1, int(n * 0.01))))

        sampled_A = A_flat[row_idx]
        sampled_C = C_flat[row_idx][:, col_idx]

        print(f"[Prover] row_idx shape: {row_idx.shape}, first few: {row_idx[:5]}")
        print(f"[Prover] col_idx shape: {col_idx.shape}, first few: {col_idx[:5]}")
        print(f"[Prover] sampled_A shape: {sampled_A.shape}")
        print(f"[Prover] sampled_C shape: {sampled_C.shape}")

        th = threading.Thread(
            target=_verifier_server,
            args=(B, m, n, layer_idx,
                  sampled_A.shape[0], sampled_C.shape[1]),
            daemon=True)
        th.start()
        time.sleep(0.05)
        _prover_send(sampled_A, sampled_C, layer_idx)
        th.join()
    finally:
        _THREAD.in_audit = False

# -----------------  monkey-patch + public context  ------------------
_THREAD = threading.local()

def _wrap_fn(fn, op_name, cfg):
    @wraps(fn)
    def wrapper(*args, **kw):
        if getattr(_THREAD, "no_hook", False):
            return fn(*args, **kw)          # bypass while flag is set
        out = fn(*args, **kw)
        if random.random() <= cfg.sample_rate:
            cfg.counter += 1
            audit_protocol(args[0], args[1], cfg.counter)
        return out
    return wrapper


@contextlib.contextmanager
def verification(sample_rate=0.1):
    cfg = types.SimpleNamespace(sample_rate=sample_rate, counter=0)
    _THREAD.records = []

    patched = []
    for name in ("mm", "matmul", "bmm"):
        orig = getattr(torch, name)
        setattr(torch, name, _wrap_fn(orig, name, cfg))
        patched.append((torch, name, orig))

     # ------------ patch tensor @-operator methods ------------------
    def _make_tensor_patch(orig_meth):
        @wraps(orig_meth)
        def _tensor_mm(self, other):
            if getattr(_THREAD, "no_hook", False):
                return orig_meth(self, other)
            out = orig_meth(self, other)
            if random.random() <= cfg.sample_rate:
                cfg.counter += 1
                audit_protocol(self, other, cfg.counter)
            return out
        return _tensor_mm

    for meth_name, orig_meth in (("__matmul__", orig_tensor_matmul),
                                 ("__rmatmul__", orig_tensor_rmatmul)):
        setattr(torch.Tensor, meth_name, _make_tensor_patch(orig_meth))
        patched.append((torch.Tensor, meth_name, orig_meth))

    def _linear_hook(module, inputs, output):
        if isinstance(module, nn.Linear) and random.random() <= cfg.sample_rate:
            cfg.counter += 1
            audit_protocol(inputs[0], module.weight.t(), cfg.counter)

    hook_handle = register_module_forward_hook(_linear_hook)

    try:
        yield
    finally:
        for tgt, name, orig in patched:
            setattr(tgt, name, orig)
        hook_handle.remove()


# ----------------------------  demo  --------------------------------
if __name__ == "__main__":
    A = torch.randn(10000, 10000)
    B = torch.randn(10000, 1000)

    with verification(sample_rate=1.0):
        _ = A @ B
        _ = torch.matmul(A,B)
        torch.matmul(torch.randn(10000,10000), torch.randn(10000,1000))

[Prover] row_idx shape: torch.Size([10]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([432,  32,  30,  95, 223])
[Prover] sampled_A shape: torch.Size([10, 10000])
[Prover] sampled_C shape: torch.Size([10, 10])
[Prover] Sending: n_rows=10, n_cols=10
[Prover] Sent 400408 total bytes

[Verifier] Received header: layer_idx=1, n_rows=10, n_cols=10
[Verifier] Received payload: 400400 bytes (A: 400000, C: 400)
[Verifier] row_idx shape: torch.Size([10]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([432,  32,  30,  95, 223])
[Verifier] Max diff: 0.0001220703125, mean diff: 2.4724602553760633e-05
[Verifier] layer 1 passed? True

[Prover] row_idx shape: torch.Size([10]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([432,  32,  30,  95, 223])
[Prover] sampled_A shape: torch.Size([10, 10000])
[Prover] s

In [None]:
import time, torch

torch.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

d = 8192
A = torch.randn(d, d, device=device)
B = torch.randn(d, d // 2, device=device)    # non-square keeps sizes realistic

def time_one(fn):
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    fn()
    torch.cuda.synchronize()
    return (time.perf_counter() - t0) * 1000  # ms

# warm-up
for _ in range(5): (A @ B)

baseline = time_one(lambda: (A @ B))

with verification(sample_rate=1.0):          # audit *every* matmul
    audited = time_one(lambda: (A @ B))

print(f"baseline: {baseline:.2f} ms  |  audited: {audited:.2f} ms  "
      f"|  overhead: {audited - baseline:.2f} ms  ({(audited/baseline-1)*100:.1f} %)")


Exception in thread Thread-18 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=1, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
baseline: 144.18 ms  |  audited: 363.44 ms  |  overhead: 219.26 ms  (152.1 %)


In [None]:
torch.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

d = 8192
A = torch.randn(d, d, device=device)
B = torch.randn(d, d // 2, device=device)

def _sync():
    if device == "cuda":
        torch.cuda.synchronize()

def time_matmuls(iters: int, *, sample_rate: float | None = None) -> float:
    ctx = verification(sample_rate=sample_rate) if sample_rate is not None else contextlib.nullcontext()

    _sync()
    t0 = time.perf_counter()
    with ctx:
        for _ in range(iters):
            _ = A @ B
    _sync()
    return (time.perf_counter() - t0) * 1000.0   # ms


iters = 100
baseline_ms = time_matmuls(iters)                    # no auditing
audited_ms  = time_matmuls(iters, sample_rate=0.10)  # 10 % audits

print(f"{iters} matmuls — baseline: {baseline_ms:.2f} ms | "
      f"audited (10 %): {audited_ms:.2f} ms | "
      f"overhead: {audited_ms - baseline_ms:.2f} ms "
      f"({(audited_ms / baseline_ms - 1) * 100:.1f} %)")


Exception in thread Thread-19 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=1, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-20 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=2, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-21 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=3, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-22 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=4, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-23 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=5, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-24 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=6, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-25 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received header: layer_idx=7, n_rows=8, n_cols=40
[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-26 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received header: layer_idx=8, n_rows=8, n_cols=40
[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-27 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=9, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-28 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=10, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-29 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=11, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-30 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=12, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])


Exception in thread Thread-31 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Prover] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
[Prover] sampled_A shape: torch.Size([8, 8192])
[Prover] sampled_C shape: torch.Size([8, 40])
[Prover] Sending: n_rows=8, n_cols=40
[Verifier] Received header: layer_idx=13, n_rows=8, n_cols=40
[Prover] Sent 263432 total bytes

[Verifier] Received payload: 263424 bytes (A: 262144, C: 1280)
[Verifier] row_idx shape: torch.Size([8]), first few: tensor([1824,  409, 4506, 4012, 3657])
[Verifier] col_idx shape: torch.Size([40]), first few: tensor([3456,  260,  244,  767, 1791])
100 matmuls — baseline: 14436.20 ms | audited (10 %): 17786.70 ms | overhead: 3350.50 ms (23.2 %)


In [27]:
import torch, torch.nn as nn, torch.nn.functional as F

class Tiny(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(10000, 50000)
        self.l2 = nn.Linear(50000, 1000)
    def forward(self, x):
        return self.l2(F.relu(self.l1(x)))

model = Tiny().cuda()
x = torch.randn(1000, 10000, device="cuda")

# Add time analysis for the Tiny model's forward pass
def time_model_forward(model, x, *, sample_rate: float | None = None) -> float:
    ctx = verification(sample_rate=sample_rate) if sample_rate is not None else contextlib.nullcontext()

    _sync()
    t0 = time.perf_counter()
    with ctx:
        for _ in range(100):
          _ = model(x)
    _sync()
    return (time.perf_counter() - t0) * 1000.0

# Warm-up for model forward pass
for _ in range(10): model(x)

baseline_model_ms = time_model_forward(model, x)
audited_model_ms = time_model_forward(model, x, sample_rate=0.1) # Audit every linear layer

print(f"Tiny model forward pass - baseline: {baseline_model_ms:.2f} ms | "
      f"audited (10 %): {audited_model_ms:.2f} ms | "
      f"overhead: {audited_model_ms - baseline_model_ms:.2f} ms "
      f"({(audited_model_ms / baseline_model_ms - 1) * 100:.1f} %)")


Exception in thread Thread-518 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received header: layer_idx=1, n_rows=1, n_cols=10
[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-519 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
[Prover] sampled_A shape: torch.Size([1, 10000])
[Prover] sampled_C shape: torch.Size([1, 500])
[Prover] Sending: n_rows=1, n_cols=500
[Verifier] Received header: layer_idx=2, n_rows=1, n_cols=500
[Verifier] Received payload: 42000 bytes (A: 40000, C: 2000)
[Prover] Sent 42008 total bytes

[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])


Exception in thread Thread-520 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received header: layer_idx=3, n_rows=1, n_cols=10
[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-521 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=4, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-522 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=5, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-523 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=6, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-524 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
[Prover] sampled_A shape: torch.Size([1, 10000])
[Prover] sampled_C shape: torch.Size([1, 500])
[Prover] Sending: n_rows=1, n_cols=500
[Verifier] Received header: layer_idx=7, n_rows=1, n_cols=500
[Verifier] Received payload: 42000 bytes (A: 40000, C: 2000)
[Prover] Sent 42008 total bytes

[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])


Exception in thread Thread-525 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
[Prover] sampled_A shape: torch.Size([1, 10000])
[Prover] sampled_C shape: torch.Size([1, 500])
[Prover] Sending: n_rows=1, n_cols=500
[Verifier] Received header: layer_idx=8, n_rows=1, n_cols=500
[Prover] Sent 42008 total bytes

[Verifier] Received payload: 42000 bytes (A: 40000, C: 2000)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])


Exception in thread Thread-526 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
[Prover] sampled_A shape: torch.Size([1, 10000])
[Prover] sampled_C shape: torch.Size([1, 500])
[Prover] Sending: n_rows=1, n_cols=500
[Prover] Sent 42008 total bytes

[Verifier] Received header: layer_idx=9, n_rows=1, n_cols=500
[Verifier] Received payload: 42000 bytes (A: 40000, C: 2000)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])


Exception in thread Thread-527 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=10, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-528 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=11, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-529 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=12, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-530 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
[Prover] sampled_A shape: torch.Size([1, 10000])
[Prover] sampled_C shape: torch.Size([1, 500])
[Prover] Sending: n_rows=1, n_cols=500
[Prover] Sent 42008 total bytes

[Verifier] Received header: layer_idx=13, n_rows=1, n_cols=500
[Verifier] Received payload: 42000 bytes (A: 40000, C: 2000)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])


Exception in thread Thread-531 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received header: layer_idx=14, n_rows=1, n_cols=10
[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-532 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])
[Prover] sampled_A shape: torch.Size([1, 50000])
[Prover] sampled_C shape: torch.Size([1, 10])
[Prover] Sending: n_rows=1, n_cols=10
[Verifier] Received header: layer_idx=15, n_rows=1, n_cols=10
[Prover] Sent 200048 total bytes

[Verifier] Received payload: 200040 bytes (A: 200000, C: 40)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([10]), first few: tensor([114,  25, 759, 281, 250])


Exception in thread Thread-533 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([654])
[Prover] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
[Prover] sampled_A shape: torch.Size([1, 10000])
[Prover] sampled_C shape: torch.Size([1, 500])
[Prover] Sending: n_rows=1, n_cols=500
[Verifier] Received header: layer_idx=16, n_rows=1, n_cols=500
[Verifier] Received payload: 42000 bytes (A: 40000, C: 2000)
[Prover] Sent 42008 total bytes

[Verifier] row_idx shape: torch.Size([1]), first few: tensor([654])
[Verifier] col_idx shape: torch.Size([500]), first few: tensor([ 7296,  1639, 48598, 18024, 16049])
Tiny model forward pass - baseline: 26023.58 ms | audited (10 %): 29357.54 ms | overhead: 3333.97 ms (12.8 %)


In [20]:
import torch, time, transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.eval()

BATCH   = 16
SEQ_LEN = 32

prompt_text = "The quick brown fox jumps over the lazy dog. " * 4
tokens  = tokenizer(prompt_text, return_tensors="pt")["input_ids"][0][:SEQ_LEN]
inputs  = tokens.unsqueeze(0).repeat(BATCH, 1).to(device)

@torch.no_grad()
def timed_forward(model, inputs, iters=20):
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(iters):
        _ = model(inputs).logits
    torch.cuda.synchronize()
    return (time.perf_counter() - t0) * 1000 / iters   # ms / batch

iters = 100

# baseline
base_ms = timed_forward(model, inputs, iters)
print(f"Baseline  (no POMM): {base_ms:6.2f} ms / batch")

# using the verification
with verification(sample_rate=0.1) as records:
    pomm_ms = timed_forward(model, inputs, iters)

print(f"With Audit: {pomm_ms:6.2f} ms / batch")
print(f"Overhead: {(pomm_ms - base_ms) / base_ms * 100:5.1f}%")

Exception in thread Thread-235 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


Baseline  (no POMM):  37.65 ms / batch
[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=1, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-236 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)
Exception in thread Thread-237 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when check

[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=2, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=3, n_rows=1, n_

Exception in thread Thread-238 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)
Exception in thread Thread-239 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when check

[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=4, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=5, n_rows=1, n_

Exception in thread Thread-240 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)
Exception in thread Thread-241 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when check

[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=6, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=7, n_rows=1, n_

Exception in thread Thread-242 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=8, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-243 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=9, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-244 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=10, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-245 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Verifier] Received header: layer_idx=11, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Prover] Sent 5088 total bytes

[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-246 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=12, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])


Exception in thread Thread-247 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=13, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-248 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=14, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])


Exception in thread Thread-249 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)
Exception in thread Thread-250 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when check

[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=15, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=16, n_rows=1, 

Exception in thread Thread-251 (_verifier_server):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-1e95f59c15d3>", line 57, in _verifier_server
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)


[Prover] row_idx shape: torch.Size([1]), first few: tensor([114])
[Prover] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
[Prover] sampled_A shape: torch.Size([1, 768])
[Prover] sampled_C shape: torch.Size([1, 502])
[Prover] Sending: n_rows=1, n_cols=502
[Prover] Sent 5088 total bytes

[Verifier] Received header: layer_idx=17, n_rows=1, n_cols=502
[Verifier] Received payload: 5080 bytes (A: 3072, C: 2008)
[Verifier] row_idx shape: torch.Size([1]), first few: tensor([114])
[Verifier] col_idx shape: torch.Size([502]), first few: tensor([ 1639, 48598, 18024, 16049, 14628])
With Audit:  45.97 ms / batch
Overhead:  22.1%
