From b11f2f2ec4c8d6c5acfb07228789cf5a45ed7fec Mon Sep 17 00:00:00 2001
From: Dave Lucia <davelucianyc@gmail.com>
Date: Thu, 21 May 2026 16:16:42 -0700
Subject: [PATCH] build(bench): add quick mode + multi-n inputs to benchmark
 harness

Adds benchmarks/helpers.exs exposing a shared Bench module with two
pre-canned Benchee profiles selected by the LUA_BENCH_MODE env var:

  * default ("quick") - 1 s warmup, 3 s measurement, memory_time off.
    Each Benchee.run takes ~4 s; the full mix lua.bench suite is ~80 s
    instead of ~17 min. For "did my change move the needle?" loops.
  * "full" - 2 s warmup, 10 s measurement, memory_time on, plus a
    sweep of input sizes (n=10, 100, 1000) for the table workloads.
    For any numbers we publish.

Each script Code.require_file/2s helpers.exs and calls Bench.opts() in
place of an inline keyword list. table_ops.exs is restructured to use
Benchee inputs: from Bench.table_inputs/0 so all sizes share warmup
and measurement state per workload.

Quick mode trades measurement precision (higher deviation bands) for
iteration speed. Full mode is the source of truth for published
numbers.

The mix lua.bench task forwards the parent process env, so
LUA_BENCH_MODE set in the user's shell propagates to the child
mix run automatically.
---
 benchmarks/closures.exs   |  10 ++-
 benchmarks/fibonacci.exs  |   6 +-
 benchmarks/helpers.exs    |  67 +++++++++++++++++++
 benchmarks/oop.exs        |   6 +-
 benchmarks/string_ops.exs |  14 ++--
 benchmarks/table_ops.exs  | 134 ++++++++++++++++----------------------
 tasks/lua.bench.ex        |  17 ++++-
 7 files changed, 159 insertions(+), 95 deletions(-)
 create mode 100644 benchmarks/helpers.exs

diff --git a/benchmarks/closures.exs b/benchmarks/closures.exs
index f282707..8910e60 100644
--- a/benchmarks/closures.exs
+++ b/benchmarks/closures.exs
@@ -16,6 +16,12 @@
 #   ./benchmarks/setup_luaport.sh           # idempotent; patches + builds
 #   MIX_ENV=benchmark mix run benchmarks/closures.exs
 # If luaport fails to start, the benchmark prints a notice and skips it.
+#
+# Run modes (see benchmarks/helpers.exs):
+#   default                — quick mode (~4 s per Benchee.run)
+#   LUA_BENCH_MODE=full    — long windows + memory_time, for publishable numbers
+
+Code.require_file("helpers.exs", __DIR__)
 
 Application.ensure_all_started(:luerl)
 
@@ -83,9 +89,7 @@ Benchee.run(
     },
     c_lua_benchmarks
   ),
-  time: 10,
-  warmup: 2,
-  memory_time: 1
+  Bench.opts()
 )
 
 c_lua_cleanup.()
diff --git a/benchmarks/fibonacci.exs b/benchmarks/fibonacci.exs
index 6ba154f..502f889 100644
--- a/benchmarks/fibonacci.exs
+++ b/benchmarks/fibonacci.exs
@@ -13,6 +13,8 @@
 #   MIX_ENV=benchmark mix run benchmarks/fibonacci.exs
 # If luaport fails to start, the benchmark prints a notice and skips it.
 
+Code.require_file("helpers.exs", __DIR__)
+
 Application.ensure_all_started(:luerl)
 
 fib_def = """
@@ -61,9 +63,7 @@ Benchee.run(
     },
     c_lua_benchmarks
   ),
-  time: 10,
-  warmup: 2,
-  memory_time: 1
+  Bench.opts()
 )
 
 c_lua_cleanup.()
diff --git a/benchmarks/helpers.exs b/benchmarks/helpers.exs
new file mode 100644
index 0000000..eb719c9
--- /dev/null
+++ b/benchmarks/helpers.exs
@@ -0,0 +1,67 @@
+# Shared configuration for the benchmark scripts under `benchmarks/`.
+#
+# Each script `Code.require_file/2`s this file at the top so the harness
+# stays consistent across workloads. There is one knob — the
+# `LUA_BENCH_MODE` env var — which selects between two pre-canned
+# Benchee profiles:
+#
+#   * `quick` (default) — short windows for iteration during development.
+#     Each Benchee.run takes ~4 seconds. Memory measurement is off.
+#     Five workloads × 4 implementations ≈ 80 seconds wall clock.
+#
+#   * `full` — longer windows + memory measurement, suitable for
+#     end-of-cycle definitive numbers and for the figures we paste into
+#     PR descriptions or ROADMAP.md.
+#
+# Usage:
+#
+#     mix run benchmarks/fibonacci.exs                       # quick
+#     LUA_BENCH_MODE=full mix run benchmarks/fibonacci.exs   # full
+#     mix lua.bench                                          # quick across all
+#     LUA_BENCH_MODE=full mix lua.bench                      # full across all
+#
+# Quick mode is intended for "did my change move the needle?" loops.
+# Full mode is the source of truth for any number we publish.
+
+defmodule Bench do
+  @moduledoc false
+
+  @doc """
+  Returns the Benchee options keyword list for the current run mode.
+
+  Mode is selected via the `LUA_BENCH_MODE` environment variable. Any
+  value other than `"full"` is treated as quick mode.
+  """
+  def opts(extra \\ []) do
+    base =
+      case System.get_env("LUA_BENCH_MODE") do
+        "full" -> [time: 10, warmup: 2, memory_time: 1]
+        _ -> [time: 3, warmup: 1, memory_time: 0]
+      end
+
+    Keyword.merge(base, extra)
+  end
+
+  @doc """
+  Returns the n-size sweep used by the multi-input table benchmarks.
+
+  Quick mode runs a single representative size to keep iteration cheap.
+  Full mode runs a sweep so we can see how a workload's perf curve
+  changes with input size.
+  """
+  def table_inputs do
+    case System.get_env("LUA_BENCH_MODE") do
+      "full" -> [{"small (n=10)", 10}, {"medium (n=100)", 100}, {"large (n=1000)", 1000}]
+      _ -> [{"medium (n=100)", 100}]
+    end
+  end
+
+  @doc """
+  Convenience helper. Prints the current mode at the top of a script so
+  the run output is self-describing.
+  """
+  def banner(name) do
+    mode = if System.get_env("LUA_BENCH_MODE") == "full", do: "full", else: "quick"
+    IO.puts("\n=== #{name} (mode: #{mode}) ===\n")
+  end
+end
diff --git a/benchmarks/oop.exs b/benchmarks/oop.exs
index 25ac5a3..e6790e5 100644
--- a/benchmarks/oop.exs
+++ b/benchmarks/oop.exs
@@ -23,6 +23,8 @@
 #   MIX_ENV=benchmark mix run benchmarks/oop.exs
 # If luaport fails to start, the benchmark prints a notice and skips it.
 
+Code.require_file("helpers.exs", __DIR__)
+
 Application.ensure_all_started(:luerl)
 
 oop_def = """
@@ -94,9 +96,7 @@ Benchee.run(
     },
     c_lua_benchmarks
   ),
-  time: 10,
-  warmup: 2,
-  memory_time: 1
+  Bench.opts()
 )
 
 c_lua_cleanup.()
diff --git a/benchmarks/string_ops.exs b/benchmarks/string_ops.exs
index cbdc3aa..50e6688 100644
--- a/benchmarks/string_ops.exs
+++ b/benchmarks/string_ops.exs
@@ -17,6 +17,8 @@
 #   MIX_ENV=benchmark mix run benchmarks/string_ops.exs
 # If luaport fails to start, the benchmark prints a notice and skips it.
 
+Code.require_file("helpers.exs", __DIR__)
+
 Application.ensure_all_started(:luerl)
 
 string_def = """
@@ -68,7 +70,7 @@ luerl_state = :luerl.init()
       {%{}, %{}, fn -> :ok end}
   end
 
-IO.puts("\n=== String Concatenation via table.concat (n=100) ===\n")
+Bench.banner("String Concatenation via table.concat (n=100)")
 
 Benchee.run(
   Map.merge(
@@ -79,12 +81,10 @@ Benchee.run(
     },
     c_lua_concat
   ),
-  time: 10,
-  warmup: 2,
-  memory_time: 1
+  Bench.opts()
 )
 
-IO.puts("\n=== String Formatting via string.format (n=100) ===\n")
+Bench.banner("String Formatting via string.format (n=100)")
 
 Benchee.run(
   Map.merge(
@@ -95,9 +95,7 @@ Benchee.run(
     },
     c_lua_format
   ),
-  time: 10,
-  warmup: 2,
-  memory_time: 1
+  Bench.opts()
 )
 
 c_lua_cleanup.()
diff --git a/benchmarks/table_ops.exs b/benchmarks/table_ops.exs
index dd93659..6dd3243 100644
--- a/benchmarks/table_ops.exs
+++ b/benchmarks/table_ops.exs
@@ -19,6 +19,8 @@
 #   MIX_ENV=benchmark mix run benchmarks/table_ops.exs
 # If luaport fails to start, the benchmark prints a notice and skips it.
 
+Code.require_file("helpers.exs", __DIR__)
+
 Application.ensure_all_started(:luerl)
 
 table_def = """
@@ -68,105 +70,83 @@ function run_table_map_reduce(n)
 end
 """
 
-n = 500
-
-call_build = "return run_table_build(#{n})"
-call_sort = "return run_table_sort(#{n})"
-call_sum = "return run_table_sum(#{n})"
-call_map_reduce = "return run_table_map_reduce(#{n})"
-
 # --- This Lua implementation ---
 lua = Lua.new()
 {_, lua} = Lua.eval!(lua, table_def)
-{build_chunk, _} = Lua.load_chunk!(lua, call_build)
-{sort_chunk, _} = Lua.load_chunk!(lua, call_sort)
-{sum_chunk, _} = Lua.load_chunk!(lua, call_sum)
-{map_reduce_chunk, _} = Lua.load_chunk!(lua, call_map_reduce)
+
+# Pre-compile chunks per (operation, n) pair so the chunk path doesn't
+# pay the compile cost during measurement. Inputs ship through Benchee's
+# `inputs:` mechanism so all sizes share warmup/measurement state.
+sizes = Bench.table_inputs()
+
+build_chunks =
+  Map.new(sizes, fn {label, n} ->
+    {chunk, _} = Lua.load_chunk!(lua, "return run_table_build(#{n})")
+    {label, {chunk, "return run_table_build(#{n})", n}}
+  end)
+
+sort_chunks =
+  Map.new(sizes, fn {label, n} ->
+    {chunk, _} = Lua.load_chunk!(lua, "return run_table_sort(#{n})")
+    {label, {chunk, "return run_table_sort(#{n})", n}}
+  end)
+
+sum_chunks =
+  Map.new(sizes, fn {label, n} ->
+    {chunk, _} = Lua.load_chunk!(lua, "return run_table_sum(#{n})")
+    {label, {chunk, "return run_table_sum(#{n})", n}}
+  end)
+
+map_reduce_chunks =
+  Map.new(sizes, fn {label, n} ->
+    {chunk, _} = Lua.load_chunk!(lua, "return run_table_map_reduce(#{n})")
+    {label, {chunk, "return run_table_map_reduce(#{n})", n}}
+  end)
 
 # --- Luerl ---
 luerl_state = :luerl.init()
 {:ok, _, luerl_state} = :luerl.do(table_def, luerl_state)
 
 # --- C Lua via luaport (optional) ---
-{c_lua_build, c_lua_sort, c_lua_sum, c_lua_map_reduce, c_lua_cleanup} =
+{c_lua_call, c_lua_cleanup} =
   case Application.ensure_all_started(:luaport) do
     {:ok, _} ->
       scripts_dir = Path.join(__DIR__, "scripts")
       {:ok, port_pid, _} = :luaport.spawn(:table_bench, to_charlist(scripts_dir))
       :luaport.load(port_pid, table_def)
 
-      mk = fn func -> %{"C Lua (luaport)" => fn -> :luaport.call(port_pid, func, [n]) end} end
-
       {
-        mk.(:run_table_build),
-        mk.(:run_table_sort),
-        mk.(:run_table_sum),
-        mk.(:run_table_map_reduce),
+        fn func, n -> :luaport.call(port_pid, func, [n]) end,
         fn -> :luaport.despawn(:table_bench) end
       }
 
     {:error, reason} ->
       IO.puts("luaport not available (#{inspect(reason)}) — skipping C Lua benchmarks")
-      empty = %{}
-      {empty, empty, empty, empty, fn -> :ok end}
+      {nil, fn -> :ok end}
   end
 
-benchee_opts = [time: 10, warmup: 2, memory_time: 1]
-
-IO.puts("\n=== Table Build (n=#{n}) ===\n")
-
-Benchee.run(
-  Map.merge(
-    %{
-      "lua (eval)" => fn -> Lua.eval!(lua, call_build) end,
-      "lua (chunk)" => fn -> Lua.eval!(lua, build_chunk) end,
-      "luerl" => fn -> :luerl.do(call_build, luerl_state) end
-    },
-    c_lua_build
-  ),
-  benchee_opts
-)
-
-IO.puts("\n=== Table Sort (n=#{n}) ===\n")
-
-Benchee.run(
-  Map.merge(
-    %{
-      "lua (eval)" => fn -> Lua.eval!(lua, call_sort) end,
-      "lua (chunk)" => fn -> Lua.eval!(lua, sort_chunk) end,
-      "luerl" => fn -> :luerl.do(call_sort, luerl_state) end
-    },
-    c_lua_sort
-  ),
-  benchee_opts
-)
-
-IO.puts("\n=== Table Iterate/Sum (n=#{n}) ===\n")
-
-Benchee.run(
-  Map.merge(
-    %{
-      "lua (eval)" => fn -> Lua.eval!(lua, call_sum) end,
-      "lua (chunk)" => fn -> Lua.eval!(lua, sum_chunk) end,
-      "luerl" => fn -> :luerl.do(call_sum, luerl_state) end
-    },
-    c_lua_sum
-  ),
-  benchee_opts
-)
-
-IO.puts("\n=== Table Map + Reduce (n=#{n}) ===\n")
-
-Benchee.run(
-  Map.merge(
-    %{
-      "lua (eval)" => fn -> Lua.eval!(lua, call_map_reduce) end,
-      "lua (chunk)" => fn -> Lua.eval!(lua, map_reduce_chunk) end,
-      "luerl" => fn -> :luerl.do(call_map_reduce, luerl_state) end
-    },
-    c_lua_map_reduce
-  ),
-  benchee_opts
-)
+bench = fn name, chunks_map, lua_func ->
+  Bench.banner(name)
+
+  jobs = %{
+    "lua (eval)" => fn {_chunk, call_str, _n} -> Lua.eval!(lua, call_str) end,
+    "lua (chunk)" => fn {chunk, _call_str, _n} -> Lua.eval!(lua, chunk) end,
+    "luerl" => fn {_chunk, call_str, _n} -> :luerl.do(call_str, luerl_state) end
+  }
+
+  jobs =
+    if c_lua_call do
+      Map.put(jobs, "C Lua (luaport)", fn {_chunk, _call_str, n} -> c_lua_call.(lua_func, n) end)
+    else
+      jobs
+    end
+
+  Benchee.run(jobs, [{:inputs, chunks_map} | Bench.opts()])
+end
+
+bench.("Table Build", build_chunks, :run_table_build)
+bench.("Table Sort", sort_chunks, :run_table_sort)
+bench.("Table Iterate/Sum", sum_chunks, :run_table_sum)
+bench.("Table Map + Reduce", map_reduce_chunks, :run_table_map_reduce)
 
 c_lua_cleanup.()
diff --git a/tasks/lua.bench.ex b/tasks/lua.bench.ex
index 3a7f894..20a618a 100644
--- a/tasks/lua.bench.ex
+++ b/tasks/lua.bench.ex
@@ -22,11 +22,12 @@ defmodule Mix.Tasks.Lua.Bench do
 
   ## Usage
 
-      mix lua.bench                          # run all workloads
+      mix lua.bench                          # run all workloads (quick mode)
       mix lua.bench --workload fibonacci     # run one
       mix lua.bench --list                   # print available workloads
       mix lua.bench --workload fibonacci --workload closures
                                              # run several
+      LUA_BENCH_MODE=full mix lua.bench      # long runs + memory_time + n-sweep
 
   ## Options
 
@@ -35,6 +36,20 @@ defmodule Mix.Tasks.Lua.Bench do
       every workload is run.
     * `--list` — Print the available workloads and exit.
 
+  ## Run modes
+
+  The benchmark scripts read the `LUA_BENCH_MODE` environment variable
+  (see `benchmarks/helpers.exs`):
+
+    * **default (`quick`)** — short Benchee windows (1 s warmup, 3 s
+      measurement, memory_time off) for fast development iteration.
+      Each workload takes ~16 s; the full suite is ~80 s wall clock.
+    * **`full`** — long windows (2 s warmup, 10 s measurement, memory
+      time on) plus a sweep of multiple input sizes for the table
+      workloads. Use this for any numbers you publish (PR descriptions,
+      ROADMAP.md). Each workload takes a minute or two; the full suite
+      runs ~15+ minutes.
+
   ## Notes
 
   This task shells out to `mix run` in the `:benchmark` env so the