diff --git a/.changeset/large-ads-report.md b/.changeset/large-ads-report.md new file mode 100644 index 000000000..991e8717b --- /dev/null +++ b/.changeset/large-ads-report.md @@ -0,0 +1,5 @@ +--- +"@workflow/world-local": patch +--- + +Silently ignore stream already closed errors diff --git a/.github/actions/render-benchmarks/action.yml b/.github/actions/render-benchmarks/action.yml index 59983bc93..df6150754 100644 --- a/.github/actions/render-benchmarks/action.yml +++ b/.github/actions/render-benchmarks/action.yml @@ -8,12 +8,19 @@ inputs: description: 'Name of the app being benchmarked' required: true backend: - description: 'Backend type (local, postgres, vercel)' + description: 'World type (local, postgres, vercel)' required: true + baseline-file: + description: 'Path to the baseline benchmark JSON file for comparison (optional)' + required: false runs: using: 'composite' steps: - name: Render benchmark results shell: bash run: | - node ${{ github.action_path }}/render.js "${{ inputs.benchmark-file }}" "${{ inputs.app-name }}" "${{ inputs.backend }}" >> $GITHUB_STEP_SUMMARY + BASELINE_ARG="" + if [ -n "${{ inputs.baseline-file }}" ] && [ -f "${{ inputs.baseline-file }}" ]; then + BASELINE_ARG="--baseline ${{ inputs.baseline-file }}" + fi + node ${{ github.action_path }}/render.js "${{ inputs.benchmark-file }}" "${{ inputs.app-name }}" "${{ inputs.backend }}" $BASELINE_ARG >> $GITHUB_STEP_SUMMARY diff --git a/.github/actions/render-benchmarks/render.js b/.github/actions/render-benchmarks/render.js index 8861ad493..b1059908e 100644 --- a/.github/actions/render-benchmarks/render.js +++ b/.github/actions/render-benchmarks/render.js @@ -1,16 +1,35 @@ #!/usr/bin/env node const fs = require('fs'); +const path = require('path'); + +// Parse arguments: render.js [--baseline ] +const args = process.argv.slice(2); +let benchmarkFile = null; +let appName = null; +let backend = null; +let baselineFile = null; -const [, , benchmarkFile, appName, backend] = process.argv; +for (let i = 0; i < args.length; i++) { + if (args[i] === '--baseline' && args[i + 1]) { + baselineFile = args[i + 1]; + i++; + } else if (!benchmarkFile) { + benchmarkFile = args[i]; + } else if (!appName) { + appName = args[i]; + } else if (!backend) { + backend = args[i]; + } +} if (!benchmarkFile || !appName || !backend) { - console.error('Usage: render.js '); + console.error( + 'Usage: render.js [--baseline ]' + ); process.exit(1); } -const path = require('path'); - // Try to load workflow timing data let workflowTimings = null; // Only replace filename, not directory name @@ -28,14 +47,76 @@ if (fs.existsSync(timingFile)) { } } +// Try to load baseline data +let baselineData = null; +let baselineTimings = null; +if (baselineFile && fs.existsSync(baselineFile)) { + try { + baselineData = JSON.parse(fs.readFileSync(baselineFile, 'utf-8')); + // Also try to load baseline timings + const baselineTimingFile = path.join( + path.dirname(baselineFile), + path.basename(baselineFile).replace('bench-results-', 'bench-timings-') + ); + if (fs.existsSync(baselineTimingFile)) { + baselineTimings = JSON.parse( + fs.readFileSync(baselineTimingFile, 'utf-8') + ); + } + } catch (e) { + console.error(`Warning: Could not parse baseline file: ${e.message}`); + } +} + +// Build baseline lookup map: benchName -> { wallTime, workflowTime, ttfb } +const baselineLookup = {}; +if (baselineData) { + for (const file of baselineData.files || []) { + for (const group of file.groups || []) { + for (const bench of group.benchmarks || []) { + if (bench.mean !== undefined && bench.mean !== null) { + baselineLookup[bench.name] = { + wallTime: bench.mean, + workflowTime: + baselineTimings?.summary?.[bench.name]?.avgExecutionTimeMs ?? + null, + ttfb: + baselineTimings?.summary?.[bench.name]?.avgFirstByteTimeMs ?? + null, + }; + } + } + } + } +} + // Format number with consistent width function formatSec(ms, decimals = 3) { return (ms / 1000).toFixed(decimals); } -// Get backend emoji -function getBackendEmoji(backend) { - switch (backend) { +// Format delta between current and baseline values +function formatDelta(current, baseline) { + if ( + baseline === null || + baseline === undefined || + current === null || + current === undefined + ) { + return ''; + } + const percentChange = ((current - baseline) / baseline) * 100; + if (Math.abs(percentChange) < 0.5) { + return ' (~)'; + } + const sign = percentChange > 0 ? '+' : ''; + const emoji = percentChange > 5 ? ' 🔺' : percentChange < -5 ? ' 🟢' : ''; + return ` (${sign}${percentChange.toFixed(1)}%${emoji})`; +} + +// Get world emoji +function getWorldEmoji(world) { + switch (world) { case 'vercel': return '▲'; case 'postgres': @@ -50,56 +131,140 @@ function getBackendEmoji(backend) { try { const data = JSON.parse(fs.readFileSync(benchmarkFile, 'utf-8')); - const emoji = getBackendEmoji(backend); - console.log(`## ${emoji} Benchmark Results: ${appName} (${backend})\n`); + const emoji = getWorldEmoji(backend); + console.log(`## ${emoji} Benchmark Results: ${appName} (${backend} world)\n`); + + // Show baseline comparison note if baseline data is available + if (Object.keys(baselineLookup).length > 0) { + console.log( + '> 📈 _Comparing against baseline from `main` branch. Green 🟢 = faster, Red 🔺 = slower._\n' + ); + } for (const file of data.files) { for (const group of file.groups) { - // Workflow Time is primary metric, Wall Time is secondary - console.log( - '| Benchmark | Workflow Time (avg) | Min | Max | Wall Time | Overhead | Samples |' - ); - console.log( - '|:----------|--------------------:|----:|----:|----------:|---------:|--------:|' - ); + // Separate regular and stream benchmarks + const regularBenchmarks = []; + const streamBenchmarks = []; for (const bench of group.benchmarks) { - // Skip benchmarks without valid timing data (failed or timed out) - if (bench.mean === undefined || bench.mean === null) { - console.log(`| ${bench.name} | ⚠️ No data | - | - | - | - | 0 |`); - continue; + const summary = workflowTimings?.summary?.[bench.name]; + if (summary?.avgFirstByteTimeMs !== undefined) { + streamBenchmarks.push(bench); + } else { + regularBenchmarks.push(bench); } + } - const wallTimeSec = formatSec(bench.mean); + // Render regular benchmarks + if (regularBenchmarks.length > 0) { + console.log( + '| Benchmark | Workflow Time (avg) | Min | Max | Wall Time | Overhead | Samples |' + ); + console.log( + '|:----------|--------------------:|----:|----:|----------:|---------:|--------:|' + ); - // Get workflow execution time if available - let workflowTimeSec = '-'; - let minTimeSec = '-'; - let maxTimeSec = '-'; - let overheadSec = '-'; + for (const bench of regularBenchmarks) { + // Skip benchmarks without valid timing data (failed or timed out) + if (bench.mean === undefined || bench.mean === null) { + console.log(`| ${bench.name} | ⚠️ No data | - | - | - | - | 0 |`); + continue; + } - if (workflowTimings?.summary?.[bench.name]) { - const summary = workflowTimings.summary[bench.name]; - workflowTimeSec = formatSec(summary.avgExecutionTimeMs); + const baseline = baselineLookup[bench.name]; + const wallTimeSec = formatSec(bench.mean); + const wallDelta = formatDelta(bench.mean, baseline?.wallTime); + let workflowTimeSec = '-'; + let workflowDelta = ''; + let minTimeSec = '-'; + let maxTimeSec = '-'; + let overheadSec = '-'; - // Get min/max if available - if (summary.minExecutionTimeMs !== undefined) { - minTimeSec = formatSec(summary.minExecutionTimeMs); - } - if (summary.maxExecutionTimeMs !== undefined) { - maxTimeSec = formatSec(summary.maxExecutionTimeMs); + if (workflowTimings?.summary?.[bench.name]) { + const summary = workflowTimings.summary[bench.name]; + workflowTimeSec = formatSec(summary.avgExecutionTimeMs); + workflowDelta = formatDelta( + summary.avgExecutionTimeMs, + baseline?.workflowTime + ); + if (summary.minExecutionTimeMs !== undefined) { + minTimeSec = formatSec(summary.minExecutionTimeMs); + } + if (summary.maxExecutionTimeMs !== undefined) { + maxTimeSec = formatSec(summary.maxExecutionTimeMs); + } + const overheadMs = bench.mean - summary.avgExecutionTimeMs; + overheadSec = formatSec(overheadMs); } - // Calculate overhead (wall time - workflow time) - const overheadMs = bench.mean - summary.avgExecutionTimeMs; - overheadSec = formatSec(overheadMs); + console.log( + `| ${bench.name} | ${workflowTimeSec}s${workflowDelta} | ${minTimeSec}s | ${maxTimeSec}s | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${bench.sampleCount} |` + ); } + console.log(''); + } + // Render stream benchmarks with TTFB column + if (streamBenchmarks.length > 0) { + console.log('**Stream Benchmarks**\n'); + console.log( + '| Benchmark | Workflow Time (avg) | TTFB | Min | Max | Wall Time | Overhead | Samples |' + ); console.log( - `| ${bench.name} | ${workflowTimeSec}s | ${minTimeSec}s | ${maxTimeSec}s | ${wallTimeSec}s | ${overheadSec}s | ${bench.sampleCount} |` + '|:----------|--------------------:|-----:|----:|----:|----------:|---------:|--------:|' ); + + for (const bench of streamBenchmarks) { + // Skip benchmarks without valid timing data (failed or timed out) + if (bench.mean === undefined || bench.mean === null) { + console.log( + `| ${bench.name} | ⚠️ No data | - | - | - | - | - | 0 |` + ); + continue; + } + + const baseline = baselineLookup[bench.name]; + const wallTimeSec = formatSec(bench.mean); + const wallDelta = formatDelta(bench.mean, baseline?.wallTime); + let workflowTimeSec = '-'; + let workflowDelta = ''; + let minTimeSec = '-'; + let maxTimeSec = '-'; + let overheadSec = '-'; + let ttfbSec = '-'; + let ttfbDelta = ''; + + if (workflowTimings?.summary?.[bench.name]) { + const summary = workflowTimings.summary[bench.name]; + workflowTimeSec = formatSec(summary.avgExecutionTimeMs); + workflowDelta = formatDelta( + summary.avgExecutionTimeMs, + baseline?.workflowTime + ); + if (summary.minExecutionTimeMs !== undefined) { + minTimeSec = formatSec(summary.minExecutionTimeMs); + } + if (summary.maxExecutionTimeMs !== undefined) { + maxTimeSec = formatSec(summary.maxExecutionTimeMs); + } + if (summary.avgFirstByteTimeMs !== undefined) { + ttfbSec = formatSec(summary.avgFirstByteTimeMs); + ttfbDelta = formatDelta( + summary.avgFirstByteTimeMs, + baseline?.ttfb + ); + } + const overheadMs = bench.mean - summary.avgExecutionTimeMs; + overheadSec = formatSec(overheadMs); + } + + console.log( + `| ${bench.name} | ${workflowTimeSec}s${workflowDelta} | ${ttfbSec}s${ttfbDelta} | ${minTimeSec}s | ${maxTimeSec}s | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${bench.sampleCount} |` + ); + } + console.log(''); } - console.log(''); } } @@ -116,6 +281,9 @@ try { ); console.log('- **Overhead**: Testbench overhead (Wall Time - Workflow Time)'); console.log('- **Samples**: Number of benchmark iterations run'); + console.log( + '- **TTFB**: Time to First Byte - time from workflow start until first stream byte received (stream benchmarks only)' + ); console.log(''); } catch (error) { console.error(`Error rendering benchmark results: ${error.message}`); diff --git a/.github/scripts/aggregate-benchmarks.js b/.github/scripts/aggregate-benchmarks.js index 23881f438..f6b3c2dc9 100644 --- a/.github/scripts/aggregate-benchmarks.js +++ b/.github/scripts/aggregate-benchmarks.js @@ -3,10 +3,22 @@ const fs = require('fs'); const path = require('path'); -const [, , resultsDir = '.'] = process.argv; +// Parse command line arguments +const args = process.argv.slice(2); +let resultsDir = '.'; +let baselineDir = null; + +for (let i = 0; i < args.length; i++) { + if (args[i] === '--baseline' && args[i + 1]) { + baselineDir = args[i + 1]; + i++; + } else if (!args[i].startsWith('--')) { + resultsDir = args[i]; + } +} -// Backend display config -const backendConfig = { +// World display config +const worldConfig = { local: { emoji: '💻', label: 'Local' }, postgres: { emoji: '🐘', label: 'Postgres' }, vercel: { emoji: '▲', label: 'Vercel' }, @@ -15,7 +27,7 @@ const backendConfig = { // Framework display config const frameworkConfig = { 'nextjs-turbopack': { label: 'Next.js (Turbopack)' }, - nitro: { label: 'Nitro' }, + 'nitro-v3': { label: 'Nitro' }, express: { label: 'Express' }, }; @@ -24,6 +36,26 @@ function formatSec(ms, decimals = 3) { return (ms / 1000).toFixed(decimals); } +// Format delta between current and baseline values +// Returns string like "+12.3%" (slower) or "-5.2%" (faster) or "" if no baseline +function formatDelta(current, baseline) { + if ( + baseline === null || + baseline === undefined || + current === null || + current === undefined + ) { + return ''; + } + const percentChange = ((current - baseline) / baseline) * 100; + if (Math.abs(percentChange) < 0.5) { + return ' (~)'; + } + const sign = percentChange > 0 ? '+' : ''; + const emoji = percentChange > 5 ? ' 🔺' : percentChange < -5 ? ' 🟢' : ''; + return ` (${sign}${percentChange.toFixed(1)}%${emoji})`; +} + // Find all benchmark result files function findBenchmarkFiles(dir) { const files = []; @@ -75,7 +107,7 @@ function loadTimingData(benchmarkFile) { // Collect all benchmark data function collectBenchmarkData(resultFiles) { - // Structure: { [benchmarkName]: { [app]: { [backend]: { wallTime, workflowTime, overhead, min, max, samples } } } } + // Structure: { [benchmarkName]: { [app]: { [backend]: { wallTime, workflowTime, overhead, min, max, samples, firstByteTime } } } } const data = {}; for (const file of resultFiles) { @@ -107,8 +139,13 @@ function collectBenchmarkData(resultFiles) { // Get workflow timing if available let workflowTimeMs = null; + let firstByteTimeMs = null; if (timings?.summary?.[benchName]) { workflowTimeMs = timings.summary[benchName].avgExecutionTimeMs; + // Get TTFB for stream benchmarks + if (timings.summary[benchName].avgFirstByteTimeMs !== undefined) { + firstByteTimeMs = timings.summary[benchName].avgFirstByteTimeMs; + } } data[benchName][app][backend] = { @@ -119,6 +156,7 @@ function collectBenchmarkData(resultFiles) { min: bench.min, max: bench.max, samples: bench.sampleCount, + firstByteTime: firstByteTimeMs, }; } } @@ -159,174 +197,345 @@ function getAppsAndBackends(data) { return { apps: sortedApps, backends: sortedBackends }; } -// Render the comparison tables -function renderComparison(data) { - const { apps, backends } = getAppsAndBackends(data); +// Check if a benchmark has TTFB data (is a stream benchmark) +function isStreamBenchmark(benchData, apps, backends) { + for (const app of apps) { + for (const backend of backends) { + if (benchData[app]?.[backend]?.firstByteTime !== null) { + return true; + } + } + } + return false; +} - if (Object.keys(data).length === 0) { - console.log('No benchmark data found.\n'); +// Render a single benchmark table +function renderBenchmarkTable( + benchName, + benchData, + baselineBenchData, + apps, + backends, + isStream +) { + console.log(`## ${benchName}\n`); + + // Collect all data points (including missing ones) for all app/backend combinations + const dataPoints = []; + const validDataPoints = []; + for (const app of apps) { + for (const backend of backends) { + const metrics = benchData[app]?.[backend]; + const baseline = baselineBenchData?.[app]?.[backend] || null; + const dataPoint = { app, backend, metrics: metrics || null, baseline }; + dataPoints.push(dataPoint); + if (metrics) { + validDataPoints.push(dataPoint); + } + } + } + + if (validDataPoints.length === 0) { + console.log('_No data available_\n'); return; } - console.log('# 📊 Benchmark Comparison\n'); - console.log( - 'Cross-matrix comparison of workflow performance across frameworks and backends.\n' - ); + // Sort valid data points by workflow time for ranking + validDataPoints.sort((a, b) => { + const aTime = a.metrics.workflowTime ?? a.metrics.wallTime; + const bTime = b.metrics.workflowTime ?? b.metrics.wallTime; + return aTime - bTime; + }); + const fastest = validDataPoints[0]; + const fastestTime = fastest.metrics.workflowTime ?? fastest.metrics.wallTime; + + // Sort all data points: valid ones first (by time), then missing ones + dataPoints.sort((a, b) => { + // Missing data goes to the end + if (!a.metrics && !b.metrics) return 0; + if (!a.metrics) return 1; + if (!b.metrics) return -1; + // Valid data sorted by time + const aTime = a.metrics.workflowTime ?? a.metrics.wallTime; + const bTime = b.metrics.workflowTime ?? b.metrics.wallTime; + return aTime - bTime; + }); + + // Render table - different columns for stream vs regular benchmarks + if (isStream) { + console.log( + '| World | Framework | Workflow Time | TTFB | Wall Time | Overhead | vs Fastest |' + ); + console.log( + '|:------|:----------|--------------:|-----:|----------:|---------:|-----------:|' + ); + } else { + console.log( + '| World | Framework | Workflow Time | Wall Time | Overhead | vs Fastest |' + ); + console.log( + '|:------|:----------|--------------:|----------:|---------:|-----------:|' + ); + } - // For each benchmark, create a comparison table - for (const [benchName, benchData] of Object.entries(data)) { - console.log(`## ${benchName}\n`); + for (const { app, backend, metrics, baseline } of dataPoints) { + const worldInfo = worldConfig[backend] || { + emoji: '', + label: backend, + }; + const frameworkInfo = frameworkConfig[app] || { label: app }; - // Collect all data points with their wall times for ranking - const dataPoints = []; - for (const app of apps) { - for (const backend of backends) { - const metrics = benchData[app]?.[backend]; - if (metrics) { - dataPoints.push({ app, backend, metrics }); - } + // Handle missing data + if (!metrics) { + if (isStream) { + console.log( + `| ${worldInfo.emoji} ${worldInfo.label} | ${frameworkInfo.label} | ⚠️ _missing_ | - | - | - | - |` + ); + } else { + console.log( + `| ${worldInfo.emoji} ${worldInfo.label} | ${frameworkInfo.label} | ⚠️ _missing_ | - | - | - |` + ); } - } - - if (dataPoints.length === 0) { - console.log('_No data available_\n'); continue; } - // Sort by workflow time (primary metric), fall back to wall time if workflow time unavailable - dataPoints.sort((a, b) => { - const aTime = a.metrics.workflowTime ?? a.metrics.wallTime; - const bTime = b.metrics.workflowTime ?? b.metrics.wallTime; - return aTime - bTime; - }); - const fastest = dataPoints[0]; - const fastestTime = - fastest.metrics.workflowTime ?? fastest.metrics.wallTime; - - // Render table - Workflow Time is primary metric - console.log( - '| Backend | Framework | Workflow Time | Wall Time | Overhead | vs Fastest |' + const isFastest = metrics === fastest.metrics; + const medal = isFastest ? '🥇 ' : ''; + + // Format workflow time with delta + const workflowTimeSec = + metrics.workflowTime !== null ? formatSec(metrics.workflowTime) : '-'; + const workflowDelta = formatDelta( + metrics.workflowTime, + baseline?.workflowTime + ); + + // Format wall time with delta + const wallTimeSec = formatSec(metrics.wallTime); + const wallDelta = formatDelta(metrics.wallTime, baseline?.wallTime); + + // Format overhead (no delta needed, it's derived) + const overheadSec = + metrics.overhead !== null ? formatSec(metrics.overhead) : '-'; + + // Format TTFB with delta for stream benchmarks + const firstByteSec = + metrics.firstByteTime !== null ? formatSec(metrics.firstByteTime) : '-'; + const ttfbDelta = formatDelta( + metrics.firstByteTime, + baseline?.firstByteTime ); + + const currentTime = metrics.workflowTime ?? metrics.wallTime; + const factor = isFastest + ? '1.00x' + : `${(currentTime / fastestTime).toFixed(2)}x`; + + if (isStream) { + console.log( + `| ${worldInfo.emoji} ${worldInfo.label} | ${medal}${frameworkInfo.label} | ${workflowTimeSec}s${workflowDelta} | ${firstByteSec}s${ttfbDelta} | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${factor} |` + ); + } else { + console.log( + `| ${worldInfo.emoji} ${worldInfo.label} | ${medal}${frameworkInfo.label} | ${workflowTimeSec}s${workflowDelta} | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${factor} |` + ); + } + } + console.log(''); +} + +// Render the comparison tables +function renderComparison(data, baselineData) { + const { apps, backends } = getAppsAndBackends(data); + + if (Object.keys(data).length === 0) { + console.log('No benchmark data found.\n'); + return; + } + + console.log('\n'); + console.log('## 📊 Benchmark Results\n'); + + // Show baseline comparison note if baseline data is available + if (baselineData && Object.keys(baselineData).length > 0) { console.log( - '|:--------|:----------|--------------:|----------:|---------:|-----------:|' + '> 📈 _Comparing against baseline from `main` branch. Green 🟢 = faster, Red 🔺 = slower._\n' ); + } - for (const { app, backend, metrics } of dataPoints) { - const backendInfo = backendConfig[backend] || { - emoji: '', - label: backend, - }; - const frameworkInfo = frameworkConfig[app] || { label: app }; + // Separate benchmarks into regular and stream categories + const regularBenchmarks = []; + const streamBenchmarks = []; - const isFastest = metrics === fastest.metrics; - const medal = isFastest ? '🥇 ' : ''; + for (const [benchName, benchData] of Object.entries(data)) { + if (isStreamBenchmark(benchData, apps, backends)) { + streamBenchmarks.push([benchName, benchData]); + } else { + regularBenchmarks.push([benchName, benchData]); + } + } - const workflowTimeSec = - metrics.workflowTime !== null ? formatSec(metrics.workflowTime) : '-'; - const wallTimeSec = formatSec(metrics.wallTime); - const overheadSec = - metrics.overhead !== null ? formatSec(metrics.overhead) : '-'; + // Render regular benchmarks first + if (regularBenchmarks.length > 0) { + for (const [benchName, benchData] of regularBenchmarks) { + const baselineBenchData = baselineData?.[benchName] || null; + renderBenchmarkTable( + benchName, + benchData, + baselineBenchData, + apps, + backends, + false + ); + } + } - const currentTime = metrics.workflowTime ?? metrics.wallTime; - const factor = isFastest - ? '1.00x' - : `${(currentTime / fastestTime).toFixed(2)}x`; + // Render stream benchmarks in a separate section + if (streamBenchmarks.length > 0) { + console.log('---\n'); + console.log('### Stream Benchmarks\n'); + console.log( + '_Stream benchmarks include Time to First Byte (TTFB) metrics._\n' + ); - console.log( - `| ${backendInfo.emoji} ${backendInfo.label} | ${medal}${frameworkInfo.label} | ${workflowTimeSec}s | ${wallTimeSec}s | ${overheadSec}s | ${factor} |` + for (const [benchName, benchData] of streamBenchmarks) { + const baselineBenchData = baselineData?.[benchName] || null; + renderBenchmarkTable( + benchName, + benchData, + baselineBenchData, + apps, + backends, + true ); } - console.log(''); } - // Summary: Best framework per backend (by Workflow Time) - console.log('## Summary: Fastest Framework by Backend\n'); - console.log('| Backend | Fastest Framework | Workflow Time |'); - console.log('|:--------|:------------------|---------------:|'); + // Summary: Count wins per framework (within each world) and per world (within each framework) + const allBenchmarks = [...regularBenchmarks, ...streamBenchmarks]; - for (const backend of backends) { - const backendInfo = backendConfig[backend] || { emoji: '', label: backend }; - let fastestApp = null; - let fastestTime = Infinity; + // Count wins: for each world, which framework wins most benchmarks + const frameworkWinsByWorld = {}; // { backend: { app: count } } + // Count wins: for each framework, which world wins most benchmarks + const worldWinsByFramework = {}; // { app: { backend: count } } - // Average workflow time across all benchmarks for this backend - const appTotals = {}; - const appCounts = {}; + for (const [benchName, benchData] of allBenchmarks) { + // For each world, find the fastest framework + for (const backend of backends) { + let fastestApp = null; + let fastestTime = Infinity; - for (const benchData of Object.values(data)) { for (const app of apps) { const metrics = benchData[app]?.[backend]; if (metrics) { const time = metrics.workflowTime ?? metrics.wallTime; - appTotals[app] = (appTotals[app] || 0) + time; - appCounts[app] = (appCounts[app] || 0) + 1; + if (time < fastestTime) { + fastestTime = time; + fastestApp = app; + } } } + + if (fastestApp) { + if (!frameworkWinsByWorld[backend]) { + frameworkWinsByWorld[backend] = {}; + } + frameworkWinsByWorld[backend][fastestApp] = + (frameworkWinsByWorld[backend][fastestApp] || 0) + 1; + } } + // For each framework, find the fastest world for (const app of apps) { - if (appCounts[app] > 0) { - const avgTime = appTotals[app] / appCounts[app]; - if (avgTime < fastestTime) { - fastestTime = avgTime; - fastestApp = app; + let fastestBackend = null; + let fastestTime = Infinity; + + for (const backend of backends) { + const metrics = benchData[app]?.[backend]; + if (metrics) { + const time = metrics.workflowTime ?? metrics.wallTime; + if (time < fastestTime) { + fastestTime = time; + fastestBackend = backend; + } + } + } + + if (fastestBackend) { + if (!worldWinsByFramework[app]) { + worldWinsByFramework[app] = {}; } + worldWinsByFramework[app][fastestBackend] = + (worldWinsByFramework[app][fastestBackend] || 0) + 1; } } + } - if (fastestApp) { - const frameworkInfo = frameworkConfig[fastestApp] || { - label: fastestApp, - }; + // Summary: Best framework per world (by wins) + console.log('---\n'); + console.log('## Summary: Fastest Framework by World\n'); + console.log(`_Winner determined by most benchmark wins_\n`); + console.log('| World | 🥇 Fastest Framework | Wins |'); + console.log('|:------|:---------------------|-----:|'); + + for (const backend of backends) { + const worldInfo = worldConfig[backend] || { emoji: '', label: backend }; + const frameworkWins = frameworkWinsByWorld[backend] || {}; + + // Find framework with most wins + let bestApp = null; + let bestWins = 0; + for (const [app, wins] of Object.entries(frameworkWins)) { + if (wins > bestWins) { + bestWins = wins; + bestApp = app; + } + } + + if (bestApp) { + const frameworkInfo = frameworkConfig[bestApp] || { label: bestApp }; + // Count total benchmarks for this world (benchmarks that have data for this world) + const totalForWorld = allBenchmarks.filter(([, bd]) => + apps.some((a) => bd[a]?.[backend]) + ).length; console.log( - `| ${backendInfo.emoji} ${backendInfo.label} | ${frameworkInfo.label} | ${formatSec(fastestTime)}s (avg) |` + `| ${worldInfo.emoji} ${worldInfo.label} | ${frameworkInfo.label} | ${bestWins}/${totalForWorld} |` ); } } console.log(''); - // Summary: Best backend per framework (by Workflow Time) - console.log('## Summary: Fastest Backend by Framework\n'); - console.log('| Framework | Fastest Backend | Workflow Time |'); - console.log('|:----------|:----------------|---------------:|'); + // Summary: Best world per framework (by wins) + console.log('## Summary: Fastest World by Framework\n'); + console.log(`_Winner determined by most benchmark wins_\n`); + console.log('| Framework | 🥇 Fastest World | Wins |'); + console.log('|:----------|:-----------------|-----:|'); for (const app of apps) { const frameworkInfo = frameworkConfig[app] || { label: app }; - let fastestBackend = null; - let fastestTime = Infinity; - - // Average workflow time across all benchmarks for this app - const backendTotals = {}; - const backendCounts = {}; - - for (const benchData of Object.values(data)) { - for (const backend of backends) { - const metrics = benchData[app]?.[backend]; - if (metrics) { - const time = metrics.workflowTime ?? metrics.wallTime; - backendTotals[backend] = (backendTotals[backend] || 0) + time; - backendCounts[backend] = (backendCounts[backend] || 0) + 1; - } + const worldWins = worldWinsByFramework[app] || {}; + + // Find world with most wins + let bestBackend = null; + let bestWins = 0; + for (const [backend, wins] of Object.entries(worldWins)) { + if (wins > bestWins) { + bestWins = wins; + bestBackend = backend; } } - for (const backend of backends) { - if (backendCounts[backend] > 0) { - const avgTime = backendTotals[backend] / backendCounts[backend]; - if (avgTime < fastestTime) { - fastestTime = avgTime; - fastestBackend = backend; - } - } - } - - if (fastestBackend) { - const backendInfo = backendConfig[fastestBackend] || { + if (bestBackend) { + const worldInfo = worldConfig[bestBackend] || { emoji: '', - label: fastestBackend, + label: bestBackend, }; + // Count total benchmarks for this framework (benchmarks that have data for this framework) + const totalForFramework = allBenchmarks.filter(([, bd]) => + backends.some((b) => bd[app]?.[b]) + ).length; console.log( - `| ${frameworkInfo.label} | ${backendInfo.emoji} ${backendInfo.label} | ${formatSec(fastestTime)}s (avg) |` + `| ${frameworkInfo.label} | ${worldInfo.emoji} ${worldInfo.label} | ${bestWins}/${totalForFramework} |` ); } } @@ -338,6 +547,9 @@ function renderComparison(data) { console.log( '- **Workflow Time**: Runtime reported by workflow (completedAt - createdAt) - *primary metric*' ); + console.log( + '- **TTFB**: Time to First Byte - time from workflow start until first stream byte received (stream benchmarks only)' + ); console.log( '- **Wall Time**: Total testbench time (trigger workflow + poll for result)' ); @@ -346,10 +558,10 @@ function renderComparison(data) { '- **vs Fastest**: How much slower compared to the fastest configuration for this benchmark' ); console.log(''); - console.log('**Backends:**'); - console.log('- 💻 Local: In-memory filesystem backend'); - console.log('- 🐘 Postgres: PostgreSQL database backend'); - console.log('- ▲ Vercel: Vercel production backend'); + console.log('**Worlds:**'); + console.log('- 💻 Local: In-memory filesystem world'); + console.log('- 🐘 Postgres: PostgreSQL database world'); + console.log('- ▲ Vercel: Vercel production world'); console.log(''); } @@ -362,4 +574,14 @@ if (resultFiles.length === 0) { } const data = collectBenchmarkData(resultFiles); -renderComparison(data); + +// Load baseline data if provided +let baselineData = null; +if (baselineDir) { + const baselineFiles = findBenchmarkFiles(baselineDir); + if (baselineFiles.length > 0) { + baselineData = collectBenchmarkData(baselineFiles); + } +} + +renderComparison(data, baselineData); diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index b993c2f6a..bdcc437ce 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -12,7 +12,7 @@ concurrency: cancel-in-progress: true jobs: - # Phase 0: Create placeholder PR comment (so it's pinned to the top) + # Phase 0: Update PR comment to show benchmarks are running pr-comment-start: name: Create PR Comment runs-on: ubuntu-latest @@ -20,11 +20,86 @@ jobs: timeout-minutes: 2 steps: - - name: Create initial benchmark comment + - name: Find existing benchmark comment + uses: peter-evans/find-comment@v3 + id: find-comment + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-includes: '' + + - name: Get existing comment body + if: steps.find-comment.outputs.comment-id != '' + id: get-comment + uses: actions/github-script@v7 + with: + script: | + const comment = await github.rest.issues.getComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: ${{ steps.find-comment.outputs.comment-id }} + }); + // Extract the results section (everything after the header and running message) + const body = comment.data.body; + // Remove any existing stale warning and running message + let resultsSection = body + .replace(/\n## 📊 Benchmark Results\n\n> ⚠️ \*\*Results below are stale\*\*[^\n]*\n\n/g, '') + .replace(/\n## 📊 Benchmark Results\n\n/g, '') + .replace(/⏳ \*\*Benchmarks are running\.\.\.\*\*\n\n---\n_Started at:[^_]*_\n\n---\n\n/g, '') + .replace(/⏳ \*\*Benchmarks are running\.\.\.\*\*\n\n---\n_Started at:[^_]*_/g, '') + .trim(); + + // If there's actual content left (benchmark tables), save it + if (resultsSection && resultsSection.includes('|')) { + core.setOutput('has-results', 'true'); + core.setOutput('previous-results', resultsSection); + } else { + core.setOutput('has-results', 'false'); + } + + - name: Create new benchmark comment + if: steps.find-comment.outputs.comment-id == '' uses: marocchino/sticky-pull-request-comment@v2 with: header: benchmark-results message: | + + ## 📊 Benchmark Results + + ⏳ **Benchmarks are running...** + + This comment will be updated with the results when the benchmarks complete. + + --- + _Started at: ${{ github.event.pull_request.updated_at }}_ + + - name: Update existing benchmark comment with stale warning + if: steps.find-comment.outputs.comment-id != '' && steps.get-comment.outputs.has-results == 'true' + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: benchmark-results + message: | + + ## 📊 Benchmark Results + + > ⚠️ **Results below are stale** and not from the latest commit. This comment will be updated when CI completes on the latest run. + + ⏳ **Benchmarks are running...** + + --- + _Started at: ${{ github.event.pull_request.updated_at }}_ + + --- + + ${{ steps.get-comment.outputs.previous-results }} + + - name: Update existing benchmark comment without results + if: steps.find-comment.outputs.comment-id != '' && steps.get-comment.outputs.has-results != 'true' + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: benchmark-results + message: | + ## 📊 Benchmark Results ⏳ **Benchmarks are running...** @@ -107,6 +182,19 @@ jobs: - name: Install dependencies run: pnpm install --frozen-lockfile + # For PRs, download baseline results from main branch + - name: Download baseline from main branch + if: github.event_name == 'pull_request' + continue-on-error: true + uses: dawidd6/action-download-artifact@v6 + with: + workflow: benchmarks.yml + branch: main + name: baseline-benchmark-results + path: baseline-results + search_artifacts: true + if_no_artifact_found: warn + - name: Build workbench run: pnpm turbo run build --filter='./workbench/${{ matrix.app }}' @@ -128,6 +216,7 @@ jobs: benchmark-file: bench-results-${{ matrix.app }}-local.json app-name: ${{ matrix.app }} backend: local + baseline-file: baseline-results/bench-results-${{ matrix.app }}-local.json - name: Upload benchmark results uses: actions/upload-artifact@v4 @@ -190,6 +279,19 @@ jobs: - name: Install dependencies run: pnpm install --frozen-lockfile + # For PRs, download baseline results from main branch + - name: Download baseline from main branch + if: github.event_name == 'pull_request' + continue-on-error: true + uses: dawidd6/action-download-artifact@v6 + with: + workflow: benchmarks.yml + branch: main + name: baseline-benchmark-results + path: baseline-results + search_artifacts: true + if_no_artifact_found: warn + - name: Setup PostgreSQL database run: ./packages/world-postgres/bin/setup.js @@ -215,6 +317,7 @@ jobs: benchmark-file: bench-results-${{ matrix.app }}-postgres.json app-name: ${{ matrix.app }} backend: postgres + baseline-file: baseline-results/bench-results-${{ matrix.app }}-postgres.json - name: Upload benchmark results uses: actions/upload-artifact@v4 @@ -261,6 +364,19 @@ jobs: - name: Install dependencies run: pnpm install --frozen-lockfile + # For PRs, download baseline results from main branch + - name: Download baseline from main branch + if: github.event_name == 'pull_request' + continue-on-error: true + uses: dawidd6/action-download-artifact@v6 + with: + workflow: benchmarks.yml + branch: main + name: baseline-benchmark-results + path: baseline-results + search_artifacts: true + if_no_artifact_found: warn + - name: Wait for Vercel deployment id: waitForDeployment uses: ./.github/actions/wait-for-vercel-project @@ -289,6 +405,7 @@ jobs: benchmark-file: bench-results-${{ matrix.app.name }}-vercel.json app-name: ${{ matrix.app.name }} backend: vercel + baseline-file: baseline-results/bench-results-${{ matrix.app.name }}-vercel.json - name: Upload benchmark results uses: actions/upload-artifact@v4 @@ -319,11 +436,32 @@ jobs: - name: List downloaded files run: find benchmark-results -type f -name "*.json" | sort + # For PRs, try to download baseline results from the latest main branch run + - name: Download baseline from main branch + if: github.event_name == 'pull_request' + continue-on-error: true + uses: dawidd6/action-download-artifact@v6 + with: + workflow: benchmarks.yml + branch: main + name: baseline-benchmark-results + path: baseline-results + search_artifacts: true + if_no_artifact_found: warn + - name: Aggregate and compare benchmarks id: aggregate run: | + # Check if baseline results exist + BASELINE_ARG="" + if [ -d "baseline-results" ] && [ "$(ls -A baseline-results 2>/dev/null)" ]; then + echo "Found baseline results from main branch" + BASELINE_ARG="--baseline baseline-results" + else + echo "No baseline results found, showing results without comparison" + fi # Capture output to both file and step summary - node .github/scripts/aggregate-benchmarks.js benchmark-results | tee benchmark-summary.md >> $GITHUB_STEP_SUMMARY + node .github/scripts/aggregate-benchmarks.js benchmark-results $BASELINE_ARG | tee benchmark-summary.md >> $GITHUB_STEP_SUMMARY - name: Check benchmark job statuses id: check-status @@ -365,3 +503,12 @@ jobs: - Vercel: ${{ needs.benchmark-vercel.result }} Check the [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for details. + + # On main branch, save results as baseline for future PR comparisons + - name: Upload baseline results + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + uses: actions/upload-artifact@v4 + with: + name: baseline-benchmark-results + path: benchmark-results/ + retention-days: 90 diff --git a/packages/core/e2e/bench.bench.ts b/packages/core/e2e/bench.bench.ts index a887f0bcb..ce57115a2 100644 --- a/packages/core/e2e/bench.bench.ts +++ b/packages/core/e2e/bench.bench.ts @@ -17,9 +17,16 @@ const workflowTimings: Record< startedAt?: string; completedAt?: string; executionTimeMs?: number; + firstByteTimeMs?: number; }[] > = {}; +// Buffered timing data keyed by task name, flushed in teardown +const bufferedTimings: Map< + string, + { run: any; extra?: { firstByteTimeMs?: number } }[] +> = new Map(); + async function triggerWorkflow( workflow: string | { workflowFile: string; workflowFn: string }, args: any[] @@ -123,6 +130,9 @@ function writeTimingFile() { minExecutionTimeMs: number; maxExecutionTimeMs: number; samples: number; + avgFirstByteTimeMs?: number; + minFirstByteTimeMs?: number; + maxFirstByteTimeMs?: number; } > = {}; for (const [benchName, timings] of Object.entries(workflowTimings)) { @@ -139,6 +149,18 @@ function writeTimingFile() { maxExecutionTimeMs: max, samples: validTimings.length, }; + + // Add first byte stats if available + const firstByteTimings = timings.filter( + (t) => t.firstByteTimeMs !== undefined + ); + if (firstByteTimings.length > 0) { + const firstByteTimes = firstByteTimings.map((t) => t.firstByteTimeMs!); + summary[benchName].avgFirstByteTimeMs = + firstByteTimes.reduce((sum, t) => sum + t, 0) / firstByteTimes.length; + summary[benchName].minFirstByteTimeMs = Math.min(...firstByteTimes); + summary[benchName].maxFirstByteTimeMs = Math.max(...firstByteTimes); + } } } @@ -148,39 +170,67 @@ function writeTimingFile() { ); } -function recordWorkflowTiming(benchName: string, run: any) { - if (!workflowTimings[benchName]) { - workflowTimings[benchName] = []; +// Buffer timing data (called during each iteration) +function stageTiming( + benchName: string, + run: any, + extra?: { firstByteTimeMs?: number } +) { + if (!bufferedTimings.has(benchName)) { + bufferedTimings.set(benchName, []); } + bufferedTimings.get(benchName)!.push({ run, extra }); +} - const timing: any = { - createdAt: run.createdAt, - startedAt: run.startedAt, - completedAt: run.completedAt, - }; - - // Calculate execution time if timestamps are available (completedAt - createdAt) - if (run.createdAt && run.completedAt) { - const created = new Date(run.createdAt).getTime(); - const completed = new Date(run.completedAt).getTime(); - timing.executionTimeMs = completed - created; - } +// Teardown: on warmup, clear buffer; on run, flush to file then clear +const teardown = (task: { name: string }, mode: 'warmup' | 'run') => { + const buffered = bufferedTimings.get(task.name) || []; - workflowTimings[benchName].push(timing); + if (mode === 'run') { + // Flush all buffered timings to workflowTimings + for (const { run, extra } of buffered) { + if (!workflowTimings[task.name]) { + workflowTimings[task.name] = []; + } - // Write timing file after each recording (overwrites previous) - writeTimingFile(); -} + const timing: any = { + createdAt: run.createdAt, + startedAt: run.startedAt, + completedAt: run.completedAt, + }; + + // Calculate execution time if timestamps are available (completedAt - createdAt) + if (run.createdAt && run.completedAt) { + const created = new Date(run.createdAt).getTime(); + const completed = new Date(run.completedAt).getTime(); + timing.executionTimeMs = completed - created; + } -describe.concurrent('Workflow Performance Benchmarks', () => { + // Add extra metrics if provided + if (extra?.firstByteTimeMs !== undefined) { + timing.firstByteTimeMs = extra.firstByteTimeMs; + } + + workflowTimings[task.name].push(timing); + } + + // Write timing file after flushing + writeTimingFile(); + } + + // Clear buffer (both warmup and run) + bufferedTimings.delete(task.name); +}; + +describe('Workflow Performance Benchmarks', () => { bench( 'workflow with no steps', async () => { const { runId } = await triggerWorkflow('noStepsWorkflow', [42]); const { run } = await getWorkflowReturnValue(runId); - recordWorkflowTiming('workflow with no steps', run); + stageTiming('workflow with no steps', run); }, - { time: 5000 } + { time: 5000, warmupIterations: 1, teardown } ); bench( @@ -188,9 +238,9 @@ describe.concurrent('Workflow Performance Benchmarks', () => { async () => { const { runId } = await triggerWorkflow('oneStepWorkflow', [100]); const { run } = await getWorkflowReturnValue(runId); - recordWorkflowTiming('workflow with 1 step', run); + stageTiming('workflow with 1 step', run); }, - { time: 5000 } + { time: 5000, warmupIterations: 1, teardown } ); bench( @@ -198,9 +248,9 @@ describe.concurrent('Workflow Performance Benchmarks', () => { async () => { const { runId } = await triggerWorkflow('tenSequentialStepsWorkflow', []); const { run } = await getWorkflowReturnValue(runId); - recordWorkflowTiming('workflow with 10 sequential steps', run); + stageTiming('workflow with 10 sequential steps', run); }, - { time: 5000 } + { time: 5000, iterations: 5, warmupIterations: 1, teardown } ); bench( @@ -208,8 +258,33 @@ describe.concurrent('Workflow Performance Benchmarks', () => { async () => { const { runId } = await triggerWorkflow('tenParallelStepsWorkflow', []); const { run } = await getWorkflowReturnValue(runId); - recordWorkflowTiming('workflow with 10 parallel steps', run); + stageTiming('workflow with 10 parallel steps', run); + }, + { time: 5000, iterations: 5, warmupIterations: 1, teardown } + ); + + bench( + 'workflow with stream', + async () => { + const { runId } = await triggerWorkflow('streamWorkflow', []); + const { run, value } = await getWorkflowReturnValue(runId); + // Consume the entire stream and track time-to-first-byte from workflow startedAt + let firstByteTimeMs: number | undefined; + if (value instanceof ReadableStream) { + const reader = value.getReader(); + let isFirstChunk = true; + while (true) { + const { done } = await reader.read(); + if (isFirstChunk && !done && run.startedAt) { + const startedAt = new Date(run.startedAt).getTime(); + firstByteTimeMs = Date.now() - startedAt; + isFirstChunk = false; + } + if (done) break; + } + } + stageTiming('workflow with stream', run, { firstByteTimeMs }); }, - { time: 5000 } + { time: 5000, warmupIterations: 1, teardown } ); }); diff --git a/packages/world-local/src/streamer.ts b/packages/world-local/src/streamer.ts index 1ff8d0026..d416b5cb3 100644 --- a/packages/world-local/src/streamer.ts +++ b/packages/world-local/src/streamer.ts @@ -216,7 +216,11 @@ export function createStreamer(basedir: string): Streamer { if (isComplete) { removeListeners(); - controller.close(); + try { + controller.close(); + } catch (e) { + // Ignore if controller is already closed (e.g., from closeListener event) + } return; } }, diff --git a/workbench/example/api/trigger.ts b/workbench/example/api/trigger.ts index 2cfe032f9..48405e87c 100644 --- a/workbench/example/api/trigger.ts +++ b/workbench/example/api/trigger.ts @@ -84,13 +84,25 @@ export async function GET(req: Request) { const run = getRun(runId); const returnValue = await run.returnValue; console.log('Return value:', returnValue); + + // Include run metadata in headers + const [createdAt, startedAt, completedAt] = await Promise.all([ + run.createdAt, + run.startedAt, + run.completedAt, + ]); + const headers: HeadersInit = + returnValue instanceof ReadableStream + ? { 'Content-Type': 'application/octet-stream' } + : {}; + + headers['X-Workflow-Run-Created-At'] = createdAt?.toISOString() || ''; + headers['X-Workflow-Run-Started-At'] = startedAt?.toISOString() || ''; + headers['X-Workflow-Run-Completed-At'] = completedAt?.toISOString() || ''; + return returnValue instanceof ReadableStream - ? new Response(returnValue, { - headers: { - 'Content-Type': 'application/octet-stream', - }, - }) - : Response.json(returnValue); + ? new Response(returnValue, { headers }) + : Response.json(returnValue, { headers }); } catch (error) { if (error instanceof Error) { if (WorkflowRunNotCompletedError.is(error)) { diff --git a/workbench/example/workflows/97_bench.ts b/workbench/example/workflows/97_bench.ts index 4b2b496c6..d47a93595 100644 --- a/workbench/example/workflows/97_bench.ts +++ b/workbench/example/workflows/97_bench.ts @@ -38,3 +38,51 @@ export async function tenParallelStepsWorkflow() { const results = await Promise.all(promises); return results.reduce((sum, val) => sum + val, 0); } + +// Step that generates a stream with 10 chunks +async function genBenchStream(): Promise> { + 'use step'; + const encoder = new TextEncoder(); + return new ReadableStream({ + async start(controller) { + for (let i = 0; i < 10; i++) { + controller.enqueue(encoder.encode(`${i}\n`)); + // Small delay to avoid synchronous close issues on local world + await new Promise((resolve) => setTimeout(resolve, 10)); + } + controller.close(); + }, + }); +} + +// Step that transforms a stream by doubling each number +async function doubleNumbers( + stream: ReadableStream +): Promise> { + 'use step'; + const decoder = new TextDecoder(); + const encoder = new TextEncoder(); + + const transformStream = new TransformStream({ + transform(chunk, controller) { + const text = decoder.decode(chunk, { stream: true }); + const lines = text.split('\n'); + for (const line of lines) { + if (line.trim()) { + const num = parseInt(line, 10); + controller.enqueue(encoder.encode(`${num * 2}\n`)); + } + } + }, + }); + + return stream.pipeThrough(transformStream); +} + +// Workflow that generates and transforms a stream +export async function streamWorkflow() { + 'use workflow'; + const stream = await genBenchStream(); + const doubled = await doubleNumbers(stream); + return doubled; +}