diff --git a/.changeset/large-ads-report.md b/.changeset/large-ads-report.md
new file mode 100644
index 000000000..991e8717b
--- /dev/null
+++ b/.changeset/large-ads-report.md
@@ -0,0 +1,5 @@
+---
+"@workflow/world-local": patch
+---
+
+Silently ignore stream already closed errors
diff --git a/.github/actions/render-benchmarks/action.yml b/.github/actions/render-benchmarks/action.yml
index 59983bc93..df6150754 100644
--- a/.github/actions/render-benchmarks/action.yml
+++ b/.github/actions/render-benchmarks/action.yml
@@ -8,12 +8,19 @@ inputs:
     description: 'Name of the app being benchmarked'
     required: true
   backend:
-    description: 'Backend type (local, postgres, vercel)'
+    description: 'World type (local, postgres, vercel)'
     required: true
+  baseline-file:
+    description: 'Path to the baseline benchmark JSON file for comparison (optional)'
+    required: false
 runs:
   using: 'composite'
   steps:
     - name: Render benchmark results
       shell: bash
       run: |
-        node ${{ github.action_path }}/render.js "${{ inputs.benchmark-file }}" "${{ inputs.app-name }}" "${{ inputs.backend }}" >> $GITHUB_STEP_SUMMARY
+        BASELINE_ARG=""
+        if [ -n "${{ inputs.baseline-file }}" ] && [ -f "${{ inputs.baseline-file }}" ]; then
+          BASELINE_ARG="--baseline ${{ inputs.baseline-file }}"
+        fi
+        node ${{ github.action_path }}/render.js "${{ inputs.benchmark-file }}" "${{ inputs.app-name }}" "${{ inputs.backend }}" $BASELINE_ARG >> $GITHUB_STEP_SUMMARY
diff --git a/.github/actions/render-benchmarks/render.js b/.github/actions/render-benchmarks/render.js
index 8861ad493..b1059908e 100644
--- a/.github/actions/render-benchmarks/render.js
+++ b/.github/actions/render-benchmarks/render.js
@@ -1,16 +1,35 @@
 #!/usr/bin/env node
 
 const fs = require('fs');
+const path = require('path');
+
+// Parse arguments: render.js <benchmark-file> <app-name> <backend> [--baseline <baseline-file>]
+const args = process.argv.slice(2);
+let benchmarkFile = null;
+let appName = null;
+let backend = null;
+let baselineFile = null;
 
-const [, , benchmarkFile, appName, backend] = process.argv;
+for (let i = 0; i < args.length; i++) {
+  if (args[i] === '--baseline' && args[i + 1]) {
+    baselineFile = args[i + 1];
+    i++;
+  } else if (!benchmarkFile) {
+    benchmarkFile = args[i];
+  } else if (!appName) {
+    appName = args[i];
+  } else if (!backend) {
+    backend = args[i];
+  }
+}
 
 if (!benchmarkFile || !appName || !backend) {
-  console.error('Usage: render.js <benchmark-file> <app-name> <backend>');
+  console.error(
+    'Usage: render.js <benchmark-file> <app-name> <backend> [--baseline <baseline-file>]'
+  );
   process.exit(1);
 }
 
-const path = require('path');
-
 // Try to load workflow timing data
 let workflowTimings = null;
 // Only replace filename, not directory name
@@ -28,14 +47,76 @@ if (fs.existsSync(timingFile)) {
   }
 }
 
+// Try to load baseline data
+let baselineData = null;
+let baselineTimings = null;
+if (baselineFile && fs.existsSync(baselineFile)) {
+  try {
+    baselineData = JSON.parse(fs.readFileSync(baselineFile, 'utf-8'));
+    // Also try to load baseline timings
+    const baselineTimingFile = path.join(
+      path.dirname(baselineFile),
+      path.basename(baselineFile).replace('bench-results-', 'bench-timings-')
+    );
+    if (fs.existsSync(baselineTimingFile)) {
+      baselineTimings = JSON.parse(
+        fs.readFileSync(baselineTimingFile, 'utf-8')
+      );
+    }
+  } catch (e) {
+    console.error(`Warning: Could not parse baseline file: ${e.message}`);
+  }
+}
+
+// Build baseline lookup map: benchName -> { wallTime, workflowTime, ttfb }
+const baselineLookup = {};
+if (baselineData) {
+  for (const file of baselineData.files || []) {
+    for (const group of file.groups || []) {
+      for (const bench of group.benchmarks || []) {
+        if (bench.mean !== undefined && bench.mean !== null) {
+          baselineLookup[bench.name] = {
+            wallTime: bench.mean,
+            workflowTime:
+              baselineTimings?.summary?.[bench.name]?.avgExecutionTimeMs ??
+              null,
+            ttfb:
+              baselineTimings?.summary?.[bench.name]?.avgFirstByteTimeMs ??
+              null,
+          };
+        }
+      }
+    }
+  }
+}
+
 // Format number with consistent width
 function formatSec(ms, decimals = 3) {
   return (ms / 1000).toFixed(decimals);
 }
 
-// Get backend emoji
-function getBackendEmoji(backend) {
-  switch (backend) {
+// Format delta between current and baseline values
+function formatDelta(current, baseline) {
+  if (
+    baseline === null ||
+    baseline === undefined ||
+    current === null ||
+    current === undefined
+  ) {
+    return '';
+  }
+  const percentChange = ((current - baseline) / baseline) * 100;
+  if (Math.abs(percentChange) < 0.5) {
+    return ' (~)';
+  }
+  const sign = percentChange > 0 ? '+' : '';
+  const emoji = percentChange > 5 ? ' 🔺' : percentChange < -5 ? ' 🟢' : '';
+  return ` (${sign}${percentChange.toFixed(1)}%${emoji})`;
+}
+
+// Get world emoji
+function getWorldEmoji(world) {
+  switch (world) {
     case 'vercel':
       return '▲';
     case 'postgres':
@@ -50,56 +131,140 @@ function getBackendEmoji(backend) {
 try {
   const data = JSON.parse(fs.readFileSync(benchmarkFile, 'utf-8'));
 
-  const emoji = getBackendEmoji(backend);
-  console.log(`## ${emoji} Benchmark Results: ${appName} (${backend})\n`);
+  const emoji = getWorldEmoji(backend);
+  console.log(`## ${emoji} Benchmark Results: ${appName} (${backend} world)\n`);
+
+  // Show baseline comparison note if baseline data is available
+  if (Object.keys(baselineLookup).length > 0) {
+    console.log(
+      '> 📈 _Comparing against baseline from `main` branch. Green 🟢 = faster, Red 🔺 = slower._\n'
+    );
+  }
 
   for (const file of data.files) {
     for (const group of file.groups) {
-      // Workflow Time is primary metric, Wall Time is secondary
-      console.log(
-        '| Benchmark | Workflow Time (avg) | Min | Max | Wall Time | Overhead | Samples |'
-      );
-      console.log(
-        '|:----------|--------------------:|----:|----:|----------:|---------:|--------:|'
-      );
+      // Separate regular and stream benchmarks
+      const regularBenchmarks = [];
+      const streamBenchmarks = [];
 
       for (const bench of group.benchmarks) {
-        // Skip benchmarks without valid timing data (failed or timed out)
-        if (bench.mean === undefined || bench.mean === null) {
-          console.log(`| ${bench.name} | ⚠️ No data | - | - | - | - | 0 |`);
-          continue;
+        const summary = workflowTimings?.summary?.[bench.name];
+        if (summary?.avgFirstByteTimeMs !== undefined) {
+          streamBenchmarks.push(bench);
+        } else {
+          regularBenchmarks.push(bench);
         }
+      }
 
-        const wallTimeSec = formatSec(bench.mean);
+      // Render regular benchmarks
+      if (regularBenchmarks.length > 0) {
+        console.log(
+          '| Benchmark | Workflow Time (avg) | Min | Max | Wall Time | Overhead | Samples |'
+        );
+        console.log(
+          '|:----------|--------------------:|----:|----:|----------:|---------:|--------:|'
+        );
 
-        // Get workflow execution time if available
-        let workflowTimeSec = '-';
-        let minTimeSec = '-';
-        let maxTimeSec = '-';
-        let overheadSec = '-';
+        for (const bench of regularBenchmarks) {
+          // Skip benchmarks without valid timing data (failed or timed out)
+          if (bench.mean === undefined || bench.mean === null) {
+            console.log(`| ${bench.name} | ⚠️ No data | - | - | - | - | 0 |`);
+            continue;
+          }
 
-        if (workflowTimings?.summary?.[bench.name]) {
-          const summary = workflowTimings.summary[bench.name];
-          workflowTimeSec = formatSec(summary.avgExecutionTimeMs);
+          const baseline = baselineLookup[bench.name];
+          const wallTimeSec = formatSec(bench.mean);
+          const wallDelta = formatDelta(bench.mean, baseline?.wallTime);
+          let workflowTimeSec = '-';
+          let workflowDelta = '';
+          let minTimeSec = '-';
+          let maxTimeSec = '-';
+          let overheadSec = '-';
 
-          // Get min/max if available
-          if (summary.minExecutionTimeMs !== undefined) {
-            minTimeSec = formatSec(summary.minExecutionTimeMs);
-          }
-          if (summary.maxExecutionTimeMs !== undefined) {
-            maxTimeSec = formatSec(summary.maxExecutionTimeMs);
+          if (workflowTimings?.summary?.[bench.name]) {
+            const summary = workflowTimings.summary[bench.name];
+            workflowTimeSec = formatSec(summary.avgExecutionTimeMs);
+            workflowDelta = formatDelta(
+              summary.avgExecutionTimeMs,
+              baseline?.workflowTime
+            );
+            if (summary.minExecutionTimeMs !== undefined) {
+              minTimeSec = formatSec(summary.minExecutionTimeMs);
+            }
+            if (summary.maxExecutionTimeMs !== undefined) {
+              maxTimeSec = formatSec(summary.maxExecutionTimeMs);
+            }
+            const overheadMs = bench.mean - summary.avgExecutionTimeMs;
+            overheadSec = formatSec(overheadMs);
           }
 
-          // Calculate overhead (wall time - workflow time)
-          const overheadMs = bench.mean - summary.avgExecutionTimeMs;
-          overheadSec = formatSec(overheadMs);
+          console.log(
+            `| ${bench.name} | ${workflowTimeSec}s${workflowDelta} | ${minTimeSec}s | ${maxTimeSec}s | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${bench.sampleCount} |`
+          );
         }
+        console.log('');
+      }
 
+      // Render stream benchmarks with TTFB column
+      if (streamBenchmarks.length > 0) {
+        console.log('**Stream Benchmarks**\n');
+        console.log(
+          '| Benchmark | Workflow Time (avg) | TTFB | Min | Max | Wall Time | Overhead | Samples |'
+        );
         console.log(
-          `| ${bench.name} | ${workflowTimeSec}s | ${minTimeSec}s | ${maxTimeSec}s | ${wallTimeSec}s | ${overheadSec}s | ${bench.sampleCount} |`
+          '|:----------|--------------------:|-----:|----:|----:|----------:|---------:|--------:|'
         );
+
+        for (const bench of streamBenchmarks) {
+          // Skip benchmarks without valid timing data (failed or timed out)
+          if (bench.mean === undefined || bench.mean === null) {
+            console.log(
+              `| ${bench.name} | ⚠️ No data | - | - | - | - | - | 0 |`
+            );
+            continue;
+          }
+
+          const baseline = baselineLookup[bench.name];
+          const wallTimeSec = formatSec(bench.mean);
+          const wallDelta = formatDelta(bench.mean, baseline?.wallTime);
+          let workflowTimeSec = '-';
+          let workflowDelta = '';
+          let minTimeSec = '-';
+          let maxTimeSec = '-';
+          let overheadSec = '-';
+          let ttfbSec = '-';
+          let ttfbDelta = '';
+
+          if (workflowTimings?.summary?.[bench.name]) {
+            const summary = workflowTimings.summary[bench.name];
+            workflowTimeSec = formatSec(summary.avgExecutionTimeMs);
+            workflowDelta = formatDelta(
+              summary.avgExecutionTimeMs,
+              baseline?.workflowTime
+            );
+            if (summary.minExecutionTimeMs !== undefined) {
+              minTimeSec = formatSec(summary.minExecutionTimeMs);
+            }
+            if (summary.maxExecutionTimeMs !== undefined) {
+              maxTimeSec = formatSec(summary.maxExecutionTimeMs);
+            }
+            if (summary.avgFirstByteTimeMs !== undefined) {
+              ttfbSec = formatSec(summary.avgFirstByteTimeMs);
+              ttfbDelta = formatDelta(
+                summary.avgFirstByteTimeMs,
+                baseline?.ttfb
+              );
+            }
+            const overheadMs = bench.mean - summary.avgExecutionTimeMs;
+            overheadSec = formatSec(overheadMs);
+          }
+
+          console.log(
+            `| ${bench.name} | ${workflowTimeSec}s${workflowDelta} | ${ttfbSec}s${ttfbDelta} | ${minTimeSec}s | ${maxTimeSec}s | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${bench.sampleCount} |`
+          );
+        }
+        console.log('');
       }
-      console.log('');
     }
   }
 
@@ -116,6 +281,9 @@ try {
   );
   console.log('- **Overhead**: Testbench overhead (Wall Time - Workflow Time)');
   console.log('- **Samples**: Number of benchmark iterations run');
+  console.log(
+    '- **TTFB**: Time to First Byte - time from workflow start until first stream byte received (stream benchmarks only)'
+  );
   console.log('</details>');
 } catch (error) {
   console.error(`Error rendering benchmark results: ${error.message}`);
diff --git a/.github/scripts/aggregate-benchmarks.js b/.github/scripts/aggregate-benchmarks.js
index 23881f438..f6b3c2dc9 100644
--- a/.github/scripts/aggregate-benchmarks.js
+++ b/.github/scripts/aggregate-benchmarks.js
@@ -3,10 +3,22 @@
 const fs = require('fs');
 const path = require('path');
 
-const [, , resultsDir = '.'] = process.argv;
+// Parse command line arguments
+const args = process.argv.slice(2);
+let resultsDir = '.';
+let baselineDir = null;
+
+for (let i = 0; i < args.length; i++) {
+  if (args[i] === '--baseline' && args[i + 1]) {
+    baselineDir = args[i + 1];
+    i++;
+  } else if (!args[i].startsWith('--')) {
+    resultsDir = args[i];
+  }
+}
 
-// Backend display config
-const backendConfig = {
+// World display config
+const worldConfig = {
   local: { emoji: '💻', label: 'Local' },
   postgres: { emoji: '🐘', label: 'Postgres' },
   vercel: { emoji: '▲', label: 'Vercel' },
@@ -15,7 +27,7 @@ const backendConfig = {
 // Framework display config
 const frameworkConfig = {
   'nextjs-turbopack': { label: 'Next.js (Turbopack)' },
-  nitro: { label: 'Nitro' },
+  'nitro-v3': { label: 'Nitro' },
   express: { label: 'Express' },
 };
 
@@ -24,6 +36,26 @@ function formatSec(ms, decimals = 3) {
   return (ms / 1000).toFixed(decimals);
 }
 
+// Format delta between current and baseline values
+// Returns string like "+12.3%" (slower) or "-5.2%" (faster) or "" if no baseline
+function formatDelta(current, baseline) {
+  if (
+    baseline === null ||
+    baseline === undefined ||
+    current === null ||
+    current === undefined
+  ) {
+    return '';
+  }
+  const percentChange = ((current - baseline) / baseline) * 100;
+  if (Math.abs(percentChange) < 0.5) {
+    return ' (~)';
+  }
+  const sign = percentChange > 0 ? '+' : '';
+  const emoji = percentChange > 5 ? ' 🔺' : percentChange < -5 ? ' 🟢' : '';
+  return ` (${sign}${percentChange.toFixed(1)}%${emoji})`;
+}
+
 // Find all benchmark result files
 function findBenchmarkFiles(dir) {
   const files = [];
@@ -75,7 +107,7 @@ function loadTimingData(benchmarkFile) {
 
 // Collect all benchmark data
 function collectBenchmarkData(resultFiles) {
-  // Structure: { [benchmarkName]: { [app]: { [backend]: { wallTime, workflowTime, overhead, min, max, samples } } } }
+  // Structure: { [benchmarkName]: { [app]: { [backend]: { wallTime, workflowTime, overhead, min, max, samples, firstByteTime } } } }
   const data = {};
 
   for (const file of resultFiles) {
@@ -107,8 +139,13 @@ function collectBenchmarkData(resultFiles) {
 
             // Get workflow timing if available
             let workflowTimeMs = null;
+            let firstByteTimeMs = null;
             if (timings?.summary?.[benchName]) {
               workflowTimeMs = timings.summary[benchName].avgExecutionTimeMs;
+              // Get TTFB for stream benchmarks
+              if (timings.summary[benchName].avgFirstByteTimeMs !== undefined) {
+                firstByteTimeMs = timings.summary[benchName].avgFirstByteTimeMs;
+              }
             }
 
             data[benchName][app][backend] = {
@@ -119,6 +156,7 @@ function collectBenchmarkData(resultFiles) {
               min: bench.min,
               max: bench.max,
               samples: bench.sampleCount,
+              firstByteTime: firstByteTimeMs,
             };
           }
         }
@@ -159,174 +197,345 @@ function getAppsAndBackends(data) {
   return { apps: sortedApps, backends: sortedBackends };
 }
 
-// Render the comparison tables
-function renderComparison(data) {
-  const { apps, backends } = getAppsAndBackends(data);
+// Check if a benchmark has TTFB data (is a stream benchmark)
+function isStreamBenchmark(benchData, apps, backends) {
+  for (const app of apps) {
+    for (const backend of backends) {
+      if (benchData[app]?.[backend]?.firstByteTime !== null) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
 
-  if (Object.keys(data).length === 0) {
-    console.log('No benchmark data found.\n');
+// Render a single benchmark table
+function renderBenchmarkTable(
+  benchName,
+  benchData,
+  baselineBenchData,
+  apps,
+  backends,
+  isStream
+) {
+  console.log(`## ${benchName}\n`);
+
+  // Collect all data points (including missing ones) for all app/backend combinations
+  const dataPoints = [];
+  const validDataPoints = [];
+  for (const app of apps) {
+    for (const backend of backends) {
+      const metrics = benchData[app]?.[backend];
+      const baseline = baselineBenchData?.[app]?.[backend] || null;
+      const dataPoint = { app, backend, metrics: metrics || null, baseline };
+      dataPoints.push(dataPoint);
+      if (metrics) {
+        validDataPoints.push(dataPoint);
+      }
+    }
+  }
+
+  if (validDataPoints.length === 0) {
+    console.log('_No data available_\n');
     return;
   }
 
-  console.log('# 📊 Benchmark Comparison\n');
-  console.log(
-    'Cross-matrix comparison of workflow performance across frameworks and backends.\n'
-  );
+  // Sort valid data points by workflow time for ranking
+  validDataPoints.sort((a, b) => {
+    const aTime = a.metrics.workflowTime ?? a.metrics.wallTime;
+    const bTime = b.metrics.workflowTime ?? b.metrics.wallTime;
+    return aTime - bTime;
+  });
+  const fastest = validDataPoints[0];
+  const fastestTime = fastest.metrics.workflowTime ?? fastest.metrics.wallTime;
+
+  // Sort all data points: valid ones first (by time), then missing ones
+  dataPoints.sort((a, b) => {
+    // Missing data goes to the end
+    if (!a.metrics && !b.metrics) return 0;
+    if (!a.metrics) return 1;
+    if (!b.metrics) return -1;
+    // Valid data sorted by time
+    const aTime = a.metrics.workflowTime ?? a.metrics.wallTime;
+    const bTime = b.metrics.workflowTime ?? b.metrics.wallTime;
+    return aTime - bTime;
+  });
+
+  // Render table - different columns for stream vs regular benchmarks
+  if (isStream) {
+    console.log(
+      '| World | Framework | Workflow Time | TTFB | Wall Time | Overhead | vs Fastest |'
+    );
+    console.log(
+      '|:------|:----------|--------------:|-----:|----------:|---------:|-----------:|'
+    );
+  } else {
+    console.log(
+      '| World | Framework | Workflow Time | Wall Time | Overhead | vs Fastest |'
+    );
+    console.log(
+      '|:------|:----------|--------------:|----------:|---------:|-----------:|'
+    );
+  }
 
-  // For each benchmark, create a comparison table
-  for (const [benchName, benchData] of Object.entries(data)) {
-    console.log(`## ${benchName}\n`);
+  for (const { app, backend, metrics, baseline } of dataPoints) {
+    const worldInfo = worldConfig[backend] || {
+      emoji: '',
+      label: backend,
+    };
+    const frameworkInfo = frameworkConfig[app] || { label: app };
 
-    // Collect all data points with their wall times for ranking
-    const dataPoints = [];
-    for (const app of apps) {
-      for (const backend of backends) {
-        const metrics = benchData[app]?.[backend];
-        if (metrics) {
-          dataPoints.push({ app, backend, metrics });
-        }
+    // Handle missing data
+    if (!metrics) {
+      if (isStream) {
+        console.log(
+          `| ${worldInfo.emoji} ${worldInfo.label} | ${frameworkInfo.label} | ⚠️ _missing_ | - | - | - | - |`
+        );
+      } else {
+        console.log(
+          `| ${worldInfo.emoji} ${worldInfo.label} | ${frameworkInfo.label} | ⚠️ _missing_ | - | - | - |`
+        );
       }
-    }
-
-    if (dataPoints.length === 0) {
-      console.log('_No data available_\n');
       continue;
     }
 
-    // Sort by workflow time (primary metric), fall back to wall time if workflow time unavailable
-    dataPoints.sort((a, b) => {
-      const aTime = a.metrics.workflowTime ?? a.metrics.wallTime;
-      const bTime = b.metrics.workflowTime ?? b.metrics.wallTime;
-      return aTime - bTime;
-    });
-    const fastest = dataPoints[0];
-    const fastestTime =
-      fastest.metrics.workflowTime ?? fastest.metrics.wallTime;
-
-    // Render table - Workflow Time is primary metric
-    console.log(
-      '| Backend | Framework | Workflow Time | Wall Time | Overhead | vs Fastest |'
+    const isFastest = metrics === fastest.metrics;
+    const medal = isFastest ? '🥇 ' : '';
+
+    // Format workflow time with delta
+    const workflowTimeSec =
+      metrics.workflowTime !== null ? formatSec(metrics.workflowTime) : '-';
+    const workflowDelta = formatDelta(
+      metrics.workflowTime,
+      baseline?.workflowTime
+    );
+
+    // Format wall time with delta
+    const wallTimeSec = formatSec(metrics.wallTime);
+    const wallDelta = formatDelta(metrics.wallTime, baseline?.wallTime);
+
+    // Format overhead (no delta needed, it's derived)
+    const overheadSec =
+      metrics.overhead !== null ? formatSec(metrics.overhead) : '-';
+
+    // Format TTFB with delta for stream benchmarks
+    const firstByteSec =
+      metrics.firstByteTime !== null ? formatSec(metrics.firstByteTime) : '-';
+    const ttfbDelta = formatDelta(
+      metrics.firstByteTime,
+      baseline?.firstByteTime
     );
+
+    const currentTime = metrics.workflowTime ?? metrics.wallTime;
+    const factor = isFastest
+      ? '1.00x'
+      : `${(currentTime / fastestTime).toFixed(2)}x`;
+
+    if (isStream) {
+      console.log(
+        `| ${worldInfo.emoji} ${worldInfo.label} | ${medal}${frameworkInfo.label} | ${workflowTimeSec}s${workflowDelta} | ${firstByteSec}s${ttfbDelta} | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${factor} |`
+      );
+    } else {
+      console.log(
+        `| ${worldInfo.emoji} ${worldInfo.label} | ${medal}${frameworkInfo.label} | ${workflowTimeSec}s${workflowDelta} | ${wallTimeSec}s${wallDelta} | ${overheadSec}s | ${factor} |`
+      );
+    }
+  }
+  console.log('');
+}
+
+// Render the comparison tables
+function renderComparison(data, baselineData) {
+  const { apps, backends } = getAppsAndBackends(data);
+
+  if (Object.keys(data).length === 0) {
+    console.log('No benchmark data found.\n');
+    return;
+  }
+
+  console.log('<!-- benchmark-results -->\n');
+  console.log('## 📊 Benchmark Results\n');
+
+  // Show baseline comparison note if baseline data is available
+  if (baselineData && Object.keys(baselineData).length > 0) {
     console.log(
-      '|:--------|:----------|--------------:|----------:|---------:|-----------:|'
+      '> 📈 _Comparing against baseline from `main` branch. Green 🟢 = faster, Red 🔺 = slower._\n'
     );
+  }
 
-    for (const { app, backend, metrics } of dataPoints) {
-      const backendInfo = backendConfig[backend] || {
-        emoji: '',
-        label: backend,
-      };
-      const frameworkInfo = frameworkConfig[app] || { label: app };
+  // Separate benchmarks into regular and stream categories
+  const regularBenchmarks = [];
+  const streamBenchmarks = [];
 
-      const isFastest = metrics === fastest.metrics;
-      const medal = isFastest ? '🥇 ' : '';
+  for (const [benchName, benchData] of Object.entries(data)) {
+    if (isStreamBenchmark(benchData, apps, backends)) {
+      streamBenchmarks.push([benchName, benchData]);
+    } else {
+      regularBenchmarks.push([benchName, benchData]);
+    }
+  }
 
-      const workflowTimeSec =
-        metrics.workflowTime !== null ? formatSec(metrics.workflowTime) : '-';
-      const wallTimeSec = formatSec(metrics.wallTime);
-      const overheadSec =
-        metrics.overhead !== null ? formatSec(metrics.overhead) : '-';
+  // Render regular benchmarks first
+  if (regularBenchmarks.length > 0) {
+    for (const [benchName, benchData] of regularBenchmarks) {
+      const baselineBenchData = baselineData?.[benchName] || null;
+      renderBenchmarkTable(
+        benchName,
+        benchData,
+        baselineBenchData,
+        apps,
+        backends,
+        false
+      );
+    }
+  }
 
-      const currentTime = metrics.workflowTime ?? metrics.wallTime;
-      const factor = isFastest
-        ? '1.00x'
-        : `${(currentTime / fastestTime).toFixed(2)}x`;
+  // Render stream benchmarks in a separate section
+  if (streamBenchmarks.length > 0) {
+    console.log('---\n');
+    console.log('### Stream Benchmarks\n');
+    console.log(
+      '_Stream benchmarks include Time to First Byte (TTFB) metrics._\n'
+    );
 
-      console.log(
-        `| ${backendInfo.emoji} ${backendInfo.label} | ${medal}${frameworkInfo.label} | ${workflowTimeSec}s | ${wallTimeSec}s | ${overheadSec}s | ${factor} |`
+    for (const [benchName, benchData] of streamBenchmarks) {
+      const baselineBenchData = baselineData?.[benchName] || null;
+      renderBenchmarkTable(
+        benchName,
+        benchData,
+        baselineBenchData,
+        apps,
+        backends,
+        true
       );
     }
-    console.log('');
   }
 
-  // Summary: Best framework per backend (by Workflow Time)
-  console.log('## Summary: Fastest Framework by Backend\n');
-  console.log('| Backend | Fastest Framework | Workflow Time |');
-  console.log('|:--------|:------------------|---------------:|');
+  // Summary: Count wins per framework (within each world) and per world (within each framework)
+  const allBenchmarks = [...regularBenchmarks, ...streamBenchmarks];
 
-  for (const backend of backends) {
-    const backendInfo = backendConfig[backend] || { emoji: '', label: backend };
-    let fastestApp = null;
-    let fastestTime = Infinity;
+  // Count wins: for each world, which framework wins most benchmarks
+  const frameworkWinsByWorld = {}; // { backend: { app: count } }
+  // Count wins: for each framework, which world wins most benchmarks
+  const worldWinsByFramework = {}; // { app: { backend: count } }
 
-    // Average workflow time across all benchmarks for this backend
-    const appTotals = {};
-    const appCounts = {};
+  for (const [benchName, benchData] of allBenchmarks) {
+    // For each world, find the fastest framework
+    for (const backend of backends) {
+      let fastestApp = null;
+      let fastestTime = Infinity;
 
-    for (const benchData of Object.values(data)) {
       for (const app of apps) {
         const metrics = benchData[app]?.[backend];
         if (metrics) {
           const time = metrics.workflowTime ?? metrics.wallTime;
-          appTotals[app] = (appTotals[app] || 0) + time;
-          appCounts[app] = (appCounts[app] || 0) + 1;
+          if (time < fastestTime) {
+            fastestTime = time;
+            fastestApp = app;
+          }
         }
       }
+
+      if (fastestApp) {
+        if (!frameworkWinsByWorld[backend]) {
+          frameworkWinsByWorld[backend] = {};
+        }
+        frameworkWinsByWorld[backend][fastestApp] =
+          (frameworkWinsByWorld[backend][fastestApp] || 0) + 1;
+      }
     }
 
+    // For each framework, find the fastest world
     for (const app of apps) {
-      if (appCounts[app] > 0) {
-        const avgTime = appTotals[app] / appCounts[app];
-        if (avgTime < fastestTime) {
-          fastestTime = avgTime;
-          fastestApp = app;
+      let fastestBackend = null;
+      let fastestTime = Infinity;
+
+      for (const backend of backends) {
+        const metrics = benchData[app]?.[backend];
+        if (metrics) {
+          const time = metrics.workflowTime ?? metrics.wallTime;
+          if (time < fastestTime) {
+            fastestTime = time;
+            fastestBackend = backend;
+          }
+        }
+      }
+
+      if (fastestBackend) {
+        if (!worldWinsByFramework[app]) {
+          worldWinsByFramework[app] = {};
         }
+        worldWinsByFramework[app][fastestBackend] =
+          (worldWinsByFramework[app][fastestBackend] || 0) + 1;
       }
     }
+  }
 
-    if (fastestApp) {
-      const frameworkInfo = frameworkConfig[fastestApp] || {
-        label: fastestApp,
-      };
+  // Summary: Best framework per world (by wins)
+  console.log('---\n');
+  console.log('## Summary: Fastest Framework by World\n');
+  console.log(`_Winner determined by most benchmark wins_\n`);
+  console.log('| World | 🥇 Fastest Framework | Wins |');
+  console.log('|:------|:---------------------|-----:|');
+
+  for (const backend of backends) {
+    const worldInfo = worldConfig[backend] || { emoji: '', label: backend };
+    const frameworkWins = frameworkWinsByWorld[backend] || {};
+
+    // Find framework with most wins
+    let bestApp = null;
+    let bestWins = 0;
+    for (const [app, wins] of Object.entries(frameworkWins)) {
+      if (wins > bestWins) {
+        bestWins = wins;
+        bestApp = app;
+      }
+    }
+
+    if (bestApp) {
+      const frameworkInfo = frameworkConfig[bestApp] || { label: bestApp };
+      // Count total benchmarks for this world (benchmarks that have data for this world)
+      const totalForWorld = allBenchmarks.filter(([, bd]) =>
+        apps.some((a) => bd[a]?.[backend])
+      ).length;
       console.log(
-        `| ${backendInfo.emoji} ${backendInfo.label} | ${frameworkInfo.label} | ${formatSec(fastestTime)}s (avg) |`
+        `| ${worldInfo.emoji} ${worldInfo.label} | ${frameworkInfo.label} | ${bestWins}/${totalForWorld} |`
       );
     }
   }
   console.log('');
 
-  // Summary: Best backend per framework (by Workflow Time)
-  console.log('## Summary: Fastest Backend by Framework\n');
-  console.log('| Framework | Fastest Backend | Workflow Time |');
-  console.log('|:----------|:----------------|---------------:|');
+  // Summary: Best world per framework (by wins)
+  console.log('## Summary: Fastest World by Framework\n');
+  console.log(`_Winner determined by most benchmark wins_\n`);
+  console.log('| Framework | 🥇 Fastest World | Wins |');
+  console.log('|:----------|:-----------------|-----:|');
 
   for (const app of apps) {
     const frameworkInfo = frameworkConfig[app] || { label: app };
-    let fastestBackend = null;
-    let fastestTime = Infinity;
-
-    // Average workflow time across all benchmarks for this app
-    const backendTotals = {};
-    const backendCounts = {};
-
-    for (const benchData of Object.values(data)) {
-      for (const backend of backends) {
-        const metrics = benchData[app]?.[backend];
-        if (metrics) {
-          const time = metrics.workflowTime ?? metrics.wallTime;
-          backendTotals[backend] = (backendTotals[backend] || 0) + time;
-          backendCounts[backend] = (backendCounts[backend] || 0) + 1;
-        }
+    const worldWins = worldWinsByFramework[app] || {};
+
+    // Find world with most wins
+    let bestBackend = null;
+    let bestWins = 0;
+    for (const [backend, wins] of Object.entries(worldWins)) {
+      if (wins > bestWins) {
+        bestWins = wins;
+        bestBackend = backend;
       }
     }
 
-    for (const backend of backends) {
-      if (backendCounts[backend] > 0) {
-        const avgTime = backendTotals[backend] / backendCounts[backend];
-        if (avgTime < fastestTime) {
-          fastestTime = avgTime;
-          fastestBackend = backend;
-        }
-      }
-    }
-
-    if (fastestBackend) {
-      const backendInfo = backendConfig[fastestBackend] || {
+    if (bestBackend) {
+      const worldInfo = worldConfig[bestBackend] || {
         emoji: '',
-        label: fastestBackend,
+        label: bestBackend,
       };
+      // Count total benchmarks for this framework (benchmarks that have data for this framework)
+      const totalForFramework = allBenchmarks.filter(([, bd]) =>
+        backends.some((b) => bd[app]?.[b])
+      ).length;
       console.log(
-        `| ${frameworkInfo.label} | ${backendInfo.emoji} ${backendInfo.label} | ${formatSec(fastestTime)}s (avg) |`
+        `| ${frameworkInfo.label} | ${worldInfo.emoji} ${worldInfo.label} | ${bestWins}/${totalForFramework} |`
       );
     }
   }
@@ -338,6 +547,9 @@ function renderComparison(data) {
   console.log(
     '- **Workflow Time**: Runtime reported by workflow (completedAt - createdAt) - *primary metric*'
   );
+  console.log(
+    '- **TTFB**: Time to First Byte - time from workflow start until first stream byte received (stream benchmarks only)'
+  );
   console.log(
     '- **Wall Time**: Total testbench time (trigger workflow + poll for result)'
   );
@@ -346,10 +558,10 @@ function renderComparison(data) {
     '- **vs Fastest**: How much slower compared to the fastest configuration for this benchmark'
   );
   console.log('');
-  console.log('**Backends:**');
-  console.log('- 💻 Local: In-memory filesystem backend');
-  console.log('- 🐘 Postgres: PostgreSQL database backend');
-  console.log('- ▲ Vercel: Vercel production backend');
+  console.log('**Worlds:**');
+  console.log('- 💻 Local: In-memory filesystem world');
+  console.log('- 🐘 Postgres: PostgreSQL database world');
+  console.log('- ▲ Vercel: Vercel production world');
   console.log('</details>');
 }
 
@@ -362,4 +574,14 @@ if (resultFiles.length === 0) {
 }
 
 const data = collectBenchmarkData(resultFiles);
-renderComparison(data);
+
+// Load baseline data if provided
+let baselineData = null;
+if (baselineDir) {
+  const baselineFiles = findBenchmarkFiles(baselineDir);
+  if (baselineFiles.length > 0) {
+    baselineData = collectBenchmarkData(baselineFiles);
+  }
+}
+
+renderComparison(data, baselineData);
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index b993c2f6a..bdcc437ce 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -12,7 +12,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  # Phase 0: Create placeholder PR comment (so it's pinned to the top)
+  # Phase 0: Update PR comment to show benchmarks are running
   pr-comment-start:
     name: Create PR Comment
     runs-on: ubuntu-latest
@@ -20,11 +20,86 @@ jobs:
     timeout-minutes: 2
 
     steps:
-      - name: Create initial benchmark comment
+      - name: Find existing benchmark comment
+        uses: peter-evans/find-comment@v3
+        id: find-comment
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-author: 'github-actions[bot]'
+          body-includes: '<!-- benchmark-results -->'
+
+      - name: Get existing comment body
+        if: steps.find-comment.outputs.comment-id != ''
+        id: get-comment
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const comment = await github.rest.issues.getComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: ${{ steps.find-comment.outputs.comment-id }}
+            });
+            // Extract the results section (everything after the header and running message)
+            const body = comment.data.body;
+            // Remove any existing stale warning and running message
+            let resultsSection = body
+              .replace(/<!-- benchmark-results -->\n## 📊 Benchmark Results\n\n> ⚠️ \*\*Results below are stale\*\*[^\n]*\n\n/g, '')
+              .replace(/<!-- benchmark-results -->\n## 📊 Benchmark Results\n\n/g, '')
+              .replace(/⏳ \*\*Benchmarks are running\.\.\.\*\*\n\n---\n_Started at:[^_]*_\n\n---\n\n/g, '')
+              .replace(/⏳ \*\*Benchmarks are running\.\.\.\*\*\n\n---\n_Started at:[^_]*_/g, '')
+              .trim();
+
+            // If there's actual content left (benchmark tables), save it
+            if (resultsSection && resultsSection.includes('|')) {
+              core.setOutput('has-results', 'true');
+              core.setOutput('previous-results', resultsSection);
+            } else {
+              core.setOutput('has-results', 'false');
+            }
+
+      - name: Create new benchmark comment
+        if: steps.find-comment.outputs.comment-id == ''
         uses: marocchino/sticky-pull-request-comment@v2
         with:
           header: benchmark-results
           message: |
+            <!-- benchmark-results -->
+            ## 📊 Benchmark Results
+
+            ⏳ **Benchmarks are running...**
+
+            This comment will be updated with the results when the benchmarks complete.
+
+            ---
+            _Started at: ${{ github.event.pull_request.updated_at }}_
+
+      - name: Update existing benchmark comment with stale warning
+        if: steps.find-comment.outputs.comment-id != '' && steps.get-comment.outputs.has-results == 'true'
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: benchmark-results
+          message: |
+            <!-- benchmark-results -->
+            ## 📊 Benchmark Results
+
+            > ⚠️ **Results below are stale** and not from the latest commit. This comment will be updated when CI completes on the latest run.
+
+            ⏳ **Benchmarks are running...**
+
+            ---
+            _Started at: ${{ github.event.pull_request.updated_at }}_
+
+            ---
+
+            ${{ steps.get-comment.outputs.previous-results }}
+
+      - name: Update existing benchmark comment without results
+        if: steps.find-comment.outputs.comment-id != '' && steps.get-comment.outputs.has-results != 'true'
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: benchmark-results
+          message: |
+            <!-- benchmark-results -->
             ## 📊 Benchmark Results
 
             ⏳ **Benchmarks are running...**
@@ -107,6 +182,19 @@ jobs:
       - name: Install dependencies
         run: pnpm install --frozen-lockfile
 
+      # For PRs, download baseline results from main branch
+      - name: Download baseline from main branch
+        if: github.event_name == 'pull_request'
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v6
+        with:
+          workflow: benchmarks.yml
+          branch: main
+          name: baseline-benchmark-results
+          path: baseline-results
+          search_artifacts: true
+          if_no_artifact_found: warn
+
       - name: Build workbench
         run: pnpm turbo run build --filter='./workbench/${{ matrix.app }}'
 
@@ -128,6 +216,7 @@ jobs:
           benchmark-file: bench-results-${{ matrix.app }}-local.json
           app-name: ${{ matrix.app }}
           backend: local
+          baseline-file: baseline-results/bench-results-${{ matrix.app }}-local.json
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v4
@@ -190,6 +279,19 @@ jobs:
       - name: Install dependencies
         run: pnpm install --frozen-lockfile
 
+      # For PRs, download baseline results from main branch
+      - name: Download baseline from main branch
+        if: github.event_name == 'pull_request'
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v6
+        with:
+          workflow: benchmarks.yml
+          branch: main
+          name: baseline-benchmark-results
+          path: baseline-results
+          search_artifacts: true
+          if_no_artifact_found: warn
+
       - name: Setup PostgreSQL database
         run: ./packages/world-postgres/bin/setup.js
 
@@ -215,6 +317,7 @@ jobs:
           benchmark-file: bench-results-${{ matrix.app }}-postgres.json
           app-name: ${{ matrix.app }}
           backend: postgres
+          baseline-file: baseline-results/bench-results-${{ matrix.app }}-postgres.json
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v4
@@ -261,6 +364,19 @@ jobs:
       - name: Install dependencies
         run: pnpm install --frozen-lockfile
 
+      # For PRs, download baseline results from main branch
+      - name: Download baseline from main branch
+        if: github.event_name == 'pull_request'
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v6
+        with:
+          workflow: benchmarks.yml
+          branch: main
+          name: baseline-benchmark-results
+          path: baseline-results
+          search_artifacts: true
+          if_no_artifact_found: warn
+
       - name: Wait for Vercel deployment
         id: waitForDeployment
         uses: ./.github/actions/wait-for-vercel-project
@@ -289,6 +405,7 @@ jobs:
           benchmark-file: bench-results-${{ matrix.app.name }}-vercel.json
           app-name: ${{ matrix.app.name }}
           backend: vercel
+          baseline-file: baseline-results/bench-results-${{ matrix.app.name }}-vercel.json
 
       - name: Upload benchmark results
         uses: actions/upload-artifact@v4
@@ -319,11 +436,32 @@ jobs:
       - name: List downloaded files
         run: find benchmark-results -type f -name "*.json" | sort
 
+      # For PRs, try to download baseline results from the latest main branch run
+      - name: Download baseline from main branch
+        if: github.event_name == 'pull_request'
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v6
+        with:
+          workflow: benchmarks.yml
+          branch: main
+          name: baseline-benchmark-results
+          path: baseline-results
+          search_artifacts: true
+          if_no_artifact_found: warn
+
       - name: Aggregate and compare benchmarks
         id: aggregate
         run: |
+          # Check if baseline results exist
+          BASELINE_ARG=""
+          if [ -d "baseline-results" ] && [ "$(ls -A baseline-results 2>/dev/null)" ]; then
+            echo "Found baseline results from main branch"
+            BASELINE_ARG="--baseline baseline-results"
+          else
+            echo "No baseline results found, showing results without comparison"
+          fi
           # Capture output to both file and step summary
-          node .github/scripts/aggregate-benchmarks.js benchmark-results | tee benchmark-summary.md >> $GITHUB_STEP_SUMMARY
+          node .github/scripts/aggregate-benchmarks.js benchmark-results $BASELINE_ARG | tee benchmark-summary.md >> $GITHUB_STEP_SUMMARY
 
       - name: Check benchmark job statuses
         id: check-status
@@ -365,3 +503,12 @@ jobs:
             - Vercel: ${{ needs.benchmark-vercel.result }}
 
             Check the [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for details.
+
+      # On main branch, save results as baseline for future PR comparisons
+      - name: Upload baseline results
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-benchmark-results
+          path: benchmark-results/
+          retention-days: 90
diff --git a/packages/core/e2e/bench.bench.ts b/packages/core/e2e/bench.bench.ts
index a887f0bcb..ce57115a2 100644
--- a/packages/core/e2e/bench.bench.ts
+++ b/packages/core/e2e/bench.bench.ts
@@ -17,9 +17,16 @@ const workflowTimings: Record<
     startedAt?: string;
     completedAt?: string;
     executionTimeMs?: number;
+    firstByteTimeMs?: number;
   }[]
 > = {};
 
+// Buffered timing data keyed by task name, flushed in teardown
+const bufferedTimings: Map<
+  string,
+  { run: any; extra?: { firstByteTimeMs?: number } }[]
+> = new Map();
+
 async function triggerWorkflow(
   workflow: string | { workflowFile: string; workflowFn: string },
   args: any[]
@@ -123,6 +130,9 @@ function writeTimingFile() {
       minExecutionTimeMs: number;
       maxExecutionTimeMs: number;
       samples: number;
+      avgFirstByteTimeMs?: number;
+      minFirstByteTimeMs?: number;
+      maxFirstByteTimeMs?: number;
     }
   > = {};
   for (const [benchName, timings] of Object.entries(workflowTimings)) {
@@ -139,6 +149,18 @@ function writeTimingFile() {
         maxExecutionTimeMs: max,
         samples: validTimings.length,
       };
+
+      // Add first byte stats if available
+      const firstByteTimings = timings.filter(
+        (t) => t.firstByteTimeMs !== undefined
+      );
+      if (firstByteTimings.length > 0) {
+        const firstByteTimes = firstByteTimings.map((t) => t.firstByteTimeMs!);
+        summary[benchName].avgFirstByteTimeMs =
+          firstByteTimes.reduce((sum, t) => sum + t, 0) / firstByteTimes.length;
+        summary[benchName].minFirstByteTimeMs = Math.min(...firstByteTimes);
+        summary[benchName].maxFirstByteTimeMs = Math.max(...firstByteTimes);
+      }
     }
   }
 
@@ -148,39 +170,67 @@ function writeTimingFile() {
   );
 }
 
-function recordWorkflowTiming(benchName: string, run: any) {
-  if (!workflowTimings[benchName]) {
-    workflowTimings[benchName] = [];
+// Buffer timing data (called during each iteration)
+function stageTiming(
+  benchName: string,
+  run: any,
+  extra?: { firstByteTimeMs?: number }
+) {
+  if (!bufferedTimings.has(benchName)) {
+    bufferedTimings.set(benchName, []);
   }
+  bufferedTimings.get(benchName)!.push({ run, extra });
+}
 
-  const timing: any = {
-    createdAt: run.createdAt,
-    startedAt: run.startedAt,
-    completedAt: run.completedAt,
-  };
-
-  // Calculate execution time if timestamps are available (completedAt - createdAt)
-  if (run.createdAt && run.completedAt) {
-    const created = new Date(run.createdAt).getTime();
-    const completed = new Date(run.completedAt).getTime();
-    timing.executionTimeMs = completed - created;
-  }
+// Teardown: on warmup, clear buffer; on run, flush to file then clear
+const teardown = (task: { name: string }, mode: 'warmup' | 'run') => {
+  const buffered = bufferedTimings.get(task.name) || [];
 
-  workflowTimings[benchName].push(timing);
+  if (mode === 'run') {
+    // Flush all buffered timings to workflowTimings
+    for (const { run, extra } of buffered) {
+      if (!workflowTimings[task.name]) {
+        workflowTimings[task.name] = [];
+      }
 
-  // Write timing file after each recording (overwrites previous)
-  writeTimingFile();
-}
+      const timing: any = {
+        createdAt: run.createdAt,
+        startedAt: run.startedAt,
+        completedAt: run.completedAt,
+      };
+
+      // Calculate execution time if timestamps are available (completedAt - createdAt)
+      if (run.createdAt && run.completedAt) {
+        const created = new Date(run.createdAt).getTime();
+        const completed = new Date(run.completedAt).getTime();
+        timing.executionTimeMs = completed - created;
+      }
 
-describe.concurrent('Workflow Performance Benchmarks', () => {
+      // Add extra metrics if provided
+      if (extra?.firstByteTimeMs !== undefined) {
+        timing.firstByteTimeMs = extra.firstByteTimeMs;
+      }
+
+      workflowTimings[task.name].push(timing);
+    }
+
+    // Write timing file after flushing
+    writeTimingFile();
+  }
+
+  // Clear buffer (both warmup and run)
+  bufferedTimings.delete(task.name);
+};
+
+describe('Workflow Performance Benchmarks', () => {
   bench(
     'workflow with no steps',
     async () => {
       const { runId } = await triggerWorkflow('noStepsWorkflow', [42]);
       const { run } = await getWorkflowReturnValue(runId);
-      recordWorkflowTiming('workflow with no steps', run);
+      stageTiming('workflow with no steps', run);
     },
-    { time: 5000 }
+    { time: 5000, warmupIterations: 1, teardown }
   );
 
   bench(
@@ -188,9 +238,9 @@ describe.concurrent('Workflow Performance Benchmarks', () => {
     async () => {
       const { runId } = await triggerWorkflow('oneStepWorkflow', [100]);
       const { run } = await getWorkflowReturnValue(runId);
-      recordWorkflowTiming('workflow with 1 step', run);
+      stageTiming('workflow with 1 step', run);
     },
-    { time: 5000 }
+    { time: 5000, warmupIterations: 1, teardown }
   );
 
   bench(
@@ -198,9 +248,9 @@ describe.concurrent('Workflow Performance Benchmarks', () => {
     async () => {
       const { runId } = await triggerWorkflow('tenSequentialStepsWorkflow', []);
       const { run } = await getWorkflowReturnValue(runId);
-      recordWorkflowTiming('workflow with 10 sequential steps', run);
+      stageTiming('workflow with 10 sequential steps', run);
     },
-    { time: 5000 }
+    { time: 5000, iterations: 5, warmupIterations: 1, teardown }
   );
 
   bench(
@@ -208,8 +258,33 @@ describe.concurrent('Workflow Performance Benchmarks', () => {
     async () => {
       const { runId } = await triggerWorkflow('tenParallelStepsWorkflow', []);
       const { run } = await getWorkflowReturnValue(runId);
-      recordWorkflowTiming('workflow with 10 parallel steps', run);
+      stageTiming('workflow with 10 parallel steps', run);
+    },
+    { time: 5000, iterations: 5, warmupIterations: 1, teardown }
+  );
+
+  bench(
+    'workflow with stream',
+    async () => {
+      const { runId } = await triggerWorkflow('streamWorkflow', []);
+      const { run, value } = await getWorkflowReturnValue(runId);
+      // Consume the entire stream and track time-to-first-byte from workflow startedAt
+      let firstByteTimeMs: number | undefined;
+      if (value instanceof ReadableStream) {
+        const reader = value.getReader();
+        let isFirstChunk = true;
+        while (true) {
+          const { done } = await reader.read();
+          if (isFirstChunk && !done && run.startedAt) {
+            const startedAt = new Date(run.startedAt).getTime();
+            firstByteTimeMs = Date.now() - startedAt;
+            isFirstChunk = false;
+          }
+          if (done) break;
+        }
+      }
+      stageTiming('workflow with stream', run, { firstByteTimeMs });
     },
-    { time: 5000 }
+    { time: 5000, warmupIterations: 1, teardown }
   );
 });
diff --git a/packages/world-local/src/streamer.ts b/packages/world-local/src/streamer.ts
index 1ff8d0026..d416b5cb3 100644
--- a/packages/world-local/src/streamer.ts
+++ b/packages/world-local/src/streamer.ts
@@ -216,7 +216,11 @@ export function createStreamer(basedir: string): Streamer {
 
           if (isComplete) {
             removeListeners();
-            controller.close();
+            try {
+              controller.close();
+            } catch (e) {
+              // Ignore if controller is already closed (e.g., from closeListener event)
+            }
             return;
           }
         },
diff --git a/workbench/example/api/trigger.ts b/workbench/example/api/trigger.ts
index 2cfe032f9..48405e87c 100644
--- a/workbench/example/api/trigger.ts
+++ b/workbench/example/api/trigger.ts
@@ -84,13 +84,25 @@ export async function GET(req: Request) {
     const run = getRun(runId);
     const returnValue = await run.returnValue;
     console.log('Return value:', returnValue);
+
+    // Include run metadata in headers
+    const [createdAt, startedAt, completedAt] = await Promise.all([
+      run.createdAt,
+      run.startedAt,
+      run.completedAt,
+    ]);
+    const headers: HeadersInit =
+      returnValue instanceof ReadableStream
+        ? { 'Content-Type': 'application/octet-stream' }
+        : {};
+
+    headers['X-Workflow-Run-Created-At'] = createdAt?.toISOString() || '';
+    headers['X-Workflow-Run-Started-At'] = startedAt?.toISOString() || '';
+    headers['X-Workflow-Run-Completed-At'] = completedAt?.toISOString() || '';
+
     return returnValue instanceof ReadableStream
-      ? new Response(returnValue, {
-          headers: {
-            'Content-Type': 'application/octet-stream',
-          },
-        })
-      : Response.json(returnValue);
+      ? new Response(returnValue, { headers })
+      : Response.json(returnValue, { headers });
   } catch (error) {
     if (error instanceof Error) {
       if (WorkflowRunNotCompletedError.is(error)) {
diff --git a/workbench/example/workflows/97_bench.ts b/workbench/example/workflows/97_bench.ts
index 4b2b496c6..d47a93595 100644
--- a/workbench/example/workflows/97_bench.ts
+++ b/workbench/example/workflows/97_bench.ts
@@ -38,3 +38,51 @@ export async function tenParallelStepsWorkflow() {
   const results = await Promise.all(promises);
   return results.reduce((sum, val) => sum + val, 0);
 }
+
+// Step that generates a stream with 10 chunks
+async function genBenchStream(): Promise<ReadableStream<Uint8Array>> {
+  'use step';
+  const encoder = new TextEncoder();
+  return new ReadableStream<Uint8Array>({
+    async start(controller) {
+      for (let i = 0; i < 10; i++) {
+        controller.enqueue(encoder.encode(`${i}\n`));
+        // Small delay to avoid synchronous close issues on local world
+        await new Promise((resolve) => setTimeout(resolve, 10));
+      }
+      controller.close();
+    },
+  });
+}
+
+// Step that transforms a stream by doubling each number
+async function doubleNumbers(
+  stream: ReadableStream<Uint8Array>
+): Promise<ReadableStream<Uint8Array>> {
+  'use step';
+  const decoder = new TextDecoder();
+  const encoder = new TextEncoder();
+
+  const transformStream = new TransformStream<Uint8Array, Uint8Array>({
+    transform(chunk, controller) {
+      const text = decoder.decode(chunk, { stream: true });
+      const lines = text.split('\n');
+      for (const line of lines) {
+        if (line.trim()) {
+          const num = parseInt(line, 10);
+          controller.enqueue(encoder.encode(`${num * 2}\n`));
+        }
+      }
+    },
+  });
+
+  return stream.pipeThrough(transformStream);
+}
+
+// Workflow that generates and transforms a stream
+export async function streamWorkflow() {
+  'use workflow';
+  const stream = await genBenchStream();
+  const doubled = await doubleNumbers(stream);
+  return doubled;
+}