From 51fe1f6256d6b6911719bc3a00313483b0345279 Mon Sep 17 00:00:00 2001
From: yyoyoian-pixel <279225925+yyoyoian-pixel@users.noreply.github.com>
Date: Tue, 5 May 2026 01:56:15 +0200
Subject: [PATCH] perf: TLS connection pool + coalesce tuning for lower latency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TLS pool improvements:
- Increase POOL_TTL from 45s to 60s so connections live longer
- Add POOL_MIN (8): background refill loop keeps at least 8 ready
  TLS connections so acquire() never pays a cold handshake
- Refill checks every 5s, only counts connections with ≥20s
  remaining as "healthy" — nearly-expired entries don't count
- warm() now opens sequentially (500ms gaps) with 8s expiry
  offset per connection so they roll off gradually instead of
  all expiring together after a cliff
- acquire() picks the freshest connection (most remaining TTL)
  instead of popping whatever is on top

Coalesce step increase:
- DEFAULT_COALESCE_STEP_MS: 10 → 200. The dominant bottleneck is
  the Apps Script round-trip (~1.5s), so the extra 200ms wait is
  negligible to the user but lets significantly more ops land in
  each batch — measured 3–5 ops/batch vs 1 op/batch at 10ms
  during page loads, cutting round-trips roughly in half.

Tested on Android (Pixel 6 Pro) with full-mode tunnel. Pool
hit rate went from 96% (POOL_MIN=4) to 100% (POOL_MIN=8) —
zero cold TLS handshakes during requests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/domain_fronter.rs | 115 +++++++++++++++++++++++++++++++-----------
 src/proxy_server.rs   |  11 ++++
 src/tunnel_client.rs  |  21 ++++----
 3 files changed, 106 insertions(+), 41 deletions(-)
diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs
index 38a454b..7f0fca5 100644
--- a/src/domain_fronter.rs
+++ b/src/domain_fronter.rs
@@ -57,7 +57,9 @@ pub enum FronterError {
 }
 
 type PooledStream = TlsStream<TcpStream>;
-const POOL_TTL_SECS: u64 = 45;
+const POOL_TTL_SECS: u64 = 60;
+const POOL_MIN: usize = 8;
+const POOL_REFILL_INTERVAL_SECS: u64 = 5;
 const POOL_MAX: usize = 80;
 const REQUEST_TIMEOUT_SECS: u64 = 25;
 const RANGE_PARALLEL_CHUNK_BYTES: u64 = 256 * 1024;
@@ -644,33 +646,31 @@ impl DomainFronter {
         Ok(tls)
     }
 
-    /// Open `n` outbound TLS connections in parallel and park them in the
-    /// pool so the first few user requests don't pay the handshake cost.
-    /// Errors are logged but not returned — best-effort.
+    /// Open `n` outbound TLS connections sequentially (500 ms apart) and
+    /// park them in the pool. Staggered so we don't burst N TLS handshakes
+    /// at Google edge simultaneously, and each connection gets an 8 s
+    /// expiry offset so they roll off gradually instead of all hitting
+    /// POOL_TTL_SECS at once.
     pub async fn warm(self: &Arc<Self>, n: usize) {
-        let mut set = tokio::task::JoinSet::new();
-        for _ in 0..n {
-            let me = self.clone();
-            set.spawn(async move {
-                match me.open().await {
-                    Ok(s) => Some(PoolEntry {
+        let mut warmed = 0usize;
+        for i in 0..n {
+            if i > 0 {
+                tokio::time::sleep(Duration::from_millis(500)).await;
+            }
+            match self.open().await {
+                Ok(s) => {
+                    let entry = PoolEntry {
                         stream: s,
-                        created: Instant::now(),
-                    }),
-                    Err(e) => {
-                        tracing::debug!("pool warm: open failed: {}", e);
-                        None
+                        created: Instant::now() - Duration::from_secs(8 * i as u64),
+                    };
+                    let mut pool = self.pool.lock().await;
+                    if pool.len() < POOL_MAX {
+                        pool.push(entry);
+                        warmed += 1;
                     }
                 }
-            });
-        }
-        let mut warmed = 0;
-        while let Some(res) = set.join_next().await {
-            if let Ok(Some(entry)) = res {
-                let mut pool = self.pool.lock().await;
-                if pool.len() < POOL_MAX {
-                    pool.push(entry);
-                    warmed += 1;
+                Err(e) => {
+                    tracing::debug!("pool warm: open failed: {}", e);
                 }
             }
         }
@@ -679,6 +679,56 @@ impl DomainFronter {
         }
     }
 
+    /// Background loop that keeps at least `POOL_MIN` valid connections
+    /// ready. A connection only counts toward the minimum if it has at
+    /// least 20 s of TTL remaining — nearly-expired entries don't help.
+    /// Checks every `POOL_REFILL_INTERVAL_SECS`, evicts expired entries,
+    /// and opens replacements one at a time so there's no burst.
+    pub async fn run_pool_refill(self: Arc<Self>) {
+        const MIN_REMAINING_SECS: u64 = 20;
+        loop {
+            tokio::time::sleep(Duration::from_secs(POOL_REFILL_INTERVAL_SECS)).await;
+
+            // Evict expired entries first.
+            {
+                let mut pool = self.pool.lock().await;
+                pool.retain(|e| e.created.elapsed().as_secs() < POOL_TTL_SECS);
+            }
+
+            // Count only connections with enough life left.
+            // Refill one at a time to avoid bursting TLS handshakes.
+            loop {
+                let healthy = {
+                    let pool = self.pool.lock().await;
+                    pool.iter()
+                        .filter(|e| {
+                            let age = e.created.elapsed().as_secs();
+                            age + MIN_REMAINING_SECS < POOL_TTL_SECS
+                        })
+                        .count()
+                };
+                if healthy >= POOL_MIN {
+                    break;
+                }
+                match self.open().await {
+                    Ok(s) => {
+                        let mut pool = self.pool.lock().await;
+                        if pool.len() < POOL_MAX {
+                            pool.push(PoolEntry {
+                                stream: s,
+                                created: Instant::now(),
+                            });
+                        }
+                    }
+                    Err(e) => {
+                        tracing::debug!("pool refill: open failed: {}", e);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
     /// Keep the Apps Script container warm with a periodic HEAD ping.
     ///
     /// `acquire()` keeps the *TCP/TLS pool* warm but does nothing for the
@@ -721,12 +771,17 @@ impl DomainFronter {
     async fn acquire(&self) -> Result<PoolEntry, FronterError> {
         {
             let mut pool = self.pool.lock().await;
-            while let Some(entry) = pool.pop() {
-                if entry.created.elapsed().as_secs() < POOL_TTL_SECS {
-                    return Ok(entry);
-                }
-                // expired — drop it
-                drop(entry);
+            // Evict expired, then hand out the freshest (most remaining TTL).
+            pool.retain(|e| e.created.elapsed().as_secs() < POOL_TTL_SECS);
+            if !pool.is_empty() {
+                // Freshest = smallest elapsed time. swap_remove is O(1).
+                let freshest = pool
+                    .iter()
+                    .enumerate()
+                    .min_by_key(|(_, e)| e.created.elapsed())
+                    .map(|(i, _)| i)
+                    .unwrap();
+                return Ok(pool.swap_remove(freshest));
             }
         }
         let stream = self.open().await?;
diff --git a/src/proxy_server.rs b/src/proxy_server.rs
index 81f9071..84e60d7 100644
--- a/src/proxy_server.rs
+++ b/src/proxy_server.rs
@@ -589,6 +589,16 @@ impl ProxyServer {
             tokio::spawn(async move { std::future::pending::<()>().await })
         };
 
+        // Background pool refill: keeps at least POOL_MIN ready TLS
+        // connections so acquire() never pays a cold handshake.
+        let refill_task = if let Some(refill_fronter) = self.fronter.clone() {
+            tokio::spawn(async move {
+                refill_fronter.run_pool_refill().await;
+            })
+        } else {
+            tokio::spawn(async move { std::future::pending::<()>().await })
+        };
+
         let stats_task = if let Some(stats_fronter) = self.fronter.clone() {
             tokio::spawn(async move {
                 let mut ticker = tokio::time::interval(std::time::Duration::from_secs(60));
@@ -697,6 +707,7 @@ impl ProxyServer {
                 tracing::info!("Shutdown signal received, stopping listeners");
                 stats_task.abort();
                 keepalive_task.abort();
+                refill_task.abort();
                 http_task.abort();
                 socks_task.abort();
             }
diff --git a/src/tunnel_client.rs b/src/tunnel_client.rs
index 57c2736..98e1572 100644
--- a/src/tunnel_client.rs
+++ b/src/tunnel_client.rs
@@ -60,17 +60,16 @@ const CLIENT_FIRST_DATA_WAIT: Duration = Duration::from_millis(50);
 /// step for more ops. Resets on every arrival, up to max from the first
 /// op. Overridable via config `coalesce_step_ms` / `coalesce_max_ms`.
 ///
-/// 10 ms is enough to catch ops that arrive in the same event-loop tick
-/// (e.g. a browser opening 6 parallel connections) without adding
-/// perceptible latency to downloads where the tunnel-node reply — not
-/// coalescing — is the real bottleneck.  When both sides *do* have data
-/// in flight (uploads, bursty page loads), the adaptive reset still
-/// packs batches efficiently: each arriving op resets the step timer, so
-/// a rapid burst naturally coalesces up to `DEFAULT_COALESCE_MAX_MS`
-/// without an explicit upload/download distinction.  The net effect is
-/// "don't wait when there's nothing to wait for; batch aggressively when
-/// there is."
-const DEFAULT_COALESCE_STEP_MS: u64 = 10;
+/// 200 ms balances latency against batching efficiency. The dominant
+/// bottleneck is the Apps Script round-trip (~1.5 s), so the extra
+/// 200 ms wait is negligible to the user but lets significantly more
+/// ops land in each batch — a page load that would fire 10 separate
+/// 1-op batches at 10 ms now packs 3–5 ops per batch, cutting the
+/// number of round-trips roughly in half. On idle sessions the step
+/// timer fires once with nothing queued (no cost); under load each
+/// arriving op resets the timer, so rapid bursts still coalesce up to
+/// `DEFAULT_COALESCE_MAX_MS` naturally.
+const DEFAULT_COALESCE_STEP_MS: u64 = 200;
 const DEFAULT_COALESCE_MAX_MS: u64 = 1000;
 
 /// Structured error code the tunnel-node returns when it doesn't know the