From 51c9b0ac1885e9a40ef9ea725f9eaf4a253b0c7a Mon Sep 17 00:00:00 2001 From: Max042004 Date: Fri, 5 Jun 2026 21:31:06 +0800 Subject: [PATCH] Grow the page-table pool for multi-isolate guests The guest page-table pool was a fixed 960 KiB arena (240 x 4 KiB L3 pages) at the bottom of the 4 MiB infra reserve. It is a bump allocator that never reclaims on munmap, and each 2 MiB block that needs mixed permissions (e.g. V8 JIT W^X) draws one L3 page, so the pool budgeted only ~480 MiB of split address space over a process lifetime. A single guest stays well under that, but every extra V8 isolate a Node worker_threads pool or cluster spins up reserves its own committed regions; the third isolate exhausted the pool, after which even munmap (which must split a block to invalidate a sub-range) failed and V8 hard-aborted on CHECK(0 == munmap) instead of getting a clean ENOMEM. Grow INFRA_RESERVE 4 MiB -> 16 MiB and tighten the shim code slot from a round 1 MiB to 40 KiB (INFRA_SHIM_SLOT, ~6x the ~7 KiB shim blob) so the freed space falls through to the pool, which becomes ~13.9 MiB (3558 pages, ~7 GiB of split address space) -- enough for an os.cpus()-sized worker pool on a 24-core Ultra. The reserve is demand-paged and sits in the ~4 GiB dead zone below interp_base, so the unused pool costs no host RAM and the larger virtual reserve is free. The layout keeps every invariant: shim data still occupies the top 2 MiB block (shim_data_base + 2 MiB == interp_base), shim code still shares the PT pool's last 2 MiB block, and the pool still ends where the shim slot begins. Only the constants in core/guest.h change; consumers derive their addresses from g->shim_base / pt_pool_base, and fork copies only the used pool, so nothing else moves. Because the shim slot is now tight, a _Static_assert in main.c and a runtime re-check in bootstrap.c fail loudly if the shim ever outgrows INFRA_SHIM_SLOT instead of letting it silently overlap the shim-data block. Surfaced while bringing up node:alpine through the OCI image work, where worker_threads pools of 8 and 16 isolates now run to completion; the fix is in the core guest runtime and is independent of that. The full guest test suite passes, including test-mremap-infra (infra-reserve boundary) and test-mprotect-mt (W^X L3 splitting). --- src/core/bootstrap.c | 5 ++-- src/core/guest.c | 2 +- src/core/guest.h | 64 +++++++++++++++++++++++++++++-------------- src/core/shim.S | 4 +-- src/main.c | 22 +++++++++++++++ src/runtime/forkipc.c | 2 +- 6 files changed, 72 insertions(+), 27 deletions(-) diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index ea3eb8f..6e202cb 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -462,8 +462,9 @@ int guest_bootstrap_prepare(guest_t *g, startup_trace_step("load_interpreter", t0); } - if (shim_bin_len > BLOCK_2MIB) { - log_error("shim binary too large (%zu bytes)", shim_bin_len); + if (shim_bin_len > INFRA_SHIM_SLOT) { + log_error("shim binary too large (%zu bytes, slot %llu)", shim_bin_len, + (unsigned long long) INFRA_SHIM_SLOT); return -1; } diff --git a/src/core/guest.c b/src/core/guest.c index 21407d5..ec8f7e6 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -136,7 +136,7 @@ static void guest_region_clip_overlay(guest_region_t *r) /* Compute infra reserve placement from guest_size and store derived fields in * @g. Called from guest_init and guest_init_from_shm. * - * Layout: a 4MiB region anchored at [interp_base - INFRA_RESERVE, interp_base) + * Layout: a 16MiB region anchored at [interp_base - INFRA_RESERVE, interp_base) * sits in the dead zone between mmap_limit and interp_base. PT pool, shim, and * shim data fall at fixed offsets within the reserve (see guest.h). * diff --git a/src/core/guest.h b/src/core/guest.h index 9e40ae3..7545ffb 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -30,31 +30,53 @@ * = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the runtime probe that * selects the correct size. * - * Infrastructure layout (page-table pool, shim code, shim data): a 4MiB reserve - * placed just below g->interp_base, in the dead zone between g->mmap_limit and - * g->interp_base. The exact base is computed at guest_init time and stored in - * guest_t.pt_pool_base / pt_pool_end / shim_base / shim_data_base. EL0 user - * binaries are therefore free to load at low addresses (down to 64KiB) without - * colliding with the runtime. + * Infrastructure layout (page-table pool, shim code, shim data): a 16MiB + * reserve placed just below g->interp_base, in the dead zone between + * g->mmap_limit and g->interp_base. The exact base is computed at guest_init + * time and stored in guest_t.pt_pool_base / pt_pool_end / shim_base / + * shim_data_base. EL0 user binaries are therefore free to load at low + * addresses (down to 64KiB) without colliding with the runtime. * - * Internal layout within the 4MiB reserve: - * +0x000000 .. +0x010000 unused (64KiB null guard) - * +0x010000 .. +0x100000 page-table pool (960KiB, RW) - * +0x100000 .. +0x200000 shim code slot (1MiB, RX). Sits in the same - * 2MiB L2 block as the PT pool, so that block - * is split into 4KiB L3 pages (mixed RX/RW). - * +0x200000 .. +0x400000 shim data + EL1 stack (full 2MiB L2 block, RW) + * The reserve is demand-paged (MAP_ANON): unused page-table-pool pages cost no + * host RAM, and the whole reserve consumes a negligible slice of the ~4 GiB + * dead zone, so the pool is sized generously and gets every byte not spoken for + * by the null guard, the shim code, and the shim data. Each split 2MiB block + * draws one 4KiB L3 page from the pool and the bump allocator never reclaims + * it, so a ~13.9MiB pool (3558 pages, ~7 GiB of split address space) hosts the + * many V8 isolates a Node worker_threads pool / cluster spins up; a 960KiB pool + * exhausted after only ~3 isolates and hard-aborted the guest. See + * issue-pt-pool-exhaustion.md. + * + * The shim code slot is tight (40KiB, ~6x the ~7KiB shim blob) rather than a + * round 1MiB so the freed space falls through to the pool; main.c + * _Static_asserts the real shim blob fits, and bootstrap.c re-checks at load. + * + * Internal layout within the 16MiB reserve: + * +0x0000000 .. +0x0010000 unused (64KiB null guard) + * +0x0010000 .. +0x0DF6000 page-table pool (~13.9MiB, RW) + * +0x0DF6000 .. +0x0E00000 shim code slot (40KiB, RX). Sits in the same + * 2MiB L2 block as the PT pool tail, so that block + * is split into 4KiB L3 pages (mixed RX/RW). + * +0x0E00000 .. +0x1000000 shim data + EL1 stack (full 2MiB L2 block, RW) + * + * Invariant: shim_data occupies the top 2MiB block of the reserve, so + * INFRA_SHIM_DATA_OFF == INFRA_RESERVE - BLOCK_2MIB and + * shim_data_base + BLOCK_2MIB == interp_base. The PT pool ends where the shim + * code slot begins (INFRA_PT_POOL_END_OFF == INFRA_SHIM_OFF), and the slot is + * INFRA_SHIM_SLOT == INFRA_SHIM_DATA_OFF - INFRA_SHIM_OFF wide. */ /* Total size of the runtime infrastructure reserve. Shifted to * [g->interp_base - INFRA_RESERVE, g->interp_base) at guest_init. */ -#define INFRA_RESERVE 0x00400000ULL /* 4MiB */ -#define INFRA_PT_POOL_OFF 0x00010000ULL /* offset of PT pool */ -#define INFRA_PT_POOL_END_OFF 0x00100000ULL /* PT pool end (960KiB) */ -#define INFRA_SHIM_OFF 0x00100000ULL /* offset of shim code slot */ -#define INFRA_SHIM_DATA_OFF 0x00200000ULL /* offset of shim data slot */ -#define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */ +#define INFRA_RESERVE 0x01000000ULL /* 16MiB */ +#define INFRA_PT_POOL_OFF 0x00010000ULL /* PT pool start */ +#define INFRA_PT_POOL_END_OFF 0x00DF6000ULL /* pool end == shim base */ +#define INFRA_SHIM_OFF 0x00DF6000ULL /* shim code slot base */ +#define INFRA_SHIM_DATA_OFF 0x00E00000ULL /* shim data block base */ +/* Shim slot width; main.c _Static_asserts the shim blob fits. */ +#define INFRA_SHIM_SLOT (INFRA_SHIM_DATA_OFF - INFRA_SHIM_OFF) +#define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */ #define PIE_LOAD_BASE 0x00400000ULL /* PIE (ET_DYN) executable base (4MiB) */ #define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MiB) */ @@ -701,7 +723,7 @@ static inline uint64_t guest_ipa(const guest_t *g, uint64_t offset) } /* True iff [start, end) overlaps the runtime infra reserve - * [interp_base - INFRA_RESERVE, interp_base). Covers the full 4 MiB + * [interp_base - INFRA_RESERVE, interp_base). Covers the full * reserve including the 64 KiB null-guard slot at the bottom (which * has no PT entries but must not become semantically reachable from * guest mmap state). Used by sys_mmap (MAP_FIXED), sys_munmap, and @@ -719,7 +741,7 @@ static inline bool guest_range_hits_infra(const guest_t *g, /* True iff a single address (PC, hint, etc.) falls inside the infra reserve. * Used by rt_sigreturn to reject forged frames that would redirect EL0 PC into - * EL1 shim or page-table memory. Covers the full 4 MiB reserve, matching + * EL1 shim or page-table memory. Covers the full reserve, matching * guest_range_hits_infra. */ static inline bool guest_addr_in_infra(const guest_t *g, uint64_t addr) diff --git a/src/core/shim.S b/src/core/shim.S index 4bc9a20..2fac83f 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -4,8 +4,8 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Loaded at g->shim_base (a 4MiB infra reserve placed just below g->interp_base; - * computed at guest_init time). Runs at EL1. + * Loaded at g->shim_base (a 16MiB infra reserve placed just below + * g->interp_base; computed at guest_init time). Runs at EL1. * All system registers (VBAR, MAIR, TCR, TTBR0, SCTLR, etc.) are configured by * the host before vCPU start. The shim entry point transitions to EL0 via ERET. * Exception vectors handle SVC #0 (Linux syscall) forwarding to the host via diff --git a/src/main.c b/src/main.c index ea1d17e..a6d2baf 100644 --- a/src/main.c +++ b/src/main.c @@ -123,6 +123,28 @@ static void cleanup_main_resources(guest_t *g, /* Embedded shim binary (generated by xxd -i from shim.bin) */ #include "shim_blob.h" +/* The shim code slot in the infra reserve is sized tight (INFRA_SHIM_SLOT, a + * few x the current blob) so the rest of the reserve goes to the page-table + * pool. If the shim ever outgrows the slot it would overlap the shim-data + * block; fail the build loudly rather than corrupt memory at boot. Enlarge + * INFRA_SHIM_SLOT (and shrink the pool to match) if this fires. */ +_Static_assert(sizeof(shim_bin) <= INFRA_SHIM_SLOT, + "shim blob exceeds its infra slot; bump INFRA_SHIM_SLOT"); + +/* The infra-reserve layout invariants documented in guest.h are derived from + * raw offset constants, so a future edit that grows the pool by shifting one + * offset without the others would silently overlap two regions. Enforce them + * at build time rather than trusting the comment. */ +_Static_assert(INFRA_PT_POOL_END_OFF == INFRA_SHIM_OFF, + "PT pool must end exactly where the shim slot begins"); +_Static_assert(INFRA_SHIM_DATA_OFF + BLOCK_2MIB == INFRA_RESERVE, + "shim_data must occupy the top 2MiB block of the reserve"); +_Static_assert((INFRA_SHIM_DATA_OFF & (BLOCK_2MIB - 1)) == 0, + "shim_data must be 2MiB-aligned"); +_Static_assert((INFRA_PT_POOL_OFF & 0xFFF) == 0 && + (INFRA_PT_POOL_END_OFF & 0xFFF) == 0, + "PT pool offsets must be page-aligned"); + /* Build-time version string (generated by make into build/version.h) */ #include "version.h" diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 9cb8ad1..c54c837 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -129,7 +129,7 @@ int fork_child_main(int ipc_fd, * pt_at. Reject impossible layouts up front. * * Lower bound: guest_size must leave room for both mmap_limit - * (size - 8 GiB) and interp_base (size - 4 GiB) plus the 4 MiB infra + * (size - 8 GiB) and interp_base (size - 4 GiB) plus the 16 MiB infra * reserve below it. 8 GiB satisfies all three with margin. * Upper bound: guest_size must fit in the negotiated IPA width. * IPA bits: 36 (M1/M2) and 40 (M3+) for native aarch64; 48 for