Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,8 @@ void guest_reset(guest_t *g)
g->mmap_end = MMAP_INITIAL_END;
g->mmap_rx_next = MMAP_RX_BASE;
g->mmap_rx_end = MMAP_RX_INITIAL_END;
g->mmap_rw_gap_hint = 0;
g->mmap_rx_gap_hint = 0;
g->ttbr0 = 0;
g->need_tlbi = false;

Expand Down
20 changes: 15 additions & 5 deletions src/core/guest.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,22 +127,32 @@ typedef struct {
typedef struct {
void *host_base; /* Host pointer to allocated guest memory */
int shm_fd; /* File fd backing host_base for CoW fork (-1 if MAP_ANON) */

uint64_t guest_size; /* Total size (determined by IPA capacity) */
uint64_t ipa_base; /* IPA base for hv_vm_map (GUEST_IPA_BASE) */
uint64_t mmap_limit; /* Max mmap address (computed from guest_size) */
uint64_t
interp_base; /* Dynamic linker load base (computed from guest_size) */

uint64_t interp_base; /* Dynamic linker load base (from guest_size) */
uint64_t pt_pool_next; /* Next free page table page in pool */
uint64_t brk_base; /* Initial brk (set after ELF load) */
uint64_t brk_current; /* Current brk position */
uint64_t stack_base; /* Bottom of stack region (dynamic, above brk) */
uint64_t stack_top; /* Top of stack (stack grows down from here) */
uint64_t mmap_next; /* RW mmap high-water mark for fork IPC snapshots */
uint64_t mmap_end; /* Current page-table-covered RW mmap limit */

uint64_t mmap_next; /* RW mmap high-water mark for fork IPC snapshots */
uint64_t mmap_end; /* Current page-table-covered RW mmap limit */
/* RX mmap high-water mark serialized through fork IPC. */
uint64_t mmap_rx_next;
uint64_t mmap_rx_end; /* Current page-table-covered RX mmap limit */
uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */
/* Gap-finder allocator hints. First free GPA past the last successful mmap
* in each region; munmap and mremap rewind the hint when a lower address is
* freed. mprotect does not, since permission changes do not free address
* space. Per-guest so multi-guest test harnesses (or any future second VM
* in the same process) cannot cross-pollute each other's allocator state.
*/
uint64_t mmap_rw_gap_hint, mmap_rx_gap_hint;

uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */
bool need_tlbi; /* Signal shim to flush TLB after page table changes */
hv_vcpu_t vcpu; /* vCPU handle */
hv_vcpu_exit_t *exit; /* vCPU exit info */
Expand Down
1 change: 0 additions & 1 deletion src/runtime/fork-state.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g)
}

syscall_init();
mmap_reset_hints();

if (num_fds > FD_TABLE_SIZE) {
log_error("fork-child: num_fds %u exceeds FD_TABLE_SIZE", num_fds);
Expand Down
3 changes: 0 additions & 3 deletions src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -688,9 +688,6 @@ static inline void sock_opt_clear(fd_entry_t *e)
/* Initialize the syscall subsystem (FD table, etc.) */
void syscall_init(void);

/* Reset mmap gap-finder hints after execve. */
void mmap_reset_hints(void);

/* Dispatch a syscall. Reads X8 (nr) and X0-X5 (args) from vCPU registers.
* Writes result back to X0. Sets *exit_code if the process should exit.
* Returns 0 to continue, 1 to exit.
Expand Down
1 change: 0 additions & 1 deletion src/syscall/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,6 @@ int64_t sys_execve(hv_vcpu_t vcpu,
* kernel exec failure after its point of no return.
*/
guest_reset(g);
mmap_reset_hints();

/* The replacement image must not inherit process-wide shutdown requests
* from the old thread group.
Expand Down
65 changes: 29 additions & 36 deletions src/syscall/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,19 @@ pthread_mutex_t mmap_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 1 */

/* Gap-finding allocator for mmap.
*
* find_free_gap_inner() scans guest_t.regions[] (sorted) for the first free
* gap of length bytes within [min_addr, max_addr). Replaces a bump
* allocator so munmap'd ranges become reusable (critical for runtimes that
* reserve, trim, and re-reserve in the same address window).
* find_free_gap_inner() scans guest_t.regions[] (sorted) for the first free gap
* of length bytes within [min_addr, max_addr). Replaces a bump allocator so
* munmap'd ranges become reusable (critical for runtimes that reserve, trim,
* and re-reserve in the same address window).
*
* The cached hints below amortize the O(n) scan to O(1) for sequential
* allocations: after each success, the hint is set to the end of the
* allocation. munmap resets the hint when freeing a region before it.
* Per-guest hints (mmap_rw_gap_hint / mmap_rx_gap_hint in guest_t) amortize the
* O(n) scan to O(1) for sequential allocations: after each success the hint is
* set to the end of the allocation. munmap/mremap rewinds the hint when a lower
* address is freed. Stored in guest_t so multiple guest instances in one
* process (test harnesses, future multi-VM use) cannot cross-pollute each
* other's allocator state. Reset to 0 by guest_init, guest_init_from_shm (via
* memset), and guest_reset.
*/
static uint64_t mmap_rw_gap_hint = 0, mmap_rx_gap_hint = 0;

typedef struct {
uint64_t start, end;
Expand Down Expand Up @@ -103,16 +106,6 @@ static int dup_region_backing_fd(const guest_region_t *region)
return dup(region->backing_fd);
}

/* Reset mmap gap hints after execve. Without this, the gap-finder starts
* searching past the previous binary's allocations, wasting address space
* and potentially causing issues with the new dynamic linker.
*/
void mmap_reset_hints(void)
{
mmap_rw_gap_hint = 0;
mmap_rx_gap_hint = 0;
}

static uint64_t find_free_gap_inner(const guest_t *g,
uint64_t length,
uint64_t min_addr,
Expand Down Expand Up @@ -152,14 +145,14 @@ static uint64_t find_free_gap_inner(const guest_t *g,
* mmap activity. A miss falls back to the region base so holes reopened by
* munmap are still reusable.
*/
static uint64_t find_free_gap(const guest_t *g,
static uint64_t find_free_gap(guest_t *g,
uint64_t length,
uint64_t min_addr,
uint64_t max_addr)
{
/* RX and RW mappings advance independently, so keep separate hints. */
uint64_t *hint =
(min_addr < MMAP_BASE) ? &mmap_rx_gap_hint : &mmap_rw_gap_hint;
(min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint;

/* Try cached hint first (only if within the valid range) */
if (*hint >= min_addr && *hint < max_addr) {
Expand Down Expand Up @@ -771,10 +764,10 @@ int64_t sys_mremap(guest_t *g,
memset((uint8_t *) g->host_base + tail_off, 0, tail_end - tail_off);
guest_region_remove(g, tail_off, tail_end);
guest_invalidate_ptes(g, tail_off, tail_end);
if (tail_off < mmap_rw_gap_hint)
mmap_rw_gap_hint = tail_off;
if (tail_off < mmap_rx_gap_hint)
mmap_rx_gap_hint = tail_off;
if (tail_off < g->mmap_rw_gap_hint)
g->mmap_rw_gap_hint = tail_off;
if (tail_off < g->mmap_rx_gap_hint)
g->mmap_rx_gap_hint = tail_off;
return (int64_t) old_addr;
}

Expand Down Expand Up @@ -844,10 +837,10 @@ int64_t sys_mremap(guest_t *g,
memset((uint8_t *) g->host_base + old_off, 0, old_size);
guest_region_remove(g, old_off, old_off + old_size);
guest_invalidate_ptes(g, old_off, old_off + old_size);
if (old_off < mmap_rw_gap_hint)
mmap_rw_gap_hint = old_off;
if (old_off < mmap_rx_gap_hint)
mmap_rx_gap_hint = old_off;
if (old_off < g->mmap_rw_gap_hint)
g->mmap_rw_gap_hint = old_off;
if (old_off < g->mmap_rx_gap_hint)
g->mmap_rx_gap_hint = old_off;
}

if (guest_region_add_ex_owned(
Expand Down Expand Up @@ -987,10 +980,10 @@ int64_t sys_mremap(guest_t *g,
memset((uint8_t *) g->host_base + old_off, 0, old_size);
guest_region_remove(g, old_off, old_off + old_size);
guest_invalidate_ptes(g, old_off, old_off + old_size);
if (old_off < mmap_rw_gap_hint)
mmap_rw_gap_hint = old_off;
if (old_off < mmap_rx_gap_hint)
mmap_rx_gap_hint = old_off;
if (old_off < g->mmap_rw_gap_hint)
g->mmap_rw_gap_hint = old_off;
if (old_off < g->mmap_rx_gap_hint)
g->mmap_rx_gap_hint = old_off;

/* Track new region */
if (guest_region_add_ex_owned(
Expand Down Expand Up @@ -1211,10 +1204,10 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length)
memset((uint8_t *) g->host_base + zstart, 0, zend - zstart);
}
guest_region_remove(g, unmap_off, end);
if (unmap_off < mmap_rw_gap_hint)
mmap_rw_gap_hint = unmap_off;
if (unmap_off < mmap_rx_gap_hint)
mmap_rx_gap_hint = unmap_off;
if (unmap_off < g->mmap_rw_gap_hint)
g->mmap_rw_gap_hint = unmap_off;
if (unmap_off < g->mmap_rx_gap_hint)
g->mmap_rx_gap_hint = unmap_off;
}
}
return 0;
Expand Down
14 changes: 12 additions & 2 deletions src/syscall/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,25 @@ static int cached_ngroups = -1;
static const linux_utsname_t cached_uname = {
.sysname = "Linux",
.nodename = "elfuse",
/* Kernel version: match the lima aarch64 VM kernel to avoid
* version-gated feature detection mismatches in userspace.
/* Kernel version: match the lima aarch64 VM kernel to avoid version-gated
* feature detection mismatches in userspace.
*/
.release = "6.17.0-20-generic",
.version = "#20-Ubuntu SMP PREEMPT_DYNAMIC",
.machine = "aarch64",
.domainname = "(none)",
};
static const uint8_t cached_affinity_mask[256] = {1}, zero_block[256] = {0};

/* sysinfo cache.
*
* Process-scoped by intent: the cache mirrors the host's view (totalram from
* sysctl(HW_MEMSIZE), free pages from host_statistics64, getloadavg). Even if
* multiple guest_t instances ever coexist in one process they share the same
* host stats, so a single rwlock-protected cache refreshed at most once per
* second is the right shape. Audited under TODO "Static state testability
* audit" -- intentionally NOT moved into guest_t.
*/
static pthread_once_t sysinfo_once = PTHREAD_ONCE_INIT;
static pthread_rwlock_t sysinfo_lock = PTHREAD_RWLOCK_INITIALIZER;
static time_t cached_boottime_sec = 0;
Expand Down
Loading