diff --git a/src/syscall/mem.c b/src/syscall/mem.c index ad584d6..06556f9 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -297,10 +297,10 @@ int64_t sys_mmap(guest_t *g, bool is_noreserve = is_anon && (flags & LINUX_MAP_NORESERVE) != 0; host_fd_ref_t backing_ref = {.fd = -1, .owned = 0}; int host_backing_fd = -1, track_backing_fd = -1; - int track_flags = is_anon - ? (LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS) - : ((flags & LINUX_MAP_SHARED) ? LINUX_MAP_SHARED - : LINUX_MAP_PRIVATE); + int track_flags = + ((flags & LINUX_MAP_SHARED) ? LINUX_MAP_SHARED : LINUX_MAP_PRIVATE); + if (is_anon) + track_flags |= LINUX_MAP_ANONYMOUS; /* Preserve MAP_NORESERVE in region metadata before merge checks run. */ if (is_noreserve) @@ -1018,6 +1018,29 @@ int64_t sys_mremap(guest_t *g, /* sys_madvise. */ +/* Returns true if [off, off+length) is fully covered by mapped regions. + * Mirrors Linux madvise_walk_vmas, which returns -ENOMEM whenever it would step + * over an unmapped sub-range. Caller holds mmap_lock. + */ +static bool madvise_range_mapped(const guest_t *g, + uint64_t off, + uint64_t length) +{ + uint64_t end = off + length; + uint64_t covered = off; + for (int i = 0; i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->end <= covered) + continue; + if (r->start > covered) + return false; + covered = r->end; + } + return covered >= end; +} + int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) { if (addr & 4095) @@ -1029,75 +1052,110 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) if (length == 0) return 0; + /* Range must lie within the guest IPA window. Linux returns -ENOMEM + * (not -EINVAL) for addresses outside the process address space — see + * madvise(2): "Addresses in the specified range are not currently + * mapped, or are outside the address space of the process." + */ + uint64_t off = addr - g->ipa_base; + if (off > g->guest_size || length > g->guest_size - off) + return -LINUX_ENOMEM; + switch (advice) { case LINUX_MADV_DONTNEED: { - /* MADV_DONTNEED: zero the pages so next access sees zero-fill. - * Linux guarantees zero-fill on next access for anonymous pages. + /* MADV_DONTNEED: zero anon pages so next access sees zero-fill, + * restore file-backed pages from the current backing file contents. * Linux returns -ENOMEM if any part of the range is unmapped. + * + * Writable MAP_SHARED file-backed regions are preserved so elfuse + * does not overwrite unsynced in-memory writes with backing-file + * contents. Read-only MAP_SHARED mappings can be invalidated safely + * and should refault from the current file image. + * + * PROT_NONE anonymous regions still get zeroed so the guest's next + * mprotect-and-read sees zero-fill (Linux semantics: pages are + * detached, faulted in lazily as zero on re-grant). */ - uint64_t off = addr - g->ipa_base; - if (off > g->guest_size || length > g->guest_size - off) - return -LINUX_EINVAL; + if (!madvise_range_mapped(g, off, length)) + return -LINUX_ENOMEM; uint64_t end = off + length; - - /* Verify the entire range is covered by regions (Linux returns - * -ENOMEM for unmapped holes). Walk regions and check for gaps. - */ - uint64_t covered = off; - for (int i = 0; i < g->nregions; i++) { - const guest_region_t *r = &g->regions[i]; - if (r->start >= end) - break; - if (r->end <= covered) - continue; - if (r->start > covered) - return -LINUX_ENOMEM; /* Unmapped hole */ - covered = r->end; - } - if (covered < end) - return -LINUX_ENOMEM; /* Tail unmapped */ - - /* Anonymous pages become zero-fill. MAP_PRIVATE file mappings discard - * private pages so later reads see the backing file contents again. - */ for (int i = 0; i < g->nregions; i++) { const guest_region_t *r = &g->regions[i]; if (r->start >= end) break; if (r->end <= off) continue; - if (r->prot == LINUX_PROT_NONE) - continue; if (!(r->flags & LINUX_MAP_ANONYMOUS) && r->backing_fd < 0) continue; + if (r->shared && r->backing_fd >= 0 && (r->prot & LINUX_PROT_WRITE)) + continue; - /* Compute overlap with the requested range */ uint64_t zstart = (r->start > off) ? r->start : off; uint64_t zend = (r->end < end) ? r->end : end; memset((uint8_t *) g->host_base + zstart, 0, zend - zstart); if (!(r->flags & LINUX_MAP_ANONYMOUS)) { uint64_t file_off = r->offset + (zstart - r->start); - ssize_t nr = - pread(r->backing_fd, (uint8_t *) g->host_base + zstart, - zend - zstart, (off_t) file_off); - if (nr < 0) - return linux_errno(); + uint8_t *dst = (uint8_t *) g->host_base + zstart; + size_t remaining = zend - zstart; + while (remaining > 0) { + ssize_t nr = + pread(r->backing_fd, dst, remaining, (off_t) file_off); + if (nr < 0) { + if (errno == EINTR) + continue; + return linux_errno(); + } + if (nr == 0) + break; /* EOF: tail stays zero per mmap rules. */ + dst += nr; + file_off += nr; + remaining -= (size_t) nr; + } } } return 0; } + case LINUX_MADV_FREE: { + /* MADV_FREE: only valid for private anonymous mappings. Linux returns + * -EINVAL for any non-anonymous vma (vma_is_anonymous check), even if + * the region is tracked without a live backing fd. Subsequent reads may + * legally return either old data or zero, so a no-op satisfies the spec + * for the anon case. + */ + if (!madvise_range_mapped(g, off, length)) + return -LINUX_ENOMEM; + + uint64_t end = off + length; + for (int i = 0; i < g->nregions; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->start >= end) + break; + if (r->end <= off) + continue; + if (!(r->flags & LINUX_MAP_ANONYMOUS) || + (r->flags & LINUX_MAP_SHARED)) + return -LINUX_EINVAL; + } + return 0; + } + case LINUX_MADV_NORMAL: case LINUX_MADV_RANDOM: case LINUX_MADV_SEQUENTIAL: case LINUX_MADV_WILLNEED: - case LINUX_MADV_FREE: case LINUX_MADV_HUGEPAGE: case LINUX_MADV_NOHUGEPAGE: case LINUX_MADV_COLD: case LINUX_MADV_PAGEOUT: - /* Advisory hints: no-op in emulation */ + /* Advisory hints: accept silently. Linux walks vmas and returns + * -ENOMEM for any unmapped sub-range; mirror that for fidelity. + * No host swap means PAGEOUT/COLD do not actually evict — keeping + * data in place is a stricter guarantee than Linux's. + */ + if (!madvise_range_mapped(g, off, length)) + return -LINUX_ENOMEM; return 0; default: diff --git a/tests/test-madvise.c b/tests/test-madvise.c index 0b153d4..cb518f2 100644 --- a/tests/test-madvise.c +++ b/tests/test-madvise.c @@ -1,15 +1,17 @@ -/* MADV_DONTNEED semantics tests +/* MADV_DONTNEED and madvise parity tests * * Copyright 2026 elfuse contributors * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Tests: MADV_DONTNEED zero-fill guarantee, page-aligned and - * multi-page ranges, advisory hints accepted. + * Tests: MADV_DONTNEED zero-fill guarantee, page-aligned and multi-page ranges, + * advisory hints accepted, MADV_FREE on anon vs file-backed, hole + * detection across all advices, unknown advice rejection. * * Syscalls exercised: mmap(222), madvise(233), munmap(215) */ +#include #include #include #include @@ -20,6 +22,22 @@ #include "test-harness.h" +#ifndef MADV_FREE +#define MADV_FREE 8 +#endif +#ifndef MADV_HUGEPAGE +#define MADV_HUGEPAGE 14 +#endif +#ifndef MADV_NOHUGEPAGE +#define MADV_NOHUGEPAGE 15 +#endif +#ifndef MADV_COLD +#define MADV_COLD 20 +#endif +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif + int passes = 0, fails = 0; /* Test 1: MADV_DONTNEED zeros a single page */ @@ -314,9 +332,450 @@ static void test_dontneed_file_backed(void) close(fd); } +/* Test 9: MADV_DONTNEED across an unmapped hole returns ENOMEM */ + +static void test_dontneed_hole(void) +{ + TEST("MADV_DONTNEED across unmapped hole"); + void *p = mmap(NULL, 4096 * 3, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + + /* Punch out the middle page so [p, p+12k) covers a hole */ + if (munmap((char *) p + 4096, 4096) != 0) { + FAIL("munmap failed"); + return; + } + + errno = 0; + int rc = madvise(p, 4096 * 3, MADV_DONTNEED); + EXPECT_TRUE(rc < 0 && errno == ENOMEM, "expected ENOMEM for hole"); + + munmap(p, 4096); + munmap((char *) p + 4096 * 2, 4096); +} + +/* Test 10: MADV_FREE on anonymous private mapping */ + +static void test_free_anon(void) +{ + TEST("MADV_FREE on anonymous mapping"); + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap failed"); + return; + } + + memset(p, 0x77, 4096); + int rc = madvise(p, 4096, MADV_FREE); + + /* Per spec the read may return either old data or zero, so we only + * check the syscall succeeds and a write/read round-trip works. + */ + if (rc != 0) { + FAIL("madvise MADV_FREE rejected anon mapping"); + munmap(p, 4096); + return; + } + + /* Subsequent write must stick */ + memset(p, 0x88, 4096); + unsigned char *cp = (unsigned char *) p; + bool ok = true; + for (int i = 0; i < 4096; i++) { + if (cp[i] != 0x88) { + ok = false; + break; + } + } + EXPECT_TRUE(ok, "write after MADV_FREE did not persist"); + + munmap(p, 4096); +} + +/* Test 11: MADV_FREE on file-backed mapping returns EINVAL */ + +static void test_free_file_backed(void) +{ + TEST("MADV_FREE rejects file-backed mapping"); + + char template[] = "/tmp/elfuse-madv-free-XXXXXX"; + int fd = mkstemp(template); + if (fd < 0) { + FAIL("mkstemp failed"); + return; + } + unlink(template); + + char buf[4096]; + memset(buf, 0x11, sizeof(buf)); + if (write(fd, buf, sizeof(buf)) != (ssize_t) sizeof(buf)) { + FAIL("write"); + close(fd); + return; + } + + void *p = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap file"); + close(fd); + return; + } + + errno = 0; + int rc = madvise(p, 4096, MADV_FREE); + EXPECT_TRUE(rc < 0 && errno == EINVAL, + "expected EINVAL for file-backed MADV_FREE"); + + munmap(p, 4096); + close(fd); +} + +/* Test 12: MADV_FREE rejects private file mappings after close(fd) */ + +static void test_free_file_backed_closed_fd(void) +{ + TEST("MADV_FREE rejects closed-fd file mapping"); + + char template[] = "/tmp/elfuse-madv-free-closed-XXXXXX"; + int fd = mkstemp(template); + if (fd < 0) { + FAIL("mkstemp failed"); + return; + } + unlink(template); + + char buf[4096]; + memset(buf, 0x22, sizeof(buf)); + if (write(fd, buf, sizeof(buf)) != (ssize_t) sizeof(buf)) { + FAIL("write"); + close(fd); + return; + } + + void *p = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + if (p == MAP_FAILED) { + FAIL("mmap file"); + return; + } + + errno = 0; + int rc = madvise(p, 4096, MADV_FREE); + EXPECT_TRUE(rc < 0 && errno == EINVAL, + "expected EINVAL for closed-fd file-backed MADV_FREE"); + + munmap(p, 4096); +} + +/* Test 13: MADV_HUGEPAGE / MADV_NOHUGEPAGE / MADV_COLD / MADV_PAGEOUT */ + +static void test_extra_hints(void) +{ + TEST("madvise hugepage/cold/pageout hints"); + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + + bool ok = true; + if (madvise(p, 4096, MADV_HUGEPAGE) != 0) + ok = false; + if (madvise(p, 4096, MADV_NOHUGEPAGE) != 0) + ok = false; + if (madvise(p, 4096, MADV_COLD) != 0) + ok = false; + if (madvise(p, 4096, MADV_PAGEOUT) != 0) + ok = false; + + EXPECT_TRUE(ok, "hint advice rejected"); + + /* Data must survive PAGEOUT/COLD on this no-swap host */ + memset(p, 0x99, 4096); + if (madvise(p, 4096, MADV_PAGEOUT) != 0) + ok = false; + unsigned char *cp = (unsigned char *) p; + for (int i = 0; i < 4096; i++) { + if (cp[i] != 0x99) { + ok = false; + break; + } + } + EXPECT_TRUE(ok, "data did not survive MADV_PAGEOUT"); + + munmap(p, 4096); +} + +/* Test 13: hint advices on a hole return ENOMEM (Linux parity) */ + +static void test_hint_hole(void) +{ + TEST("madvise hint on hole returns ENOMEM"); + void *p = mmap(NULL, 4096 * 3, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + munmap((char *) p + 4096, 4096); + + errno = 0; + int rc = madvise(p, 4096 * 3, MADV_WILLNEED); + EXPECT_TRUE(rc < 0 && errno == ENOMEM, "expected ENOMEM for WILLNEED hole"); + + munmap(p, 4096); + munmap((char *) p + 4096 * 2, 4096); +} + +/* Test 14: unknown advice value rejected */ + +static void test_unknown_advice(void) +{ + TEST("madvise unknown advice returns EINVAL"); + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + + errno = 0; + /* 9999 is not a defined advice */ + int rc = madvise(p, 4096, 9999); + EXPECT_TRUE(rc < 0 && errno == EINVAL, + "expected EINVAL for unknown advice"); + + munmap(p, 4096); +} + +/* Test 15: length=0 succeeds without error */ + +static void test_zero_length(void) +{ + TEST("madvise length=0 returns 0"); + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + + int rc = madvise(p, 0, MADV_DONTNEED); + EXPECT_TRUE(rc == 0, "length=0 should succeed"); + + munmap(p, 4096); +} + +/* Test 16: PROT_NONE region zero-fills on subsequent re-grant. */ + +static void test_dontneed_prot_none_zerofill(void) +{ + TEST("MADV_DONTNEED through PROT_NONE round trip"); + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + return; + } + + memset(p, 0x42, 4096); + + if (mprotect(p, 4096, PROT_NONE) != 0) { + FAIL("mprotect PROT_NONE"); + munmap(p, 4096); + return; + } + + if (madvise(p, 4096, MADV_DONTNEED) != 0) { + FAIL("madvise"); + munmap(p, 4096); + return; + } + + if (mprotect(p, 4096, PROT_READ | PROT_WRITE) != 0) { + FAIL("mprotect restore"); + munmap(p, 4096); + return; + } + + unsigned char *cp = (unsigned char *) p; + bool ok = true; + for (int i = 0; i < 4096; i++) { + if (cp[i] != 0) { + ok = false; + break; + } + } + EXPECT_TRUE(ok, "PROT_NONE region not zeroed after restore"); + + munmap(p, 4096); +} + +/* Test 17: MADV_DONTNEED on MAP_SHARED file mapping preserves dirty data. + * + * Linux cannot discard pages from a shared page-cache mapping; in elfuse's + * MAP_SHARED-as-CoW model, an in-memory write must not be silently + * overwritten by stale file contents during DONTNEED. + */ + +static void test_dontneed_shared_file_preserved(void) +{ + TEST("MADV_DONTNEED MAP_SHARED preserves in-memory writes"); + + char template[] = "/tmp/elfuse-madv-shared-XXXXXX"; + int fd = mkstemp(template); + if (fd < 0) { + FAIL("mkstemp"); + return; + } + unlink(template); + + char buf[4096]; + memset(buf, 0x11, sizeof(buf)); + if (write(fd, buf, sizeof(buf)) != (ssize_t) sizeof(buf)) { + FAIL("write"); + close(fd); + return; + } + + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap shared"); + close(fd); + return; + } + + /* Dirty the in-memory copy without msync. */ + memset(p, 0x33, 4096); + + if (madvise(p, 4096, MADV_DONTNEED) != 0) { + FAIL("madvise"); + munmap(p, 4096); + close(fd); + return; + } + + unsigned char *cp = (unsigned char *) p; + bool ok = true; + for (int i = 0; i < 4096; i++) { + if (cp[i] != 0x33) { + ok = false; + break; + } + } + EXPECT_TRUE(ok, "MAP_SHARED dirty data lost after DONTNEED"); + + munmap(p, 4096); + close(fd); +} + +/* Test 18: MADV_DONTNEED on clean read-only MAP_SHARED refaults from file. */ + +static void test_dontneed_shared_file_reload(void) +{ + TEST("MADV_DONTNEED MAP_SHARED reloads clean file data"); + + char template[] = "/tmp/elfuse-madv-shared-reload-XXXXXX"; + int fd = mkstemp(template); + if (fd < 0) { + FAIL("mkstemp"); + return; + } + unlink(template); + + char buf[4096]; + memset(buf, 0x44, sizeof(buf)); + if (write(fd, buf, sizeof(buf)) != (ssize_t) sizeof(buf)) { + FAIL("write"); + close(fd); + return; + } + + void *p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap shared"); + close(fd); + return; + } + + memset(buf, 0x66, sizeof(buf)); + if (pwrite(fd, buf, sizeof(buf), 0) != (ssize_t) sizeof(buf)) { + FAIL("pwrite"); + munmap(p, 4096); + close(fd); + return; + } + + if (madvise(p, 4096, MADV_DONTNEED) != 0) { + FAIL("madvise"); + munmap(p, 4096); + close(fd); + return; + } + + unsigned char *cp = (unsigned char *) p; + bool ok = true; + for (int i = 0; i < 4096; i++) { + if (cp[i] != 0x66) { + ok = false; + break; + } + } + EXPECT_TRUE(ok, "MAP_SHARED clean data did not reload from file"); + + munmap(p, 4096); + close(fd); +} + +/* Test 19: MADV_FREE rejects shared anonymous mappings. */ + +static void test_free_shared_anon(void) +{ + TEST("MADV_FREE rejects shared anonymous mapping"); + + void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap shared anon"); + return; + } + + errno = 0; + int rc = madvise(p, 4096, MADV_FREE); + EXPECT_TRUE(rc < 0 && errno == EINVAL, + "expected EINVAL for MAP_SHARED|MAP_ANONYMOUS MADV_FREE"); + + munmap(p, 4096); +} + +/* Test 20: address outside the guest address space returns ENOMEM. + * + * Per madvise(2), addresses outside the process address space yield + * ENOMEM, not EINVAL. The high half of the 64-bit space is well past + * any IPA window elfuse can be configured with (1 TiB ceiling for + * 40-bit IPA, 64 GiB for 36-bit), so this address is unconditionally + * out of range. + */ + +static void test_oob_address_enomem(void) +{ + TEST("madvise OOB address returns ENOMEM"); + void *p = (void *) 0xFFFFFFFFFFFF0000ULL; + errno = 0; + int rc = madvise(p, 4096, MADV_DONTNEED); + EXPECT_TRUE(rc < 0 && errno == ENOMEM, + "expected ENOMEM for address beyond guest space"); +} + int main(void) { - printf("test-madvise: MADV_DONTNEED semantics tests\n"); + printf("test-madvise: MADV_DONTNEED and parity tests\n"); test_dontneed_single(); test_dontneed_multi(); @@ -326,6 +785,19 @@ int main(void) test_dontneed_large(); test_dontneed_unaligned(); test_dontneed_file_backed(); + test_dontneed_hole(); + test_free_anon(); + test_free_file_backed(); + test_free_file_backed_closed_fd(); + test_extra_hints(); + test_hint_hole(); + test_unknown_advice(); + test_zero_length(); + test_dontneed_prot_none_zerofill(); + test_dontneed_shared_file_preserved(); + test_dontneed_shared_file_reload(); + test_free_shared_anon(); + test_oob_address_enomem(); SUMMARY("test-madvise"); return fails > 0 ? 1 : 0;