From aaf26b4686c744ab5169d60a210a909fdce21241 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 7 Jun 2026 03:47:54 +0800 Subject: [PATCH 1/4] Honor CLOCK_BOOTTIME and TFD_NONBLOCK in timerfd foot fails at startup under elfuse with "failed to create keyboard repeat timer FD: Inappropriate ioctl for device" because timerfd_create( CLOCK_BOOTTIME, TFD_CLOEXEC | TFD_NONBLOCK) returns -1. Two root causes: - The clockid allow-list only accepted CLOCK_REALTIME and CLOCK_MONOTONIC, so CLOCK_BOOTTIME (7) returned -EINVAL. Linux has no suspend-aware equivalent on macOS; treating BOOTTIME as MONOTONIC matches the existing translate_clockid mapping in src/syscall/time.c. - TFD_NONBLOCK was applied via fd_set_nonblock(kq), which issues fcntl(F_SETFL, O_NONBLOCK) on the kqueue host fd. macOS rejects that with ENOTTY (errno 25), so every timerfd_create(..., TFD_NONBLOCK) call failed regardless of clockid. The "Inappropriate ioctl for device" string foot logs is glibc strerror for ENOTTY leaking through. The non-blocking flag now lives in fd_table[gfd].linux_flags alongside the existing CLOEXEC bit, and timerfd_read consults that field after snapshotting it under fd_lock (order 3) before acquiring sfd_lock (order 5a). The lock-order snapshot matches the documented discipline in src/syscall/internal.h and the eventfd_dup_fd pattern. Tests cover create succeeding with CLOCK_BOOTTIME+TFD_NONBLOCK and an armed-but-unfired non-blocking read returning EAGAIN through the shadow rather than the unarmed-timer EAGAIN path. Verified against an ARM64 Linux host that read-before-fire on a non-blocking timerfd returns errno=EAGAIN, matching the elfuse behavior. Close #82 --- src/syscall/fd.c | 33 ++++++++++++++++++++++++++------- tests/test-timerfd.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/src/syscall/fd.c b/src/syscall/fd.c index f06b0d2..aa9ff9b 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -89,6 +89,12 @@ typedef struct { #define LINUX_CLOCK_REALTIME 0 #define LINUX_CLOCK_MONOTONIC 1 +/* Linux CLOCK_BOOTTIME counts time including suspend; macOS has no equivalent. + * timerfd_settime treats non-REALTIME slots as MONOTONIC for ABSTIME + * conversion, which matches translate_clockid() in time.c. + */ +#define LINUX_CLOCK_BOOTTIME 7 + static struct { int guest_fd; /* Guest fd (-1 if unused) */ int kq_fd; /* kqueue fd for this timer */ @@ -167,7 +173,8 @@ static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns) int64_t sys_timerfd_create(int clockid, int flags) { - if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC) + if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC && + clockid != LINUX_CLOCK_BOOTTIME) return -LINUX_EINVAL; if (flags & ~(LINUX_TFD_CLOEXEC | LINUX_TFD_NONBLOCK)) return -LINUX_EINVAL; @@ -176,8 +183,12 @@ int64_t sys_timerfd_create(int clockid, int flags) if (kq < 0) return linux_errno(); - if (((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) || - ((flags & LINUX_TFD_NONBLOCK) && fd_set_nonblock(kq) < 0)) { + /* macOS kqueue fds reject fcntl(F_SETFL, O_NONBLOCK) with ENOTTY, so + * track the non-blocking mode in fd_table[gfd].linux_flags below and + * let timerfd_read consult that field directly. F_SETFD CLOEXEC still + * works on a kqueue fd. + */ + if ((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) { close(kq); return linux_errno(); } @@ -204,7 +215,8 @@ int64_t sys_timerfd_create(int clockid, int flags) pthread_mutex_unlock(&sfd_lock); fd_table[gfd].linux_flags = - (flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) | + ((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0); return gfd; } @@ -396,6 +408,15 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) if (count < 8) return -LINUX_EINVAL; + /* Snapshot the NONBLOCK status under fd_lock before sfd_lock to match the + * documented lock order (fd_lock=3 < sfd_lock=5a). The kqueue host fd + * rejects fcntl(F_SETFL, O_NONBLOCK) on macOS, so the flag lives in + * fd_table[guest_fd].linux_flags rather than on the host fd. + */ + pthread_mutex_lock(&fd_lock); + bool nonblock = fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK; + pthread_mutex_unlock(&fd_lock); + pthread_mutex_lock(&sfd_lock); int slot = timerfd_find(guest_fd); if (slot < 0) { @@ -409,9 +430,7 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) timerfd_drain_pending_locked(slot); if (timerfd_state[slot].expirations == 0) { - /* No events yet; check if non-blocking */ - int fl = fcntl(kq, F_GETFL); - if (fl >= 0 && (fl & O_NONBLOCK)) { + if (nonblock) { pthread_mutex_unlock(&sfd_lock); return -LINUX_EAGAIN; } diff --git a/tests/test-timerfd.c b/tests/test-timerfd.c index 162b9c2..f8f00c7 100644 --- a/tests/test-timerfd.c +++ b/tests/test-timerfd.c @@ -148,6 +148,42 @@ int main(void) FAIL("timerfd_create failed"); } + /* CLOCK_BOOTTIME backs foot's keyboard repeat timer; on macOS it + * resolves to CLOCK_MONOTONIC since no boottime equivalent exists. + * Paired with TFD_NONBLOCK to mirror foot's actual call. + */ + TEST("create accepts CLOCK_BOOTTIME with TFD_NONBLOCK"); + { + int fd = timerfd_create(CLOCK_BOOTTIME, TFD_CLOEXEC | TFD_NONBLOCK); + EXPECT_TRUE(fd >= 0, "timerfd_create(CLOCK_BOOTTIME) failed"); + if (fd >= 0) + close(fd); + } + + /* NONBLOCK shadow: the kqueue host fd cannot carry O_NONBLOCK, so the + * flag lives in fd_table[gfd].linux_flags. Arm with a 1s deadline so + * the read is non-trivially blocked, then verify EAGAIN comes from the + * NONBLOCK path; an unarmed timer would also return EAGAIN, so the arm + * step is what makes this a real probe of the shadow. + */ + TEST("NONBLOCK: armed-but-unfired returns EAGAIN"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (fd >= 0) { + struct itimerspec its = {.it_value = {.tv_sec = 1, .tv_nsec = 0}, + .it_interval = {0, 0}}; + EXPECT_TRUE(timerfd_settime(fd, 0, &its, NULL) == 0, + "settime failed"); + uint64_t count = 0; + errno = 0; + ssize_t r = read(fd, &count, sizeof(count)); + EXPECT_TRUE(r == -1 && errno == EAGAIN, + "non-blocking read did not return EAGAIN"); + close(fd); + } else + FAIL("timerfd_create"); + } + TEST("timerfd_create invalid clock"); EXPECT_ERRNO(timerfd_create(9999, 0), EINVAL, "expected EINVAL"); From 43a723c0f45e78d84813bde2c92b8a63c83fc0b2 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 7 Jun 2026 03:51:02 +0800 Subject: [PATCH 2/4] Match Linux setfl semantics for timerfd F_{GET,SET}FL Without an FD_TIMERFD branch, fcntl(timerfd, F_GETFL) routed to the kqueue host fd and surfaced macOS-side flags, while F_SETFL hit the same ENOTTY rejection that broke the create path. Wire both branches through fd_table[fd].linux_flags so the shadow is the source of truth. F_GETFL returns O_RDWR plus the writable bits Linux honors on a timerfd inode: O_APPEND, O_NONBLOCK, and O_NOATIME (Linux's full SETFL_MASK minus O_ASYNC, which timerfd_fops drops because it lacks ->fasync, and minus O_DIRECT). O_RDWR is hard-coded because Linux opens the inode O_RDWR via anon_inode_getfd in fs/timerfd.c, and is also stamped into linux_flags at create time so the access mode is visible to other consumers. F_SETFL accepts O_APPEND, O_NONBLOCK, and O_NOATIME, silently drops access mode / CLOEXEC / non-writable bits matching how Linux F_SETFL treats them, and returns -EINVAL on O_DIRECT mirroring Linux's vfs_set_direct_io_flags rejecting it on an inode without FMODE_CAN_ODIRECT. dup(2) preservation: the install_fd_alias_metadata_atomic preserved mask now includes LINUX_O_NONBLOCK, and fuse_dup_fd does the same. Without this a duplicated non-blocking fd of any kind that stores NONBLOCK in linux_flags rather than on the host fd (FUSE files, now also timerfds) silently reverted to blocking. abi.h adds LINUX_O_ASYNC=0x2000 and LINUX_O_NOATIME=0x40000 (octal 020000 and 01000000 from asm-generic/fcntl.h). The translate helpers now use the LINUX_O_ASYNC symbol instead of the 0x2000 inline literal. Tests cover the full fcntl coherence surface: O_RDWR access mode is visible, accepted-plus-stray F_SETFL persists O_APPEND while dropping O_WRONLY and O_CLOEXEC, O_DIRECT returns EINVAL, O_NOATIME round-trips, and dup preserves both NONBLOCK and the access mode. --- src/syscall/abi.h | 2 ++ src/syscall/fd.c | 6 +++- src/syscall/fs.c | 37 ++++++++++++++++++++++- src/syscall/fuse.c | 11 +++++-- src/syscall/translate.c | 6 ++-- tests/test-timerfd.c | 65 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 119 insertions(+), 8 deletions(-) diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 52874a4..ac58574 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -378,6 +378,7 @@ typedef struct { #define LINUX_O_TRUNC 0x0200 #define LINUX_O_APPEND 0x0400 #define LINUX_O_NONBLOCK 0x0800 +#define LINUX_O_ASYNC 0x2000 /* aarch64-linux open flag values (from asm-generic/fcntl.h). * These differ from x86_64-linux values. */ @@ -385,6 +386,7 @@ typedef struct { #define LINUX_O_NOFOLLOW 0x8000 /* 0100000 octal */ #define LINUX_O_DIRECT 0x10000 /* 0200000 octal */ #define LINUX_O_LARGEFILE 0x20000 /* 0400000 octal, ignored on LP64 */ +#define LINUX_O_NOATIME 0x40000 /* 01000000 octal */ #define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */ #define LINUX_O_PATH 0x200000 /* 010000000 octal */ diff --git a/src/syscall/fd.c b/src/syscall/fd.c index aa9ff9b..f9cbf75 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -214,8 +214,12 @@ int64_t sys_timerfd_create(int clockid, int flags) timerfd_state[slot].clockid = clockid; pthread_mutex_unlock(&sfd_lock); + /* Linux opens the timerfd inode O_RDWR (anon_inode_getfd in + * fs/timerfd.c). Stamp O_RDWR into linux_flags so the F_GETFL branch + * below can surface the access mode without re-deriving it. + */ fd_table[gfd].linux_flags = - ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) | + LINUX_O_RDWR | ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) | ((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0); return gfd; } diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 1b7f4d3..0566da1 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -501,10 +501,15 @@ static bool install_fd_alias_metadata_atomic(int dst_fd, int linux_flags, DIR *dir) { + /* LINUX_O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2). + * Required for FD_TIMERFD (and any other type that stores NONBLOCK in + * linux_flags rather than on the host fd) so a duplicated non-blocking + * timerfd does not silently turn blocking. + */ int preserved_flags = src_snap->linux_flags & (LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | - LINUX_O_DIRECT | LINUX_O_LARGEFILE); + LINUX_O_DIRECT | LINUX_O_LARGEFILE | LINUX_O_NONBLOCK); int final_flags = preserved_flags | linux_flags; bool installed = false; @@ -701,6 +706,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) case 3: { /* F_GETFL */ if (fuse_fd) return fd_table[fd].linux_flags; + /* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits + * the kernel honors. Surface only those bits from the shadow rather + * than echoing arbitrary linux_flags bits so stray F_SETFL args + * cannot leak through here. O_ASYNC stays off because timerfd_fops + * lacks ->fasync, so generic_setfl drops it. + */ + if (fd_type == FD_TIMERFD) + return LINUX_O_RDWR | + (fd_table[fd].linux_flags & + (LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME)); fd_entry_t snap; if (!fd_snapshot(fd, &snap)) return -LINUX_EBADF; @@ -734,6 +749,26 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE)); return 0; } + /* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's + * file-status word in the linux_flags shadow. Of Linux's writable + * status flags (O_APPEND, O_ASYNC, O_DIRECT, O_NOATIME, O_NONBLOCK) + * the timerfd kernel object honors O_APPEND, O_NONBLOCK, and + * O_NOATIME. O_ASYNC is silently dropped (timerfd_fops lacks + * ->fasync). O_DIRECT returns -EINVAL because the inode lacks + * FMODE_CAN_ODIRECT. Bits outside the writable set (access mode, + * CLOEXEC, O_PATH/DIRECTORY/NOFOLLOW/etc.) are silently ignored, + * matching how Linux F_SETFL drops them. + */ + if (fd_type == FD_TIMERFD) { + if ((int) arg & LINUX_O_DIRECT) + return -LINUX_EINVAL; + const int setfl_mask = + LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME; + fd_table[fd].linux_flags = + (fd_table[fd].linux_flags & ~setfl_mask) | + ((int) arg & setfl_mask); + return 0; + } host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c index adab4a8..39de346 100644 --- a/src/syscall/fuse.c +++ b/src/syscall/fuse.c @@ -2607,9 +2607,14 @@ int fuse_dup_fd(int src_fd, } } - int preserved_flags = fd_table[src_fd].linux_flags & - (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | - LINUX_O_DIRECT | LINUX_O_LARGEFILE); + /* O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2); without + * it a duplicated non-blocking FUSE fd would silently become blocking + * because nothing else carries the flag forward. + */ + int preserved_flags = + fd_table[src_fd].linux_flags & + (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | + LINUX_O_LARGEFILE | LINUX_O_NONBLOCK); fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; pthread_mutex_unlock(&fuse_lock); return guest_fd; diff --git a/src/syscall/translate.c b/src/syscall/translate.c index e3cf9a8..018690d 100644 --- a/src/syscall/translate.c +++ b/src/syscall/translate.c @@ -219,7 +219,7 @@ int mac_to_linux_status_flags(int mac_flags) if (mac_flags & O_APPEND) linux_flags |= LINUX_O_APPEND; if (mac_flags & O_ASYNC) - linux_flags |= 0x2000; /* LINUX_O_ASYNC */ + linux_flags |= LINUX_O_ASYNC; return linux_flags; } @@ -233,7 +233,7 @@ int linux_to_mac_status_flags(int linux_flags) mac_flags |= O_NONBLOCK; if (linux_flags & LINUX_O_APPEND) mac_flags |= O_APPEND; - if (linux_flags & 0x2000) - mac_flags |= O_ASYNC; /* LINUX_O_ASYNC */ + if (linux_flags & LINUX_O_ASYNC) + mac_flags |= O_ASYNC; return mac_flags; } diff --git a/tests/test-timerfd.c b/tests/test-timerfd.c index f8f00c7..0f58057 100644 --- a/tests/test-timerfd.c +++ b/tests/test-timerfd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "test-harness.h" @@ -184,6 +185,70 @@ int main(void) FAIL("timerfd_create"); } + /* fcntl coherence: F_GETFL must surface O_RDWR plus the writable bits + * Linux honors on a timerfd (O_APPEND, O_NONBLOCK, O_NOATIME). + * F_SETFL persists the writable bits and silently drops everything + * else; O_DIRECT returns EINVAL. + */ + TEST("fcntl F_GETFL/F_SETFL match Linux setfl semantics"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (fd >= 0) { + int fl = fcntl(fd, F_GETFL); + EXPECT_TRUE( + fl >= 0 && (fl & O_NONBLOCK) && (fl & O_ACCMODE) == O_RDWR, + "F_GETFL did not surface O_RDWR | O_NONBLOCK"); + + /* Stray O_CLOEXEC and O_WRONLY must not land in the shadow. */ + EXPECT_TRUE( + fcntl(fd, F_SETFL, + O_APPEND | O_NONBLOCK | O_WRONLY | O_CLOEXEC) == 0, + "F_SETFL accepted-plus-stray bits failed"); + fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & O_APPEND) && (fl & O_NONBLOCK) && + (fl & O_ACCMODE) == O_RDWR && !(fl & O_CLOEXEC), + "F_GETFL leaked stray F_SETFL bits"); + + errno = 0; + EXPECT_TRUE( + fcntl(fd, F_SETFL, fl | O_DIRECT) == -1 && errno == EINVAL, + "F_SETFL accepted unsupported O_DIRECT"); + + /* Round-trip O_NOATIME. */ + EXPECT_TRUE(fcntl(fd, F_SETFL, O_NOATIME) == 0, + "F_SETFL O_NOATIME failed"); + fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & O_NOATIME), + "F_GETFL did not surface O_NOATIME"); + close(fd); + } else + FAIL("timerfd_create"); + } + + /* dup must carry the linux_flags shadow's NONBLOCK bit through; without + * the preserved-mask fix the new fd's F_GETFL would report blocking. + * (The timerfd slot table is keyed by guest_fd with no alias support, + * so read/settime through the dup return EBADF -- that pre-existing + * limitation is outside #82's scope.) + */ + TEST("dup preserves O_NONBLOCK and O_RDWR in linux_flags"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (fd >= 0) { + int dfd = dup(fd); + EXPECT_TRUE(dfd >= 0, "dup failed"); + if (dfd >= 0) { + int fl = fcntl(dfd, F_GETFL); + EXPECT_TRUE( + fl >= 0 && (fl & O_NONBLOCK) && (fl & O_ACCMODE) == O_RDWR, + "dup lost O_NONBLOCK or O_RDWR"); + close(dfd); + } + close(fd); + } else + FAIL("timerfd_create"); + } + TEST("timerfd_create invalid clock"); EXPECT_ERRNO(timerfd_create(9999, 0), EINVAL, "expected EINVAL"); From b526ad77d8a72edf39a7f7a763d6a9ea80f9a59d Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 7 Jun 2026 03:51:43 +0800 Subject: [PATCH 3/4] Preserve access mode in F_SETFL on FUSE fds The FUSE F_SETFL branch's preserved mask omitted LINUX_O_ACCMODE, while the assignment OR'd in arg bits outside the preserved set. As a result, fcntl(fuse_rdwr_fd, F_SETFL, 0) silently turned an O_RDWR FUSE shadow into O_RDONLY, and a subsequent fcntl(fd, F_GETFL) reported the wrong access mode -- breaking the Linux contract that F_SETFL cannot change the access mode. Add LINUX_O_ACCMODE to both the preserved mask and the strip applied to the incoming arg, matching how Linux generic_setfl() preserves the access mode bits outside its SETFL_MASK. --- src/syscall/fs.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 0566da1..8687e3c 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -738,15 +738,20 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) case 4: /* F_SETFL */ { if (fuse_fd) { - int preserved = - fd_table[fd].linux_flags & - (LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | - LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); + /* Preserve LINUX_O_ACCMODE: F_SETFL is not allowed to change the + * access mode in the Linux kernel, and without preserving it + * here a stray F_SETFL(0) would silently flip an O_RDWR FUSE + * shadow to O_RDONLY, surfacing the wrong mode through F_GETFL. + */ + int preserved = fd_table[fd].linux_flags & + (LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH | + LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | + LINUX_O_DIRECT | LINUX_O_LARGEFILE); fd_table[fd].linux_flags = - preserved | - ((int) arg & - ~(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | - LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE)); + preserved | ((int) arg & ~(LINUX_O_ACCMODE | LINUX_O_CLOEXEC | + LINUX_O_PATH | LINUX_O_DIRECTORY | + LINUX_O_NOFOLLOW | LINUX_O_DIRECT | + LINUX_O_LARGEFILE)); return 0; } /* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's From 0a2199c6f2a43cacab9029a8caa022c2141659f0 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 7 Jun 2026 03:56:25 +0800 Subject: [PATCH 4/4] Serialize linux_flags writes on fd_lock Several callers wrote fd_table[gfd].linux_flags under different locks or none at all, so a concurrent fcntl(F_SETFL/F_SETFD) on fd_lock could race a creator's bare assignment. sys_fcntl read the slot's type and flags outside fd_lock and mutated them without revalidation, so a close+reopen between the read and the write could update an unrelated fd. This commit unifies both concerns under fd_lock. fd_publish_linux_flags helper New fdtable helper takes fd_lock around a single linux_flags write. Replaces bare assignments in sys_timerfd_create, sys_eventfd, sys_signalfd, sys_inotify_init1, the FUSE dev mount, and fuse_open. fuse_dup_fd takes fd_lock once for both the source read and the destination write so the preserved-flags snapshot stays consistent with a racing F_SETFL on either fd. The result: every write to fd_table[*].linux_flags is now serialized on the same lock, with no fuse_lock<->fd_lock nesting introduced. sys_fcntl snapshot-then-revalidate sys_fcntl now takes a single fd_snapshot at entry and uses it for F_GETFD, F_GETFL, and F_DUPFD source reads. F_SETFD and the FUSE / timerfd F_SETFL writers reacquire fd_lock and revalidate against fd_snap.generation before mutating linux_flags. fd_alloc bumps a monotonic generation counter per slot reuse, so close+reopen between snapshot and lock is caught and returns EBADF rather than mutating an unrelated fd. The timerfd F_SETFL O_DIRECT EINVAL check moves inside the lock so a stale-snapshot race cannot report EINVAL based on a fd that is no longer a timerfd; the revalidation returns EBADF first instead. A new test exercises the cross-cutting fd_lock RMW: F_SETFL stamps the writable status bits, then F_SETFD toggles CLOEXEC, and F_GETFL must still surface the status bits unperturbed. --- src/syscall/fd.c | 15 +++++----- src/syscall/fdtable.c | 9 ++++++ src/syscall/fs.c | 68 ++++++++++++++++++++++++++++++++---------- src/syscall/fuse.c | 20 +++++++++++-- src/syscall/inotify.c | 2 +- src/syscall/internal.h | 9 ++++++ tests/test-timerfd.c | 27 +++++++++++++++++ 7 files changed, 124 insertions(+), 26 deletions(-) diff --git a/src/syscall/fd.c b/src/syscall/fd.c index f9cbf75..5c53f21 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -218,9 +218,10 @@ int64_t sys_timerfd_create(int clockid, int flags) * fs/timerfd.c). Stamp O_RDWR into linux_flags so the F_GETFL branch * below can surface the access mode without re-deriving it. */ - fd_table[gfd].linux_flags = - LINUX_O_RDWR | ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) | - ((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0); + fd_publish_linux_flags( + gfd, LINUX_O_RDWR | + ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) | + ((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0)); return gfd; } @@ -659,8 +660,8 @@ int64_t sys_eventfd2(unsigned int initval, int flags) eventfd_owner[gfd] = slot; pthread_mutex_unlock(&sfd_lock); - fd_table[gfd].linux_flags = - (flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + fd_publish_linux_flags(gfd, + (flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0); /* If initial counter > 0, make the pipe readable so poll sees it */ if (initval > 0) { @@ -1083,8 +1084,8 @@ int64_t sys_signalfd4(guest_t *g, signalfd_state[slot].nonblock = (flags & LINUX_SFD_NONBLOCK) ? 1 : 0; pthread_mutex_unlock(&sfd_lock); - fd_table[gfd].linux_flags = - (flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + fd_publish_linux_flags(gfd, + (flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0); return gfd; } diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c index 3c061b7..a072cfb 100644 --- a/src/syscall/fdtable.c +++ b/src/syscall/fdtable.c @@ -404,6 +404,15 @@ int fd_get_type(int guest_fd) return type; } +void fd_publish_linux_flags(int guest_fd, int linux_flags) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return; + pthread_mutex_lock(&fd_lock); + fd_table[guest_fd].linux_flags = linux_flags; + pthread_mutex_unlock(&fd_lock); +} + /* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed * by type. Each slot defaults to NULL (no per-type cleanup). Modules that * own a type call fd_register_cleanup() at init time; dup and fork-restore diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 8687e3c..004f3b2 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -668,7 +668,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -LINUX_EBADF; - int fd_type = fd_table[fd].type; + /* Snapshot the slot under fd_lock once; readers use fd_snap below, and + * writers reacquire fd_lock and revalidate against fd_snap.generation + * so a close+reopen between the snapshot and the RMW returns EBADF + * instead of mutating an unrelated fd. + */ + fd_entry_t fd_snap; + if (!fd_snapshot(fd, &fd_snap)) + return -LINUX_EBADF; + + int fd_type = fd_snap.type; bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE || fd_type == FD_FUSE_DIR); @@ -681,7 +690,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if ((int) arg < 0) { return -LINUX_EINVAL; } - int dup_flags = fd_table[fd].linux_flags & ~LINUX_O_CLOEXEC; + int dup_flags = fd_snap.linux_flags & ~LINUX_O_CLOEXEC; if (cmd == 1030) dup_flags |= LINUX_O_CLOEXEC; int gfd = duplicate_guest_fd(fd, (int) arg, -1, false, dup_flags); @@ -695,17 +704,28 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) return gfd; } case 1: /* F_GETFD */ - return (fd_table[fd].linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC - : 0; + return (fd_snap.linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC : 0; case 2: /* F_SETFD */ + /* Hold fd_lock across the read-modify-write so the CLOEXEC flip is + * atomic against a concurrent F_SETFL on the same shadow word and + * against any fd_lock-protected reader. Revalidate against the + * snapshot generation so a close+reopen returns EBADF. + */ + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type == FD_CLOSED || + fd_table[fd].generation != fd_snap.generation) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } if ((int) arg & LINUX_FD_CLOEXEC) fd_table[fd].linux_flags |= LINUX_O_CLOEXEC; else fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC; + pthread_mutex_unlock(&fd_lock); return 0; case 3: { /* F_GETFL */ if (fuse_fd) - return fd_table[fd].linux_flags; + return fd_snap.linux_flags; /* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits * the kernel honors. Surface only those bits from the shadow rather * than echoing arbitrary linux_flags bits so stray F_SETFL args @@ -714,11 +734,8 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) */ if (fd_type == FD_TIMERFD) return LINUX_O_RDWR | - (fd_table[fd].linux_flags & + (fd_snap.linux_flags & (LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME)); - fd_entry_t snap; - if (!fd_snapshot(fd, &snap)) - return -LINUX_EBADF; host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; @@ -727,10 +744,10 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if (mac_fl < 0) return linux_errno(); int linux_fl = mac_to_linux_status_flags(mac_fl); - if (snap.type == FD_REGULAR || snap.type == FD_DIR || - snap.type == FD_PATH || snap.type == FD_URANDOM) - linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3); - linux_fl |= snap.linux_flags & + if (fd_snap.type == FD_REGULAR || fd_snap.type == FD_DIR || + fd_snap.type == FD_PATH || fd_snap.type == FD_URANDOM) + linux_fl = (linux_fl & ~O_ACCMODE) | (fd_snap.linux_flags & 3); + linux_fl |= fd_snap.linux_flags & (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); return linux_fl; @@ -742,7 +759,18 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) * access mode in the Linux kernel, and without preserving it * here a stray F_SETFL(0) would silently flip an O_RDWR FUSE * shadow to O_RDONLY, surfacing the wrong mode through F_GETFL. + * + * Hold fd_lock across the read-modify-write so the update is + * atomic against a concurrent F_SETFD and any fd_lock-protected + * reader. Revalidate against the snapshot generation so a + * close+reopen returns EBADF. */ + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type != fd_type || + fd_table[fd].generation != fd_snap.generation) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } int preserved = fd_table[fd].linux_flags & (LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | @@ -752,6 +780,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE)); + pthread_mutex_unlock(&fd_lock); return 0; } /* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's @@ -765,13 +794,22 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) * matching how Linux F_SETFL drops them. */ if (fd_type == FD_TIMERFD) { - if ((int) arg & LINUX_O_DIRECT) - return -LINUX_EINVAL; const int setfl_mask = LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME; + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type != FD_TIMERFD || + fd_table[fd].generation != fd_snap.generation) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } + if ((int) arg & LINUX_O_DIRECT) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EINVAL; + } fd_table[fd].linux_flags = (fd_table[fd].linux_flags & ~setfl_mask) | ((int) arg & setfl_mask); + pthread_mutex_unlock(&fd_lock); return 0; } host_fd_ref_t host_ref; diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c index 39de346..17c2e3f 100644 --- a/src/syscall/fuse.c +++ b/src/syscall/fuse.c @@ -1329,8 +1329,11 @@ int fuse_proc_open(int linux_flags) errno = EMFILE; return -1; } - fd_table[guest_fd].linux_flags = linux_flags; pthread_mutex_unlock(&fuse_lock); + /* Publish under fd_lock so the write is on the same lock domain as + * sys_fcntl(F_SETFL/F_SETFD), not stranded behind fuse_lock. + */ + fd_publish_linux_flags(guest_fd, linux_flags); return guest_fd; } @@ -1897,8 +1900,11 @@ int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode) fd_mark_closed(guest_fd); return -LINUX_EMFILE; } - fd_table[guest_fd].linux_flags = linux_flags; pthread_mutex_unlock(&fuse_lock); + /* Publish under fd_lock so the open's flags land on the same lock + * domain that sys_fcntl(F_SETFL/F_SETFD) uses. + */ + fd_publish_linux_flags(guest_fd, linux_flags); return guest_fd; } @@ -2607,16 +2613,24 @@ int fuse_dup_fd(int src_fd, } } + pthread_mutex_unlock(&fuse_lock); + /* O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2); without * it a duplicated non-blocking FUSE fd would silently become blocking * because nothing else carries the flag forward. + * + * Take fd_lock once for both the source read and the destination write + * so the dup snapshot is consistent with any concurrent F_SETFL on the + * source and so the destination publish cannot be overwritten by an + * early racing F_SETFL on the new slot. */ + pthread_mutex_lock(&fd_lock); int preserved_flags = fd_table[src_fd].linux_flags & (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE | LINUX_O_NONBLOCK); fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; - pthread_mutex_unlock(&fuse_lock); + pthread_mutex_unlock(&fd_lock); return guest_fd; } diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index d9b54dd..6713a5e 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -402,7 +402,7 @@ int64_t sys_inotify_init1(int flags) memset(inst->watches, 0, sizeof(inst->watches)); pthread_mutex_unlock(&inotify_lock); - fd_table[gfd].linux_flags = (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + fd_publish_linux_flags(gfd, (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0); return gfd; } diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 4e3fa56..3ba5b51 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -109,6 +109,15 @@ int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out); */ int fd_get_type(int guest_fd); +/* Publish linux_flags for a guest fd under fd_lock. Use after fd_alloc when + * the creating syscall needs to set linux_flags atomically with respect to a + * concurrent fcntl(F_SETFL/F_SETFD) on the same slot. The fd_alloc-then- + * publish window is small (the new gfd is not communicated to other threads + * until the syscall returns) but the lock removes the structural race and + * keeps every linux_flags writer on one lock domain. + */ +void fd_publish_linux_flags(int guest_fd, int linux_flags); + /* Republish the EL1 urandom read fast-path bit for this fd from the current * fd_table type and access mode. Only readable /dev/urandom descriptors are * eligible for the bitmap. diff --git a/tests/test-timerfd.c b/tests/test-timerfd.c index 0f58057..6254d22 100644 --- a/tests/test-timerfd.c +++ b/tests/test-timerfd.c @@ -225,6 +225,33 @@ int main(void) FAIL("timerfd_create"); } + /* F_SETFD and F_SETFL touch the same linux_flags shadow but operate on + * disjoint bits. Toggling CLOEXEC via F_SETFD must not perturb the + * status bits surfaced by F_GETFL. Targets the new fd_lock-serialized + * RMW in both branches. + */ + TEST("F_SETFD does not perturb F_GETFL status bits"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, 0); + if (fd >= 0) { + int wanted = O_APPEND | O_NONBLOCK | O_NOATIME; + EXPECT_TRUE(fcntl(fd, F_SETFL, wanted) == 0, "F_SETFL failed"); + + EXPECT_TRUE(fcntl(fd, F_SETFD, FD_CLOEXEC) == 0, + "F_SETFD set failed"); + int fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & wanted) == wanted, + "F_SETFD CLOEXEC perturbed status bits"); + + EXPECT_TRUE(fcntl(fd, F_SETFD, 0) == 0, "F_SETFD clear failed"); + fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & wanted) == wanted, + "F_SETFD clear perturbed status bits"); + close(fd); + } else + FAIL("timerfd_create"); + } + /* dup must carry the linux_flags shadow's NONBLOCK bit through; without * the preserved-mask fix the new fd's F_GETFL would report blocking. * (The timerfd slot table is keyed by guest_fd with no alias support,