diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 52874a4..ac58574 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -378,6 +378,7 @@ typedef struct { #define LINUX_O_TRUNC 0x0200 #define LINUX_O_APPEND 0x0400 #define LINUX_O_NONBLOCK 0x0800 +#define LINUX_O_ASYNC 0x2000 /* aarch64-linux open flag values (from asm-generic/fcntl.h). * These differ from x86_64-linux values. */ @@ -385,6 +386,7 @@ typedef struct { #define LINUX_O_NOFOLLOW 0x8000 /* 0100000 octal */ #define LINUX_O_DIRECT 0x10000 /* 0200000 octal */ #define LINUX_O_LARGEFILE 0x20000 /* 0400000 octal, ignored on LP64 */ +#define LINUX_O_NOATIME 0x40000 /* 01000000 octal */ #define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */ #define LINUX_O_PATH 0x200000 /* 010000000 octal */ diff --git a/src/syscall/fd.c b/src/syscall/fd.c index f06b0d2..5c53f21 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -89,6 +89,12 @@ typedef struct { #define LINUX_CLOCK_REALTIME 0 #define LINUX_CLOCK_MONOTONIC 1 +/* Linux CLOCK_BOOTTIME counts time including suspend; macOS has no equivalent. + * timerfd_settime treats non-REALTIME slots as MONOTONIC for ABSTIME + * conversion, which matches translate_clockid() in time.c. + */ +#define LINUX_CLOCK_BOOTTIME 7 + static struct { int guest_fd; /* Guest fd (-1 if unused) */ int kq_fd; /* kqueue fd for this timer */ @@ -167,7 +173,8 @@ static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns) int64_t sys_timerfd_create(int clockid, int flags) { - if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC) + if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC && + clockid != LINUX_CLOCK_BOOTTIME) return -LINUX_EINVAL; if (flags & ~(LINUX_TFD_CLOEXEC | LINUX_TFD_NONBLOCK)) return -LINUX_EINVAL; @@ -176,8 +183,12 @@ int64_t sys_timerfd_create(int clockid, int flags) if (kq < 0) return linux_errno(); - if (((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) || - ((flags & LINUX_TFD_NONBLOCK) && fd_set_nonblock(kq) < 0)) { + /* macOS kqueue fds reject fcntl(F_SETFL, O_NONBLOCK) with ENOTTY, so + * track the non-blocking mode in fd_table[gfd].linux_flags below and + * let timerfd_read consult that field directly. F_SETFD CLOEXEC still + * works on a kqueue fd. + */ + if ((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) { close(kq); return linux_errno(); } @@ -203,8 +214,14 @@ int64_t sys_timerfd_create(int clockid, int flags) timerfd_state[slot].clockid = clockid; pthread_mutex_unlock(&sfd_lock); - fd_table[gfd].linux_flags = - (flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + /* Linux opens the timerfd inode O_RDWR (anon_inode_getfd in + * fs/timerfd.c). Stamp O_RDWR into linux_flags so the F_GETFL branch + * below can surface the access mode without re-deriving it. + */ + fd_publish_linux_flags( + gfd, LINUX_O_RDWR | + ((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) | + ((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0)); return gfd; } @@ -396,6 +413,15 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) if (count < 8) return -LINUX_EINVAL; + /* Snapshot the NONBLOCK status under fd_lock before sfd_lock to match the + * documented lock order (fd_lock=3 < sfd_lock=5a). The kqueue host fd + * rejects fcntl(F_SETFL, O_NONBLOCK) on macOS, so the flag lives in + * fd_table[guest_fd].linux_flags rather than on the host fd. + */ + pthread_mutex_lock(&fd_lock); + bool nonblock = fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK; + pthread_mutex_unlock(&fd_lock); + pthread_mutex_lock(&sfd_lock); int slot = timerfd_find(guest_fd); if (slot < 0) { @@ -409,9 +435,7 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) timerfd_drain_pending_locked(slot); if (timerfd_state[slot].expirations == 0) { - /* No events yet; check if non-blocking */ - int fl = fcntl(kq, F_GETFL); - if (fl >= 0 && (fl & O_NONBLOCK)) { + if (nonblock) { pthread_mutex_unlock(&sfd_lock); return -LINUX_EAGAIN; } @@ -636,8 +660,8 @@ int64_t sys_eventfd2(unsigned int initval, int flags) eventfd_owner[gfd] = slot; pthread_mutex_unlock(&sfd_lock); - fd_table[gfd].linux_flags = - (flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + fd_publish_linux_flags(gfd, + (flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0); /* If initial counter > 0, make the pipe readable so poll sees it */ if (initval > 0) { @@ -1060,8 +1084,8 @@ int64_t sys_signalfd4(guest_t *g, signalfd_state[slot].nonblock = (flags & LINUX_SFD_NONBLOCK) ? 1 : 0; pthread_mutex_unlock(&sfd_lock); - fd_table[gfd].linux_flags = - (flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + fd_publish_linux_flags(gfd, + (flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0); return gfd; } diff --git a/src/syscall/fdtable.c b/src/syscall/fdtable.c index 3c061b7..a072cfb 100644 --- a/src/syscall/fdtable.c +++ b/src/syscall/fdtable.c @@ -404,6 +404,15 @@ int fd_get_type(int guest_fd) return type; } +void fd_publish_linux_flags(int guest_fd, int linux_flags) +{ + if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE)) + return; + pthread_mutex_lock(&fd_lock); + fd_table[guest_fd].linux_flags = linux_flags; + pthread_mutex_unlock(&fd_lock); +} + /* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed * by type. Each slot defaults to NULL (no per-type cleanup). Modules that * own a type call fd_register_cleanup() at init time; dup and fork-restore diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 1b7f4d3..004f3b2 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -501,10 +501,15 @@ static bool install_fd_alias_metadata_atomic(int dst_fd, int linux_flags, DIR *dir) { + /* LINUX_O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2). + * Required for FD_TIMERFD (and any other type that stores NONBLOCK in + * linux_flags rather than on the host fd) so a duplicated non-blocking + * timerfd does not silently turn blocking. + */ int preserved_flags = src_snap->linux_flags & (LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | - LINUX_O_DIRECT | LINUX_O_LARGEFILE); + LINUX_O_DIRECT | LINUX_O_LARGEFILE | LINUX_O_NONBLOCK); int final_flags = preserved_flags | linux_flags; bool installed = false; @@ -663,7 +668,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE)) return -LINUX_EBADF; - int fd_type = fd_table[fd].type; + /* Snapshot the slot under fd_lock once; readers use fd_snap below, and + * writers reacquire fd_lock and revalidate against fd_snap.generation + * so a close+reopen between the snapshot and the RMW returns EBADF + * instead of mutating an unrelated fd. + */ + fd_entry_t fd_snap; + if (!fd_snapshot(fd, &fd_snap)) + return -LINUX_EBADF; + + int fd_type = fd_snap.type; bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE || fd_type == FD_FUSE_DIR); @@ -676,7 +690,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if ((int) arg < 0) { return -LINUX_EINVAL; } - int dup_flags = fd_table[fd].linux_flags & ~LINUX_O_CLOEXEC; + int dup_flags = fd_snap.linux_flags & ~LINUX_O_CLOEXEC; if (cmd == 1030) dup_flags |= LINUX_O_CLOEXEC; int gfd = duplicate_guest_fd(fd, (int) arg, -1, false, dup_flags); @@ -690,20 +704,38 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) return gfd; } case 1: /* F_GETFD */ - return (fd_table[fd].linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC - : 0; + return (fd_snap.linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC : 0; case 2: /* F_SETFD */ + /* Hold fd_lock across the read-modify-write so the CLOEXEC flip is + * atomic against a concurrent F_SETFL on the same shadow word and + * against any fd_lock-protected reader. Revalidate against the + * snapshot generation so a close+reopen returns EBADF. + */ + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type == FD_CLOSED || + fd_table[fd].generation != fd_snap.generation) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } if ((int) arg & LINUX_FD_CLOEXEC) fd_table[fd].linux_flags |= LINUX_O_CLOEXEC; else fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC; + pthread_mutex_unlock(&fd_lock); return 0; case 3: { /* F_GETFL */ if (fuse_fd) - return fd_table[fd].linux_flags; - fd_entry_t snap; - if (!fd_snapshot(fd, &snap)) - return -LINUX_EBADF; + return fd_snap.linux_flags; + /* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits + * the kernel honors. Surface only those bits from the shadow rather + * than echoing arbitrary linux_flags bits so stray F_SETFL args + * cannot leak through here. O_ASYNC stays off because timerfd_fops + * lacks ->fasync, so generic_setfl drops it. + */ + if (fd_type == FD_TIMERFD) + return LINUX_O_RDWR | + (fd_snap.linux_flags & + (LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME)); host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; @@ -712,10 +744,10 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) if (mac_fl < 0) return linux_errno(); int linux_fl = mac_to_linux_status_flags(mac_fl); - if (snap.type == FD_REGULAR || snap.type == FD_DIR || - snap.type == FD_PATH || snap.type == FD_URANDOM) - linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3); - linux_fl |= snap.linux_flags & + if (fd_snap.type == FD_REGULAR || fd_snap.type == FD_DIR || + fd_snap.type == FD_PATH || fd_snap.type == FD_URANDOM) + linux_fl = (linux_fl & ~O_ACCMODE) | (fd_snap.linux_flags & 3); + linux_fl |= fd_snap.linux_flags & (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); return linux_fl; @@ -723,15 +755,61 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) case 4: /* F_SETFL */ { if (fuse_fd) { - int preserved = - fd_table[fd].linux_flags & - (LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | - LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE); + /* Preserve LINUX_O_ACCMODE: F_SETFL is not allowed to change the + * access mode in the Linux kernel, and without preserving it + * here a stray F_SETFL(0) would silently flip an O_RDWR FUSE + * shadow to O_RDONLY, surfacing the wrong mode through F_GETFL. + * + * Hold fd_lock across the read-modify-write so the update is + * atomic against a concurrent F_SETFD and any fd_lock-protected + * reader. Revalidate against the snapshot generation so a + * close+reopen returns EBADF. + */ + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type != fd_type || + fd_table[fd].generation != fd_snap.generation) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } + int preserved = fd_table[fd].linux_flags & + (LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH | + LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | + LINUX_O_DIRECT | LINUX_O_LARGEFILE); + fd_table[fd].linux_flags = + preserved | ((int) arg & ~(LINUX_O_ACCMODE | LINUX_O_CLOEXEC | + LINUX_O_PATH | LINUX_O_DIRECTORY | + LINUX_O_NOFOLLOW | LINUX_O_DIRECT | + LINUX_O_LARGEFILE)); + pthread_mutex_unlock(&fd_lock); + return 0; + } + /* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's + * file-status word in the linux_flags shadow. Of Linux's writable + * status flags (O_APPEND, O_ASYNC, O_DIRECT, O_NOATIME, O_NONBLOCK) + * the timerfd kernel object honors O_APPEND, O_NONBLOCK, and + * O_NOATIME. O_ASYNC is silently dropped (timerfd_fops lacks + * ->fasync). O_DIRECT returns -EINVAL because the inode lacks + * FMODE_CAN_ODIRECT. Bits outside the writable set (access mode, + * CLOEXEC, O_PATH/DIRECTORY/NOFOLLOW/etc.) are silently ignored, + * matching how Linux F_SETFL drops them. + */ + if (fd_type == FD_TIMERFD) { + const int setfl_mask = + LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME; + pthread_mutex_lock(&fd_lock); + if (fd_table[fd].type != FD_TIMERFD || + fd_table[fd].generation != fd_snap.generation) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EBADF; + } + if ((int) arg & LINUX_O_DIRECT) { + pthread_mutex_unlock(&fd_lock); + return -LINUX_EINVAL; + } fd_table[fd].linux_flags = - preserved | - ((int) arg & - ~(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY | - LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE)); + (fd_table[fd].linux_flags & ~setfl_mask) | + ((int) arg & setfl_mask); + pthread_mutex_unlock(&fd_lock); return 0; } host_fd_ref_t host_ref; diff --git a/src/syscall/fuse.c b/src/syscall/fuse.c index adab4a8..17c2e3f 100644 --- a/src/syscall/fuse.c +++ b/src/syscall/fuse.c @@ -1329,8 +1329,11 @@ int fuse_proc_open(int linux_flags) errno = EMFILE; return -1; } - fd_table[guest_fd].linux_flags = linux_flags; pthread_mutex_unlock(&fuse_lock); + /* Publish under fd_lock so the write is on the same lock domain as + * sys_fcntl(F_SETFL/F_SETFD), not stranded behind fuse_lock. + */ + fd_publish_linux_flags(guest_fd, linux_flags); return guest_fd; } @@ -1897,8 +1900,11 @@ int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode) fd_mark_closed(guest_fd); return -LINUX_EMFILE; } - fd_table[guest_fd].linux_flags = linux_flags; pthread_mutex_unlock(&fuse_lock); + /* Publish under fd_lock so the open's flags land on the same lock + * domain that sys_fcntl(F_SETFL/F_SETFD) uses. + */ + fd_publish_linux_flags(guest_fd, linux_flags); return guest_fd; } @@ -2607,11 +2613,24 @@ int fuse_dup_fd(int src_fd, } } - int preserved_flags = fd_table[src_fd].linux_flags & - (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | - LINUX_O_DIRECT | LINUX_O_LARGEFILE); - fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; pthread_mutex_unlock(&fuse_lock); + + /* O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2); without + * it a duplicated non-blocking FUSE fd would silently become blocking + * because nothing else carries the flag forward. + * + * Take fd_lock once for both the source read and the destination write + * so the dup snapshot is consistent with any concurrent F_SETFL on the + * source and so the destination publish cannot be overwritten by an + * early racing F_SETFL on the new slot. + */ + pthread_mutex_lock(&fd_lock); + int preserved_flags = + fd_table[src_fd].linux_flags & + (LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT | + LINUX_O_LARGEFILE | LINUX_O_NONBLOCK); + fd_table[guest_fd].linux_flags = preserved_flags | linux_flags; + pthread_mutex_unlock(&fd_lock); return guest_fd; } diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index d9b54dd..6713a5e 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -402,7 +402,7 @@ int64_t sys_inotify_init1(int flags) memset(inst->watches, 0, sizeof(inst->watches)); pthread_mutex_unlock(&inotify_lock); - fd_table[gfd].linux_flags = (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0; + fd_publish_linux_flags(gfd, (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0); return gfd; } diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 4e3fa56..3ba5b51 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -109,6 +109,15 @@ int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out); */ int fd_get_type(int guest_fd); +/* Publish linux_flags for a guest fd under fd_lock. Use after fd_alloc when + * the creating syscall needs to set linux_flags atomically with respect to a + * concurrent fcntl(F_SETFL/F_SETFD) on the same slot. The fd_alloc-then- + * publish window is small (the new gfd is not communicated to other threads + * until the syscall returns) but the lock removes the structural race and + * keeps every linux_flags writer on one lock domain. + */ +void fd_publish_linux_flags(int guest_fd, int linux_flags); + /* Republish the EL1 urandom read fast-path bit for this fd from the current * fd_table type and access mode. Only readable /dev/urandom descriptors are * eligible for the bitmap. diff --git a/src/syscall/translate.c b/src/syscall/translate.c index e3cf9a8..018690d 100644 --- a/src/syscall/translate.c +++ b/src/syscall/translate.c @@ -219,7 +219,7 @@ int mac_to_linux_status_flags(int mac_flags) if (mac_flags & O_APPEND) linux_flags |= LINUX_O_APPEND; if (mac_flags & O_ASYNC) - linux_flags |= 0x2000; /* LINUX_O_ASYNC */ + linux_flags |= LINUX_O_ASYNC; return linux_flags; } @@ -233,7 +233,7 @@ int linux_to_mac_status_flags(int linux_flags) mac_flags |= O_NONBLOCK; if (linux_flags & LINUX_O_APPEND) mac_flags |= O_APPEND; - if (linux_flags & 0x2000) - mac_flags |= O_ASYNC; /* LINUX_O_ASYNC */ + if (linux_flags & LINUX_O_ASYNC) + mac_flags |= O_ASYNC; return mac_flags; } diff --git a/tests/test-timerfd.c b/tests/test-timerfd.c index 162b9c2..6254d22 100644 --- a/tests/test-timerfd.c +++ b/tests/test-timerfd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "test-harness.h" @@ -148,6 +149,133 @@ int main(void) FAIL("timerfd_create failed"); } + /* CLOCK_BOOTTIME backs foot's keyboard repeat timer; on macOS it + * resolves to CLOCK_MONOTONIC since no boottime equivalent exists. + * Paired with TFD_NONBLOCK to mirror foot's actual call. + */ + TEST("create accepts CLOCK_BOOTTIME with TFD_NONBLOCK"); + { + int fd = timerfd_create(CLOCK_BOOTTIME, TFD_CLOEXEC | TFD_NONBLOCK); + EXPECT_TRUE(fd >= 0, "timerfd_create(CLOCK_BOOTTIME) failed"); + if (fd >= 0) + close(fd); + } + + /* NONBLOCK shadow: the kqueue host fd cannot carry O_NONBLOCK, so the + * flag lives in fd_table[gfd].linux_flags. Arm with a 1s deadline so + * the read is non-trivially blocked, then verify EAGAIN comes from the + * NONBLOCK path; an unarmed timer would also return EAGAIN, so the arm + * step is what makes this a real probe of the shadow. + */ + TEST("NONBLOCK: armed-but-unfired returns EAGAIN"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (fd >= 0) { + struct itimerspec its = {.it_value = {.tv_sec = 1, .tv_nsec = 0}, + .it_interval = {0, 0}}; + EXPECT_TRUE(timerfd_settime(fd, 0, &its, NULL) == 0, + "settime failed"); + uint64_t count = 0; + errno = 0; + ssize_t r = read(fd, &count, sizeof(count)); + EXPECT_TRUE(r == -1 && errno == EAGAIN, + "non-blocking read did not return EAGAIN"); + close(fd); + } else + FAIL("timerfd_create"); + } + + /* fcntl coherence: F_GETFL must surface O_RDWR plus the writable bits + * Linux honors on a timerfd (O_APPEND, O_NONBLOCK, O_NOATIME). + * F_SETFL persists the writable bits and silently drops everything + * else; O_DIRECT returns EINVAL. + */ + TEST("fcntl F_GETFL/F_SETFL match Linux setfl semantics"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (fd >= 0) { + int fl = fcntl(fd, F_GETFL); + EXPECT_TRUE( + fl >= 0 && (fl & O_NONBLOCK) && (fl & O_ACCMODE) == O_RDWR, + "F_GETFL did not surface O_RDWR | O_NONBLOCK"); + + /* Stray O_CLOEXEC and O_WRONLY must not land in the shadow. */ + EXPECT_TRUE( + fcntl(fd, F_SETFL, + O_APPEND | O_NONBLOCK | O_WRONLY | O_CLOEXEC) == 0, + "F_SETFL accepted-plus-stray bits failed"); + fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & O_APPEND) && (fl & O_NONBLOCK) && + (fl & O_ACCMODE) == O_RDWR && !(fl & O_CLOEXEC), + "F_GETFL leaked stray F_SETFL bits"); + + errno = 0; + EXPECT_TRUE( + fcntl(fd, F_SETFL, fl | O_DIRECT) == -1 && errno == EINVAL, + "F_SETFL accepted unsupported O_DIRECT"); + + /* Round-trip O_NOATIME. */ + EXPECT_TRUE(fcntl(fd, F_SETFL, O_NOATIME) == 0, + "F_SETFL O_NOATIME failed"); + fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & O_NOATIME), + "F_GETFL did not surface O_NOATIME"); + close(fd); + } else + FAIL("timerfd_create"); + } + + /* F_SETFD and F_SETFL touch the same linux_flags shadow but operate on + * disjoint bits. Toggling CLOEXEC via F_SETFD must not perturb the + * status bits surfaced by F_GETFL. Targets the new fd_lock-serialized + * RMW in both branches. + */ + TEST("F_SETFD does not perturb F_GETFL status bits"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, 0); + if (fd >= 0) { + int wanted = O_APPEND | O_NONBLOCK | O_NOATIME; + EXPECT_TRUE(fcntl(fd, F_SETFL, wanted) == 0, "F_SETFL failed"); + + EXPECT_TRUE(fcntl(fd, F_SETFD, FD_CLOEXEC) == 0, + "F_SETFD set failed"); + int fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & wanted) == wanted, + "F_SETFD CLOEXEC perturbed status bits"); + + EXPECT_TRUE(fcntl(fd, F_SETFD, 0) == 0, "F_SETFD clear failed"); + fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl >= 0 && (fl & wanted) == wanted, + "F_SETFD clear perturbed status bits"); + close(fd); + } else + FAIL("timerfd_create"); + } + + /* dup must carry the linux_flags shadow's NONBLOCK bit through; without + * the preserved-mask fix the new fd's F_GETFL would report blocking. + * (The timerfd slot table is keyed by guest_fd with no alias support, + * so read/settime through the dup return EBADF -- that pre-existing + * limitation is outside #82's scope.) + */ + TEST("dup preserves O_NONBLOCK and O_RDWR in linux_flags"); + { + int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (fd >= 0) { + int dfd = dup(fd); + EXPECT_TRUE(dfd >= 0, "dup failed"); + if (dfd >= 0) { + int fl = fcntl(dfd, F_GETFL); + EXPECT_TRUE( + fl >= 0 && (fl & O_NONBLOCK) && (fl & O_ACCMODE) == O_RDWR, + "dup lost O_NONBLOCK or O_RDWR"); + close(dfd); + } + close(fd); + } else + FAIL("timerfd_create"); + } + TEST("timerfd_create invalid clock"); EXPECT_ERRNO(timerfd_create(9999, 0), EINVAL, "expected EINVAL");