Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,13 +378,15 @@ typedef struct {
#define LINUX_O_TRUNC 0x0200
#define LINUX_O_APPEND 0x0400
#define LINUX_O_NONBLOCK 0x0800
#define LINUX_O_ASYNC 0x2000
/* aarch64-linux open flag values (from asm-generic/fcntl.h).
* These differ from x86_64-linux values.
*/
#define LINUX_O_DIRECTORY 0x4000 /* 040000 octal */
#define LINUX_O_NOFOLLOW 0x8000 /* 0100000 octal */
#define LINUX_O_DIRECT 0x10000 /* 0200000 octal */
#define LINUX_O_LARGEFILE 0x20000 /* 0400000 octal, ignored on LP64 */
#define LINUX_O_NOATIME 0x40000 /* 01000000 octal */
#define LINUX_O_CLOEXEC 0x80000 /* 02000000 octal */
#define LINUX_O_PATH 0x200000 /* 010000000 octal */

Expand Down
48 changes: 36 additions & 12 deletions src/syscall/fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ typedef struct {
#define LINUX_CLOCK_REALTIME 0
#define LINUX_CLOCK_MONOTONIC 1

/* Linux CLOCK_BOOTTIME counts time including suspend; macOS has no equivalent.
* timerfd_settime treats non-REALTIME slots as MONOTONIC for ABSTIME
* conversion, which matches translate_clockid() in time.c.
*/
#define LINUX_CLOCK_BOOTTIME 7

static struct {
int guest_fd; /* Guest fd (-1 if unused) */
int kq_fd; /* kqueue fd for this timer */
Expand Down Expand Up @@ -167,7 +173,8 @@ static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns)

int64_t sys_timerfd_create(int clockid, int flags)
{
if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC)
if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC &&
clockid != LINUX_CLOCK_BOOTTIME)
return -LINUX_EINVAL;
if (flags & ~(LINUX_TFD_CLOEXEC | LINUX_TFD_NONBLOCK))
return -LINUX_EINVAL;
Expand All @@ -176,8 +183,12 @@ int64_t sys_timerfd_create(int clockid, int flags)
if (kq < 0)
return linux_errno();

if (((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) ||
((flags & LINUX_TFD_NONBLOCK) && fd_set_nonblock(kq) < 0)) {
/* macOS kqueue fds reject fcntl(F_SETFL, O_NONBLOCK) with ENOTTY, so
* track the non-blocking mode in fd_table[gfd].linux_flags below and
* let timerfd_read consult that field directly. F_SETFD CLOEXEC still
* works on a kqueue fd.
*/
if ((flags & LINUX_TFD_CLOEXEC) && fd_set_cloexec(kq) < 0) {
close(kq);
return linux_errno();
}
Expand All @@ -203,8 +214,14 @@ int64_t sys_timerfd_create(int clockid, int flags)
timerfd_state[slot].clockid = clockid;
pthread_mutex_unlock(&sfd_lock);

fd_table[gfd].linux_flags =
(flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
/* Linux opens the timerfd inode O_RDWR (anon_inode_getfd in
* fs/timerfd.c). Stamp O_RDWR into linux_flags so the F_GETFL branch
* below can surface the access mode without re-deriving it.
*/
fd_publish_linux_flags(
gfd, LINUX_O_RDWR |
((flags & LINUX_TFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0) |
((flags & LINUX_TFD_NONBLOCK) ? LINUX_O_NONBLOCK : 0));
return gfd;
}

Expand Down Expand Up @@ -396,6 +413,15 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
if (count < 8)
return -LINUX_EINVAL;

/* Snapshot the NONBLOCK status under fd_lock before sfd_lock to match the
* documented lock order (fd_lock=3 < sfd_lock=5a). The kqueue host fd
* rejects fcntl(F_SETFL, O_NONBLOCK) on macOS, so the flag lives in
* fd_table[guest_fd].linux_flags rather than on the host fd.
*/
pthread_mutex_lock(&fd_lock);
bool nonblock = fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK;
pthread_mutex_unlock(&fd_lock);

pthread_mutex_lock(&sfd_lock);
int slot = timerfd_find(guest_fd);
if (slot < 0) {
Expand All @@ -409,9 +435,7 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
timerfd_drain_pending_locked(slot);

if (timerfd_state[slot].expirations == 0) {
/* No events yet; check if non-blocking */
int fl = fcntl(kq, F_GETFL);
if (fl >= 0 && (fl & O_NONBLOCK)) {
if (nonblock) {
pthread_mutex_unlock(&sfd_lock);
return -LINUX_EAGAIN;
}
Expand Down Expand Up @@ -636,8 +660,8 @@ int64_t sys_eventfd2(unsigned int initval, int flags)
eventfd_owner[gfd] = slot;
pthread_mutex_unlock(&sfd_lock);

fd_table[gfd].linux_flags =
(flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
fd_publish_linux_flags(gfd,
(flags & LINUX_EFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0);

/* If initial counter > 0, make the pipe readable so poll sees it */
if (initval > 0) {
Expand Down Expand Up @@ -1060,8 +1084,8 @@ int64_t sys_signalfd4(guest_t *g,
signalfd_state[slot].nonblock = (flags & LINUX_SFD_NONBLOCK) ? 1 : 0;
pthread_mutex_unlock(&sfd_lock);

fd_table[gfd].linux_flags =
(flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
fd_publish_linux_flags(gfd,
(flags & LINUX_SFD_CLOEXEC) ? LINUX_O_CLOEXEC : 0);

return gfd;
}
Expand Down
9 changes: 9 additions & 0 deletions src/syscall/fdtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,15 @@ int fd_get_type(int guest_fd)
return type;
}

void fd_publish_linux_flags(int guest_fd, int linux_flags)
{
if (!RANGE_CHECK(guest_fd, 0, FD_TABLE_SIZE))
return;
pthread_mutex_lock(&fd_lock);
fd_table[guest_fd].linux_flags = linux_flags;
pthread_mutex_unlock(&fd_lock);
}

/* Sized to cover all FD_* constants in abi.h plus a small headroom. Indexed
* by type. Each slot defaults to NULL (no per-type cleanup). Modules that
* own a type call fd_register_cleanup() at init time; dup and fork-restore
Expand Down
120 changes: 99 additions & 21 deletions src/syscall/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -501,10 +501,15 @@ static bool install_fd_alias_metadata_atomic(int dst_fd,
int linux_flags,
DIR *dir)
{
/* LINUX_O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2).
* Required for FD_TIMERFD (and any other type that stores NONBLOCK in
* linux_flags rather than on the host fd) so a duplicated non-blocking
* timerfd does not silently turn blocking.
*/
int preserved_flags =
src_snap->linux_flags &
(LINUX_O_ACCMODE | LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
LINUX_O_DIRECT | LINUX_O_LARGEFILE | LINUX_O_NONBLOCK);
int final_flags = preserved_flags | linux_flags;

bool installed = false;
Expand Down Expand Up @@ -663,7 +668,16 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
if (!RANGE_CHECK(fd, 0, FD_TABLE_SIZE))
return -LINUX_EBADF;

int fd_type = fd_table[fd].type;
/* Snapshot the slot under fd_lock once; readers use fd_snap below, and
* writers reacquire fd_lock and revalidate against fd_snap.generation
* so a close+reopen between the snapshot and the RMW returns EBADF
* instead of mutating an unrelated fd.
*/
fd_entry_t fd_snap;
if (!fd_snapshot(fd, &fd_snap))
return -LINUX_EBADF;

int fd_type = fd_snap.type;
bool fuse_fd = (fd_type == FD_FUSE_DEV || fd_type == FD_FUSE_FILE ||
fd_type == FD_FUSE_DIR);

Expand All @@ -676,7 +690,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
if ((int) arg < 0) {
return -LINUX_EINVAL;
}
int dup_flags = fd_table[fd].linux_flags & ~LINUX_O_CLOEXEC;
int dup_flags = fd_snap.linux_flags & ~LINUX_O_CLOEXEC;
if (cmd == 1030)
dup_flags |= LINUX_O_CLOEXEC;
int gfd = duplicate_guest_fd(fd, (int) arg, -1, false, dup_flags);
Expand All @@ -690,20 +704,38 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
return gfd;
}
case 1: /* F_GETFD */
return (fd_table[fd].linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC
: 0;
return (fd_snap.linux_flags & LINUX_O_CLOEXEC) ? LINUX_FD_CLOEXEC : 0;
case 2: /* F_SETFD */
/* Hold fd_lock across the read-modify-write so the CLOEXEC flip is
* atomic against a concurrent F_SETFL on the same shadow word and
* against any fd_lock-protected reader. Revalidate against the
* snapshot generation so a close+reopen returns EBADF.
*/
pthread_mutex_lock(&fd_lock);
if (fd_table[fd].type == FD_CLOSED ||
fd_table[fd].generation != fd_snap.generation) {
pthread_mutex_unlock(&fd_lock);
return -LINUX_EBADF;
}
if ((int) arg & LINUX_FD_CLOEXEC)
fd_table[fd].linux_flags |= LINUX_O_CLOEXEC;
else
fd_table[fd].linux_flags &= ~LINUX_O_CLOEXEC;
pthread_mutex_unlock(&fd_lock);
return 0;
case 3: { /* F_GETFL */
if (fuse_fd)
return fd_table[fd].linux_flags;
fd_entry_t snap;
if (!fd_snapshot(fd, &snap))
return -LINUX_EBADF;
return fd_snap.linux_flags;
/* Linux timerfd F_GETFL reports O_RDWR plus the writable status bits
* the kernel honors. Surface only those bits from the shadow rather
* than echoing arbitrary linux_flags bits so stray F_SETFL args
* cannot leak through here. O_ASYNC stays off because timerfd_fops
* lacks ->fasync, so generic_setfl drops it.
*/
if (fd_type == FD_TIMERFD)
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
return LINUX_O_RDWR |
(fd_snap.linux_flags &
(LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME));
host_fd_ref_t host_ref;
if (host_fd_ref_open(fd, &host_ref) < 0)
return -LINUX_EBADF;
Expand All @@ -712,26 +744,72 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg)
if (mac_fl < 0)
return linux_errno();
int linux_fl = mac_to_linux_status_flags(mac_fl);
if (snap.type == FD_REGULAR || snap.type == FD_DIR ||
snap.type == FD_PATH || snap.type == FD_URANDOM)
linux_fl = (linux_fl & ~O_ACCMODE) | (snap.linux_flags & 3);
linux_fl |= snap.linux_flags &
if (fd_snap.type == FD_REGULAR || fd_snap.type == FD_DIR ||
fd_snap.type == FD_PATH || fd_snap.type == FD_URANDOM)
linux_fl = (linux_fl & ~O_ACCMODE) | (fd_snap.linux_flags & 3);
linux_fl |= fd_snap.linux_flags &
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
return linux_fl;
}
case 4: /* F_SETFL */
{
if (fuse_fd) {
int preserved =
fd_table[fd].linux_flags &
(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY |
LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE);
/* Preserve LINUX_O_ACCMODE: F_SETFL is not allowed to change the
* access mode in the Linux kernel, and without preserving it
* here a stray F_SETFL(0) would silently flip an O_RDWR FUSE
* shadow to O_RDONLY, surfacing the wrong mode through F_GETFL.
*
* Hold fd_lock across the read-modify-write so the update is
* atomic against a concurrent F_SETFD and any fd_lock-protected
* reader. Revalidate against the snapshot generation so a
* close+reopen returns EBADF.
*/
pthread_mutex_lock(&fd_lock);
if (fd_table[fd].type != fd_type ||
fd_table[fd].generation != fd_snap.generation) {
pthread_mutex_unlock(&fd_lock);
return -LINUX_EBADF;
}
int preserved = fd_table[fd].linux_flags &
(LINUX_O_ACCMODE | LINUX_O_CLOEXEC | LINUX_O_PATH |
LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
fd_table[fd].linux_flags =
preserved | ((int) arg & ~(LINUX_O_ACCMODE | LINUX_O_CLOEXEC |
LINUX_O_PATH | LINUX_O_DIRECTORY |
LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
LINUX_O_LARGEFILE));
pthread_mutex_unlock(&fd_lock);
return 0;
}
/* Timerfd: kqueue host fd rejects fcntl(F_SETFL), so mirror Linux's
* file-status word in the linux_flags shadow. Of Linux's writable
* status flags (O_APPEND, O_ASYNC, O_DIRECT, O_NOATIME, O_NONBLOCK)
* the timerfd kernel object honors O_APPEND, O_NONBLOCK, and
* O_NOATIME. O_ASYNC is silently dropped (timerfd_fops lacks
* ->fasync). O_DIRECT returns -EINVAL because the inode lacks
* FMODE_CAN_ODIRECT. Bits outside the writable set (access mode,
* CLOEXEC, O_PATH/DIRECTORY/NOFOLLOW/etc.) are silently ignored,
* matching how Linux F_SETFL drops them.
*/
if (fd_type == FD_TIMERFD) {
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
const int setfl_mask =
LINUX_O_APPEND | LINUX_O_NONBLOCK | LINUX_O_NOATIME;
pthread_mutex_lock(&fd_lock);
if (fd_table[fd].type != FD_TIMERFD ||
fd_table[fd].generation != fd_snap.generation) {
pthread_mutex_unlock(&fd_lock);
return -LINUX_EBADF;
}
if ((int) arg & LINUX_O_DIRECT) {
pthread_mutex_unlock(&fd_lock);
return -LINUX_EINVAL;
}
fd_table[fd].linux_flags =
preserved |
((int) arg &
~(LINUX_O_CLOEXEC | LINUX_O_PATH | LINUX_O_DIRECTORY |
LINUX_O_NOFOLLOW | LINUX_O_DIRECT | LINUX_O_LARGEFILE));
(fd_table[fd].linux_flags & ~setfl_mask) |
((int) arg & setfl_mask);
pthread_mutex_unlock(&fd_lock);
return 0;
}
host_fd_ref_t host_ref;
Expand Down
31 changes: 25 additions & 6 deletions src/syscall/fuse.c
Original file line number Diff line number Diff line change
Expand Up @@ -1329,8 +1329,11 @@ int fuse_proc_open(int linux_flags)
errno = EMFILE;
return -1;
}
fd_table[guest_fd].linux_flags = linux_flags;
pthread_mutex_unlock(&fuse_lock);
/* Publish under fd_lock so the write is on the same lock domain as
* sys_fcntl(F_SETFL/F_SETFD), not stranded behind fuse_lock.
*/
fd_publish_linux_flags(guest_fd, linux_flags);
return guest_fd;
}

Expand Down Expand Up @@ -1897,8 +1900,11 @@ int64_t fuse_open_path(guest_t *g, const char *path, int linux_flags, int mode)
fd_mark_closed(guest_fd);
return -LINUX_EMFILE;
}
fd_table[guest_fd].linux_flags = linux_flags;
pthread_mutex_unlock(&fuse_lock);
/* Publish under fd_lock so the open's flags land on the same lock
* domain that sys_fcntl(F_SETFL/F_SETFD) uses.
*/
fd_publish_linux_flags(guest_fd, linux_flags);
return guest_fd;
}

Expand Down Expand Up @@ -2607,11 +2613,24 @@ int fuse_dup_fd(int src_fd,
}
}

int preserved_flags = fd_table[src_fd].linux_flags &
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW |
LINUX_O_DIRECT | LINUX_O_LARGEFILE);
fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
pthread_mutex_unlock(&fuse_lock);

/* O_NONBLOCK is a file-status flag preserved by dup(2)/dup2(2); without
* it a duplicated non-blocking FUSE fd would silently become blocking
* because nothing else carries the flag forward.
*
* Take fd_lock once for both the source read and the destination write
* so the dup snapshot is consistent with any concurrent F_SETFL on the
* source and so the destination publish cannot be overwritten by an
* early racing F_SETFL on the new slot.
*/
pthread_mutex_lock(&fd_lock);
int preserved_flags =
fd_table[src_fd].linux_flags &
(LINUX_O_PATH | LINUX_O_DIRECTORY | LINUX_O_NOFOLLOW | LINUX_O_DIRECT |
LINUX_O_LARGEFILE | LINUX_O_NONBLOCK);
fd_table[guest_fd].linux_flags = preserved_flags | linux_flags;
pthread_mutex_unlock(&fd_lock);
return guest_fd;
}

Expand Down
2 changes: 1 addition & 1 deletion src/syscall/inotify.c
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ int64_t sys_inotify_init1(int flags)
memset(inst->watches, 0, sizeof(inst->watches));
pthread_mutex_unlock(&inotify_lock);

fd_table[gfd].linux_flags = (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0;
fd_publish_linux_flags(gfd, (flags & IN_CLOEXEC) ? LINUX_O_CLOEXEC : 0);

return gfd;
}
Expand Down
9 changes: 9 additions & 0 deletions src/syscall/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,15 @@ int fd_snapshot_and_dup(int guest_fd, fd_entry_t *out);
*/
int fd_get_type(int guest_fd);

/* Publish linux_flags for a guest fd under fd_lock. Use after fd_alloc when
* the creating syscall needs to set linux_flags atomically with respect to a
* concurrent fcntl(F_SETFL/F_SETFD) on the same slot. The fd_alloc-then-
* publish window is small (the new gfd is not communicated to other threads
* until the syscall returns) but the lock removes the structural race and
* keeps every linux_flags writer on one lock domain.
*/
void fd_publish_linux_flags(int guest_fd, int linux_flags);

/* Republish the EL1 urandom read fast-path bit for this fd from the current
* fd_table type and access mode. Only readable /dev/urandom descriptors are
* eligible for the bitmap.
Expand Down
Loading
Loading