From ca9c1be34db4d237935b1d0fa8c5d863513399cb Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Mon, 11 May 2026 13:28:27 +0800 Subject: [PATCH] Plug sysroot bypasses in at-family path syscalls A path-translation audit on the *at()/readlinkat/sysroot surface that does not flow through openat2 found a consistent set of gaps: sys_fchmodat, sys_fchownat, sys_mknodat, sys_utimensat, sys_chdir, and the four sys_*xattr path handlers passed absolute guest paths straight to the host syscall, skipping path_resolve_sysroot_*. A guest running chmod("/etc/foo") under --sysroot=/opt/sr therefore hit the host's /etc/foo rather than /opt/sr/etc/foo, contradicting the redirect contract every other path syscall already implements (sys_openat_path, sys_truncate, sys_statfs, stat_at_path). Route each handler through the matching resolver: nofollow when AT_SYMLINK_NOFOLLOW or the lxattr variant is set, _create for mknodat since the target may not yet exist, _path for everything else. sys_chroot accepted any guest-supplied path, stat()'d it on the host, and stored it via proc_set_sysroot without checking that the new root was reachable from the current one. A guest already running under --sysroot=/opt/sr could call chroot("/etc") and pivot to the host's /etc with no containment check. Reduce to a chroot("/") no-op and return -EPERM for anything else; this still satisfies the original motivation (coreutils stdbuf does fork -> chroot("/") -> exec) without keeping the pivot path. Real Linux chroot requires CAP_SYS_CHROOT, which the guest does not have in elfuse's single-process VM. The sysroot_path static buffer was mutated by proc_set_sysroot and read by proc_get_sysroot with no lock. A sibling vCPU running chroot during another vCPU's path resolution could tear the snprintf source underneath the consumer. Add sysroot_lock and a copying proc_sysroot_snapshot helper, then migrate proc_resolve_sysroot_*, fork-state.c, and exec.c to the snapshot. proc_get_sysroot stays as-is for the NULL-test fast paths (path[0] != '/' early returns) that tolerate a racy read; the docstring spells out the contract. Also canonicalize the sysroot once at proc_set_sysroot time via realpath so subsequent containment checks compare against a stable form. The /lib/basename retry inside proc_resolve_sysroot_path_flags was an early hack that masked containment errors when the dynamic linker walked DT_RPATH/RUNPATH/LD_LIBRARY_PATH for itself; drop it. --- mk/tests.mk | 8 ++- src/runtime/fork-state.c | 4 +- src/runtime/procemu.c | 23 ++++++- src/runtime/procemu.h | 10 +++ src/syscall/exec.c | 5 +- src/syscall/fs-xattr.c | 45 ++++++++++++-- src/syscall/fs.c | 105 +++++++++++++++++++++----------- src/syscall/proc-state.c | 121 +++++++++++++++++++++++++------------ src/syscall/proc.h | 25 ++++++-- tests/test-sysroot-chdir.c | 72 ++++++++++++++++++++++ 10 files changed, 325 insertions(+), 93 deletions(-) create mode 100644 tests/test-sysroot-chdir.c diff --git a/mk/tests.mk b/mk/tests.mk index bddd0e8..2268273 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -7,7 +7,7 @@ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ test-full test-multi-vcpu test-rwx test-sysroot-rename \ test-sysroot-procfs-exec test-timeout-disable \ - test-sysroot-nofollow perf + test-sysroot-nofollow test-sysroot-chdir perf ## Build and run the assembly hello world test test-hello: $(ELFUSE_BIN) $(TEST_HELLO_DEP) @@ -47,6 +47,12 @@ test-sysroot-nofollow: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-nofollow ln -sf /outside-target "$$tmpdir/tmp/elfuse-sysroot-nofollow-link"; \ $(ELFUSE_BIN) --sysroot "$$tmpdir" $(BUILD_DIR)/test-sysroot-nofollow +test-sysroot-chdir: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-chdir + @tmpdir=$$(mktemp -d); \ + trap 'rm -rf "$$tmpdir"' EXIT; \ + mkdir -p "$$tmpdir/bin" "$$tmpdir/lib" "$$tmpdir/lib/elfuse-sysroot-shadow"; \ + $(ELFUSE_BIN) --sysroot "$$tmpdir" $(BUILD_DIR)/test-sysroot-chdir + test-sysroot-procfs-exec: $(ELFUSE_BIN) $(BUILD_DIR)/test-procfs-exec @tmpdir=$$(mktemp -d); \ trap 'rm -rf "$$tmpdir"' EXIT; \ diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 1d56c38..c168f8f 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -440,9 +440,7 @@ int fork_ipc_send_process_state(int ipc_sock, return -1; char sysroot_ipc[LINUX_PATH_MAX] = {0}; - const char *sr = proc_get_sysroot(); - if (sr) - str_copy_trunc(sysroot_ipc, sr, sizeof(sysroot_ipc)); + (void) proc_sysroot_snapshot(sysroot_ipc, sizeof(sysroot_ipc)); if (fork_ipc_write_all(ipc_sock, sysroot_ipc, sizeof(sysroot_ipc)) < 0) return -1; diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 61698b6..c5097d2 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -599,6 +599,13 @@ static int dev_shm_resolve_path(const char *guest_suffix, return 0; } +int proc_dev_shm_resolve(const char *guest_suffix, + char *host_path, + size_t host_path_sz) +{ + return dev_shm_resolve_path(guest_suffix, host_path, host_path_sz); +} + /* Give synthetic procfs nodes stable identities so directory walkers do not * collapse distinct paths into one inode and falsely report filesystem loops. */ @@ -2553,13 +2560,27 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) return proc_intercept_readlink(alias, buf, bufsiz); } - /* /proc/self/exe -> path of current ELF binary */ + /* /proc/self/exe -> path of current ELF binary. + * Strip the sysroot prefix so a guest running under --sysroot=/opt/sr + * sees /bin/ls rather than /opt/sr/bin/ls, matching the chroot-like + * abstraction the rest of the path layer presents. + */ if (!strcmp(path, "/proc/self/exe")) { const char *exe = proc_get_elf_path(); if (!exe) { errno = ENOENT; return -1; } + char sysroot_snap[LINUX_PATH_MAX]; + if (proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap))) { + size_t sr_len = strlen(sysroot_snap); + if (sr_len > 0 && strncmp(exe, sysroot_snap, sr_len) == 0 && + (exe[sr_len] == '/' || exe[sr_len] == '\0')) { + exe += sr_len; + if (*exe == '\0') + exe = "/"; + } + } size_t len = strlen(exe); if (len > bufsiz) len = bufsiz; diff --git a/src/runtime/procemu.h b/src/runtime/procemu.h index ae52de2..58f4f30 100644 --- a/src/runtime/procemu.h +++ b/src/runtime/procemu.h @@ -76,3 +76,13 @@ int proc_intercept_readv(int guest_fd, * Used by sys_unlinkat to rewrite /dev/shm/ paths. */ const char *proc_get_shm_dir(void); + +/* Resolve a /dev/shm/ guest-path suffix to a host path inside the + * per-UID /dev/shm emulation directory. Rejects empty, ".."-bearing, or + * compound suffixes with errno = EACCES. Returns 0 on success and writes the + * full host path to host_path; returns -1 with errno set (EACCES on invalid + * suffix, ENAMETOOLONG on overflow, or as set by shm_dir_path()). + */ +int proc_dev_shm_resolve(const char *guest_suffix, + char *host_path, + size_t host_path_sz); diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 9b2118f..bee7549 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -348,7 +348,10 @@ int64_t sys_execve(hv_vcpu_t vcpu, interp_resolved[0] = '\0'; if (elf_info.interp_path[0] != '\0') { - elf_resolve_interp(proc_get_sysroot(), elf_info.interp_path, + char sysroot_snap[LINUX_PATH_MAX]; + bool have_sr = + proc_sysroot_snapshot(sysroot_snap, sizeof(sysroot_snap)); + elf_resolve_interp(have_sr ? sysroot_snap : NULL, elf_info.interp_path, interp_resolved, sizeof(interp_resolved)); log_debug("execve: pre-validating interpreter: %s", interp_resolved); diff --git a/src/syscall/fs-xattr.c b/src/syscall/fs-xattr.c index 29c3ef5..765f4ac 100644 --- a/src/syscall/fs-xattr.c +++ b/src/syscall/fs-xattr.c @@ -14,6 +14,7 @@ #include "syscall/abi.h" #include "syscall/fs.h" #include "syscall/internal.h" +#include "syscall/path.h" /* macOS xattr API has extra position and options parameters vs Linux. * Linux: getxattr(path, name, value, size) @@ -73,15 +74,23 @@ int64_t sys_getxattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX], name[LINUX_XATTR_NAME_MAX + 1]; + char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) return -LINUX_EFAULT; + const char *target = nofollow ? path_resolve_sysroot_nofollow_path( + path, sysroot_buf, sizeof(sysroot_buf)) + : path_resolve_sysroot_path( + path, sysroot_buf, sizeof(sysroot_buf)); + if (!target) + return -LINUX_ENAMETOOLONG; + int opts = nofollow ? XATTR_NOFOLLOW : 0; if (size == 0) { - ssize_t ret = getxattr(path, name, NULL, 0, 0, opts); + ssize_t ret = getxattr(target, name, NULL, 0, 0, opts); return ret < 0 ? linux_errno() : ret; } @@ -90,7 +99,7 @@ int64_t sys_getxattr(guest_t *g, if (err < 0) return err; - ssize_t ret = getxattr(path, name, buf, (size_t) size, 0, opts); + ssize_t ret = getxattr(target, name, buf, (size_t) size, 0, opts); int64_t result = xattr_copy_out_result(g, value_gva, buf, ret); free(buf); return result; @@ -105,11 +114,19 @@ int64_t sys_setxattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX], name[LINUX_XATTR_NAME_MAX + 1]; + char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) return -LINUX_EFAULT; + const char *target = nofollow ? path_resolve_sysroot_nofollow_path( + path, sysroot_buf, sizeof(sysroot_buf)) + : path_resolve_sysroot_path( + path, sysroot_buf, sizeof(sysroot_buf)); + if (!target) + return -LINUX_ENAMETOOLONG; + void *buf; int64_t err = xattr_alloc_buf(size, &buf); if (err < 0) @@ -126,7 +143,7 @@ int64_t sys_setxattr(guest_t *g, return err; } - int ret = setxattr(path, name, buf, (size_t) size, 0, opts); + int ret = setxattr(target, name, buf, (size_t) size, 0, opts); free(buf); return ret < 0 ? linux_errno() : 0; } @@ -138,13 +155,21 @@ int64_t sys_listxattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX]; + char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; + const char *target = nofollow ? path_resolve_sysroot_nofollow_path( + path, sysroot_buf, sizeof(sysroot_buf)) + : path_resolve_sysroot_path( + path, sysroot_buf, sizeof(sysroot_buf)); + if (!target) + return -LINUX_ENAMETOOLONG; + int opts = nofollow ? XATTR_NOFOLLOW : 0; if (size == 0) { - ssize_t ret = listxattr(path, NULL, 0, opts); + ssize_t ret = listxattr(target, NULL, 0, opts); return ret < 0 ? linux_errno() : ret; } @@ -153,7 +178,7 @@ int64_t sys_listxattr(guest_t *g, if (err < 0) return err; - ssize_t ret = listxattr(path, buf, (size_t) size, opts); + ssize_t ret = listxattr(target, buf, (size_t) size, opts); int64_t result = xattr_copy_out_result(g, list_gva, buf, ret); free(buf); return result; @@ -165,13 +190,21 @@ int64_t sys_removexattr(guest_t *g, int nofollow) { char path[LINUX_PATH_MAX], name[LINUX_XATTR_NAME_MAX + 1]; + char sysroot_buf[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) return -LINUX_EFAULT; + const char *target = nofollow ? path_resolve_sysroot_nofollow_path( + path, sysroot_buf, sizeof(sysroot_buf)) + : path_resolve_sysroot_path( + path, sysroot_buf, sizeof(sysroot_buf)); + if (!target) + return -LINUX_ENAMETOOLONG; + int opts = nofollow ? XATTR_NOFOLLOW : 0; - int ret = removexattr(path, name, opts); + int ret = removexattr(target, name, opts); return ret < 0 ? linux_errno() : 0; } diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 196c3be..6f90719 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -814,7 +814,12 @@ int64_t sys_chdir(guest_t *g, uint64_t path_gva) return 0; } - if (chdir(chdir_path) < 0) + char sysroot_buf[LINUX_PATH_MAX]; + const char *target = + path_resolve_sysroot_path(chdir_path, sysroot_buf, sizeof(sysroot_buf)); + if (!target) + return -LINUX_ENAMETOOLONG; + if (chdir(target) < 0) return linux_errno(); if (proc_cwd_refresh() < 0) @@ -849,23 +854,17 @@ int64_t sys_chroot(guest_t *g, uint64_t path_gva) char path[LINUX_PATH_MAX]; if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) return -LINUX_EFAULT; - /* Emulate chroot by updating the sysroot prefix. All path resolution - * already redirects through sysroot. chroot("/") is a no-op because the - * guest already sees "/" as root, and resetting sysroot to "/" would - * break dynamic linker resolution. Real chroot() requires root and - * does not make sense in elfuse's single-process VM. This enables - * coreutils stdbuf (which does fork -> chroot("/") -> exec) and the - * chroot coreutil itself. + /* Only accept chroot("/") as a no-op. The original motivation was + * coreutils stdbuf (fork -> chroot("/") -> exec) which never changes + * roots in practice. Accepting an arbitrary path was a containment + * escape: a guest already running under --sysroot=/opt/sr could call + * chroot("/etc") and pivot to the host's /etc with no boundary check. + * Real chroot() requires CAP_SYS_CHROOT, which the guest does not have + * in elfuse's single-process VM model. */ - if (strcmp(path, "/") != 0) { - struct stat st; - if (stat(path, &st) < 0) - return linux_errno(); - if (!S_ISDIR(st.st_mode)) - return -LINUX_ENOTDIR; - proc_set_sysroot(path); - } - return 0; + if (strcmp(path, "/") == 0) + return 0; + return -LINUX_EPERM; } /* pipe/seek. */ @@ -977,8 +976,10 @@ int64_t sys_readlinkat(guest_t *g, char sysroot_buf[LINUX_PATH_MAX]; const char *read_path = path_resolve_sysroot_nofollow_path( path, sysroot_buf, sizeof(sysroot_buf)); - if (!read_path) + if (!read_path) { + host_fd_ref_close(&dir_ref); return -LINUX_ENAMETOOLONG; + } ssize_t len = readlinkat(dir_ref.fd, read_path, link, sizeof(link) - 1); host_fd_ref_close(&dir_ref); if (len < 0) @@ -1014,23 +1015,13 @@ int64_t sys_unlinkat(guest_t *g, int dirfd, uint64_t path_gva, int flags) /* Rewrite /dev/shm/ to the host temp directory so shm_unlink works */ const char *unlink_path; - char shm_host[512]; + char shm_host[LINUX_PATH_MAX]; if (!strncmp(unlink_guest_path, "/dev/shm/", 9)) { - const char *shm = proc_get_shm_dir(); - if (!shm) { + if (proc_dev_shm_resolve(unlink_guest_path + 9, shm_host, + sizeof(shm_host)) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } - const char *suffix = unlink_guest_path + 9; - if (strstr(suffix, "..") || strchr(suffix, '/') || suffix[0] == '\0') { - host_fd_ref_close(&dir_ref); - return -LINUX_EACCES; - } - int n = snprintf(shm_host, sizeof(shm_host), "%s/%s", shm, suffix); - if (n < 0 || (size_t) n >= sizeof(shm_host)) { - host_fd_ref_close(&dir_ref); - return -LINUX_ENAMETOOLONG; - } unlink_path = shm_host; host_fd_ref_close(&dir_ref); dir_ref.fd = AT_FDCWD; @@ -1237,9 +1228,17 @@ int64_t sys_mknodat(guest_t *g, int dirfd, uint64_t path_gva, int mode, int dev) if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; + char sysroot_buf[LINUX_PATH_MAX]; + const char *target = path_resolve_sysroot_create_path( + node_path, sysroot_buf, sizeof(sysroot_buf)); + if (!target) { + host_fd_ref_close(&dir_ref); + return -LINUX_ENAMETOOLONG; + } + /* Only support FIFO creation; other node types need root */ if (S_ISFIFO(mode)) { - if (mkfifoat(dir_ref.fd, node_path, mode & 0777) < 0) { + if (mkfifoat(dir_ref.fd, target, mode & 0777) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1249,7 +1248,7 @@ int64_t sys_mknodat(guest_t *g, int dirfd, uint64_t path_gva, int mode, int dev) /* Regular files: create an empty file */ if (S_ISREG(mode) || (mode & S_IFMT) == 0) { - int fd = openat(dir_ref.fd, node_path, O_CREAT | O_WRONLY | O_EXCL, + int fd = openat(dir_ref.fd, target, O_CREAT | O_WRONLY | O_EXCL, mode & 0777); host_fd_ref_close(&dir_ref); if (fd < 0) @@ -1556,8 +1555,20 @@ int64_t sys_fchmodat(guest_t *g, if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; + char sysroot_buf[LINUX_PATH_MAX]; + const char *target = + (flags & LINUX_AT_SYMLINK_NOFOLLOW) + ? path_resolve_sysroot_nofollow_path(chmod_path, sysroot_buf, + sizeof(sysroot_buf)) + : path_resolve_sysroot_path(chmod_path, sysroot_buf, + sizeof(sysroot_buf)); + if (!target) { + host_fd_ref_close(&dir_ref); + return -LINUX_ENAMETOOLONG; + } + int mac_flags = translate_at_flags(flags); - if (fchmodat(dir_ref.fd, chmod_path, mode, mac_flags) < 0) { + if (fchmodat(dir_ref.fd, target, mode, mac_flags) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1592,8 +1603,20 @@ int64_t sys_fchownat(guest_t *g, if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; + char sysroot_buf[LINUX_PATH_MAX]; + const char *target = + (flags & LINUX_AT_SYMLINK_NOFOLLOW) + ? path_resolve_sysroot_nofollow_path(chown_path, sysroot_buf, + sizeof(sysroot_buf)) + : path_resolve_sysroot_path(chown_path, sysroot_buf, + sizeof(sysroot_buf)); + if (!target) { + host_fd_ref_close(&dir_ref); + return -LINUX_ENAMETOOLONG; + } + int mac_flags = translate_at_flags(flags); - if (fchownat(dir_ref.fd, chown_path, owner, group, mac_flags) < 0) { + if (fchownat(dir_ref.fd, target, owner, group, mac_flags) < 0) { host_fd_ref_close(&dir_ref); return linux_errno(); } @@ -1637,6 +1660,7 @@ int64_t sys_utimensat(guest_t *g, const char *path_arg = NULL; char path[LINUX_PATH_MAX]; char proc_path[LINUX_PATH_MAX]; + char sysroot_buf[LINUX_PATH_MAX]; if (path_gva != 0) { if (guest_read_str(g, path_gva, path, sizeof(path)) < 0) { host_fd_ref_close(&dir_ref); @@ -1648,7 +1672,16 @@ int64_t sys_utimensat(guest_t *g, host_fd_ref_close(&dir_ref); return -LINUX_ENAMETOOLONG; } - path_arg = proc_resolved > 0 ? proc_path : path; + const char *src = proc_resolved > 0 ? proc_path : path; + path_arg = (flags & LINUX_AT_SYMLINK_NOFOLLOW) + ? path_resolve_sysroot_nofollow_path(src, sysroot_buf, + sizeof(sysroot_buf)) + : path_resolve_sysroot_path(src, sysroot_buf, + sizeof(sysroot_buf)); + if (!path_arg) { + host_fd_ref_close(&dir_ref); + return -LINUX_ENAMETOOLONG; + } } struct timespec ts[2]; diff --git a/src/syscall/proc-state.c b/src/syscall/proc-state.c index 7bc2fb3..001a8e4 100644 --- a/src/syscall/proc-state.c +++ b/src/syscall/proc-state.c @@ -40,6 +40,12 @@ static uint8_t auxv_buf[1024] = {0}; static size_t auxv_len = 0; static char sysroot_path[LINUX_PATH_MAX] = {0}; +/* Serializes proc_set_sysroot against path-helper snapshots. Without this, + * a sibling vCPU running chroot during another vCPU's path resolution can tear + * the snprintf input buffer underneath that thread. + */ +static pthread_mutex_t sysroot_lock = PTHREAD_MUTEX_INITIALIZER; + /* Cached current working directory for getcwd() and /proc/self/cwd. */ static pthread_mutex_t cwd_lock = PTHREAD_MUTEX_INITIALIZER; static char cwd_path[LINUX_PATH_MAX] = {0}; @@ -67,12 +73,24 @@ void proc_state_init(void) int proc_cwd_refresh(void) { char cwd[LINUX_PATH_MAX]; + const char *guest_cwd = cwd; if (!getcwd(cwd, sizeof(cwd))) return -1; - size_t len = strlen(cwd); + char sr[LINUX_PATH_MAX]; + if (proc_sysroot_snapshot(sr, sizeof(sr))) { + size_t sr_len = strlen(sr); + if (strncmp(cwd, sr, sr_len) == 0 && + (cwd[sr_len] == '\0' || cwd[sr_len] == '/')) { + guest_cwd = cwd + sr_len; + if (*guest_cwd == '\0') + guest_cwd = "/"; + } + } + + size_t len = strlen(guest_cwd); pthread_mutex_lock(&cwd_lock); - memcpy(cwd_path, cwd, len + 1); + memcpy(cwd_path, guest_cwd, len + 1); cwd_len = len; cwd_valid = true; pthread_mutex_unlock(&cwd_lock); @@ -328,21 +346,57 @@ const void *proc_get_auxv(size_t *len_out) void proc_set_sysroot(const char *path) { + pthread_mutex_lock(&sysroot_lock); if (path && path[0]) { str_copy_trunc(sysroot_path, path, sizeof(sysroot_path)); size_t len = strlen(sysroot_path); while (len > 1 && sysroot_path[len - 1] == '/') sysroot_path[--len] = '\0'; + char resolved[LINUX_PATH_MAX]; + if (realpath(sysroot_path, resolved)) + str_copy_trunc(sysroot_path, resolved, sizeof(sysroot_path)); } else { sysroot_path[0] = '\0'; } + pthread_mutex_unlock(&sysroot_lock); } const char *proc_get_sysroot(void) { + /* Boolean-test callers (path[0] != '/' fast paths) tolerate the racy read: + * the first byte transitions atomically and a missed update only causes one + * extra resolution attempt. Callers that consume the string content must + * use proc_sysroot_snapshot() instead. + */ return sysroot_path[0] ? sysroot_path : NULL; } +bool proc_sysroot_snapshot(char *out, size_t outsz) +{ + if (!out || outsz == 0) + return false; + pthread_mutex_lock(&sysroot_lock); + bool ok = false; + if (sysroot_path[0]) { + size_t need = strlen(sysroot_path) + 1; + if (need <= outsz) { + memcpy(out, sysroot_path, need); + ok = true; + } + } + pthread_mutex_unlock(&sysroot_lock); + if (!ok) + out[0] = '\0'; + return ok; +} + +/* Confirm resolved_path canonicalizes inside sysroot. This is a check-then-use + * sequence: callers issue the actual syscall after this returns, so a symlink + * swapped in between will not be re-validated. openat2 + * RESOLVE_{BENEATH,IN_ROOT} close that race for callers willing to opt in. For + * the legacy *at() surface this is best-effort defense at the boundary; the + * guest is in the host user's trust domain. + */ static bool sysroot_path_is_contained(const char *resolved_path, const char *sysroot, bool follow_final) @@ -361,30 +415,23 @@ static bool sysroot_path_is_contained(const char *resolved_path, str_copy_trunc(parent, resolved_path, sizeof(parent)); slash = strrchr(parent, '/'); - if (!slash) + /* resolved_path is always ${sysroot}${guest_path} where sysroot is + * non-empty (caller short-circuits otherwise) and guest_path starts + * with '/'. The result therefore contains at least two '/' bytes, so + * the basename slash is never at parent[0]. Reject anything that + * violates the invariant rather than carrying dead code for it. + */ + if (!slash || slash == parent) return false; - if (slash == parent) { - parent[1] = '\0'; - if (!realpath(parent, real_path)) - return false; - size_t rp_len = strlen(real_path); - const char *tail = resolved_path + 1; - size_t tail_len = strlen(tail); - if (rp_len + 1 + tail_len >= sizeof(real_path)) - return false; - real_path[rp_len] = '/'; - memcpy(real_path + rp_len + 1, tail, tail_len + 1); - } else { - *slash = '\0'; - if (!realpath(parent, real_path)) - return false; - size_t parent_len = strlen(real_path); - if (snprintf(real_path + parent_len, sizeof(real_path) - parent_len, - "/%s", - slash + 1) >= (int) (sizeof(real_path) - parent_len)) { - return false; - } + *slash = '\0'; + if (!realpath(parent, real_path)) + return false; + size_t parent_len = strlen(real_path); + if (snprintf(real_path + parent_len, sizeof(real_path) - parent_len, + "/%s", + slash + 1) >= (int) (sizeof(real_path) - parent_len)) { + return false; } } @@ -405,13 +452,20 @@ static bool sysroot_path_exists(const char *resolved_path, bool follow_final) return lstat(resolved_path, &st) == 0; } +/* Resolve an absolute guest path against --sysroot. This keeps absolute guest + * filesystem syscalls inside the sysroot when the target exists there, and + * otherwise falls back to the literal host path so apps can still reach host + * resources such as /tmp or /etc/resolv.conf. Containment via realpath() is + * enforced only when the path actually resolves under sysroot, to prevent + * symlink escape from a tree the caller intended to stay inside. + */ static const char *proc_resolve_sysroot_path_flags(const char *path, char *buf, size_t bufsz, bool follow_final) { - const char *sr = proc_get_sysroot(); - if (!sr || !path || path[0] != '/') + char sr[LINUX_PATH_MAX]; + if (!proc_sysroot_snapshot(sr, sizeof(sr)) || !path || path[0] != '/') return path; if (bufsz == 0) return NULL; @@ -426,17 +480,6 @@ static const char *proc_resolve_sysroot_path_flags(const char *path, return buf; } - const char *base = strrchr(path, '/'); - if (base) { - n = snprintf(buf, bufsz, "%s/lib/%s", sr, base + 1); - if (n >= 0 && (size_t) n < bufsz && - sysroot_path_exists(buf, follow_final)) { - if (!sysroot_path_is_contained(buf, sr, follow_final)) - return NULL; - return buf; - } - } - return full_path_truncated ? NULL : path; } @@ -456,8 +499,8 @@ const char *proc_resolve_sysroot_create_path(const char *path, char *buf, size_t bufsz) { - const char *sr = proc_get_sysroot(); - if (!sr || !path || path[0] != '/') + char sr[LINUX_PATH_MAX]; + if (!proc_sysroot_snapshot(sr, sizeof(sr)) || !path || path[0] != '/') return path; if (bufsz == 0) return NULL; diff --git a/src/syscall/proc.h b/src/syscall/proc.h index f10393b..6cf2d9a 100644 --- a/src/syscall/proc.h +++ b/src/syscall/proc.h @@ -184,18 +184,31 @@ int rseq_try_abort(guest_t *g, /* Allocate next guest PID (called from sys_clone). */ int64_t proc_alloc_pid(void); -/* Store the sysroot path for dynamic linker library resolution. - * When set, absolute library paths (e.g. /lib/libc.so) are prefixed - * with this path. Pass NULL to clear. +/* Store the sysroot path for absolute guest path resolution. Pass NULL to + * clear. */ void proc_set_sysroot(const char *path); -/* Get the stored sysroot path. Returns NULL if not set. */ +/* Get the stored sysroot path. Returns NULL if not set. + * + * The returned pointer aliases a static buffer mutated by proc_set_sysroot() + * under a private lock; callers that only test for NULL are safe, but any + * caller that reads the string content must use proc_sysroot_snapshot() + * instead to avoid a torn read against a concurrent chroot. + */ const char *proc_get_sysroot(void); +/* Copy the current sysroot into out (NUL-terminated) under the same lock that + * serializes proc_set_sysroot(). Returns true if a sysroot is configured and + * fits in outsz; false if unset, truncating, or out/outsz is invalid (in which + * case out[0] is set to '\0' when possible). + */ +bool proc_sysroot_snapshot(char *out, size_t outsz); + /* Resolve an absolute guest path through the stored sysroot. Returns path - * unchanged when no sysroot applies, buf when a sysroot-backed file exists, - * or NULL if sysroot path construction would truncate. + * unchanged when no sysroot applies or when the sysroot-backed path does not + * exist, buf when a sysroot-backed file exists, or NULL if sysroot path + * construction would truncate or escape containment checks. */ const char *proc_resolve_sysroot_path(const char *path, char *buf, diff --git a/tests/test-sysroot-chdir.c b/tests/test-sysroot-chdir.c new file mode 100644 index 0000000..72e7da9 --- /dev/null +++ b/tests/test-sysroot-chdir.c @@ -0,0 +1,72 @@ +/* Sysroot chdir regression tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +int main(void) +{ + char cwd[256]; + char proc_cwd[256]; + + printf("test-sysroot-chdir: sysroot chdir tests\n"); + + TEST("absolute chdir keeps guest cwd"); + { + ssize_t len; + + if (chdir("/bin") < 0) { + FAIL("chdir /bin failed"); + } else if (!getcwd(cwd, sizeof(cwd))) { + FAIL("getcwd after chdir /bin failed"); + } else if (strcmp(cwd, "/bin") != 0) { + FAIL("getcwd leaked host sysroot path"); + } else if ((len = readlink("/proc/self/cwd", proc_cwd, + sizeof(proc_cwd) - 1)) < 0) { + FAIL("readlink /proc/self/cwd failed"); + } else { + proc_cwd[len] = '\0'; + if (strcmp(proc_cwd, "/bin") != 0) + FAIL("/proc/self/cwd leaked host sysroot path"); + else + PASS(); + } + } + + TEST("relative chdir stays guest-visible under sysroot"); + { + if (chdir("../lib") < 0) { + FAIL("chdir ../lib failed"); + } else if (!getcwd(cwd, sizeof(cwd))) { + FAIL("getcwd after chdir ../lib failed"); + } else if (strcmp(cwd, "/lib") != 0) { + FAIL("relative chdir produced wrong guest cwd"); + } else { + PASS(); + } + } + + TEST("missing absolute path does not fall back to sysroot lib basename"); + { + errno = 0; + if (chdir("/elfuse-sysroot-shadow") == 0) { + FAIL("chdir unexpectedly succeeded via sysroot lib fallback"); + } else if (errno != ENOENT) { + FAIL("chdir failed with wrong errno"); + } else { + PASS(); + } + } + + SUMMARY("test-sysroot-chdir"); + return fails > 0 ? 1 : 0; +}