From 64ea4e3772137622be8489922bf71c9170a9662b Mon Sep 17 00:00:00 2001 From: Max042004 Date: Wed, 27 May 2026 01:01:43 +0800 Subject: [PATCH] Reject CLONE_NEW* namespace flags in legacy clone(2) sys_clone forwarded raw flags to the posix_spawn-based fork path without inspecting CLONE_NEW* bits, so clone(CLONE_NEWPID, ...) silently created an unisolated child and returned a PID as if the namespace had been set up. clone3 already rejected the same flags with EINVAL, so the observable result depended on which entry point the caller used. Mirror the clone3 policy in sys_clone: reject any namespace flag with EINVAL. The exit signal occupies the CSIGNAL low byte in clone(2), so mask it off before testing. CLONE_NEWTIME (0x80) lives in that byte and, like CLONE_INTO_CGROUP (bit 33) and set_tid, cannot be conveyed through clone(2), so only the higher namespace bits are reachable. Move the namespace flag macros to the top of the file so both entry points share them. Add a regression test asserting clone(2) rejects each reachable CLONE_NEW* flag, matching the existing clone3 coverage. Closes #44 --- src/runtime/forkipc.c | 40 ++++++++++++++++++++++++++-------------- tests/test-clone3.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index a2cfe9c..963cb61 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -360,6 +360,23 @@ int fork_child_main(int ipc_fd, #define LINUX_CLONE_CHILD_SETTID 0x01000000 /* LINUX_SIGCHLD defined in syscall_signal.h (included above) */ +/* Namespace flags. elfuse implements no namespace isolation. Both + * sys_clone and sys_clone3 reject them. + */ +#define LINUX_CLONE_NEWTIME 0x00000080 +#define LINUX_CLONE_NEWNS 0x00020000 +#define LINUX_CLONE_NEWCGROUP 0x02000000 +#define LINUX_CLONE_NEWUTS 0x04000000 +#define LINUX_CLONE_NEWIPC 0x08000000 +#define LINUX_CLONE_NEWUSER 0x10000000 +#define LINUX_CLONE_NEWPID 0x20000000 +#define LINUX_CLONE_NEWNET 0x40000000 + +#define LINUX_CLONE3_NS_FLAGS \ + (LINUX_CLONE_NEWNS | LINUX_CLONE_NEWCGROUP | LINUX_CLONE_NEWUTS | \ + LINUX_CLONE_NEWIPC | LINUX_CLONE_NEWUSER | LINUX_CLONE_NEWPID | \ + LINUX_CLONE_NEWNET | LINUX_CLONE_NEWTIME) + /* CLONE_THREAD: create a new guest thread in the same VM. */ /* Arguments passed to the worker pthread. Allocated by sys_clone_thread, freed @@ -1069,6 +1086,14 @@ int64_t sys_clone(hv_vcpu_t vcpu, uint64_t ctid_gva, bool verbose) { + /* Namespaces are not implemented. CLONE_NEWTIME (0x80) lives in the CSIGNAL + * low byte and, like CLONE_INTO_CGROUP (bit 33) and set_tid, cannot be + * conveyed through clone(2) at all, so only the higher namespace bits are + * reachable here. + */ + if ((flags & ~(uint64_t) 0xff) & LINUX_CLONE3_NS_FLAGS) + return -LINUX_EINVAL; + /* CLONE_THREAD: create a new thread in the same VM (not a new process) */ if (flags & LINUX_CLONE_THREAD) { return sys_clone_thread(vcpu, g, flags, child_stack, stack_map_start, @@ -1507,22 +1532,9 @@ struct linux_clone_args { #define CLONE_ARGS_SIZE_VER0 64 /* v5.3: first 8 fields (flags..tls) */ -/* Unsupported clone3 flags: reject early rather than silently ignoring. */ +/* Unsupported clone3-only flags: reject early rather than silently ignoring. */ #define LINUX_CLONE_PIDFD 0x00001000 #define LINUX_CLONE_INTO_CGROUP 0x200000000ULL -#define LINUX_CLONE_NEWNS 0x00020000 -#define LINUX_CLONE_NEWCGROUP 0x02000000 -#define LINUX_CLONE_NEWUTS 0x04000000 -#define LINUX_CLONE_NEWIPC 0x08000000 -#define LINUX_CLONE_NEWUSER 0x10000000 -#define LINUX_CLONE_NEWPID 0x20000000 -#define LINUX_CLONE_NEWNET 0x40000000 -#define LINUX_CLONE_NEWTIME 0x00000080 - -#define LINUX_CLONE3_NS_FLAGS \ - (LINUX_CLONE_NEWNS | LINUX_CLONE_NEWCGROUP | LINUX_CLONE_NEWUTS | \ - LINUX_CLONE_NEWIPC | LINUX_CLONE_NEWUSER | LINUX_CLONE_NEWPID | \ - LINUX_CLONE_NEWNET | LINUX_CLONE_NEWTIME) int64_t sys_clone3(hv_vcpu_t vcpu, guest_t *g, diff --git a/tests/test-clone3.c b/tests/test-clone3.c index dc679b9..c23c18b 100644 --- a/tests/test-clone3.c +++ b/tests/test-clone3.c @@ -658,6 +658,35 @@ static void test_partial_deferred_stack_munmap(void) munmap(reuse_stack, stack_size); } +/* Test 15: legacy clone(2) rejects CLONE_NEW* namespace flags with EINVAL, + * matching clone3 (issue #44). Before the fix these flags fell through to a + * plain fork that falsely appeared to succeed. CLONE_NEWTIME is omitted: it + * lives in the CSIGNAL low byte and is not reachable through clone(2). + */ +static void test_legacy_clone_namespaces(void) +{ + static const struct { + unsigned long flag; + const char *name; + } ns_flags[] = { + {0x00020000, "CLONE_NEWNS"}, {0x02000000, "CLONE_NEWCGROUP"}, + {0x04000000, "CLONE_NEWUTS"}, {0x08000000, "CLONE_NEWIPC"}, + {0x10000000, "CLONE_NEWUSER"}, {0x20000000, "CLONE_NEWPID"}, + {0x40000000, "CLONE_NEWNET"}, + }; + for (size_t i = 0; i < sizeof(ns_flags) / sizeof(ns_flags[0]); i++) { + /* SIGCHLD (17) in the low byte makes this a fork-like clone. */ + long ret = raw_clone(ns_flags[i].flag | 17, NULL, NULL, 0, NULL); + CHECK(ret == -22 /* EINVAL */, + "clone(%s) returned %ld (expected -EINVAL)", ns_flags[i].name, + ret); + if (ret == 0) /* defensive: a leaked child must not run the suite */ + raw_syscall1(__NR_exit, 0); + else if (ret > 0) + raw_syscall4(__NR_wait4, ret, 0, 0, 0); + } +} + int main(int argc, char **argv) { if (argc > 1 && !strcmp(argv[1], "--clone3-vfork-child")) @@ -687,6 +716,7 @@ int main(int argc, char **argv) test_vfork_exec_unblocks_parent(); test_deferred_stack_munmap(); test_partial_deferred_stack_munmap(); + test_legacy_clone_namespaces(); SUMMARY("test-clone3"); return fails > 0 ? 1 : 0;