From d8873a26e08b818091a95c417c4b5bcca542e9e8 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 10 May 2026 19:53:20 +0800 Subject: [PATCH] Stub scheduler policy syscalls Add sched_{setparam,setscheduler,getscheduler,getparam,get_priority_min, get_priority_max,rr_get_interval}. Newer musl and glibc pthread_create probe the parent thread's policy via sched_getscheduler before spawning a worker; returning ENOSYS broke thread startup for guests that exercised that path. Pid arguments are TIDs in Linux scheduler ABI, not TGIDs, so a worker calling sched_*(gettid()) must reach its own thread entry. Live TIDs are matched through thread_tid_alive(); unknown pids return ESRCH. Setter errno ordering tracks Linux 6.x kernel/sched/syscalls.c: copy_from_user runs before find_process_by_pid, so a bad sched_param pointer plus an unknown pid yields EFAULT, not ESRCH. Any policy transition away from SCHED_OTHER is refused. SCHED_BATCH and SCHED_IDLE return EPERM at priority 0 rather than being silently accepted while the stub keeps reporting SCHED_OTHER. SCHED_FIFO and SCHED_RR collapse to EINVAL when priority is outside [1, 99] and to EPERM when valid; SCHED_DEADLINE through the legacy entry is EINVAL because the syscall cannot supply deadline attributes (real Linux requires sched_setattr). SCHED_RESET_ON_FORK (0x40000000) is parsed off the policy before validation; any other high bit is EINVAL. sched_rr_get_interval reports a 100 ms slice, matching the sched_rr_timeslice default and a typical CFS quantum, instead of a zero timespec that would mislead callers branching on the value. sched_getaffinity also runs through the per-thread pid gate so unknown pids return ESRCH instead of silently succeeding. --- Makefile | 5 + src/syscall/abi.h | 21 +++ src/syscall/dispatch.tbl | 7 + src/syscall/sys.c | 184 ++++++++++++++++++++++- src/syscall/sys.h | 10 ++ src/syscall/syscall.c | 9 +- tests/manifest.txt | 3 + tests/test-sched-policy.c | 306 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 543 insertions(+), 2 deletions(-) create mode 100644 tests/test-sched-policy.c diff --git a/Makefile b/Makefile index 8690e60..78c96bf 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,11 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-sched-policy spawns a pthread to verify per-thread TID lookup +$(BUILD_DIR)/test-sched-policy: tests/test-sched-policy.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + # test-signalfd-hardening needs -lpthread for the worker-thread tid # regression case in test_rt_sigqueueinfo_rejects_thread_tid. $(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_DIR) diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 12e3a25..0655a18 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -87,9 +87,16 @@ #define SYS_clock_gettime 113 #define SYS_clock_getres 114 #define SYS_clock_nanosleep 115 +#define SYS_sched_setparam 118 +#define SYS_sched_setscheduler 119 +#define SYS_sched_getscheduler 120 +#define SYS_sched_getparam 121 #define SYS_sched_setaffinity 122 #define SYS_sched_getaffinity 123 #define SYS_sched_yield 124 +#define SYS_sched_get_priority_max 125 +#define SYS_sched_get_priority_min 126 +#define SYS_sched_rr_get_interval 127 #define SYS_kill 129 #define SYS_tgkill 131 #define SYS_sigaltstack 132 @@ -479,6 +486,20 @@ typedef struct { int64_t tv_sec, tv_usec; } linux_timeval_t; +/* Linux scheduling policies (asm-generic/sched.h). */ +#define LINUX_SCHED_NORMAL 0 +#define LINUX_SCHED_FIFO 1 +#define LINUX_SCHED_RR 2 +#define LINUX_SCHED_BATCH 3 +#define LINUX_SCHED_IDLE 5 +#define LINUX_SCHED_DEADLINE 6 +#define LINUX_SCHED_RESET_ON_FORK 0x40000000 + +/* Linux struct sched_param (POSIX); only sched_priority is exposed. */ +typedef struct { + int32_t sched_priority; +} linux_sched_param_t; + /* Linux struct statfs (aarch64). */ typedef struct { int64_t f_type, f_bsize; diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index 7ccb457..56857db 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -163,9 +163,16 @@ SYS_setsid sc_setsid 0 SYS_getgroups sc_getgroups 1 SYS_setpriority sc_setpriority 0 SYS_getpriority sc_getpriority 0 +SYS_sched_setparam sc_sched_setparam 0 +SYS_sched_setscheduler sc_sched_setscheduler 0 +SYS_sched_getscheduler sc_sched_getscheduler 0 +SYS_sched_getparam sc_sched_getparam 0 SYS_sched_setaffinity sc_sched_setaffinity 1 SYS_sched_getaffinity sc_sched_getaffinity 1 SYS_sched_yield sc_sched_yield 0 +SYS_sched_get_priority_max sc_sched_get_priority_max 0 +SYS_sched_get_priority_min sc_sched_get_priority_min 0 +SYS_sched_rr_get_interval sc_sched_rr_get_interval 0 SYS_set_tid_address sc_set_tid_address 0 SYS_capget sc_capget 1 SYS_capset sc_capset 0 diff --git a/src/syscall/sys.c b/src/syscall/sys.c index fef7595..ee4cde2 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -74,6 +74,12 @@ _Static_assert(offsetof(struct rusage, ru_maxrss) == offsetof(linux_rusage_t, ru_maxrss), "ru_maxrss offset must stay aligned for fast translation"); +/* Defined below in the scheduler-policy section; forward-declared so + * sys_sched_getaffinity (which sits above the policy stubs) can share the + * same per-thread TID gate. + */ +static bool sched_pid_alive(int pid); + static void groups_init_cached_linux_groups(void) { gid_t groups[64]; @@ -246,10 +252,13 @@ int64_t sys_sched_getaffinity(guest_t *g, uint64_t size, uint64_t mask_gva) { - (void) pid; /* Return a 1-CPU affinity mask for simplicity. * sched_setaffinity is not implemented; all threads see CPU 0. */ + if (pid < 0) + return -LINUX_EINVAL; + if (!sched_pid_alive(pid)) + return -LINUX_ESRCH; if (size < 8) return -LINUX_EINVAL; @@ -283,6 +292,179 @@ int64_t sys_sched_getaffinity(guest_t *g, return 8; /* Returns size of written mask */ } +/* Scheduler policy stubs. + * + * elfuse models a single SCHED_OTHER thread group. Linux scheduler syscalls + * are per-thread: the pid argument is actually a TID, and a worker calling + * sched_getscheduler(gettid()) must reach its own thread entry, not just the + * thread-group leader. Live TIDs are matched via thread_tid_alive(); pid 0 + * means "the calling thread" and is always accepted. + * + * Any policy transition away from SCHED_OTHER is rejected unless the stub can + * model it faithfully. Callers branch on the apparent policy, and silently + * accepting BATCH/IDLE/RT classes while still reporting SCHED_OTHER would + * hide guest bugs. SCHED_DEADLINE through legacy sched_setscheduler is + * -EINVAL because the legacy syscall cannot supply the deadline attributes + * (real Linux requires sched_setattr). + * + * Errno ordering follows Linux 6.x kernel/sched/syscalls.c: + * 1. pid < 0 or NULL param pointer -> EINVAL + * 2. copy_from_user (setters) -> EFAULT before pid lookup + * 3. find_process_by_pid -> ESRCH + * 4. policy/priority validation -> EINVAL or EPERM + * Getters that only write back leave EFAULT for the final copy_to_user step, + * matching the kernel's order. + */ +static bool sched_pid_alive(int pid) +{ + if (pid == 0) + return true; + if (pid == (int) proc_get_pid()) + return true; + return thread_tid_alive((int64_t) pid) != 0; +} + +/* Validate a sched_param for the named policy. Returns 0 if accepted by the + * stub, or a negative Linux errno. EPERM is reserved for "request would be + * valid on Linux but the stub refuses to honor it" -- RT priority elevation + * and BATCH/IDLE class transitions away from SCHED_OTHER. EINVAL covers + * every other out-of-spec input (bad priority range, SCHED_DEADLINE through + * the legacy entry point, unknown policy bits). + */ +static int sched_check_policy_param(int policy, int prio) +{ + int base_policy = policy & ~LINUX_SCHED_RESET_ON_FORK; + + /* Reject any unknown high bit. The mask 0x7 covers every base policy + * id we recognize (NORMAL=0, FIFO=1, RR=2, BATCH=3, IDLE=5, DEADLINE=6); + * the unused 4 and 7 fall through to the default switch arm below. + */ + if (policy & ~(LINUX_SCHED_RESET_ON_FORK | 0x7)) + return -LINUX_EINVAL; + + switch (base_policy) { + case LINUX_SCHED_NORMAL: + return prio == 0 ? 0 : -LINUX_EINVAL; + case LINUX_SCHED_BATCH: + case LINUX_SCHED_IDLE: + return prio == 0 ? -LINUX_EPERM : -LINUX_EINVAL; + case LINUX_SCHED_FIFO: + case LINUX_SCHED_RR: + if (prio < 1 || prio > 99) + return -LINUX_EINVAL; + return -LINUX_EPERM; + case LINUX_SCHED_DEADLINE: + return -LINUX_EINVAL; + default: + return -LINUX_EINVAL; + } +} + +int64_t sys_sched_getscheduler(int pid) +{ + if (pid < 0) + return -LINUX_EINVAL; + if (!sched_pid_alive(pid)) + return -LINUX_ESRCH; + return LINUX_SCHED_NORMAL; +} + +int64_t sys_sched_getparam(guest_t *g, int pid, uint64_t param_gva) +{ + if (pid < 0 || param_gva == 0) + return -LINUX_EINVAL; + if (!sched_pid_alive(pid)) + return -LINUX_ESRCH; + linux_sched_param_t param = {0}; + if (guest_write_small(g, param_gva, ¶m, sizeof(param)) < 0) + return -LINUX_EFAULT; + return 0; +} + +int64_t sys_sched_setscheduler(guest_t *g, + int pid, + int policy, + uint64_t param_gva) +{ + if (pid < 0 || param_gva == 0) + return -LINUX_EINVAL; + linux_sched_param_t param; + if (guest_read_small(g, param_gva, ¶m, sizeof(param)) < 0) + return -LINUX_EFAULT; + if (!sched_pid_alive(pid)) + return -LINUX_ESRCH; + return sched_check_policy_param(policy, param.sched_priority); +} + +int64_t sys_sched_setparam(guest_t *g, int pid, uint64_t param_gva) +{ + if (pid < 0 || param_gva == 0) + return -LINUX_EINVAL; + linux_sched_param_t param; + if (guest_read_small(g, param_gva, ¶m, sizeof(param)) < 0) + return -LINUX_EFAULT; + if (!sched_pid_alive(pid)) + return -LINUX_ESRCH; + /* Current policy is SCHED_OTHER, so only priority 0 is valid. Any other + * value mirrors the kernel's EINVAL for non-RT priority changes. + */ + if (param.sched_priority != 0) + return -LINUX_EINVAL; + return 0; +} + +int64_t sys_sched_get_priority_min(int policy) +{ + switch (policy) { + case LINUX_SCHED_NORMAL: + case LINUX_SCHED_BATCH: + case LINUX_SCHED_IDLE: + case LINUX_SCHED_DEADLINE: + return 0; + case LINUX_SCHED_FIFO: + case LINUX_SCHED_RR: + return 1; + default: + return -LINUX_EINVAL; + } +} + +int64_t sys_sched_get_priority_max(int policy) +{ + switch (policy) { + case LINUX_SCHED_NORMAL: + case LINUX_SCHED_BATCH: + case LINUX_SCHED_IDLE: + case LINUX_SCHED_DEADLINE: + return 0; + case LINUX_SCHED_FIFO: + case LINUX_SCHED_RR: + return 99; + default: + return -LINUX_EINVAL; + } +} + +int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva) +{ + if (pid < 0) + return -LINUX_EINVAL; + if (!sched_pid_alive(pid)) + return -LINUX_ESRCH; + if (ts_gva == 0) + return -LINUX_EFAULT; + /* Linux's fair_sched_class.get_rr_interval returns a CFS-derived slice + * for SCHED_OTHER tasks whenever the runqueue carries load. Reporting + * 100 ms (the sched_rr_timeslice default and a typical CFS quantum) + * gives querying tools a plausible non-zero value without pretending + * the guest is actually under SCHED_RR. + */ + linux_timespec_t ts = {.tv_sec = 0, .tv_nsec = 100 * 1000 * 1000L}; + if (guest_write_small(g, ts_gva, &ts, sizeof(ts)) < 0) + return -LINUX_EFAULT; + return 0; +} + int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva) { int ngroups = get_cached_linux_groups(); diff --git a/src/syscall/sys.h b/src/syscall/sys.h index 8025d2b..d2714bc 100644 --- a/src/syscall/sys.h +++ b/src/syscall/sys.h @@ -30,6 +30,16 @@ int64_t sys_sched_getaffinity(guest_t *g, int pid, uint64_t size, uint64_t mask_gva); +int64_t sys_sched_getscheduler(int pid); +int64_t sys_sched_getparam(guest_t *g, int pid, uint64_t param_gva); +int64_t sys_sched_setscheduler(guest_t *g, + int pid, + int policy, + uint64_t param_gva); +int64_t sys_sched_setparam(guest_t *g, int pid, uint64_t param_gva); +int64_t sys_sched_get_priority_min(int policy); +int64_t sys_sched_get_priority_max(int policy); +int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva); int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva); int64_t sys_getrusage(guest_t *g, int who, uint64_t usage_gva); int64_t sys_sysinfo(guest_t *g, uint64_t info_gva); diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index a75fa73..5af65cb 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -313,7 +313,14 @@ SC_FORWARD(sc_getrlimit, sys_prlimit64(g, 0, (int) x0, 0, x1)) SC_FORWARD(sc_setrlimit, sys_prlimit64(g, 0, (int) x0, x1, 0)) SC_FORWARD(sc_getgroups, sys_getgroups(g, (int) x0, x1)) SC_FORWARD(sc_getrusage, sys_getrusage(g, (int) x0, x1)) -SC_FORWARD(sc_sched_getaffinity, sys_sched_getaffinity(g, (int) x0, x1, x2)) +SC_FORWARD(sc_sched_getaffinity, sys_sched_getaffinity(g, (int) x0, x1, x2)) +SC_FORWARD(sc_sched_getscheduler, sys_sched_getscheduler((int) x0)) +SC_FORWARD(sc_sched_getparam, sys_sched_getparam(g, (int) x0, x1)) +SC_FORWARD(sc_sched_setscheduler, sys_sched_setscheduler(g, (int) x0, (int) x1, x2)) +SC_FORWARD(sc_sched_setparam, sys_sched_setparam(g, (int) x0, x1)) +SC_FORWARD(sc_sched_get_priority_min, sys_sched_get_priority_min((int) x0)) +SC_FORWARD(sc_sched_get_priority_max, sys_sched_get_priority_max((int) x0)) +SC_FORWARD(sc_sched_rr_get_interval, sys_sched_rr_get_interval(g, (int) x0, x1)) /* Process identity is modeled as one Linux process inside this elfuse instance. */ SC_FORWARD(sc_exit, SC_EXIT_SENTINEL | ((int) x0 & 0xFF)) diff --git a/tests/manifest.txt b/tests/manifest.txt index 3fbdb75..49136eb 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -147,6 +147,9 @@ test-sysv-shm [section] Credential/identity emulation tests test-credentials # diff=skip +[section] Scheduler policy stub tests +test-sched-policy + [section] membarrier tests test-membarrier diff --git a/tests/test-sched-policy.c b/tests/test-sched-policy.c new file mode 100644 index 0000000..9e592d5 --- /dev/null +++ b/tests/test-sched-policy.c @@ -0,0 +1,306 @@ +/* Test scheduler policy stub syscalls + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Exercises sched_setparam (118), sched_setscheduler (119), + * sched_getscheduler (120), sched_getparam (121), + * sched_get_priority_min (125), sched_get_priority_max (126), + * sched_rr_get_interval (127). The implementation is a stub: getters + * report SCHED_OTHER and a zero priority, RT priority elevation is + * refused with -EPERM, SCHED_DEADLINE through the legacy entry point + * is -EINVAL, and all entries validate guest pointers (-EFAULT). + */ + +#include +#include +#include +#include + +#include "test-harness.h" +#include "raw-syscall.h" + +#define __NR_sched_setparam 118 +#define __NR_sched_setscheduler 119 +#define __NR_sched_getscheduler 120 +#define __NR_sched_getparam 121 +#define __NR_sched_get_priority_max 125 +#define __NR_sched_get_priority_min 126 +#define __NR_sched_rr_get_interval 127 + +#define LINUX_SCHED_DEADLINE 6 +#define LINUX_SCHED_RESET_ON_FORK 0x40000000 + +struct sched_param_compat { + int sched_priority; +}; + +struct ts_compat { + long tv_sec; + long tv_nsec; +}; + +struct worker_result { + long getscheduler_rc; + long getparam_rc; + int param_priority; + long setscheduler_self_rc; + long getaffinity_rc; +}; + +static void *worker_main(void *arg) +{ + struct worker_result *out = arg; + long tid = raw_syscall0(178 /* gettid */); + + out->getscheduler_rc = raw_syscall1(__NR_sched_getscheduler, tid); + + struct sched_param_compat p = {.sched_priority = 0xdead}; + out->getparam_rc = raw_syscall2(__NR_sched_getparam, tid, (long) &p); + out->param_priority = p.sched_priority; + + struct sched_param_compat zero = {.sched_priority = 0}; + out->setscheduler_self_rc = + raw_syscall3(__NR_sched_setscheduler, tid, SCHED_OTHER, (long) &zero); + + unsigned long mask = 0; + out->getaffinity_rc = raw_syscall3(123 /* sched_getaffinity */, tid, + sizeof(mask), (long) &mask); + + return NULL; +} + +int main(void) +{ + int passes = 0, fails = 0; + + printf("test-sched-policy: scheduler policy stub coverage\n"); + + TEST("getscheduler(0) == SCHED_OTHER"); + EXPECT_EQ(raw_syscall1(__NR_sched_getscheduler, 0), SCHED_OTHER, + "default policy is not SCHED_OTHER"); + + TEST("getscheduler(-1) == -EINVAL"); + EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_getscheduler, -1), -EINVAL, + "negative pid did not return EINVAL"); + + TEST("getscheduler(99999) == -ESRCH"); + EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_getscheduler, 99999), -ESRCH, + "unknown pid did not return ESRCH"); + + TEST("getparam(0, &p) zeroes priority"); + { + struct sched_param_compat p = {.sched_priority = 0x5a5a}; + long rc = raw_syscall2(__NR_sched_getparam, 0, (long) &p); + EXPECT_TRUE(rc == 0 && p.sched_priority == 0, + "getparam did not return zero priority"); + } + + TEST("getparam(0, NULL) == -EINVAL"); + EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_getparam, 0, 0), -EINVAL, + "NULL param pointer did not return EINVAL"); + + TEST("getparam(0, bogus) == -EFAULT"); + EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_getparam, 0, (long) 0x1), -EFAULT, + "unmapped param did not return EFAULT"); + + TEST("setscheduler(SCHED_OTHER, prio=0) == 0"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_EQ( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_OTHER, (long) &p), 0, + "SCHED_OTHER prio=0 was not accepted"); + } + + TEST("setscheduler(SCHED_OTHER, prio=1) == -EINVAL"); + { + struct sched_param_compat p = {.sched_priority = 1}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_OTHER, (long) &p), + -EINVAL, "non-zero priority for SCHED_OTHER was accepted"); + } + + TEST("setscheduler(SCHED_BATCH, prio=0) == -EPERM"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_BATCH, (long) &p), + -EPERM, "SCHED_BATCH was accepted without state tracking"); + } + + TEST("setscheduler(SCHED_IDLE, prio=0) == -EPERM"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_IDLE, (long) &p), + -EPERM, "SCHED_IDLE was accepted without state tracking"); + } + + TEST("setscheduler(SCHED_FIFO, prio=50) == -EPERM"); + { + struct sched_param_compat p = {.sched_priority = 50}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_FIFO, (long) &p), + -EPERM, "SCHED_FIFO with valid prio did not return EPERM"); + } + + TEST("setscheduler(SCHED_FIFO, prio=0) == -EINVAL"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_FIFO, (long) &p), + -EINVAL, "SCHED_FIFO with prio=0 did not return EINVAL"); + } + + TEST("setscheduler(SCHED_FIFO, prio=100) == -EINVAL"); + { + struct sched_param_compat p = {.sched_priority = 100}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_FIFO, (long) &p), + -EINVAL, "SCHED_FIFO with prio=100 did not return EINVAL"); + } + + TEST("setscheduler(SCHED_RR, prio=50) == -EPERM"); + { + struct sched_param_compat p = {.sched_priority = 50}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, SCHED_RR, (long) &p), + -EPERM, "SCHED_RR with valid prio did not return EPERM"); + } + + TEST("setscheduler(SCHED_DEADLINE) == -EINVAL"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_RAW_ERRNO(raw_syscall3(__NR_sched_setscheduler, 0, + LINUX_SCHED_DEADLINE, (long) &p), + -EINVAL, + "SCHED_DEADLINE legacy syscall did not return EINVAL"); + } + + TEST("setscheduler(SCHED_OTHER|RESET_ON_FORK, prio=0) == 0"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_EQ( + raw_syscall3(__NR_sched_setscheduler, 0, + SCHED_OTHER | LINUX_SCHED_RESET_ON_FORK, (long) &p), + 0, "SCHED_RESET_ON_FORK flag was not accepted"); + } + + TEST("setscheduler(SCHED_FIFO|RESET_ON_FORK, prio=50) == -EPERM"); + { + struct sched_param_compat p = {.sched_priority = 50}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, + SCHED_FIFO | LINUX_SCHED_RESET_ON_FORK, (long) &p), + -EPERM, "SCHED_RESET_ON_FORK changed SCHED_FIFO semantics"); + } + + TEST("setscheduler(_, _, bogus) + bad pid -> EFAULT"); + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 99999, SCHED_OTHER, (long) 0x1), + -EFAULT, "bad param pointer did not take precedence over ESRCH"); + + TEST("setscheduler(garbage policy) == -EINVAL"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_RAW_ERRNO( + raw_syscall3(__NR_sched_setscheduler, 0, 99, (long) &p), -EINVAL, + "unknown policy did not return EINVAL"); + } + + TEST("setscheduler(_, _, NULL) == -EINVAL"); + EXPECT_RAW_ERRNO(raw_syscall3(__NR_sched_setscheduler, 0, SCHED_OTHER, 0), + -EINVAL, "NULL param pointer did not return EINVAL"); + + TEST("setparam(0, prio=0) == 0"); + { + struct sched_param_compat p = {.sched_priority = 0}; + EXPECT_EQ(raw_syscall2(__NR_sched_setparam, 0, (long) &p), 0, + "setparam(prio=0) was rejected"); + } + + TEST("setparam(0, prio=1) == -EINVAL"); + { + struct sched_param_compat p = {.sched_priority = 1}; + EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_setparam, 0, (long) &p), + -EINVAL, "setparam(prio=1) was accepted"); + } + + TEST("get_priority_min(SCHED_OTHER) == 0"); + EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_min, SCHED_OTHER), 0, + "min for SCHED_OTHER not 0"); + + TEST("get_priority_max(SCHED_OTHER) == 0"); + EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_max, SCHED_OTHER), 0, + "max for SCHED_OTHER not 0"); + + TEST("get_priority_min(SCHED_FIFO) == 1"); + EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_min, SCHED_FIFO), 1, + "min for SCHED_FIFO not 1"); + + TEST("get_priority_max(SCHED_FIFO) == 99"); + EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_max, SCHED_FIFO), 99, + "max for SCHED_FIFO not 99"); + + TEST("get_priority_min(SCHED_DEADLINE) == 0"); + EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_min, LINUX_SCHED_DEADLINE), + 0, "min for SCHED_DEADLINE not 0"); + + TEST("get_priority_max(SCHED_DEADLINE) == 0"); + EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_max, LINUX_SCHED_DEADLINE), + 0, "max for SCHED_DEADLINE not 0"); + + TEST("get_priority_min(garbage) == -EINVAL"); + EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_get_priority_min, 42), -EINVAL, + "unknown policy min did not return EINVAL"); + + TEST("get_priority_max(garbage) == -EINVAL"); + EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_get_priority_max, 42), -EINVAL, + "unknown policy max did not return EINVAL"); + + TEST("rr_get_interval(0, &ts) writes plausible slice"); + { + struct ts_compat ts = {.tv_sec = 7, .tv_nsec = 11}; + long rc = raw_syscall2(__NR_sched_rr_get_interval, 0, (long) &ts); + EXPECT_TRUE(rc == 0 && ts.tv_sec == 0 && ts.tv_nsec > 0 && + ts.tv_nsec < 1000 * 1000 * 1000L, + "rr_get_interval did not write a sane sub-second slice"); + } + + TEST("rr_get_interval(0, NULL) == -EFAULT"); + EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_rr_get_interval, 0, 0), -EFAULT, + "NULL ts pointer did not return EFAULT"); + + /* Worker-thread coverage: scheduler syscalls take a TID, not a TGID, so a + * worker calling sched_*(gettid()) must succeed. This also exercises + * sched_getaffinity's per-thread pid gate. + */ + { + pthread_t worker; + struct worker_result r = {0}; + if (pthread_create(&worker, NULL, worker_main, &r) != 0) { + TEST("pthread_create(worker)"); + FAIL("pthread_create failed"); + } else { + pthread_join(worker, NULL); + + TEST("worker getscheduler(gettid()) == SCHED_OTHER"); + EXPECT_EQ(r.getscheduler_rc, SCHED_OTHER, + "worker did not see SCHED_OTHER"); + + TEST("worker getparam(gettid()) zeroes priority"); + EXPECT_TRUE(r.getparam_rc == 0 && r.param_priority == 0, + "worker getparam mismatch"); + + TEST("worker setscheduler(gettid(), OTHER, 0) == 0"); + EXPECT_EQ(r.setscheduler_self_rc, 0, + "worker self-setscheduler rejected"); + + TEST("worker getaffinity(gettid()) succeeds"); + EXPECT_TRUE(r.getaffinity_rc > 0, "worker getaffinity failed"); + } + } + + SUMMARY("test-sched-policy"); + return fails == 0 ? 0 : 1; +}