From d8873a26e08b818091a95c417c4b5bcca542e9e8 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sun, 10 May 2026 19:53:20 +0800
Subject: [PATCH] Stub scheduler policy syscalls

Add sched_{setparam,setscheduler,getscheduler,getparam,get_priority_min,
get_priority_max,rr_get_interval}. Newer musl and glibc pthread_create
probe the parent thread's policy via sched_getscheduler before spawning
a worker; returning ENOSYS broke thread startup for guests that exercised
that path.

Pid arguments are TIDs in Linux scheduler ABI, not TGIDs, so a worker
calling sched_*(gettid()) must reach its own thread entry. Live TIDs
are matched through thread_tid_alive(); unknown pids return ESRCH.

Setter errno ordering tracks Linux 6.x kernel/sched/syscalls.c:
copy_from_user runs before find_process_by_pid, so a bad sched_param
pointer plus an unknown pid yields EFAULT, not ESRCH.

Any policy transition away from SCHED_OTHER is refused. SCHED_BATCH
and SCHED_IDLE return EPERM at priority 0 rather than being silently
accepted while the stub keeps reporting SCHED_OTHER. SCHED_FIFO and
SCHED_RR collapse to EINVAL when priority is outside [1, 99] and to
EPERM when valid; SCHED_DEADLINE through the legacy entry is EINVAL
because the syscall cannot supply deadline attributes (real Linux
requires sched_setattr). SCHED_RESET_ON_FORK (0x40000000) is parsed
off the policy before validation; any other high bit is EINVAL.

sched_rr_get_interval reports a 100 ms slice, matching the
sched_rr_timeslice default and a typical CFS quantum, instead of a
zero timespec that would mislead callers branching on the value.

sched_getaffinity also runs through the per-thread pid gate so unknown
pids return ESRCH instead of silently succeeding.
---
 Makefile                  |   5 +
 src/syscall/abi.h         |  21 +++
 src/syscall/dispatch.tbl  |   7 +
 src/syscall/sys.c         | 184 ++++++++++++++++++++++-
 src/syscall/sys.h         |  10 ++
 src/syscall/syscall.c     |   9 +-
 tests/manifest.txt        |   3 +
 tests/test-sched-policy.c | 306 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 543 insertions(+), 2 deletions(-)
 create mode 100644 tests/test-sched-policy.c

diff --git a/Makefile b/Makefile
index 8690e60..78c96bf 100644
--- a/Makefile
+++ b/Makefile
@@ -147,6 +147,11 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
 	@echo "  CROSS   $< (with -lpthread)"
 	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
 
+# test-sched-policy spawns a pthread to verify per-thread TID lookup
+$(BUILD_DIR)/test-sched-policy: tests/test-sched-policy.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (with -lpthread)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
+
 # test-signalfd-hardening needs -lpthread for the worker-thread tid
 # regression case in test_rt_sigqueueinfo_rejects_thread_tid.
 $(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_DIR)
diff --git a/src/syscall/abi.h b/src/syscall/abi.h
index 12e3a25..0655a18 100644
--- a/src/syscall/abi.h
+++ b/src/syscall/abi.h
@@ -87,9 +87,16 @@
 #define SYS_clock_gettime 113
 #define SYS_clock_getres 114
 #define SYS_clock_nanosleep 115
+#define SYS_sched_setparam 118
+#define SYS_sched_setscheduler 119
+#define SYS_sched_getscheduler 120
+#define SYS_sched_getparam 121
 #define SYS_sched_setaffinity 122
 #define SYS_sched_getaffinity 123
 #define SYS_sched_yield 124
+#define SYS_sched_get_priority_max 125
+#define SYS_sched_get_priority_min 126
+#define SYS_sched_rr_get_interval 127
 #define SYS_kill 129
 #define SYS_tgkill 131
 #define SYS_sigaltstack 132
@@ -479,6 +486,20 @@ typedef struct {
     int64_t tv_sec, tv_usec;
 } linux_timeval_t;
 
+/* Linux scheduling policies (asm-generic/sched.h). */
+#define LINUX_SCHED_NORMAL 0
+#define LINUX_SCHED_FIFO 1
+#define LINUX_SCHED_RR 2
+#define LINUX_SCHED_BATCH 3
+#define LINUX_SCHED_IDLE 5
+#define LINUX_SCHED_DEADLINE 6
+#define LINUX_SCHED_RESET_ON_FORK 0x40000000
+
+/* Linux struct sched_param (POSIX); only sched_priority is exposed. */
+typedef struct {
+    int32_t sched_priority;
+} linux_sched_param_t;
+
 /* Linux struct statfs (aarch64). */
 typedef struct {
     int64_t f_type, f_bsize;
diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl
index 7ccb457..56857db 100644
--- a/src/syscall/dispatch.tbl
+++ b/src/syscall/dispatch.tbl
@@ -163,9 +163,16 @@ SYS_setsid sc_setsid 0
 SYS_getgroups sc_getgroups 1
 SYS_setpriority sc_setpriority 0
 SYS_getpriority sc_getpriority 0
+SYS_sched_setparam sc_sched_setparam 0
+SYS_sched_setscheduler sc_sched_setscheduler 0
+SYS_sched_getscheduler sc_sched_getscheduler 0
+SYS_sched_getparam sc_sched_getparam 0
 SYS_sched_setaffinity sc_sched_setaffinity 1
 SYS_sched_getaffinity sc_sched_getaffinity 1
 SYS_sched_yield sc_sched_yield 0
+SYS_sched_get_priority_max sc_sched_get_priority_max 0
+SYS_sched_get_priority_min sc_sched_get_priority_min 0
+SYS_sched_rr_get_interval sc_sched_rr_get_interval 0
 SYS_set_tid_address sc_set_tid_address 0
 SYS_capget sc_capget 1
 SYS_capset sc_capset 0
diff --git a/src/syscall/sys.c b/src/syscall/sys.c
index fef7595..ee4cde2 100644
--- a/src/syscall/sys.c
+++ b/src/syscall/sys.c
@@ -74,6 +74,12 @@ _Static_assert(offsetof(struct rusage, ru_maxrss) ==
                    offsetof(linux_rusage_t, ru_maxrss),
                "ru_maxrss offset must stay aligned for fast translation");
 
+/* Defined below in the scheduler-policy section; forward-declared so
+ * sys_sched_getaffinity (which sits above the policy stubs) can share the
+ * same per-thread TID gate.
+ */
+static bool sched_pid_alive(int pid);
+
 static void groups_init_cached_linux_groups(void)
 {
     gid_t groups[64];
@@ -246,10 +252,13 @@ int64_t sys_sched_getaffinity(guest_t *g,
                               uint64_t size,
                               uint64_t mask_gva)
 {
-    (void) pid;
     /* Return a 1-CPU affinity mask for simplicity.
      * sched_setaffinity is not implemented; all threads see CPU 0.
      */
+    if (pid < 0)
+        return -LINUX_EINVAL;
+    if (!sched_pid_alive(pid))
+        return -LINUX_ESRCH;
     if (size < 8)
         return -LINUX_EINVAL;
 
@@ -283,6 +292,179 @@ int64_t sys_sched_getaffinity(guest_t *g,
     return 8; /* Returns size of written mask */
 }
 
+/* Scheduler policy stubs.
+ *
+ * elfuse models a single SCHED_OTHER thread group. Linux scheduler syscalls
+ * are per-thread: the pid argument is actually a TID, and a worker calling
+ * sched_getscheduler(gettid()) must reach its own thread entry, not just the
+ * thread-group leader. Live TIDs are matched via thread_tid_alive(); pid 0
+ * means "the calling thread" and is always accepted.
+ *
+ * Any policy transition away from SCHED_OTHER is rejected unless the stub can
+ * model it faithfully. Callers branch on the apparent policy, and silently
+ * accepting BATCH/IDLE/RT classes while still reporting SCHED_OTHER would
+ * hide guest bugs. SCHED_DEADLINE through legacy sched_setscheduler is
+ * -EINVAL because the legacy syscall cannot supply the deadline attributes
+ * (real Linux requires sched_setattr).
+ *
+ * Errno ordering follows Linux 6.x kernel/sched/syscalls.c:
+ *   1. pid < 0 or NULL param pointer -> EINVAL
+ *   2. copy_from_user (setters) -> EFAULT before pid lookup
+ *   3. find_process_by_pid -> ESRCH
+ *   4. policy/priority validation -> EINVAL or EPERM
+ * Getters that only write back leave EFAULT for the final copy_to_user step,
+ * matching the kernel's order.
+ */
+static bool sched_pid_alive(int pid)
+{
+    if (pid == 0)
+        return true;
+    if (pid == (int) proc_get_pid())
+        return true;
+    return thread_tid_alive((int64_t) pid) != 0;
+}
+
+/* Validate a sched_param for the named policy. Returns 0 if accepted by the
+ * stub, or a negative Linux errno. EPERM is reserved for "request would be
+ * valid on Linux but the stub refuses to honor it" -- RT priority elevation
+ * and BATCH/IDLE class transitions away from SCHED_OTHER. EINVAL covers
+ * every other out-of-spec input (bad priority range, SCHED_DEADLINE through
+ * the legacy entry point, unknown policy bits).
+ */
+static int sched_check_policy_param(int policy, int prio)
+{
+    int base_policy = policy & ~LINUX_SCHED_RESET_ON_FORK;
+
+    /* Reject any unknown high bit. The mask 0x7 covers every base policy
+     * id we recognize (NORMAL=0, FIFO=1, RR=2, BATCH=3, IDLE=5, DEADLINE=6);
+     * the unused 4 and 7 fall through to the default switch arm below.
+     */
+    if (policy & ~(LINUX_SCHED_RESET_ON_FORK | 0x7))
+        return -LINUX_EINVAL;
+
+    switch (base_policy) {
+    case LINUX_SCHED_NORMAL:
+        return prio == 0 ? 0 : -LINUX_EINVAL;
+    case LINUX_SCHED_BATCH:
+    case LINUX_SCHED_IDLE:
+        return prio == 0 ? -LINUX_EPERM : -LINUX_EINVAL;
+    case LINUX_SCHED_FIFO:
+    case LINUX_SCHED_RR:
+        if (prio < 1 || prio > 99)
+            return -LINUX_EINVAL;
+        return -LINUX_EPERM;
+    case LINUX_SCHED_DEADLINE:
+        return -LINUX_EINVAL;
+    default:
+        return -LINUX_EINVAL;
+    }
+}
+
+int64_t sys_sched_getscheduler(int pid)
+{
+    if (pid < 0)
+        return -LINUX_EINVAL;
+    if (!sched_pid_alive(pid))
+        return -LINUX_ESRCH;
+    return LINUX_SCHED_NORMAL;
+}
+
+int64_t sys_sched_getparam(guest_t *g, int pid, uint64_t param_gva)
+{
+    if (pid < 0 || param_gva == 0)
+        return -LINUX_EINVAL;
+    if (!sched_pid_alive(pid))
+        return -LINUX_ESRCH;
+    linux_sched_param_t param = {0};
+    if (guest_write_small(g, param_gva, &param, sizeof(param)) < 0)
+        return -LINUX_EFAULT;
+    return 0;
+}
+
+int64_t sys_sched_setscheduler(guest_t *g,
+                               int pid,
+                               int policy,
+                               uint64_t param_gva)
+{
+    if (pid < 0 || param_gva == 0)
+        return -LINUX_EINVAL;
+    linux_sched_param_t param;
+    if (guest_read_small(g, param_gva, &param, sizeof(param)) < 0)
+        return -LINUX_EFAULT;
+    if (!sched_pid_alive(pid))
+        return -LINUX_ESRCH;
+    return sched_check_policy_param(policy, param.sched_priority);
+}
+
+int64_t sys_sched_setparam(guest_t *g, int pid, uint64_t param_gva)
+{
+    if (pid < 0 || param_gva == 0)
+        return -LINUX_EINVAL;
+    linux_sched_param_t param;
+    if (guest_read_small(g, param_gva, &param, sizeof(param)) < 0)
+        return -LINUX_EFAULT;
+    if (!sched_pid_alive(pid))
+        return -LINUX_ESRCH;
+    /* Current policy is SCHED_OTHER, so only priority 0 is valid. Any other
+     * value mirrors the kernel's EINVAL for non-RT priority changes.
+     */
+    if (param.sched_priority != 0)
+        return -LINUX_EINVAL;
+    return 0;
+}
+
+int64_t sys_sched_get_priority_min(int policy)
+{
+    switch (policy) {
+    case LINUX_SCHED_NORMAL:
+    case LINUX_SCHED_BATCH:
+    case LINUX_SCHED_IDLE:
+    case LINUX_SCHED_DEADLINE:
+        return 0;
+    case LINUX_SCHED_FIFO:
+    case LINUX_SCHED_RR:
+        return 1;
+    default:
+        return -LINUX_EINVAL;
+    }
+}
+
+int64_t sys_sched_get_priority_max(int policy)
+{
+    switch (policy) {
+    case LINUX_SCHED_NORMAL:
+    case LINUX_SCHED_BATCH:
+    case LINUX_SCHED_IDLE:
+    case LINUX_SCHED_DEADLINE:
+        return 0;
+    case LINUX_SCHED_FIFO:
+    case LINUX_SCHED_RR:
+        return 99;
+    default:
+        return -LINUX_EINVAL;
+    }
+}
+
+int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva)
+{
+    if (pid < 0)
+        return -LINUX_EINVAL;
+    if (!sched_pid_alive(pid))
+        return -LINUX_ESRCH;
+    if (ts_gva == 0)
+        return -LINUX_EFAULT;
+    /* Linux's fair_sched_class.get_rr_interval returns a CFS-derived slice
+     * for SCHED_OTHER tasks whenever the runqueue carries load. Reporting
+     * 100 ms (the sched_rr_timeslice default and a typical CFS quantum)
+     * gives querying tools a plausible non-zero value without pretending
+     * the guest is actually under SCHED_RR.
+     */
+    linux_timespec_t ts = {.tv_sec = 0, .tv_nsec = 100 * 1000 * 1000L};
+    if (guest_write_small(g, ts_gva, &ts, sizeof(ts)) < 0)
+        return -LINUX_EFAULT;
+    return 0;
+}
+
 int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva)
 {
     int ngroups = get_cached_linux_groups();
diff --git a/src/syscall/sys.h b/src/syscall/sys.h
index 8025d2b..d2714bc 100644
--- a/src/syscall/sys.h
+++ b/src/syscall/sys.h
@@ -30,6 +30,16 @@ int64_t sys_sched_getaffinity(guest_t *g,
                               int pid,
                               uint64_t size,
                               uint64_t mask_gva);
+int64_t sys_sched_getscheduler(int pid);
+int64_t sys_sched_getparam(guest_t *g, int pid, uint64_t param_gva);
+int64_t sys_sched_setscheduler(guest_t *g,
+                               int pid,
+                               int policy,
+                               uint64_t param_gva);
+int64_t sys_sched_setparam(guest_t *g, int pid, uint64_t param_gva);
+int64_t sys_sched_get_priority_min(int policy);
+int64_t sys_sched_get_priority_max(int policy);
+int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva);
 int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva);
 int64_t sys_getrusage(guest_t *g, int who, uint64_t usage_gva);
 int64_t sys_sysinfo(guest_t *g, uint64_t info_gva);
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index a75fa73..5af65cb 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -313,7 +313,14 @@ SC_FORWARD(sc_getrlimit, sys_prlimit64(g, 0, (int) x0, 0, x1))
 SC_FORWARD(sc_setrlimit, sys_prlimit64(g, 0, (int) x0, x1, 0))
 SC_FORWARD(sc_getgroups, sys_getgroups(g, (int) x0, x1))
 SC_FORWARD(sc_getrusage, sys_getrusage(g, (int) x0, x1))
-SC_FORWARD(sc_sched_getaffinity, sys_sched_getaffinity(g, (int) x0, x1, x2))
+SC_FORWARD(sc_sched_getaffinity,    sys_sched_getaffinity(g, (int) x0, x1, x2))
+SC_FORWARD(sc_sched_getscheduler,   sys_sched_getscheduler((int) x0))
+SC_FORWARD(sc_sched_getparam,       sys_sched_getparam(g, (int) x0, x1))
+SC_FORWARD(sc_sched_setscheduler,   sys_sched_setscheduler(g, (int) x0, (int) x1, x2))
+SC_FORWARD(sc_sched_setparam,       sys_sched_setparam(g, (int) x0, x1))
+SC_FORWARD(sc_sched_get_priority_min, sys_sched_get_priority_min((int) x0))
+SC_FORWARD(sc_sched_get_priority_max, sys_sched_get_priority_max((int) x0))
+SC_FORWARD(sc_sched_rr_get_interval,  sys_sched_rr_get_interval(g, (int) x0, x1))
 
 /* Process identity is modeled as one Linux process inside this elfuse instance. */
 SC_FORWARD(sc_exit,    SC_EXIT_SENTINEL | ((int) x0 & 0xFF))
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 3fbdb75..49136eb 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -147,6 +147,9 @@ test-sysv-shm
 [section] Credential/identity emulation tests
 test-credentials                   # diff=skip
 
+[section] Scheduler policy stub tests
+test-sched-policy
+
 [section] membarrier tests
 test-membarrier
 
diff --git a/tests/test-sched-policy.c b/tests/test-sched-policy.c
new file mode 100644
index 0000000..9e592d5
--- /dev/null
+++ b/tests/test-sched-policy.c
@@ -0,0 +1,306 @@
+/* Test scheduler policy stub syscalls
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Exercises sched_setparam (118), sched_setscheduler (119),
+ * sched_getscheduler (120), sched_getparam (121),
+ * sched_get_priority_min (125), sched_get_priority_max (126),
+ * sched_rr_get_interval (127). The implementation is a stub: getters
+ * report SCHED_OTHER and a zero priority, RT priority elevation is
+ * refused with -EPERM, SCHED_DEADLINE through the legacy entry point
+ * is -EINVAL, and all entries validate guest pointers (-EFAULT).
+ */
+
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "test-harness.h"
+#include "raw-syscall.h"
+
+#define __NR_sched_setparam 118
+#define __NR_sched_setscheduler 119
+#define __NR_sched_getscheduler 120
+#define __NR_sched_getparam 121
+#define __NR_sched_get_priority_max 125
+#define __NR_sched_get_priority_min 126
+#define __NR_sched_rr_get_interval 127
+
+#define LINUX_SCHED_DEADLINE 6
+#define LINUX_SCHED_RESET_ON_FORK 0x40000000
+
+struct sched_param_compat {
+    int sched_priority;
+};
+
+struct ts_compat {
+    long tv_sec;
+    long tv_nsec;
+};
+
+struct worker_result {
+    long getscheduler_rc;
+    long getparam_rc;
+    int param_priority;
+    long setscheduler_self_rc;
+    long getaffinity_rc;
+};
+
+static void *worker_main(void *arg)
+{
+    struct worker_result *out = arg;
+    long tid = raw_syscall0(178 /* gettid */);
+
+    out->getscheduler_rc = raw_syscall1(__NR_sched_getscheduler, tid);
+
+    struct sched_param_compat p = {.sched_priority = 0xdead};
+    out->getparam_rc = raw_syscall2(__NR_sched_getparam, tid, (long) &p);
+    out->param_priority = p.sched_priority;
+
+    struct sched_param_compat zero = {.sched_priority = 0};
+    out->setscheduler_self_rc =
+        raw_syscall3(__NR_sched_setscheduler, tid, SCHED_OTHER, (long) &zero);
+
+    unsigned long mask = 0;
+    out->getaffinity_rc = raw_syscall3(123 /* sched_getaffinity */, tid,
+                                       sizeof(mask), (long) &mask);
+
+    return NULL;
+}
+
+int main(void)
+{
+    int passes = 0, fails = 0;
+
+    printf("test-sched-policy: scheduler policy stub coverage\n");
+
+    TEST("getscheduler(0) == SCHED_OTHER");
+    EXPECT_EQ(raw_syscall1(__NR_sched_getscheduler, 0), SCHED_OTHER,
+              "default policy is not SCHED_OTHER");
+
+    TEST("getscheduler(-1) == -EINVAL");
+    EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_getscheduler, -1), -EINVAL,
+                     "negative pid did not return EINVAL");
+
+    TEST("getscheduler(99999) == -ESRCH");
+    EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_getscheduler, 99999), -ESRCH,
+                     "unknown pid did not return ESRCH");
+
+    TEST("getparam(0, &p) zeroes priority");
+    {
+        struct sched_param_compat p = {.sched_priority = 0x5a5a};
+        long rc = raw_syscall2(__NR_sched_getparam, 0, (long) &p);
+        EXPECT_TRUE(rc == 0 && p.sched_priority == 0,
+                    "getparam did not return zero priority");
+    }
+
+    TEST("getparam(0, NULL) == -EINVAL");
+    EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_getparam, 0, 0), -EINVAL,
+                     "NULL param pointer did not return EINVAL");
+
+    TEST("getparam(0, bogus) == -EFAULT");
+    EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_getparam, 0, (long) 0x1), -EFAULT,
+                     "unmapped param did not return EFAULT");
+
+    TEST("setscheduler(SCHED_OTHER, prio=0) == 0");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_EQ(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_OTHER, (long) &p), 0,
+            "SCHED_OTHER prio=0 was not accepted");
+    }
+
+    TEST("setscheduler(SCHED_OTHER, prio=1) == -EINVAL");
+    {
+        struct sched_param_compat p = {.sched_priority = 1};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_OTHER, (long) &p),
+            -EINVAL, "non-zero priority for SCHED_OTHER was accepted");
+    }
+
+    TEST("setscheduler(SCHED_BATCH, prio=0) == -EPERM");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_BATCH, (long) &p),
+            -EPERM, "SCHED_BATCH was accepted without state tracking");
+    }
+
+    TEST("setscheduler(SCHED_IDLE, prio=0) == -EPERM");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_IDLE, (long) &p),
+            -EPERM, "SCHED_IDLE was accepted without state tracking");
+    }
+
+    TEST("setscheduler(SCHED_FIFO, prio=50) == -EPERM");
+    {
+        struct sched_param_compat p = {.sched_priority = 50};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_FIFO, (long) &p),
+            -EPERM, "SCHED_FIFO with valid prio did not return EPERM");
+    }
+
+    TEST("setscheduler(SCHED_FIFO, prio=0) == -EINVAL");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_FIFO, (long) &p),
+            -EINVAL, "SCHED_FIFO with prio=0 did not return EINVAL");
+    }
+
+    TEST("setscheduler(SCHED_FIFO, prio=100) == -EINVAL");
+    {
+        struct sched_param_compat p = {.sched_priority = 100};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_FIFO, (long) &p),
+            -EINVAL, "SCHED_FIFO with prio=100 did not return EINVAL");
+    }
+
+    TEST("setscheduler(SCHED_RR, prio=50) == -EPERM");
+    {
+        struct sched_param_compat p = {.sched_priority = 50};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, SCHED_RR, (long) &p),
+            -EPERM, "SCHED_RR with valid prio did not return EPERM");
+    }
+
+    TEST("setscheduler(SCHED_DEADLINE) == -EINVAL");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_RAW_ERRNO(raw_syscall3(__NR_sched_setscheduler, 0,
+                                      LINUX_SCHED_DEADLINE, (long) &p),
+                         -EINVAL,
+                         "SCHED_DEADLINE legacy syscall did not return EINVAL");
+    }
+
+    TEST("setscheduler(SCHED_OTHER|RESET_ON_FORK, prio=0) == 0");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_EQ(
+            raw_syscall3(__NR_sched_setscheduler, 0,
+                         SCHED_OTHER | LINUX_SCHED_RESET_ON_FORK, (long) &p),
+            0, "SCHED_RESET_ON_FORK flag was not accepted");
+    }
+
+    TEST("setscheduler(SCHED_FIFO|RESET_ON_FORK, prio=50) == -EPERM");
+    {
+        struct sched_param_compat p = {.sched_priority = 50};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0,
+                         SCHED_FIFO | LINUX_SCHED_RESET_ON_FORK, (long) &p),
+            -EPERM, "SCHED_RESET_ON_FORK changed SCHED_FIFO semantics");
+    }
+
+    TEST("setscheduler(_, _, bogus) + bad pid -> EFAULT");
+    EXPECT_RAW_ERRNO(
+        raw_syscall3(__NR_sched_setscheduler, 99999, SCHED_OTHER, (long) 0x1),
+        -EFAULT, "bad param pointer did not take precedence over ESRCH");
+
+    TEST("setscheduler(garbage policy) == -EINVAL");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_RAW_ERRNO(
+            raw_syscall3(__NR_sched_setscheduler, 0, 99, (long) &p), -EINVAL,
+            "unknown policy did not return EINVAL");
+    }
+
+    TEST("setscheduler(_, _, NULL) == -EINVAL");
+    EXPECT_RAW_ERRNO(raw_syscall3(__NR_sched_setscheduler, 0, SCHED_OTHER, 0),
+                     -EINVAL, "NULL param pointer did not return EINVAL");
+
+    TEST("setparam(0, prio=0) == 0");
+    {
+        struct sched_param_compat p = {.sched_priority = 0};
+        EXPECT_EQ(raw_syscall2(__NR_sched_setparam, 0, (long) &p), 0,
+                  "setparam(prio=0) was rejected");
+    }
+
+    TEST("setparam(0, prio=1) == -EINVAL");
+    {
+        struct sched_param_compat p = {.sched_priority = 1};
+        EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_setparam, 0, (long) &p),
+                         -EINVAL, "setparam(prio=1) was accepted");
+    }
+
+    TEST("get_priority_min(SCHED_OTHER) == 0");
+    EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_min, SCHED_OTHER), 0,
+              "min for SCHED_OTHER not 0");
+
+    TEST("get_priority_max(SCHED_OTHER) == 0");
+    EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_max, SCHED_OTHER), 0,
+              "max for SCHED_OTHER not 0");
+
+    TEST("get_priority_min(SCHED_FIFO) == 1");
+    EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_min, SCHED_FIFO), 1,
+              "min for SCHED_FIFO not 1");
+
+    TEST("get_priority_max(SCHED_FIFO) == 99");
+    EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_max, SCHED_FIFO), 99,
+              "max for SCHED_FIFO not 99");
+
+    TEST("get_priority_min(SCHED_DEADLINE) == 0");
+    EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_min, LINUX_SCHED_DEADLINE),
+              0, "min for SCHED_DEADLINE not 0");
+
+    TEST("get_priority_max(SCHED_DEADLINE) == 0");
+    EXPECT_EQ(raw_syscall1(__NR_sched_get_priority_max, LINUX_SCHED_DEADLINE),
+              0, "max for SCHED_DEADLINE not 0");
+
+    TEST("get_priority_min(garbage) == -EINVAL");
+    EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_get_priority_min, 42), -EINVAL,
+                     "unknown policy min did not return EINVAL");
+
+    TEST("get_priority_max(garbage) == -EINVAL");
+    EXPECT_RAW_ERRNO(raw_syscall1(__NR_sched_get_priority_max, 42), -EINVAL,
+                     "unknown policy max did not return EINVAL");
+
+    TEST("rr_get_interval(0, &ts) writes plausible slice");
+    {
+        struct ts_compat ts = {.tv_sec = 7, .tv_nsec = 11};
+        long rc = raw_syscall2(__NR_sched_rr_get_interval, 0, (long) &ts);
+        EXPECT_TRUE(rc == 0 && ts.tv_sec == 0 && ts.tv_nsec > 0 &&
+                        ts.tv_nsec < 1000 * 1000 * 1000L,
+                    "rr_get_interval did not write a sane sub-second slice");
+    }
+
+    TEST("rr_get_interval(0, NULL) == -EFAULT");
+    EXPECT_RAW_ERRNO(raw_syscall2(__NR_sched_rr_get_interval, 0, 0), -EFAULT,
+                     "NULL ts pointer did not return EFAULT");
+
+    /* Worker-thread coverage: scheduler syscalls take a TID, not a TGID, so a
+     * worker calling sched_*(gettid()) must succeed. This also exercises
+     * sched_getaffinity's per-thread pid gate.
+     */
+    {
+        pthread_t worker;
+        struct worker_result r = {0};
+        if (pthread_create(&worker, NULL, worker_main, &r) != 0) {
+            TEST("pthread_create(worker)");
+            FAIL("pthread_create failed");
+        } else {
+            pthread_join(worker, NULL);
+
+            TEST("worker getscheduler(gettid()) == SCHED_OTHER");
+            EXPECT_EQ(r.getscheduler_rc, SCHED_OTHER,
+                      "worker did not see SCHED_OTHER");
+
+            TEST("worker getparam(gettid()) zeroes priority");
+            EXPECT_TRUE(r.getparam_rc == 0 && r.param_priority == 0,
+                        "worker getparam mismatch");
+
+            TEST("worker setscheduler(gettid(), OTHER, 0) == 0");
+            EXPECT_EQ(r.setscheduler_self_rc, 0,
+                      "worker self-setscheduler rejected");
+
+            TEST("worker getaffinity(gettid()) succeeds");
+            EXPECT_TRUE(r.getaffinity_rc > 0, "worker getaffinity failed");
+        }
+    }
+
+    SUMMARY("test-sched-policy");
+    return fails == 0 ? 0 : 1;
+}