Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-sched-policy spawns a pthread to verify per-thread TID lookup
$(BUILD_DIR)/test-sched-policy: tests/test-sched-policy.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-signalfd-hardening needs -lpthread for the worker-thread tid
# regression case in test_rt_sigqueueinfo_rejects_thread_tid.
$(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_DIR)
Expand Down
21 changes: 21 additions & 0 deletions src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,16 @@
#define SYS_clock_gettime 113
#define SYS_clock_getres 114
#define SYS_clock_nanosleep 115
#define SYS_sched_setparam 118
#define SYS_sched_setscheduler 119
#define SYS_sched_getscheduler 120
#define SYS_sched_getparam 121
#define SYS_sched_setaffinity 122
#define SYS_sched_getaffinity 123
#define SYS_sched_yield 124
#define SYS_sched_get_priority_max 125
#define SYS_sched_get_priority_min 126
#define SYS_sched_rr_get_interval 127
#define SYS_kill 129
#define SYS_tgkill 131
#define SYS_sigaltstack 132
Expand Down Expand Up @@ -479,6 +486,20 @@ typedef struct {
int64_t tv_sec, tv_usec;
} linux_timeval_t;

/* Linux scheduling policies (asm-generic/sched.h). */
#define LINUX_SCHED_NORMAL 0
#define LINUX_SCHED_FIFO 1
#define LINUX_SCHED_RR 2
#define LINUX_SCHED_BATCH 3
#define LINUX_SCHED_IDLE 5
#define LINUX_SCHED_DEADLINE 6
#define LINUX_SCHED_RESET_ON_FORK 0x40000000

/* Linux struct sched_param (POSIX); only sched_priority is exposed. */
typedef struct {
int32_t sched_priority;
} linux_sched_param_t;

/* Linux struct statfs (aarch64). */
typedef struct {
int64_t f_type, f_bsize;
Expand Down
7 changes: 7 additions & 0 deletions src/syscall/dispatch.tbl
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,16 @@ SYS_setsid sc_setsid 0
SYS_getgroups sc_getgroups 1
SYS_setpriority sc_setpriority 0
SYS_getpriority sc_getpriority 0
SYS_sched_setparam sc_sched_setparam 0
SYS_sched_setscheduler sc_sched_setscheduler 0
SYS_sched_getscheduler sc_sched_getscheduler 0
SYS_sched_getparam sc_sched_getparam 0
SYS_sched_setaffinity sc_sched_setaffinity 1
SYS_sched_getaffinity sc_sched_getaffinity 1
SYS_sched_yield sc_sched_yield 0
SYS_sched_get_priority_max sc_sched_get_priority_max 0
SYS_sched_get_priority_min sc_sched_get_priority_min 0
SYS_sched_rr_get_interval sc_sched_rr_get_interval 0
SYS_set_tid_address sc_set_tid_address 0
SYS_capget sc_capget 1
SYS_capset sc_capset 0
Expand Down
184 changes: 183 additions & 1 deletion src/syscall/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ _Static_assert(offsetof(struct rusage, ru_maxrss) ==
offsetof(linux_rusage_t, ru_maxrss),
"ru_maxrss offset must stay aligned for fast translation");

/* Defined below in the scheduler-policy section; forward-declared so
* sys_sched_getaffinity (which sits above the policy stubs) can share the
* same per-thread TID gate.
*/
static bool sched_pid_alive(int pid);

static void groups_init_cached_linux_groups(void)
{
gid_t groups[64];
Expand Down Expand Up @@ -246,10 +252,13 @@ int64_t sys_sched_getaffinity(guest_t *g,
uint64_t size,
uint64_t mask_gva)
{
(void) pid;
/* Return a 1-CPU affinity mask for simplicity.
* sched_setaffinity is not implemented; all threads see CPU 0.
*/
if (pid < 0)
return -LINUX_EINVAL;
if (!sched_pid_alive(pid))
return -LINUX_ESRCH;
if (size < 8)
return -LINUX_EINVAL;

Expand Down Expand Up @@ -283,6 +292,179 @@ int64_t sys_sched_getaffinity(guest_t *g,
return 8; /* Returns size of written mask */
}

/* Scheduler policy stubs.
*
* elfuse models a single SCHED_OTHER thread group. Linux scheduler syscalls
* are per-thread: the pid argument is actually a TID, and a worker calling
* sched_getscheduler(gettid()) must reach its own thread entry, not just the
* thread-group leader. Live TIDs are matched via thread_tid_alive(); pid 0
* means "the calling thread" and is always accepted.
*
* Any policy transition away from SCHED_OTHER is rejected unless the stub can
* model it faithfully. Callers branch on the apparent policy, and silently
* accepting BATCH/IDLE/RT classes while still reporting SCHED_OTHER would
* hide guest bugs. SCHED_DEADLINE through legacy sched_setscheduler is
* -EINVAL because the legacy syscall cannot supply the deadline attributes
* (real Linux requires sched_setattr).
*
* Errno ordering follows Linux 6.x kernel/sched/syscalls.c:
* 1. pid < 0 or NULL param pointer -> EINVAL
* 2. copy_from_user (setters) -> EFAULT before pid lookup
* 3. find_process_by_pid -> ESRCH
* 4. policy/priority validation -> EINVAL or EPERM
* Getters that only write back leave EFAULT for the final copy_to_user step,
* matching the kernel's order.
*/
static bool sched_pid_alive(int pid)
{
if (pid == 0)
return true;
if (pid == (int) proc_get_pid())
return true;
return thread_tid_alive((int64_t) pid) != 0;
}

/* Validate a sched_param for the named policy. Returns 0 if accepted by the
* stub, or a negative Linux errno. EPERM is reserved for "request would be
* valid on Linux but the stub refuses to honor it" -- RT priority elevation
* and BATCH/IDLE class transitions away from SCHED_OTHER. EINVAL covers
* every other out-of-spec input (bad priority range, SCHED_DEADLINE through
* the legacy entry point, unknown policy bits).
*/
static int sched_check_policy_param(int policy, int prio)
{
int base_policy = policy & ~LINUX_SCHED_RESET_ON_FORK;

/* Reject any unknown high bit. The mask 0x7 covers every base policy
* id we recognize (NORMAL=0, FIFO=1, RR=2, BATCH=3, IDLE=5, DEADLINE=6);
* the unused 4 and 7 fall through to the default switch arm below.
*/
if (policy & ~(LINUX_SCHED_RESET_ON_FORK | 0x7))
return -LINUX_EINVAL;

switch (base_policy) {
case LINUX_SCHED_NORMAL:
return prio == 0 ? 0 : -LINUX_EINVAL;
case LINUX_SCHED_BATCH:
case LINUX_SCHED_IDLE:
return prio == 0 ? -LINUX_EPERM : -LINUX_EINVAL;
case LINUX_SCHED_FIFO:
case LINUX_SCHED_RR:
if (prio < 1 || prio > 99)
return -LINUX_EINVAL;
return -LINUX_EPERM;
case LINUX_SCHED_DEADLINE:
return -LINUX_EINVAL;
default:
return -LINUX_EINVAL;
}
}

int64_t sys_sched_getscheduler(int pid)
{
if (pid < 0)
return -LINUX_EINVAL;
if (!sched_pid_alive(pid))
return -LINUX_ESRCH;
return LINUX_SCHED_NORMAL;
}

int64_t sys_sched_getparam(guest_t *g, int pid, uint64_t param_gva)
{
if (pid < 0 || param_gva == 0)
return -LINUX_EINVAL;
if (!sched_pid_alive(pid))
return -LINUX_ESRCH;
linux_sched_param_t param = {0};
if (guest_write_small(g, param_gva, &param, sizeof(param)) < 0)
return -LINUX_EFAULT;
return 0;
}

int64_t sys_sched_setscheduler(guest_t *g,
int pid,
int policy,
uint64_t param_gva)
{
if (pid < 0 || param_gva == 0)
return -LINUX_EINVAL;
linux_sched_param_t param;
if (guest_read_small(g, param_gva, &param, sizeof(param)) < 0)
return -LINUX_EFAULT;
if (!sched_pid_alive(pid))
return -LINUX_ESRCH;
return sched_check_policy_param(policy, param.sched_priority);
}

int64_t sys_sched_setparam(guest_t *g, int pid, uint64_t param_gva)
{
if (pid < 0 || param_gva == 0)
return -LINUX_EINVAL;
linux_sched_param_t param;
if (guest_read_small(g, param_gva, &param, sizeof(param)) < 0)
return -LINUX_EFAULT;
if (!sched_pid_alive(pid))
return -LINUX_ESRCH;
/* Current policy is SCHED_OTHER, so only priority 0 is valid. Any other
* value mirrors the kernel's EINVAL for non-RT priority changes.
*/
if (param.sched_priority != 0)
return -LINUX_EINVAL;
return 0;
}

int64_t sys_sched_get_priority_min(int policy)
{
switch (policy) {
case LINUX_SCHED_NORMAL:
case LINUX_SCHED_BATCH:
case LINUX_SCHED_IDLE:
case LINUX_SCHED_DEADLINE:
return 0;
case LINUX_SCHED_FIFO:
case LINUX_SCHED_RR:
return 1;
default:
return -LINUX_EINVAL;
}
}

int64_t sys_sched_get_priority_max(int policy)
{
switch (policy) {
case LINUX_SCHED_NORMAL:
case LINUX_SCHED_BATCH:
case LINUX_SCHED_IDLE:
case LINUX_SCHED_DEADLINE:
return 0;
case LINUX_SCHED_FIFO:
case LINUX_SCHED_RR:
return 99;
default:
return -LINUX_EINVAL;
}
}

int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva)
{
if (pid < 0)
return -LINUX_EINVAL;
if (!sched_pid_alive(pid))
return -LINUX_ESRCH;
if (ts_gva == 0)
return -LINUX_EFAULT;
/* Linux's fair_sched_class.get_rr_interval returns a CFS-derived slice
* for SCHED_OTHER tasks whenever the runqueue carries load. Reporting
* 100 ms (the sched_rr_timeslice default and a typical CFS quantum)
* gives querying tools a plausible non-zero value without pretending
* the guest is actually under SCHED_RR.
*/
linux_timespec_t ts = {.tv_sec = 0, .tv_nsec = 100 * 1000 * 1000L};
if (guest_write_small(g, ts_gva, &ts, sizeof(ts)) < 0)
return -LINUX_EFAULT;
return 0;
}

int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva)
{
int ngroups = get_cached_linux_groups();
Expand Down
10 changes: 10 additions & 0 deletions src/syscall/sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ int64_t sys_sched_getaffinity(guest_t *g,
int pid,
uint64_t size,
uint64_t mask_gva);
int64_t sys_sched_getscheduler(int pid);
int64_t sys_sched_getparam(guest_t *g, int pid, uint64_t param_gva);
int64_t sys_sched_setscheduler(guest_t *g,
int pid,
int policy,
uint64_t param_gva);
int64_t sys_sched_setparam(guest_t *g, int pid, uint64_t param_gva);
int64_t sys_sched_get_priority_min(int policy);
int64_t sys_sched_get_priority_max(int policy);
int64_t sys_sched_rr_get_interval(guest_t *g, int pid, uint64_t ts_gva);
int64_t sys_getgroups(guest_t *g, int size, uint64_t list_gva);
int64_t sys_getrusage(guest_t *g, int who, uint64_t usage_gva);
int64_t sys_sysinfo(guest_t *g, uint64_t info_gva);
Expand Down
9 changes: 8 additions & 1 deletion src/syscall/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,14 @@ SC_FORWARD(sc_getrlimit, sys_prlimit64(g, 0, (int) x0, 0, x1))
SC_FORWARD(sc_setrlimit, sys_prlimit64(g, 0, (int) x0, x1, 0))
SC_FORWARD(sc_getgroups, sys_getgroups(g, (int) x0, x1))
SC_FORWARD(sc_getrusage, sys_getrusage(g, (int) x0, x1))
SC_FORWARD(sc_sched_getaffinity, sys_sched_getaffinity(g, (int) x0, x1, x2))
SC_FORWARD(sc_sched_getaffinity, sys_sched_getaffinity(g, (int) x0, x1, x2))
SC_FORWARD(sc_sched_getscheduler, sys_sched_getscheduler((int) x0))
SC_FORWARD(sc_sched_getparam, sys_sched_getparam(g, (int) x0, x1))
SC_FORWARD(sc_sched_setscheduler, sys_sched_setscheduler(g, (int) x0, (int) x1, x2))
SC_FORWARD(sc_sched_setparam, sys_sched_setparam(g, (int) x0, x1))
SC_FORWARD(sc_sched_get_priority_min, sys_sched_get_priority_min((int) x0))
SC_FORWARD(sc_sched_get_priority_max, sys_sched_get_priority_max((int) x0))
SC_FORWARD(sc_sched_rr_get_interval, sys_sched_rr_get_interval(g, (int) x0, x1))

/* Process identity is modeled as one Linux process inside this elfuse instance. */
SC_FORWARD(sc_exit, SC_EXIT_SENTINEL | ((int) x0 & 0xFF))
Expand Down
3 changes: 3 additions & 0 deletions tests/manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ test-sysv-shm
[section] Credential/identity emulation tests
test-credentials # diff=skip

[section] Scheduler policy stub tests
test-sched-policy

[section] membarrier tests
test-membarrier

Expand Down
Loading
Loading