From 83c45c669e4d31766f7da4303ba9e24dc7022fc7 Mon Sep 17 00:00:00 2001 From: utam0k Date: Fri, 4 Aug 2023 02:04:56 +0000 Subject: [PATCH] Support `process.scheduler` Spec: https://github.com/opencontainers/runtime-spec/pull/1188 Fix: https://github.com/opencontainers/runc/issues/3895 Signed-off-by: utam0k --- docs/spec-conformance.md | 1 - libcontainer/configs/config.go | 14 ++++ libcontainer/configs/validate/validator.go | 12 +++ .../configs/validate/validator_test.go | 30 +++++++ libcontainer/process.go | 2 + libcontainer/process_linux.go | 7 ++ libcontainer/specconv/spec_linux.go | 11 +++ libcontainer/standard_init_linux.go | 8 ++ libcontainer/utils/utils_unix.go | 80 +++++++++++++++++++ tests/integration/scheduler.bats | 23 ++++++ utils_linux.go | 12 +++ 11 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 tests/integration/scheduler.bats diff --git a/docs/spec-conformance.md b/docs/spec-conformance.md index b512e5f5bb1..b7579fdb469 100644 --- a/docs/spec-conformance.md +++ b/docs/spec-conformance.md @@ -14,7 +14,6 @@ v1.1.0 | `.linux.resources.cpu.burst` | [#3749](https://github v1.1.0 | `SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV` | [#3862](https://github.com/opencontainers/runc/pull/3862) v1.1.0 | time namespaces | [#3876](https://github.com/opencontainers/runc/pull/3876) v1.1.0 | rsvd hugetlb cgroup | TODO ([#3859](https://github.com/opencontainers/runc/issues/3859)) -v1.1.0 | `.process.scheduler` | TODO ([#3895](https://github.com/opencontainers/runc/issues/3895)) v1.1.0 | `.process.ioPriority` | [#3783](https://github.com/opencontainers/runc/pull/3783) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index bb5dbba6588..842b1c6ea39 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -216,6 +216,20 @@ type Config struct { // Do not try to remount a bind mount again after the first attempt failed on source // filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set NoMountFallback bool `json:"no_mount_fallback,omitempty"` + + // Scheduler represents the scheduling attributes for a process. + Scheduler *Scheduler `json:"shceduler,omitempty"` +} + +// Scheduler is based on the Linux sched_setattr(2) syscall. +type Scheduler struct { + Policy specs.LinuxSchedulerPolicy `json:"policy"` + Nice int32 `json:"nice,omitempty"` + Priority int32 `json:"priority,omitempty"` + Flags []specs.LinuxSchedulerFlag `json:"flags,omitempty"` + Runtime uint64 `json:"runtime,omitempty"` + Deadline uint64 `json:"deadline,omitempty"` + Period uint64 `json:"period,omitempty"` } type ( diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 196b431dba1..3300a90697a 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -29,6 +29,7 @@ func Validate(config *configs.Config) error { intelrdtCheck, rootlessEUIDCheck, mounts, + scheduler, } for _, c := range checks { if err := c(config); err != nil { @@ -333,3 +334,14 @@ func isHostNetNS(path string) (bool, error) { return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil } + +func scheduler(config *configs.Config) error { + if config.Scheduler == nil { + return nil + } + niceValue := config.Scheduler.Nice + if niceValue < -20 || niceValue > 20 { + return fmt.Errorf("invalid scheduler.nice: %d", niceValue) + } + return nil +} diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index 58aae7a68f8..d0e1e765ebc 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -582,3 +582,33 @@ func TestValidateIDMapMounts(t *testing.T) { }) } } + +func TestValidateScheduler(t *testing.T) { + testCases := []struct { + isErr bool + niceValue int32 + }{ + {isErr: false, niceValue: 20}, + {isErr: false, niceValue: -20}, + {isErr: true, niceValue: 21}, + {isErr: true, niceValue: -21}, + } + + for _, tc := range testCases { + scheduler := configs.Scheduler{ + Nice: tc.niceValue, + } + config := &configs.Config{ + Rootfs: "/var", + Scheduler: &scheduler, + } + + err := Validate(config) + if tc.isErr && err == nil { + t.Errorf("scheduler: %d, expected error, got nil", tc.niceValue) + } + if !tc.isErr && err != nil { + t.Errorf("scheduler: %d, expected nil, got error %v", tc.niceValue, err) + } + } +} diff --git a/libcontainer/process.go b/libcontainer/process.go index 4de4a9e75c2..fc9b89611ee 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -92,6 +92,8 @@ type Process struct { // // For cgroup v2, the only key allowed is "". SubCgroupPaths map[string]string + + Scheduler *configs.Scheduler } // Wait waits for the process to exit. diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 48861406dba..3046c19b2db 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -80,6 +80,13 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() + + if p.process.Scheduler != nil { + if err := utils.SetSchedAttr(p.pid(), p.process.Scheduler); err != nil { + return fmt.Errorf("error setting scheduler: %w", err) + } + } + // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() err := p.cmd.Start() diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index d3938da516c..7becfb56fa0 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -493,6 +493,17 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Ambient: spec.Process.Capabilities.Ambient, } } + if spec.Process.Scheduler != nil { + config.Scheduler = &configs.Scheduler{ + Policy: spec.Process.Scheduler.Policy, + Nice: spec.Process.Scheduler.Nice, + Priority: spec.Process.Scheduler.Priority, + Flags: spec.Process.Scheduler.Flags, + Runtime: spec.Process.Scheduler.Runtime, + Deadline: spec.Process.Scheduler.Deadline, + Period: spec.Process.Scheduler.Period, + } + } } createHooks(spec, config) config.Version = specs.Version diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index f3d04282362..908f5a77010 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -158,6 +159,13 @@ func (l *linuxStandardInit) Init() error { return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err} } } + + if l.config.Config.Scheduler != nil { + if err := utils.SetSchedAttr(0, l.config.Config.Scheduler); err != nil { + return fmt.Errorf("error setting scheduler: %w", err) + } + } + // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 6b9a7be038f..a6df8054fde 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -9,10 +9,19 @@ import ( "os" "strconv" "sync" + "syscall" + "unsafe" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" ) +/* +#include +*/ +import "C" + // EnsureProcHandle returns whether or not the given file handle is on procfs. func EnsureProcHandle(fh *os.File) error { var buf unix.Statfs_t @@ -98,3 +107,74 @@ func NewSockPair(name string) (parent *os.File, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +type schedAttr struct { + Size uint32 + SchedPolicy uint32 + SchedFlags uint64 + SchedNice int32 + SchedPriority uint32 + SchedRuntime uint64 + SchedDeadline uint64 + SchedPeriod uint64 +} + +// SetSchedAttr sets the scheduler attributes for the process with the given pid. +// Please refer to the following link for kernel-specific values: +// https://github.com/torvalds/linux/blob/c1a515d3c0270628df8ae5f5118ba859b85464a2/include/uapi/linux/sched.h#L111-L134 +func SetSchedAttr(pid int, scheduler *configs.Scheduler) error { + var policy uint32 + switch scheduler.Policy { + case specs.SchedOther: + policy = 0 + case specs.SchedFIFO: + policy = 1 + case specs.SchedRR: + policy = 2 + case specs.SchedBatch: + policy = 3 + case specs.SchedISO: + policy = 4 + case specs.SchedIdle: + policy = 5 + case specs.SchedDeadline: + policy = 6 + } + + var flags uint64 + for _, flag := range scheduler.Flags { + switch flag { + case specs.SchedFlagResetOnFork: + flags |= 0x01 + case specs.SchedFlagReclaim: + flags |= 0x02 + case specs.SchedFlagDLOverrun: + flags |= 0x04 + case specs.SchedFlagKeepPolicy: + flags |= 0x08 + case specs.SchedFlagKeepParams: + flags |= 0x10 + case specs.SchedFlagUtilClampMin: + flags |= 0x20 + case specs.SchedFlagUtilClampMax: + flags |= 0x40 + } + } + + attr := &schedAttr{ + Size: uint32(unsafe.Sizeof(schedAttr{})), + SchedPolicy: policy, + SchedFlags: flags, + SchedNice: scheduler.Nice, + SchedPriority: uint32(scheduler.Priority), + SchedRuntime: scheduler.Runtime, + SchedDeadline: scheduler.Deadline, + SchedPeriod: scheduler.Period, + } + _, _, errno := syscall.Syscall(C.SYS_sched_setattr, uintptr(pid), uintptr(unsafe.Pointer(attr)), uintptr(0)) + if errno != 0 { + return errno + } + + return nil +} diff --git a/tests/integration/scheduler.bats b/tests/integration/scheduler.bats new file mode 100644 index 00000000000..922cbbf45a7 --- /dev/null +++ b/tests/integration/scheduler.bats @@ -0,0 +1,23 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires root + setup_debian +} + +function teardown() { + teardown_bundle +} + +@test "scheduler is applied" { + update_config ' .process.args |= ["chrt", "-p", "1"] + | .process.scheduler = {"policy": "SCHED_DEADLINE", "runtime": 42000, "deadline": 1000000, "period": 1000000, }' + + runc run test_scheduler + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *"scheduling policy: SCHED_DEADLINE"* ]] + [[ "${lines[1]}" == *"priority: 0"* ]] + [[ "${lines[2]}" == *"runtime/deadline/period parameters: 42000/1000000/1000000"* ]] +} diff --git a/utils_linux.go b/utils_linux.go index 0f787cb3387..8e1d4f6ae76 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -61,6 +61,18 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { lp.ConsoleHeight = uint16(p.ConsoleSize.Height) } + if p.Scheduler != nil { + lp.Scheduler = &configs.Scheduler{ + Policy: p.Scheduler.Policy, + Nice: p.Scheduler.Nice, + Priority: p.Scheduler.Priority, + Flags: p.Scheduler.Flags, + Runtime: p.Scheduler.Runtime, + Deadline: p.Scheduler.Deadline, + Period: p.Scheduler.Period, + } + } + if p.Capabilities != nil { lp.Capabilities = &configs.Capabilities{} lp.Capabilities.Bounding = p.Capabilities.Bounding