-
Notifications
You must be signed in to change notification settings - Fork 64
/
standard_init_linux.go
192 lines (166 loc) · 6.33 KB
/
standard_init_linux.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
// SPDX-License-Identifier: Apache-2.0
// Copyright 2014 Docker, Inc.
// Copyright 2023 Unikraft GmbH and The KraftKit Authors
package libmocktainer
import (
"errors"
"fmt"
"os"
"os/exec"
"strconv"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/system"
"kraftkit.sh/libmocktainer/configs"
"kraftkit.sh/libmocktainer/unikraft"
)
type linuxStandardInit struct {
pipe *os.File
parentPid int
fifoFd int
logFd int
config *initConfig
}
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32 //nolint:gosimple
// Without user ns we need 'UID' search permissions.
newperms = 0x80000
// Create a unique per session container name that we can join in setns;
// However, other containers can also join it.
return "_ses." + l.config.ContainerID, 0xffffffff, newperms
}
func (l *linuxStandardInit) Init() error {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring.
if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
// If keyrings aren't supported then it is likely we are on an
// older kernel (or inside an LXC container). While we could bail,
// the security feature we are using here is best-effort (it only
// really provides marginal protection since VFS credentials are
// the only significant protection of keyrings).
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
} else {
// Make session keyring searchable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return fmt.Errorf("unable to mod keyring permissions: %w", err)
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
if err := setupRoute(l.config.Config); err != nil {
return err
}
// initialises the labeling system
selinux.GetEnabled()
// We don't need the mount nor idmap fds after prepareRootfs() nor if it fails.
err := prepareRootfs(l.pipe, l.config, mountFds{})
if err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return fmt.Errorf("unable to apply apparmor profile: %w", err)
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("can't get pdeath signal: %w", err)
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return fmt.Errorf("sync ready: %w", err)
}
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return fmt.Errorf("can't set process label: %w", err)
}
defer selinux.SetExecLabel("") //nolint: errcheck
if err := finalizeNamespace(l.config); err != nil {
return err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return fmt.Errorf("can't restore pdeath signal: %w", err)
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// Close the pipe to signal that we have completed our init.
logrus.Debugf("init: closing the pipe to signal completion")
_ = l.pipe.Close()
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd)
fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
}
// -- BEGIN Unikraft
var isUnikernel bool
for _, lbl := range l.config.Config.Labels {
if lbl == "org.unikraft.kernel=" { // injected by `runu create`
isUnikernel = true
break
}
}
if isUnikernel {
// This must happen in the Start phase of the OCI startup flow, right
// before exec(), because the setup of the container's network interfaces
// typically happens between the Create and the Start phases (e.g. CNI).
qemuNetArgs, err := unikraft.SetupQemuNet()
if err != nil {
return fmt.Errorf("setting up qemu network: %w", err)
}
l.config.Args = append(l.config.Args, qemuNetArgs...)
}
// -- END Unikraft
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
_ = unix.Close(l.fifoFd)
s := l.config.SpecState
s.Pid = unix.Getpid()
s.Status = specs.StateCreated
if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
return err
}
return system.Exec(name, l.config.Args[0:], os.Environ())
}