-
Notifications
You must be signed in to change notification settings - Fork 93
/
nvidia.go
211 lines (185 loc) · 7.06 KB
/
nvidia.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved.
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.
package gpu
import (
"errors"
"fmt"
"os"
"os/exec"
"strings"
"syscall"
"github.com/sylabs/singularity/v4/internal/pkg/util/bin"
"github.com/sylabs/singularity/v4/internal/pkg/util/env"
"github.com/sylabs/singularity/v4/internal/pkg/util/fs"
"github.com/sylabs/singularity/v4/pkg/sylog"
"github.com/sylabs/singularity/v4/pkg/util/capabilities"
"github.com/sylabs/singularity/v4/pkg/util/slice"
)
var (
errNvCCLIInsecure = errors.New("nvidia-container-cli is not owned by root user")
errLdconfigInsecure = errors.New("ldconfig is not owned by root user")
)
// nVDriverCapabilities is the set of driver capabilities supported by nvidia-container-cli.
// See: https://github.com/nvidia/nvidia-container-runtime#nvidia_driver_capabilities
var nVDriverCapabilities = []string{
"compute",
"compat32",
"graphics",
"utility",
"video",
"display",
}
// nVDriverDefaultCapabilities is the default set of nvidia-container-cli driver capabilities.
// It is used if NVIDIA_DRIVER_CAPABILITIES is not set.
// See: https://github.com/nvidia/nvidia-container-runtime#nvidia_driver_capabilities
var nVDriverDefaultCapabilities = []string{
"compute",
"utility",
}
// nVCLIAmbientCaps is the ambient capability set required by nvidia-container-cli.
var nVCLIAmbientCaps = []uintptr{
// Set by default in starter bounding set
uintptr(capabilities.Map["CAP_SYS_ADMIN"].Value),
uintptr(capabilities.Map["CAP_MKNOD"].Value),
// Additionally set in starter with nvCCLICaps
uintptr(capabilities.Map["CAP_CHOWN"].Value),
uintptr(capabilities.Map["CAP_DAC_OVERRIDE"].Value),
uintptr(capabilities.Map["CAP_DAC_READ_SEARCH"].Value),
uintptr(capabilities.Map["CAP_FOWNER"].Value),
uintptr(capabilities.Map["CAP_KILL"].Value),
uintptr(capabilities.Map["CAP_NET_ADMIN"].Value),
uintptr(capabilities.Map["CAP_SETGID"].Value),
uintptr(capabilities.Map["CAP_SETPCAP"].Value),
uintptr(capabilities.Map["CAP_SETUID"].Value),
uintptr(capabilities.Map["CAP_SYS_CHROOT"].Value),
uintptr(capabilities.Map["CAP_SYS_PTRACE"].Value),
}
// NVCLIConfigure calls out to the nvidia-container-cli configure operation.
// This sets up the GPU with the container. Note that the ability to set a
// fairly broad range of ambient capabilities is required. This function will
// error if the bounding set does not include NvidiaContainerCLIAmbientCaps.
//
// When userNS is true, we are running in a user namespace and don't require
// any privilege escalation when calling out to `nvidia-container-cli`.
//
// When userNS is false, we are not running in a user namespace, but are in
// setuid mode or directly called as `sudo singularity` etc. In this case we
// exec `nvidia-container-cli` as root via SysProcAttr, having first ensured
// that it and `ldconfig` are root-owned.
func NVCLIConfigure(nvidiaEnv []string, rootfs string, userNS bool) error {
nvCCLIPath, err := bin.FindBin("nvidia-container-cli")
if err != nil {
return err
}
// If we will run nvidia-container-cli as the host root user then ensure
// it is trusted, i.e. owned by them.
if !userNS && !fs.IsOwner(nvCCLIPath, 0) {
return errNvCCLIInsecure
}
// Translate the passed in NVIDIA_ env vars to option flags
flags, err := NVCLIEnvToFlags(nvidiaEnv)
if err != nil {
return err
}
// The --ldconfig flag is constructed here, as the specified binary
// will be called as root in the set-uid flow, so the user should not
// be able to influence it from the CLI code.
ldConfig, err := bin.FindBin("ldconfig")
if err != nil {
return err
}
// If we will run nvidia-container-cli as the host root user then ensure
// ldconfig is trusted, i.e. owned by them.
if !userNS && !fs.IsOwner(ldConfig, 0) {
return errLdconfigInsecure
}
flags = append(flags, "--ldconfig=@"+ldConfig)
nccArgs := []string{"configure"}
// If we are running in a user namespace specify --user as a global flag,
// or nvidia-container-cli will fail.
if userNS {
nccArgs = []string{"--user", "configure"}
}
nccArgs = append(nccArgs, flags...)
nccArgs = append(nccArgs, rootfs)
sylog.Debugf("nvidia-container-cli binary: %q args: %q", nvCCLIPath, nccArgs)
cmd := exec.Command(nvCCLIPath, nccArgs...)
cmd.Env = os.Environ()
// We are called from the RPC server which has an empty PATH.
// nvidia-container-cli requires a default sensible PATH to work correctly.
cmd.Env = append(cmd.Env, "PATH="+env.DefaultPath)
// We need to run nvidia-container-cli as host root when there is no user
// namespace in play.
if !userNS {
sylog.Debugf("Running nvidia-container-cli as uid=0 gid=0")
cmd.SysProcAttr = &syscall.SysProcAttr{
Credential: &syscall.Credential{Uid: 0, Gid: 0},
}
} else {
sylog.Debugf("Running nvidia-container-cli in user namespace")
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.SysProcAttr.AmbientCaps = nVCLIAmbientCaps
stdoutStderr, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("nvidia-container-cli failed with %v: %s", err, stdoutStderr)
}
return nil
}
// NVCLIEnvToFlags reads the passed in NVIDIA_ environment variables supported
// by nvidia-container-runtime and converts them to flags for
// nvidia-container-cli. See:
// https://github.com/nvidia/nvidia-container-runtime#environment-variables-oci-spec
func NVCLIEnvToFlags(nvidiaEnv []string) (flags []string, err error) {
// We don't support cgroups related usage yet.
flags = []string{"--no-cgroups"}
requireFlags := []string{}
disableRequire := false
defaultDriverCaps := true
for _, e := range nvidiaEnv {
pair := strings.SplitN(e, "=", 2)
if len(pair) != 2 {
return []string{}, fmt.Errorf("can't process environment variable %s", e)
}
if pair[0] == "NVIDIA_VISIBLE_DEVICES" && pair[1] != "" {
flags = append(flags, "--device="+pair[1])
}
if pair[0] == "NVIDIA_MIG_CONFIG_DEVICES" && pair[1] != "" {
flags = append(flags, "--mig-config="+pair[1])
}
if pair[0] == "NVIDIA_MIG_MONITOR_DEVICES" && pair[1] != "" {
flags = append(flags, "--mig-monitor="+pair[1])
}
// Driver capabilities have a default, but can be overridden.
if pair[0] == "NVIDIA_DRIVER_CAPABILITIES" && pair[1] != "" {
defaultDriverCaps = false
caps := strings.Split(pair[1], ",")
for _, cap := range caps {
if slice.ContainsString(nVDriverCapabilities, cap) {
flags = append(flags, "--"+cap)
} else {
return nil, fmt.Errorf("unknown NVIDIA_DRIVER_CAPABILITIES value: %s", cap)
}
}
}
// One --require flag for each NVIDIA_REQUIRE_* environment
// https://github.com/nvidia/nvidia-container-runtime#nvidia_require_
if strings.HasPrefix(pair[0], "NVIDIA_REQUIRE_") {
requireFlags = append(requireFlags, "--require="+pair[1])
}
if pair[0] == "NVIDIA_DISABLE_REQUIRE" {
disableRequire = true
}
}
if defaultDriverCaps {
for _, cap := range nVDriverDefaultCapabilities {
flags = append(flags, "--"+cap)
}
}
if !disableRequire {
flags = append(flags, requireFlags...)
}
return flags, nil
}