Skip to content

Commit

Permalink
fix: Add gpu error message (#217)
Browse files Browse the repository at this point in the history
* fix: Add gpu error message

Signed-off-by: Ce Gao <cegao@tensorchord.ai>

* Update cmd/envd/up.go

Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
  • Loading branch information
gaocegege and VoVAllen committed May 30, 2022
1 parent 93face5 commit 855a5f5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
10 changes: 10 additions & 0 deletions cmd/envd/up.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,16 @@ func up(clicontext *cli.Context) error {
return err
}

if gpu {
nvruntimeExists, err := dockerClient.GPUEnabled(clicontext.Context)
if err != nil {
return err
}
if !nvruntimeExists {
return errors.New("GPU is required but nvidia container runtime is not installed, please refer to https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker")
}
}

containerID, containerIP, err := dockerClient.StartEnvd(clicontext.Context,
tag, ctr, buildContext, gpu, *ir.DefaultGraph, clicontext.Duration("timeout"),
clicontext.StringSlice("volume"))
Expand Down
14 changes: 14 additions & 0 deletions pkg/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ type Client interface {
Exec(ctx context.Context, cname string, cmd []string) error
Destroy(ctx context.Context, name string) error
List(ctx context.Context) ([]types.Container, error)
// GPUEnabled returns true if nvidia container runtime exists in docker daemon.
GPUEnabled(ctx context.Context) (bool, error)
}

type generalClient struct {
Expand All @@ -70,6 +72,18 @@ func NewClient(ctx context.Context) (Client, error) {
return generalClient{cli}, nil
}

func (g generalClient) GPUEnabled(ctx context.Context) (bool, error) {
info, err := g.Info(ctx)
if err != nil {
return false, errors.Wrap(err, "failed to get docker info")
}
logrus.WithField("info", info).Debug("docker info")
if nv, ok := info.Runtimes["nvidia"]; ok {
return nv.Path != "", nil
}
return false, nil
}

func (g generalClient) WaitUntilRunning(ctx context.Context,
name string, timeout time.Duration) error {
logger := logrus.WithField("container", name)
Expand Down

0 comments on commit 855a5f5

Please sign in to comment.