Skip to content

Commit

Permalink
Init commit
Browse files Browse the repository at this point in the history
Signed-off-by: mYmNeo <thomassong2012@gmail.com>
  • Loading branch information
mYmNeo committed Nov 5, 2019
0 parents commit cb32733
Show file tree
Hide file tree
Showing 27,408 changed files with 8,423,714 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
go/
32 changes: 32 additions & 0 deletions Makefile
@@ -0,0 +1,32 @@
.PHONY: all
all:
hack/build.sh manager client

.PHONY: clean
clean:
rm -rf ./go

.PHONY: vendor
vendor:
rm -rf vendor
hack/glide.sh

.PHONY: test
test:
hack/build.sh "test"

.PHONY: proto
proto:
hack/build.sh "proto"

.PHONY: img
img:
hack/build.sh "img"

.PHONY: fmt
fmt:
hack/build.sh "fmt"

.PHONY: lint
lint:
@revive -config revive.toml -exclude vendor/... -exclude pkg/api/runtime/... ./...
35 changes: 35 additions & 0 deletions README.md
@@ -0,0 +1,35 @@
# GPU Manager

GPU Manager is used for managing the nvidia GPU devices in Kubernetes cluster. It implements the `DevicePlugin` interface
of Kubernetes. So it's compatible with 1.9+ of Kubernetes release version.

To compare with the combination solution of `nvidia-docker`
and `nvidia-k8s-plugin`, GPU manager will use native `runc` without modification but nvidia solution does.
Besides we also support metrics report without deploying new components.

To schedule a GPU payload correctly, GPU manager should work with `gpu-quota-admission` which is a kubernetes scheduler plugin.

GPU manager also supports the payload with fraction resource of GPU device such as 0.1 card or 100MiB gpu device memory.
If you want this kind feature, please refer to `vcuda` project.

# How to deploy GPU Manager

GPU Manager is running as daemonset, and because of the RABC restriction and hydrid cluster,
you need to do the following steps to make this daemonset run correctly.

- service account and clusterrole

```
kubectl create sa gpu-manager -n kube-system
kubectl create clusterrolebinding gpu-manager-role --clusterrole=cluster-admin --serviceaccount=kube-system:gpu-manager
```

- label node with `nvidia-device-enable=enable`

```
kubectl label node <node> nvidia-device-enable=enable
```

- change gpu-manager.yaml and submit

change --incluster-mode from `false` to `true`, change image field to `<your repository>/public/gpu-manager:latest`, add serviceAccount filed to `gpu-manager-role`
1 change: 1 addition & 0 deletions VERSION
@@ -0,0 +1 @@
0.2.0
68 changes: 68 additions & 0 deletions build/Dockerfile
@@ -0,0 +1,68 @@
ARG base_img
FROM nvidia/cuda:10.1-devel-centos7 as build

ARG version
ARG commit

RUN yum install -y rpm-build make git

ENV GOLANG_VERSION 1.12.4
RUN curl -sSL https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz \
| tar -C /usr/local -xz
ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH

RUN mkdir -p /root/rpmbuild/{SPECS,SOURCES}

COPY gpu-manager.spec /root/rpmbuild/SPECS
COPY gpu-manager-source.tar.gz /root/rpmbuild/SOURCES

RUN echo '%_topdir /root/rpmbuild' > /root/.rpmmacros \
&& echo '%__os_install_post %{nil}' >> /root/.rpmmacros \
&& echo '%debug_package %{nil}' >> /root/.rpmmacros
WORKDIR /root/rpmbuild/SPECS
RUN rpmbuild -bb --quiet \
--define 'version '${version}'' \
--define 'commit '${commit}'' \
gpu-manager.spec

FROM $base_img

ARG version
ARG commit

COPY --from=build /root/rpmbuild/RPMS/x86_64/gpu-manager-${version}-${commit}.el7.x86_64.rpm /tmp

RUN yum install epel-release -y && \
yum install -y which jq

# Install packages
RUN rpm -ivh /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm \
&& rm -rf /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm

# kubelet
VOLUME ["/var/lib/kubelet/device-plugins"]

# gpu manager storage
VOLUME ["/etc/gpu-manager/vm"]
VOLUME ["/etc/gpu-manager/vdriver"]
VOLUME ["/var/log/gpu-manager"]

# nvidia library search location
VOLUME ["/usr/local/host"]

RUN echo "/usr/local/nvidia/lib" > /etc/ld.so.conf.d/nvidia.conf && \
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf

ENV PATH=$PATH:/usr/local/nvidia/bin

# cgroup
VOLUME ["/sys/fs/cgroup"]

# display
EXPOSE 5678

COPY start.sh /
COPY copy-bin-lib.sh /

CMD ["/start.sh"]
83 changes: 83 additions & 0 deletions build/copy-bin-lib.sh
@@ -0,0 +1,83 @@
#!/bin/bash

set -o pipefail
set -o errexit
set -o nounset

FILE=${FILE:-"/etc/gpu-manager/volume.conf"}
LIB_FILES=$(jq -r .volume[1].components.libraries[] ${FILE})
BIN_FILES=$(jq -r .volume[1].components.binaries[] ${FILE})
readonly NV_DIR="/usr/local/nvidia"
readonly FIND_BASE=${FIND_BASE:-"/usr/local/host"}

function check_arch() {
local readonly lib=$1
if [[ $(objdump -f ${lib} | grep -o "elf64-x86-64") == "elf64-x86-64" ]]; then
echo "64"
else
echo ""
fi
}

function copy_lib() {
for target in $(find /usr -name "${1}*" | grep -v "stubs"); do
if [[ $(objdump -p ${target} 2>/dev/null | grep -o "SONAME") == "SONAME" ]]; then
copy_directory ${target} "${NV_DIR}/lib$(check_arch ${target})"
fi
done
}

function copy_bin() {
for target in $(find /usr -name "${1}"); do
copy_directory ${target} "${NV_DIR}/bin/"
done
}

function copy_directory() {
local readonly lib=$1
local readonly path=$2

echo "copy ${lib} to ${path}"
cp -Pf "${lib}" "${path}"
}

rm -rf ${NV_DIR}
mkdir -p ${NV_DIR}/{bin,lib,lib64}

for file in ${LIB_FILES[@]}; do
copy_lib ${file}
done

for file in ${BIN_FILES[@]}; do
copy_bin ${file}
done

# fix libvdpau_nvidia.so
(
cd ${NV_DIR}/lib
rm -rf libvdpau_nvidia.so
rel_path=$(readlink -f libvdpau_nvidia.so.1)
ln -s $(basename ${rel_path}) libvdpau_nvidia.so
)

(
cd ${NV_DIR}/lib64
rm -rf libvdpau_nvidia.so
rel_path=$(readlink -f libvdpau_nvidia.so.1)
ln -s $(basename ${rel_path}) libvdpau_nvidia.so
)

# fix libnvidia-ml.so
(
cd ${NV_DIR}/lib
rm -rf libnvidia-ml.so
rel_path=$(readlink -f libnvidia-ml.so.1)
ln -s $(basename ${rel_path}) libnvidia-ml.so
)

(
cd ${NV_DIR}/lib64
rm -rf libnvidia-ml.so
rel_path=$(readlink -f libnvidia-ml.so.1)
ln -s $(basename ${rel_path}) libnvidia-ml.so
)
1 change: 1 addition & 0 deletions build/extra-config.json
@@ -0,0 +1 @@
{}
1 change: 1 addition & 0 deletions build/gpu-manager.conf
@@ -0,0 +1 @@
GPU_MANAGER_ARGS="--extra-config=/etc/gpu-manager/extra-config.json --addr=/var/run/gpu-manager.sock --v=2 --logtostderr"
34 changes: 34 additions & 0 deletions build/gpu-manager.service
@@ -0,0 +1,34 @@
[Unit]
Description=GPU Manager Runtime
After=network-online.target docker.socket kubelet.service
Wants=network-online.target kubelet.service

[Service]
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
EnvironmentFile=-/etc/gpu-manager/gpu-manager.conf
ExecStart=/usr/bin/gpu-manager $GPU_MANAGER_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
LimitNOFILE=1048576
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
UMask=0000

[Install]
WantedBy=multi-user.target
48 changes: 48 additions & 0 deletions build/gpu-manager.spec
@@ -0,0 +1,48 @@
Name: gpu-manager
Version: %{version}
Release: %{commit}%{?dist}
Summary: GPU Manager Plugin for Kubernetes

License: MIT
Source: gpu-manager-source.tar.gz

Requires: systemd-units

%define pkgname %{name}-%{version}-%{release}

%description
GPU Manager Plugin for Kubernetes

%prep
%setup -n gpu-manager-%{version}


%build
make all

%install
install -d $RPM_BUILD_ROOT/%{_bindir}
install -d $RPM_BUILD_ROOT/%{_unitdir}
install -d $RPM_BUILD_ROOT/etc/gpu-manager

install -p -m 755 ./go/bin/gpu-manager $RPM_BUILD_ROOT/%{_bindir}/
install -p -m 755 ./go/bin/gpu-client $RPM_BUILD_ROOT/%{_bindir}/

install -p -m 644 ./build/extra-config.json $RPM_BUILD_ROOT/etc/gpu-manager/
install -p -m 644 ./build/gpu-manager.conf $RPM_BUILD_ROOT/etc/gpu-manager/
install -p -m 644 ./build/volume.conf $RPM_BUILD_ROOT/etc/gpu-manager/

install -p -m 644 ./build/gpu-manager.service $RPM_BUILD_ROOT/%{_unitdir}/

%clean
rm -rf $RPM_BUILD_ROOT

%files
%config(noreplace,missingok) /etc/gpu-manager/extra-config.json
%config(noreplace,missingok) /etc/gpu-manager/gpu-manager.conf
%config(noreplace,missingok) /etc/gpu-manager/volume.conf

/%{_bindir}/gpu-manager
/%{_bindir}/gpu-client

/%{_unitdir}/gpu-manager.service
13 changes: 13 additions & 0 deletions build/start.sh
@@ -0,0 +1,13 @@
#!/bin/bash

set -o errexit
set -o pipefail
set -o nounset

source copy-bin-lib.sh

echo "rebuild ldcache"
/usr/sbin/ldconfig

echo "launch gpu manager"
/usr/bin/gpu-manager --extra-config=/etc/gpu-manager/extra-config.json --v=${LOG_LEVEL} --hostname-override=${NODE_NAME} --kubeconfig=/root/.kube/config --share-mode=true --volume-config=/etc/gpu-manager/volume.conf --log-dir=/var/log/gpu-manager --query-addr=0.0.0.0 ${EXTRA_FLAGS:-""}

0 comments on commit cb32733

Please sign in to comment.