Init commit

Signed-off-by: mYmNeo <thomassong2012@gmail.com>
tkestack · Nov 5, 2019 · cb32733 · cb32733
commit cb32733
Show file tree

Hide file tree

Showing 27,408 changed files with 8,423,714 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+go/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,32 @@
+.PHONY: all
+all:
+	hack/build.sh manager client
+
+.PHONY: clean
+clean:
+	rm -rf ./go
+
+.PHONY: vendor
+vendor:
+	rm -rf vendor
+	hack/glide.sh
+
+.PHONY: test
+test:
+	hack/build.sh "test"
+
+.PHONY: proto
+proto:
+	hack/build.sh "proto"
+
+.PHONY: img
+img:
+	hack/build.sh "img"
+
+.PHONY: fmt
+fmt:
+	hack/build.sh "fmt"
+
+.PHONY: lint
+lint:
+	@revive -config revive.toml -exclude vendor/... -exclude pkg/api/runtime/... ./...
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# GPU Manager
+
+GPU Manager is used for managing the nvidia GPU devices in Kubernetes cluster. It implements the `DevicePlugin` interface
+of Kubernetes. So it's compatible with 1.9+ of Kubernetes release version. 
+
+To compare with the combination solution of `nvidia-docker`
+and `nvidia-k8s-plugin`, GPU manager will use native `runc` without modification but nvidia solution does.
+Besides we also support metrics report without deploying new components. 
+
+To schedule a GPU payload correctly, GPU manager should work with `gpu-quota-admission` which is a kubernetes scheduler plugin.
+
+GPU manager also supports the payload with fraction resource of GPU device such as 0.1 card or 100MiB gpu device memory.
+If you want this kind feature, please refer to `vcuda` project.
+
+# How to deploy GPU Manager
+
+GPU Manager is running as daemonset, and because of the RABC restriction and hydrid cluster,
+you need to do the following steps to make this daemonset run correctly.
+
+- service account and clusterrole
+
+```
+kubectl create sa gpu-manager -n kube-system
+kubectl create clusterrolebinding gpu-manager-role --clusterrole=cluster-admin --serviceaccount=kube-system:gpu-manager
+```
+
+- label node with `nvidia-device-enable=enable`
+
+```
+kubectl label node <node> nvidia-device-enable=enable
+```
+
+- change gpu-manager.yaml and submit
+
+change --incluster-mode from `false` to `true`, change image field to `<your repository>/public/gpu-manager:latest`, add serviceAccount filed to `gpu-manager-role`
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+0.2.0
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -0,0 +1,68 @@
+ARG base_img
+FROM nvidia/cuda:10.1-devel-centos7 as build
+
+ARG version
+ARG commit
+
+RUN yum install -y rpm-build make git
+
+ENV GOLANG_VERSION 1.12.4
+RUN curl -sSL https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz \
+    | tar -C /usr/local -xz
+ENV GOPATH /go
+ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
+
+RUN mkdir -p /root/rpmbuild/{SPECS,SOURCES}
+
+COPY gpu-manager.spec /root/rpmbuild/SPECS
+COPY gpu-manager-source.tar.gz /root/rpmbuild/SOURCES
+
+RUN echo '%_topdir /root/rpmbuild' > /root/.rpmmacros \
+  && echo '%__os_install_post %{nil}' >> /root/.rpmmacros \
+  && echo '%debug_package %{nil}' >> /root/.rpmmacros
+WORKDIR /root/rpmbuild/SPECS
+RUN rpmbuild -bb --quiet \
+  --define 'version '${version}'' \
+  --define 'commit '${commit}'' \
+  gpu-manager.spec
+
+FROM $base_img
+
+ARG version
+ARG commit
+
+COPY --from=build /root/rpmbuild/RPMS/x86_64/gpu-manager-${version}-${commit}.el7.x86_64.rpm /tmp
+
+RUN yum install epel-release -y && \
+  yum install -y which jq
+
+# Install packages
+RUN rpm -ivh /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm \
+	&& rm -rf /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm
+
+# kubelet
+VOLUME ["/var/lib/kubelet/device-plugins"]
+
+# gpu manager storage
+VOLUME ["/etc/gpu-manager/vm"]
+VOLUME ["/etc/gpu-manager/vdriver"]
+VOLUME ["/var/log/gpu-manager"]
+
+# nvidia library search location
+VOLUME ["/usr/local/host"]
+
+RUN echo "/usr/local/nvidia/lib" > /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+ENV PATH=$PATH:/usr/local/nvidia/bin
+
+# cgroup
+VOLUME ["/sys/fs/cgroup"]
+
+# display
+EXPOSE 5678
+
+COPY start.sh /
+COPY copy-bin-lib.sh /
+
+CMD ["/start.sh"]
diff --git a/build/copy-bin-lib.sh b/build/copy-bin-lib.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+set -o pipefail
+set -o errexit
+set -o nounset
+
+FILE=${FILE:-"/etc/gpu-manager/volume.conf"}
+LIB_FILES=$(jq -r .volume[1].components.libraries[] ${FILE})
+BIN_FILES=$(jq -r .volume[1].components.binaries[] ${FILE})
+readonly NV_DIR="/usr/local/nvidia"
+readonly FIND_BASE=${FIND_BASE:-"/usr/local/host"}
+
+function check_arch() {
+  local readonly lib=$1
+  if [[ $(objdump -f ${lib} | grep -o "elf64-x86-64") == "elf64-x86-64" ]]; then
+    echo "64"
+  else
+    echo ""
+  fi
+}
+
+function copy_lib() {
+  for target in $(find /usr -name "${1}*" | grep -v "stubs"); do
+    if [[ $(objdump -p ${target} 2>/dev/null | grep -o "SONAME") == "SONAME" ]]; then
+      copy_directory ${target} "${NV_DIR}/lib$(check_arch ${target})"
+    fi
+  done
+}
+
+function copy_bin() {
+  for target in $(find /usr -name "${1}"); do
+    copy_directory ${target} "${NV_DIR}/bin/"
+  done
+}
+
+function copy_directory() {
+  local readonly lib=$1
+  local readonly path=$2
+
+  echo "copy ${lib} to ${path}"
+  cp -Pf "${lib}" "${path}"
+}
+
+rm -rf ${NV_DIR}
+mkdir -p ${NV_DIR}/{bin,lib,lib64}
+
+for file in ${LIB_FILES[@]}; do
+  copy_lib ${file}
+done
+
+for file in ${BIN_FILES[@]}; do
+  copy_bin ${file}
+done
+
+# fix libvdpau_nvidia.so
+(
+  cd ${NV_DIR}/lib
+  rm -rf libvdpau_nvidia.so
+  rel_path=$(readlink -f libvdpau_nvidia.so.1)
+  ln -s $(basename ${rel_path}) libvdpau_nvidia.so
+)
+
+(
+  cd ${NV_DIR}/lib64
+  rm -rf libvdpau_nvidia.so
+  rel_path=$(readlink -f libvdpau_nvidia.so.1)
+  ln -s $(basename ${rel_path}) libvdpau_nvidia.so
+)
+
+# fix libnvidia-ml.so
+(
+  cd ${NV_DIR}/lib
+  rm -rf libnvidia-ml.so
+  rel_path=$(readlink -f libnvidia-ml.so.1)
+  ln -s $(basename ${rel_path}) libnvidia-ml.so
+)
+
+(
+  cd ${NV_DIR}/lib64
+  rm -rf libnvidia-ml.so
+  rel_path=$(readlink -f libnvidia-ml.so.1)
+  ln -s $(basename ${rel_path}) libnvidia-ml.so
+)
diff --git a/build/extra-config.json b/build/extra-config.json
@@ -0,0 +1 @@
+{}
diff --git a/build/gpu-manager.conf b/build/gpu-manager.conf
@@ -0,0 +1 @@
+GPU_MANAGER_ARGS="--extra-config=/etc/gpu-manager/extra-config.json --addr=/var/run/gpu-manager.sock --v=2 --logtostderr"
diff --git a/build/gpu-manager.service b/build/gpu-manager.service
@@ -0,0 +1,34 @@
+[Unit]
+Description=GPU Manager Runtime
+After=network-online.target docker.socket kubelet.service
+Wants=network-online.target kubelet.service
+
+[Service]
+Type=notify
+# the default is not to use systemd for cgroups because the delegate issues still
+# exists and systemd currently does not support the cgroup feature set required
+# for containers run by docker
+EnvironmentFile=-/etc/gpu-manager/gpu-manager.conf
+ExecStart=/usr/bin/gpu-manager $GPU_MANAGER_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+LimitNOFILE=1048576
+# Having non-zero Limit*s causes performance problems due to accounting overhead
+# in the kernel. We recommend using cgroups to do container-local accounting.
+LimitNPROC=infinity
+LimitCORE=infinity
+# Uncomment TasksMax if your systemd version supports it.
+# Only systemd 226 and above support this version.
+#TasksMax=infinity
+TimeoutStartSec=0
+# set delegate yes so that systemd does not reset the cgroups of docker containers
+Delegate=yes
+# kill only the docker process, not all processes in the cgroup
+KillMode=process
+# restart the docker process if it exits prematurely
+Restart=on-failure
+StartLimitBurst=3
+StartLimitInterval=60s
+UMask=0000
+
+[Install]
+WantedBy=multi-user.target
diff --git a/build/gpu-manager.spec b/build/gpu-manager.spec
@@ -0,0 +1,48 @@
+Name: gpu-manager
+Version: %{version}
+Release: %{commit}%{?dist}
+Summary: GPU Manager Plugin for Kubernetes
+
+License: MIT
+Source: gpu-manager-source.tar.gz
+
+Requires: systemd-units
+
+%define pkgname %{name}-%{version}-%{release}
+
+%description
+GPU Manager Plugin for Kubernetes
+
+%prep
+%setup -n gpu-manager-%{version}
+
+
+%build
+make all
+
+%install
+install -d $RPM_BUILD_ROOT/%{_bindir}
+install -d $RPM_BUILD_ROOT/%{_unitdir}
+install -d $RPM_BUILD_ROOT/etc/gpu-manager
+
+install -p -m 755 ./go/bin/gpu-manager $RPM_BUILD_ROOT/%{_bindir}/
+install -p -m 755 ./go/bin/gpu-client $RPM_BUILD_ROOT/%{_bindir}/
+
+install -p -m 644 ./build/extra-config.json $RPM_BUILD_ROOT/etc/gpu-manager/
+install -p -m 644 ./build/gpu-manager.conf $RPM_BUILD_ROOT/etc/gpu-manager/
+install -p -m 644 ./build/volume.conf $RPM_BUILD_ROOT/etc/gpu-manager/
+
+install -p -m 644 ./build/gpu-manager.service $RPM_BUILD_ROOT/%{_unitdir}/
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%config(noreplace,missingok) /etc/gpu-manager/extra-config.json
+%config(noreplace,missingok) /etc/gpu-manager/gpu-manager.conf
+%config(noreplace,missingok) /etc/gpu-manager/volume.conf
+
+/%{_bindir}/gpu-manager
+/%{_bindir}/gpu-client
+
+/%{_unitdir}/gpu-manager.service
diff --git a/build/start.sh b/build/start.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -o errexit
+set -o pipefail
+set -o nounset
+
+source copy-bin-lib.sh
+
+echo "rebuild ldcache"
+/usr/sbin/ldconfig
+
+echo "launch gpu manager"
+/usr/bin/gpu-manager --extra-config=/etc/gpu-manager/extra-config.json --v=${LOG_LEVEL} --hostname-override=${NODE_NAME} --kubeconfig=/root/.kube/config --share-mode=true --volume-config=/etc/gpu-manager/volume.conf --log-dir=/var/log/gpu-manager --query-addr=0.0.0.0 ${EXTRA_FLAGS:-""}