Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: wip max in-flight pulls #81

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .github/workflows/containerd.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: containerd-11mins
name: containerd-16mins
on:
push:
branches: [master]
Expand Down Expand Up @@ -36,3 +36,18 @@ jobs:
run: ./hack/ci/test.sh
- name: Uninstall the CSI Driver
run: helm uninstall -n kube-system ${HELM_NAME} --wait
- name: Install the CSI Driver (for `--max-in-flight-pulls`)
run: |
helm install ${HELM_NAME} charts/warm-metal-csi-driver -n kube-system \
-f ${VALUE_FILE} \
--set csiPlugin.image.tag=${IMAGE_TAG} \
--set csiPlugin.maxInFlightPulls=1 \
--set csiPlugin.asyncPullTimeout=1ms \
--set csiPlugin.enableAsyncPullMount=true \
--wait \
--debug
- name: Run Tests
run: test/integration/test-max-in-flight-pulls.sh
- name: Uninstall the CSI Driver (for `--max-in-flight-pulls`)
run: helm uninstall -n kube-system ${HELM_NAME} --wait

18 changes: 17 additions & 1 deletion .github/workflows/cri-o.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: cri-o-10mins
name: cri-o-15mins
on:
push:
branches: [master]
Expand Down Expand Up @@ -37,3 +37,19 @@ jobs:
run: ./hack/ci/test.sh
- name: Uninstall the CSI Driver
run: helm uninstall -n kube-system ${HELM_NAME} --wait
- name: Install the CSI Driver (for `--max-in-flight-pulls`)
run: |
helm install ${HELM_NAME} charts/warm-metal-csi-driver -n kube-system \
-f ${VALUE_FILE} \
--set csiPlugin.image.tag=${IMAGE_TAG} \
--set csiPlugin.maxInFlightPulls=1 \
--set csiPlugin.asyncPullTimeout=1ms \
--set csiPlugin.enableAsyncPullMount=true \
--wait \
--debug
# - name: debug using ssh
# uses: lhotari/action-upterm@v1
- name: Run Tests
run: test/integration/test-max-in-flight-pulls.sh
- name: Uninstall the CSI Driver (for `--max-in-flight-pulls`)
run: helm uninstall -n kube-system ${HELM_NAME} --wait
37 changes: 0 additions & 37 deletions .github/workflows/metrics.yaml

This file was deleted.

9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ sanity:
e2e:
cd ./test/e2e && KUBECONFIG=~/.kube/config go run .

# to run unit tests
# PHONY signifies the make recipe is not a file
# more info: https://stackoverflow.com/questions/2145590/what-is-the-purpose-of-phony-in-a-makefile
.PHONY: unit-tests
unit-tests:
# -count=1 forces re-test everytime (instead of caching the results)
# more info: https://stackoverflow.com/questions/48882691/force-retesting-or-disable-test-caching
go test -count=1 ./cmd/plugin

.PHONY: integration
integration:
./test/integration/test-backward-compatability.sh
Expand Down
8 changes: 7 additions & 1 deletion charts/warm-metal-csi-driver/templates/nodeplugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,15 @@ spec:
{{- if .Values.enableDaemonImageCredentialCache }}
- --enable-daemon-image-credential-cache
{{- end }}
{{- if .Values.enableAsyncPullMount }}
{{- if .Values.csiPlugin.enableAsyncPullMount }}
- --async-pull-mount=true
{{- end }}
{{- if .Values.csiPlugin.asyncPullTimeout }}
- --async-pull-timeout={{ .Values.csiPlugin.asyncPullTimeout }}
{{- end }}
{{- if .Values.csiPlugin.maxInFlightPulls }}
- --max-in-flight-pulls={{ .Values.csiPlugin.maxInFlightPulls }}
{{- end }}
- "-v={{ .Values.logLevel }}"
- "--mode=node"
env:
Expand Down
4 changes: 3 additions & 1 deletion charts/warm-metal-csi-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ kubeletRoot: /var/lib/kubelet
snapshotRoot: /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs
logLevel: 4
enableDaemonImageCredentialCache:
enableAsyncPullMount: false
pullImageSecretForDaemonset:

csiPlugin:
hostNetwork: false
maxInFlightPulls: -1 # i.e., no pull limit
asyncPullTimeout:
enableAsyncPullMount: false
resources: {}
image:
tag: ""
Expand Down
22 changes: 20 additions & 2 deletions cmd/plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/warm-metal/csi-driver-image/pkg/backend/crio"
"github.com/warm-metal/csi-driver-image/pkg/cri"
"github.com/warm-metal/csi-driver-image/pkg/metrics"
"github.com/warm-metal/csi-driver-image/pkg/pullexecutor"
"github.com/warm-metal/csi-driver-image/pkg/secret"
"github.com/warm-metal/csi-driver-image/pkg/watcher"
csicommon "github.com/warm-metal/csi-drivers/pkg/csi-common"
Expand Down Expand Up @@ -57,6 +58,8 @@ var (
watcherResyncPeriod = flag.Duration("watcher-resync-period", 30*time.Minute, "The resync period of the pvc watcher.")
mode = flag.String("mode", "", "The mode of the driver. Valid values are: node, controller")
nodePluginSA = flag.String("node-plugin-sa", "csi-image-warm-metal", "The name of the ServiceAccount used by the node plugin.")
maxInflightPulls = flag.Int("max-in-flight-pulls", -1, "The maximum number of image pull operations that can happen at the same time. Only works if --async-pull-mount is set to true. (default: -1 which means there is no limit)")
asyncPullTimeout = flag.Duration("async-pull-timeout", pullexecutor.DefaultPullPollTimeout, "(EXPERIMENTAL) Synchronous wait timeout for image to get pulled after which a response is returned and image pull happens in the background. Only works if --async-pull-mount is set to true. (default: 2 minutes)")
)

func main() {
Expand All @@ -81,6 +84,20 @@ func main() {
klog.Fatalf("The mode of the driver is required.")
}

if *maxInflightPulls == 0 {
klog.Fatalf("--max-in-flight-pulls cannot be zero (current value: '%v')", *maxInflightPulls)
}

if !*asyncImagePullMount && *maxInflightPulls > 0 {
klog.Fatalf("--max-in-flight-pulls (current value: '%v') can only be used with --async-pull-mount=true (current value: %v)", *maxInflightPulls, *asyncImagePullMount)
}

// `.Changed` is used to check if the flag was specified by the user
// more info: https://github.com/spf13/pflag/issues/293#issuecomment-719996662
if !*asyncImagePullMount && flag.Lookup("async-pull-timeout").Changed {
klog.Fatalf("--async-pull-timeout (current value: '%v') can only be used with --async-pull-mount=true (current value: %v)", *asyncPullTimeout, *asyncImagePullMount)
}

server := csicommon.NewNonBlockingGRPCServer()

switch *mode {
Expand All @@ -99,7 +116,7 @@ func main() {
*runtimeAddr = addr.String()
}

var mounter *backend.SnapshotMounter
var mounter backend.Mounter
if len(*runtimeAddr) > 0 {
addr, err := url.Parse(*runtimeAddr)
if err != nil {
Expand Down Expand Up @@ -130,7 +147,8 @@ func main() {
server.Start(*endpoint,
NewIdentityServer(driverVersion),
nil,
NewNodeServer(driver, mounter, criClient, secretStore, *asyncImagePullMount))
NewNodeServer(driver, mounter, criClient, secretStore,
*asyncImagePullMount, *maxInflightPulls, *asyncPullTimeout))
case controllerMode:
watcher, err := watcher.New(context.Background(), *watcherResyncPeriod)
if err != nil {
Expand Down
14 changes: 12 additions & 2 deletions cmd/plugin/node_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"os"
"strings"
"time"

"github.com/container-storage-interface/spec/lib/go/csi"
"github.com/containerd/containerd/reference/docker"
Expand All @@ -26,11 +27,12 @@ const (
ctxKeyImage = "image"
ctxKeyPullAlways = "pullAlways"
ctxKeyEphemeralVolume = "csi.storage.k8s.io/ephemeral"
ctxKeyPodUid = "csi.storage.k8s.io/pod.uid"
)

type ImagePullStatus int

func NewNodeServer(driver *csicommon.CSIDriver, mounter *backend.SnapshotMounter, imageSvc cri.ImageServiceClient, secretStore secret.Store, asyncImagePullMount bool) *NodeServer {
func NewNodeServer(driver *csicommon.CSIDriver, mounter backend.Mounter, imageSvc cri.ImageServiceClient, secretStore secret.Store, asyncImagePullMount bool, maxInflightPulls int, asyncPullTimeout time.Duration) *NodeServer {
return &NodeServer{
DefaultNodeServer: csicommon.NewDefaultNodeServer(driver),
mounter: mounter,
Expand All @@ -45,13 +47,15 @@ func NewNodeServer(driver *csicommon.CSIDriver, mounter *backend.SnapshotMounter
ImageServiceClient: imageSvc,
SecretStore: secretStore,
Mounter: mounter,
MaxInflightPulls: maxInflightPulls,
AsyncPullTimeout: asyncPullTimeout,
}),
}
}

type NodeServer struct {
*csicommon.DefaultNodeServer
mounter *backend.SnapshotMounter
mounter backend.Mounter
secretStore secret.Store
asyncImagePullMount bool
mountExecutor *mountexecutor.MountExecutor
Expand All @@ -75,6 +79,11 @@ func (n NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishV
return
}

if len(req.VolumeContext[ctxKeyPodUid]) == 0 {
err = status.Error(codes.InvalidArgument, "Pod Uid is missing")
return
}

if _, isBlock := req.VolumeCapability.AccessType.(*csi.VolumeCapability_Block); isBlock {
err = status.Error(codes.InvalidArgument, "unable to mount as a block device")
return
Expand Down Expand Up @@ -134,6 +143,7 @@ func (n NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishV
PullAlways: pullAlways,
Image: image,
PullSecrets: req.Secrets,
PodUid: req.VolumeContext[ctxKeyPodUid],
}

if e := n.pullExecutor.StartPulling(po); e != nil {
Expand Down
Loading