From 364ef5a5598774078e4c253372aa723c8bb71c6d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 7 Dec 2020 11:38:43 -0800 Subject: [PATCH 1/2] Support multiple concurrent builder deployments This fixes a glitch in the way builders were looked up where multiple builders running on the same cluster wound up incorrectly identified for transfer during builds. It also refines the way we define the buildkit state directory for containerd based builds. This state directory must be on the host so containerd can mount it into child containers, but multiple instances of buildkitd can not share. Most of the tests have been switched to support parallel runs, however while testing, I noticed `kind` does not work in parallel (seemingly random low-level containerd failures to start containers) so CI forces a serial testing for kind. Native dockerd and containerd clusters work correctly. --- .github/workflows/pull_request.yaml | 5 ++-- Makefile | 7 +++-- integration/common/basesuites.go | 8 +++--- integration/suites/configmap_test.go | 4 +-- integration/suites/default_test.go | 2 +- integration/suites/randomlb_test.go | 2 +- pkg/build/build.go | 30 ++++++++++++---------- pkg/cmd/build.go | 2 +- pkg/driver/kubernetes/driver.go | 5 ++-- pkg/driver/kubernetes/factory.go | 17 ++---------- pkg/driver/kubernetes/manifest/manifest.go | 9 +++---- 11 files changed, 42 insertions(+), 49 deletions(-) diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index 83611a14..a5f378c3 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -40,7 +40,7 @@ jobs: - name: Setup kind cluster run: kind create cluster --config ./integration/kind-ci.yaml - name: Run integration tests - run: make integration EXTRA_GO_TEST_FLAGS=-v + run: make integration EXTRA_GO_TEST_FLAGS=-v TEST_PARALLELISM=1 - name: Gather integration coverage results uses: codecov/codecov-action@v1 with: @@ -69,7 +69,8 @@ jobs: kubectl taint nodes --all node-role.kubernetes.io/master- kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml kubectl wait --for=condition=ready --timeout=30s node --all - kubectl get nodes + kubectl get nodes -o wide + docker version - name: Run integration tests run: make integration EXTRA_GO_TEST_FLAGS=-v diff --git a/Makefile b/Makefile index 62feb578..4e4a1c48 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,8 @@ VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always) TEST_KUBECONFIG?=$(HOME)/.kube/config -TEST_TIMEOUT=300s +TEST_TIMEOUT=600s +TEST_PARALLELISM=3 # Verify Go in PATH ifeq (, $(shell which go)) @@ -71,7 +72,9 @@ test: integration: @echo "Running integration tests with $(TEST_KUBECONFIG)" @kubectl config get-contexts - TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) $(EXTRA_GO_TEST_FLAGS) \ + TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) \ + -parallel $(TEST_PARALLELISM) \ + $(EXTRA_GO_TEST_FLAGS) \ $(GO_COVER_FLAGS) -coverprofile=./cover-int.out \ ./integration/... diff --git a/integration/common/basesuites.go b/integration/common/basesuites.go index 717c3049..dbbffbba 100644 --- a/integration/common/basesuites.go +++ b/integration/common/basesuites.go @@ -26,7 +26,7 @@ type BaseSuite struct { Namespace string } -func (s *BaseSuite) SetupTest() { +func (s *BaseSuite) SetupSuite() { var err error if !s.SkipSetupCreate { logrus.Infof("%s: Setting up builder", s.Name) @@ -44,7 +44,7 @@ func (s *BaseSuite) SetupTest() { require.NoError(s.T(), err, "%s: kube client failed", s.Name) } -func (s *BaseSuite) TearDownTest() { +func (s *BaseSuite) TearDownSuite() { logrus.Infof("%s: Removing builder", s.Name) err := RunBuildkit("rm", []string{ s.Name, @@ -62,7 +62,7 @@ func (s *BaseSuite) TestSimpleBuild() { dir, cleanup, err := NewSimpleBuildContext() defer cleanup() require.NoError(s.T(), err, "Failed to set up temporary build context") - args := []string{} + args := []string{"--progress=plain"} if s.Name != "buildkit" { // TODO wire up the default name variable args = append( args, @@ -102,7 +102,7 @@ func (s *BaseSuite) TestLocalOutputTarBuild() { dir, cleanup, err := NewSimpleBuildContext() defer cleanup() require.NoError(s.T(), err, "Failed to set up temporary build context") - args := []string{} + args := []string{"--progress=plain"} if s.Name != "buildkit" { // TODO wire up the default name variable args = append( args, diff --git a/integration/suites/configmap_test.go b/integration/suites/configmap_test.go index 4359b3d0..0292d4ed 100644 --- a/integration/suites/configmap_test.go +++ b/integration/suites/configmap_test.go @@ -30,7 +30,7 @@ type configMapSuite struct { configMapClient v1.ConfigMapInterface } -func (s *configMapSuite) SetupTest() { +func (s *configMapSuite) SetupSuite() { var err error s.ClientSet, s.Namespace, err = common.GetKubeClientset() require.NoError(s.T(), err, "%s: kube client failed", s.Name) @@ -200,7 +200,7 @@ func (s *configMapSuite) TestPreExistingWithCustomCreate() { func TestConfigMapSuite(t *testing.T) { common.Skipper(t) - //t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug + t.Parallel() suite.Run(t, &configMapSuite{ Name: "configmaptest", }) diff --git a/integration/suites/default_test.go b/integration/suites/default_test.go index d6dc0b65..e9c905b0 100644 --- a/integration/suites/default_test.go +++ b/integration/suites/default_test.go @@ -14,7 +14,7 @@ type DefaultSuite struct{ common.BaseSuite } func TestDefaultSuite(t *testing.T) { common.Skipper(t) - //t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug + t.Parallel() suite.Run(t, &DefaultSuite{ BaseSuite: common.BaseSuite{ Name: "buildkit", // TODO pull this from the actual default name diff --git a/integration/suites/randomlb_test.go b/integration/suites/randomlb_test.go index 9421bb81..9c65d095 100644 --- a/integration/suites/randomlb_test.go +++ b/integration/suites/randomlb_test.go @@ -16,7 +16,7 @@ type RandomLBSuite struct{ common.BaseSuite } func TestRandomLBSuite(t *testing.T) { common.Skipper(t) - //t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug + t.Parallel() suite.Run(t, &RandomLBSuite{ BaseSuite: common.BaseSuite{ Name: "randomlb", diff --git a/pkg/build/build.go b/pkg/build/build.go index a322e386..610f2829 100644 --- a/pkg/build/build.go +++ b/pkg/build/build.go @@ -581,14 +581,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku multiDriver := len(m[k]) > 1 for i, dp := range m[k] { d := drivers[dp.driverIndex].Driver + driverName := drivers[dp.driverIndex].Name opt.Platforms = dp.platforms - // TODO - this is a little messy - some recactoring of this routine would be worthwhile - var chosenNodeName string - for name := range clients[d.Factory().Name()] { - chosenNodeName = name // TODO - this doesn't actually seem to be working!!! - break - } // TODO - this is also messy and wont work for multi-driver scenarios (no that it's possible yet...) if auth == nil { auth = d.GetAuthWrapper(registrySecretName) @@ -598,9 +593,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku // Set up loader based on first found type (only 1 supported) for _, entry := range opt.Exports { if entry.Type == "docker" { - return newDockerLoader(ctx, d, kubeClientConfig, chosenNodeName, mw) + return newDockerLoader(ctx, d, kubeClientConfig, driverName, mw) } else if entry.Type == "oci" { - return newContainerdLoader(ctx, d, kubeClientConfig, chosenNodeName, mw) + return newContainerdLoader(ctx, d, kubeClientConfig, driverName, mw) } } // TODO - Push scenario? (or is this a "not reached" scenario now?) @@ -899,7 +894,7 @@ func notSupported(d driver.Driver, f driver.Feature) error { type dockerLoadCallback func(name string) (io.WriteCloser, func(), error) -func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) { +func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) { nodeNames := []string{} // TODO this isn't quite right - we need a better "list only pods from one instance" func builders, err := d.List(ctx) @@ -907,10 +902,16 @@ func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clie return nil, nil, err } for _, builder := range builders { + if builder.Name != builderName { + continue + } for _, node := range builder.Nodes { nodeNames = append(nodeNames, node.Name) } } + if len(nodeNames) == 0 { + return nil, nil, fmt.Errorf("no builders found for %s", builderName) + } // TODO revamp this flow to return a list of pods readers := make([]*io.PipeReader, len(nodeNames)) @@ -1019,7 +1020,7 @@ func (w *waitingWriter) Close() error { return err } -func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) { +func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) { nodeNames := []string{} // TODO this isn't quite right - we need a better "list only pods from one instance" func // TODO revamp this flow to return a list of pods @@ -1029,12 +1030,15 @@ func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig return nil, nil, err } for _, builder := range builders { + if builder.Name != builderName { + continue + } for _, node := range builder.Nodes { // For containerd, we never have to load on the node where it was built // TODO - this doesn't actually work yet, but when we switch the output.Type to "oci" we need to load it everywhere anyway - if node.Name == chosenNodeName { - continue - } + // if node.Name == chosenNodeName { + // continue + // } nodeNames = append(nodeNames, node.Name) } } diff --git a/pkg/cmd/build.go b/pkg/cmd/build.go index bade0ddd..b11d6700 100644 --- a/pkg/cmd/build.go +++ b/pkg/cmd/build.go @@ -232,7 +232,7 @@ func runBuild(streams genericclioptions.IOStreams, in buildOptions) error { func buildTargets(ctx context.Context, kubeClientConfig clientcmd.ClientConfig, streams genericclioptions.IOStreams, opts map[string]build.Options, progressMode, contextPathHash, registrySecretName, instance string) error { driverName := instance if driverName == "" { - driverName = "buildx_buildkit_kubernetes" + driverName = "buildkit" } d, err := driver.GetDriver(ctx, driverName, nil, kubeClientConfig, []string{} /* TODO what BuildkitFlags are these? */, "" /* unused config file */, map[string]string{} /* DriverOpts unused */, contextPathHash) if err != nil { diff --git a/pkg/driver/kubernetes/driver.go b/pkg/driver/kubernetes/driver.go index e76f8d58..cd5f7831 100644 --- a/pkg/driver/kubernetes/driver.go +++ b/pkg/driver/kubernetes/driver.go @@ -51,6 +51,7 @@ const ( // TODO - consider adding other default values here to aid users in fine-tuning by editing the configmap post deployment DefaultConfigFileTemplate = `# Default buildkitd configuration. Use --config to override during create debug = false +root = "/var/lib/buildkit/{{ .Name }}" [worker.containerd] namespace = "{{ .ContainerdNamespace }}" ` @@ -103,7 +104,7 @@ func (d *Driver) Bootstrap(ctx context.Context, l progress.Logger) error { } } return sub.Wrap( - fmt.Sprintf("waiting for %d pods to be ready", d.minReplicas), + fmt.Sprintf("waiting for %d pods to be ready for %s", d.minReplicas, d.deployment.Name), func() error { if err := d.wait(ctx, sub); err != nil { return err @@ -137,7 +138,7 @@ func (d *Driver) wait(ctx context.Context, sub progress.SubLogger) error { for try := 0; try < 100; try++ { if err == nil { if depl.Status.ReadyReplicas >= int32(d.minReplicas) { - sub.Log(1, []byte(fmt.Sprintf("All %d replicas online\n", d.minReplicas))) + sub.Log(1, []byte(fmt.Sprintf("All %d replicas for %s online\n", d.minReplicas, d.deployment.Name))) return nil } deploymentUID = string(depl.ObjectMeta.GetUID()) diff --git a/pkg/driver/kubernetes/factory.go b/pkg/driver/kubernetes/factory.go index 949fed9e..71251989 100644 --- a/pkg/driver/kubernetes/factory.go +++ b/pkg/driver/kubernetes/factory.go @@ -8,7 +8,6 @@ import ( "fmt" "io/ioutil" "strconv" - "strings" "text/template" "github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/driver" @@ -243,21 +242,9 @@ func (f *factory) AllowsInstances() bool { } // buildxNameToDeploymentName converts buildx name to Kubernetes Deployment name. -// -// eg. "buildx_buildkit_loving_mendeleev0" -> "loving-mendeleev0" func buildxNameToDeploymentName(bx string) string { - // TODO: commands.util.go should not pass "buildx_buildkit_" prefix to drivers - s := bx - if strings.HasPrefix(s, "buildx_buildkit_") { - s = strings.TrimPrefix(s, "buildx_buildkit_") - } - s = strings.ReplaceAll(s, "_", "-") - if s == "kubernetes" { - // Having the default name of the deployment for buildkit called "kubernetes" is confusing, use something better - return "buildkit" - } - if s == "" { + if bx == "" { return "buildkit" } - return s + return bx } diff --git a/pkg/driver/kubernetes/manifest/manifest.go b/pkg/driver/kubernetes/manifest/manifest.go index 56743da4..a160f732 100644 --- a/pkg/driver/kubernetes/manifest/manifest.go +++ b/pkg/driver/kubernetes/manifest/manifest.go @@ -160,7 +160,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error { }, corev1.VolumeMount{ Name: "var-lib-buildkit", - MountPath: "/var/lib/buildkit", + MountPath: "/var/lib/buildkit/" + opt.Name, MountPropagation: &mountPropagationBidirectional, }, corev1.VolumeMount{ @@ -204,7 +204,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error { Name: "var-lib-buildkit", VolumeSource: corev1.VolumeSource{ HostPath: &corev1.HostPathVolumeSource{ - Path: "/var/lib/buildkit", + Path: "/var/lib/buildkit/" + opt.Name, Type: &hostPathDirectoryOrCreate, }}, }, @@ -246,10 +246,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error { }, ) - // If this is a multi-user cluster and other users are running - // containerd based builders in other namespaces, we'll fail to start due to - // shared mounts for the buildkit cache - // buildkitd: could not lock /var/lib/buildkit/buildkitd.lock, another instance running? + // Spread our builders out on a multi-node cluster d.Spec.Template.Spec.Affinity = &corev1.Affinity{ PodAntiAffinity: &corev1.PodAntiAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ From edb8d429062b2fadc0c7eae99dbd558ebe931b30 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 7 Dec 2020 15:55:04 -0800 Subject: [PATCH 2/2] Switch CI to use native containerd instead of kind The github actions ubuntu image has a recent enough containerd version installed that we can run kubernetes directly on containerd without having to use kind nesting. --- .github/workflows/pull_request.yaml | 25 ++++++++++++++++++++----- pkg/build/build.go | 5 +---- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index a5f378c3..8305c01f 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -28,7 +28,7 @@ jobs: flags: unit-tests name: codecov-unit-test - test-integration-kind-containerd: + test-integration-containerd: runs-on: ubuntu-latest steps: - name: Set up Go ${{ env.GO_VERSION }} @@ -37,16 +37,31 @@ jobs: go-version: ^${{ env.GO_VERSION }} - uses: actions/checkout@v2 name: Check out code into the Go module directory - - name: Setup kind cluster - run: kind create cluster --config ./integration/kind-ci.yaml + - name: Setup containerd cluster + run: | + set -x + sudo apt-get install -y kubeadm kubelet + sudo swapoff -a + # Ensure dockerd isn't running + sudo systemctl stop docker.socket + sudo systemctl stop docker + sudo rm -f /var/run/docker.sock + sudo kubeadm init --cri-socket /run/containerd/containerd.sock + mkdir -p $HOME/.kube/ + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $USER $HOME/.kube/config + kubectl taint nodes --all node-role.kubernetes.io/master- + kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml + kubectl wait --for=condition=ready --timeout=30s node --all + kubectl get nodes -o wide - name: Run integration tests - run: make integration EXTRA_GO_TEST_FLAGS=-v TEST_PARALLELISM=1 + run: make integration EXTRA_GO_TEST_FLAGS=-v - name: Gather integration coverage results uses: codecov/codecov-action@v1 with: file: cover-int.out flags: integration-tests - name: codecov-integration-test-kind + name: codecov-integration-test-containerd test-integration-dockerd: runs-on: ubuntu-latest diff --git a/pkg/build/build.go b/pkg/build/build.go index 610f2829..92141178 100644 --- a/pkg/build/build.go +++ b/pkg/build/build.go @@ -1035,10 +1035,7 @@ func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig } for _, node := range builder.Nodes { // For containerd, we never have to load on the node where it was built - // TODO - this doesn't actually work yet, but when we switch the output.Type to "oci" we need to load it everywhere anyway - // if node.Name == chosenNodeName { - // continue - // } + // TODO - we may want to filter the source node, but when we switch the output.Type to "oci" we need to load it everywhere anyway nodeNames = append(nodeNames, node.Name) } }