Support multiple concurrent builder deployments

This fixes a glitch in the way builders were looked up where multiple builders running on the same cluster wound up incorrectly identified for transfer during builds. It also refines the way we define the buildkit state directory for containerd based builds. This state directory must be on the host so containerd can mount it into child containers, but multiple instances of buildkitd can not share. Most of the tests have been switched to support parallel runs, however while testing, I noticed `kind` does not work in parallel (seemingly random low-level containerd failures to start containers) so CI forces a serial testing for kind. Native dockerd and containerd clusters work correctly.
vmware-archive · Dec 9, 2020 · 364ef5a · 364ef5a
1 parent 2848b46
commit 364ef5a
Show file tree

Hide file tree

Showing 11 changed files with 42 additions and 49 deletions.
diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml
@@ -40,7 +40,7 @@ jobs:
     - name: Setup kind cluster
       run: kind create cluster --config ./integration/kind-ci.yaml
     - name: Run integration tests
-      run: make integration EXTRA_GO_TEST_FLAGS=-v
+      run: make integration EXTRA_GO_TEST_FLAGS=-v TEST_PARALLELISM=1
     - name: Gather integration coverage results
       uses: codecov/codecov-action@v1
       with:
@@ -69,7 +69,8 @@ jobs:
         kubectl taint nodes --all node-role.kubernetes.io/master-
         kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
         kubectl wait --for=condition=ready --timeout=30s node --all
-        kubectl get nodes
+        kubectl get nodes -o wide
+        docker version
 
     - name: Run integration tests
       run: make integration EXTRA_GO_TEST_FLAGS=-v

diff --git a/Makefile b/Makefile
@@ -3,7 +3,8 @@
 
 VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always)
 TEST_KUBECONFIG?=$(HOME)/.kube/config
-TEST_TIMEOUT=300s
+TEST_TIMEOUT=600s
+TEST_PARALLELISM=3
 
 # Verify Go in PATH
 ifeq (, $(shell which go))
@@ -71,7 +72,9 @@ test:
 integration:
 	@echo "Running integration tests with $(TEST_KUBECONFIG)"
 	@kubectl config get-contexts
-	TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) $(EXTRA_GO_TEST_FLAGS)  \
+	TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) \
+		-parallel $(TEST_PARALLELISM) \
+		$(EXTRA_GO_TEST_FLAGS)  \
 		$(GO_COVER_FLAGS) -coverprofile=./cover-int.out \
 		./integration/...
 

diff --git a/integration/common/basesuites.go b/integration/common/basesuites.go
@@ -26,7 +26,7 @@ type BaseSuite struct {
 	Namespace string
 }
 
-func (s *BaseSuite) SetupTest() {
+func (s *BaseSuite) SetupSuite() {
 	var err error
 	if !s.SkipSetupCreate {
 		logrus.Infof("%s: Setting up builder", s.Name)
@@ -44,7 +44,7 @@ func (s *BaseSuite) SetupTest() {
 	require.NoError(s.T(), err, "%s: kube client failed", s.Name)
 }
 
-func (s *BaseSuite) TearDownTest() {
+func (s *BaseSuite) TearDownSuite() {
 	logrus.Infof("%s: Removing builder", s.Name)
 	err := RunBuildkit("rm", []string{
 		s.Name,
@@ -62,7 +62,7 @@ func (s *BaseSuite) TestSimpleBuild() {
 	dir, cleanup, err := NewSimpleBuildContext()
 	defer cleanup()
 	require.NoError(s.T(), err, "Failed to set up temporary build context")
-	args := []string{}
+	args := []string{"--progress=plain"}
 	if s.Name != "buildkit" { // TODO wire up the default name variable
 		args = append(
 			args,
@@ -102,7 +102,7 @@ func (s *BaseSuite) TestLocalOutputTarBuild() {
 	dir, cleanup, err := NewSimpleBuildContext()
 	defer cleanup()
 	require.NoError(s.T(), err, "Failed to set up temporary build context")
-	args := []string{}
+	args := []string{"--progress=plain"}
 	if s.Name != "buildkit" { // TODO wire up the default name variable
 		args = append(
 			args,

diff --git a/integration/suites/configmap_test.go b/integration/suites/configmap_test.go
@@ -30,7 +30,7 @@ type configMapSuite struct {
 	configMapClient v1.ConfigMapInterface
 }
 
-func (s *configMapSuite) SetupTest() {
+func (s *configMapSuite) SetupSuite() {
 	var err error
 	s.ClientSet, s.Namespace, err = common.GetKubeClientset()
 	require.NoError(s.T(), err, "%s: kube client failed", s.Name)
@@ -200,7 +200,7 @@ func (s *configMapSuite) TestPreExistingWithCustomCreate() {
 
 func TestConfigMapSuite(t *testing.T) {
 	common.Skipper(t)
-	//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
+	t.Parallel()
 	suite.Run(t, &configMapSuite{
 		Name: "configmaptest",
 	})

diff --git a/integration/suites/default_test.go b/integration/suites/default_test.go
@@ -14,7 +14,7 @@ type DefaultSuite struct{ common.BaseSuite }
 
 func TestDefaultSuite(t *testing.T) {
 	common.Skipper(t)
-	//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
+	t.Parallel()
 	suite.Run(t, &DefaultSuite{
 		BaseSuite: common.BaseSuite{
 			Name: "buildkit", // TODO pull this from the actual default name

diff --git a/integration/suites/randomlb_test.go b/integration/suites/randomlb_test.go
@@ -16,7 +16,7 @@ type RandomLBSuite struct{ common.BaseSuite }
 
 func TestRandomLBSuite(t *testing.T) {
 	common.Skipper(t)
-	//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
+	t.Parallel()
 	suite.Run(t, &RandomLBSuite{
 		BaseSuite: common.BaseSuite{
 			Name:        "randomlb",

diff --git a/pkg/build/build.go b/pkg/build/build.go
@@ -581,14 +581,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku
 		multiDriver := len(m[k]) > 1
 		for i, dp := range m[k] {
 			d := drivers[dp.driverIndex].Driver
+			driverName := drivers[dp.driverIndex].Name
 			opt.Platforms = dp.platforms
 
-			// TODO - this is a little messy - some recactoring of this routine would be worthwhile
-			var chosenNodeName string
-			for name := range clients[d.Factory().Name()] {
-				chosenNodeName = name // TODO - this doesn't actually seem to be working!!!
-				break
-			}
 			// TODO - this is also messy and wont work for multi-driver scenarios (no that it's possible yet...)
 			if auth == nil {
 				auth = d.GetAuthWrapper(registrySecretName)
@@ -598,9 +593,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku
 				// Set up loader based on first found type (only 1 supported)
 				for _, entry := range opt.Exports {
 					if entry.Type == "docker" {
-						return newDockerLoader(ctx, d, kubeClientConfig, chosenNodeName, mw)
+						return newDockerLoader(ctx, d, kubeClientConfig, driverName, mw)
 					} else if entry.Type == "oci" {
-						return newContainerdLoader(ctx, d, kubeClientConfig, chosenNodeName, mw)
+						return newContainerdLoader(ctx, d, kubeClientConfig, driverName, mw)
 					}
 				}
 				// TODO - Push scenario?  (or is this a "not reached" scenario now?)
@@ -899,18 +894,24 @@ func notSupported(d driver.Driver, f driver.Feature) error {
 
 type dockerLoadCallback func(name string) (io.WriteCloser, func(), error)
 
-func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
+func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
 	nodeNames := []string{}
 	// TODO this isn't quite right - we need a better "list only pods from one instance" func
 	builders, err := d.List(ctx)
 	if err != nil {
 		return nil, nil, err
 	}
 	for _, builder := range builders {
+		if builder.Name != builderName {
+			continue
+		}
 		for _, node := range builder.Nodes {
 			nodeNames = append(nodeNames, node.Name)
 		}
 	}
+	if len(nodeNames) == 0 {
+		return nil, nil, fmt.Errorf("no builders found for %s", builderName)
+	}
 	// TODO revamp this flow to return a list of pods
 
 	readers := make([]*io.PipeReader, len(nodeNames))
@@ -1019,7 +1020,7 @@ func (w *waitingWriter) Close() error {
 	return err
 }
 
-func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
+func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
 	nodeNames := []string{}
 	// TODO this isn't quite right - we need a better "list only pods from one instance" func
 	// TODO revamp this flow to return a list of pods
@@ -1029,12 +1030,15 @@ func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig
 		return nil, nil, err
 	}
 	for _, builder := range builders {
+		if builder.Name != builderName {
+			continue
+		}
 		for _, node := range builder.Nodes {
 			// For containerd, we never have to load on the node where it was built
 			// TODO - this doesn't actually work yet, but when we switch the output.Type to "oci" we need to load it everywhere anyway
-			if node.Name == chosenNodeName {
-				continue
-			}
+			// if node.Name == chosenNodeName {
+			// 	continue
+			// }
 			nodeNames = append(nodeNames, node.Name)
 		}
 	}

diff --git a/pkg/cmd/build.go b/pkg/cmd/build.go
@@ -232,7 +232,7 @@ func runBuild(streams genericclioptions.IOStreams, in buildOptions) error {
 func buildTargets(ctx context.Context, kubeClientConfig clientcmd.ClientConfig, streams genericclioptions.IOStreams, opts map[string]build.Options, progressMode, contextPathHash, registrySecretName, instance string) error {
 	driverName := instance
 	if driverName == "" {
-		driverName = "buildx_buildkit_kubernetes"
+		driverName = "buildkit"
 	}
 	d, err := driver.GetDriver(ctx, driverName, nil, kubeClientConfig, []string{} /* TODO what BuildkitFlags are these? */, "" /* unused config file */, map[string]string{} /* DriverOpts unused */, contextPathHash)
 	if err != nil {

diff --git a/pkg/driver/kubernetes/driver.go b/pkg/driver/kubernetes/driver.go
@@ -51,6 +51,7 @@ const (
 	// TODO - consider adding other default values here to aid users in fine-tuning by editing the configmap post deployment
 	DefaultConfigFileTemplate = `# Default buildkitd configuration.  Use --config <path/to/file> to override during create
 debug = false
+root = "/var/lib/buildkit/{{ .Name }}"
 [worker.containerd]
   namespace = "{{ .ContainerdNamespace }}"
 `
@@ -103,7 +104,7 @@ func (d *Driver) Bootstrap(ctx context.Context, l progress.Logger) error {
 			}
 		}
 		return sub.Wrap(
-			fmt.Sprintf("waiting for %d pods to be ready", d.minReplicas),
+			fmt.Sprintf("waiting for %d pods to be ready for %s", d.minReplicas, d.deployment.Name),
 			func() error {
 				if err := d.wait(ctx, sub); err != nil {
 					return err
@@ -137,7 +138,7 @@ func (d *Driver) wait(ctx context.Context, sub progress.SubLogger) error {
 	for try := 0; try < 100; try++ {
 		if err == nil {
 			if depl.Status.ReadyReplicas >= int32(d.minReplicas) {
-				sub.Log(1, []byte(fmt.Sprintf("All %d replicas online\n", d.minReplicas)))
+				sub.Log(1, []byte(fmt.Sprintf("All %d replicas for %s online\n", d.minReplicas, d.deployment.Name)))
 				return nil
 			}
 			deploymentUID = string(depl.ObjectMeta.GetUID())

diff --git a/pkg/driver/kubernetes/factory.go b/pkg/driver/kubernetes/factory.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"io/ioutil"
 	"strconv"
-	"strings"
 	"text/template"
 
 	"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/driver"
@@ -243,21 +242,9 @@ func (f *factory) AllowsInstances() bool {
 }
 
 // buildxNameToDeploymentName converts buildx name to Kubernetes Deployment name.
-//
-// eg. "buildx_buildkit_loving_mendeleev0" -> "loving-mendeleev0"
 func buildxNameToDeploymentName(bx string) string {
-	// TODO: commands.util.go should not pass "buildx_buildkit_" prefix to drivers
-	s := bx
-	if strings.HasPrefix(s, "buildx_buildkit_") {
-		s = strings.TrimPrefix(s, "buildx_buildkit_")
-	}
-	s = strings.ReplaceAll(s, "_", "-")
-	if s == "kubernetes" {
-		// Having the default name of the deployment for buildkit called "kubernetes" is confusing, use something better
-		return "buildkit"
-	}
-	if s == "" {
+	if bx == "" {
 		return "buildkit"
 	}
-	return s
+	return bx
 }
diff --git a/pkg/driver/kubernetes/manifest/manifest.go b/pkg/driver/kubernetes/manifest/manifest.go
@@ -160,7 +160,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
 		},
 		corev1.VolumeMount{
 			Name:             "var-lib-buildkit",
-			MountPath:        "/var/lib/buildkit",
+			MountPath:        "/var/lib/buildkit/" + opt.Name,
 			MountPropagation: &mountPropagationBidirectional,
 		},
 		corev1.VolumeMount{
@@ -204,7 +204,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
 			Name: "var-lib-buildkit",
 			VolumeSource: corev1.VolumeSource{
 				HostPath: &corev1.HostPathVolumeSource{
-					Path: "/var/lib/buildkit",
+					Path: "/var/lib/buildkit/" + opt.Name,
 					Type: &hostPathDirectoryOrCreate,
 				}},
 		},
@@ -246,10 +246,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
 		},
 	)
 
-	// If this is a multi-user cluster and other users are running
-	// containerd based builders in other namespaces, we'll fail to start due to
-	// shared mounts for the buildkit cache
-	// buildkitd: could not lock /var/lib/buildkit/buildkitd.lock, another instance running?
+	// Spread our builders out on a multi-node cluster
 	d.Spec.Template.Spec.Affinity = &corev1.Affinity{
 		PodAntiAffinity: &corev1.PodAntiAffinity{
 			RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{