From 364ef5a5598774078e4c253372aa723c8bb71c6d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <hiltgend@vmware.com>
Date: Mon, 7 Dec 2020 11:38:43 -0800
Subject: [PATCH 1/2] Support multiple concurrent builder deployments

This fixes a glitch in the way builders were looked up where multiple builders
running on the same cluster wound up incorrectly identified for transfer during
builds.  It also refines the way we define the buildkit state directory for
containerd based builds.  This state directory must be on the host so
containerd can mount it into child containers, but multiple instances of
buildkitd can not share.  Most of the tests have been switched to support
parallel runs, however while testing, I noticed `kind` does not work in
parallel (seemingly random low-level containerd failures to start containers)
so CI forces a serial testing for kind.  Native dockerd and containerd clusters
work correctly.
---
 .github/workflows/pull_request.yaml        |  5 ++--
 Makefile                                   |  7 +++--
 integration/common/basesuites.go           |  8 +++---
 integration/suites/configmap_test.go       |  4 +--
 integration/suites/default_test.go         |  2 +-
 integration/suites/randomlb_test.go        |  2 +-
 pkg/build/build.go                         | 30 ++++++++++++----------
 pkg/cmd/build.go                           |  2 +-
 pkg/driver/kubernetes/driver.go            |  5 ++--
 pkg/driver/kubernetes/factory.go           | 17 ++----------
 pkg/driver/kubernetes/manifest/manifest.go |  9 +++----
 11 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml
index 83611a14..a5f378c3 100644
--- a/.github/workflows/pull_request.yaml
+++ b/.github/workflows/pull_request.yaml
@@ -40,7 +40,7 @@ jobs:
     - name: Setup kind cluster
       run: kind create cluster --config ./integration/kind-ci.yaml
     - name: Run integration tests
-      run: make integration EXTRA_GO_TEST_FLAGS=-v
+      run: make integration EXTRA_GO_TEST_FLAGS=-v TEST_PARALLELISM=1
     - name: Gather integration coverage results
       uses: codecov/codecov-action@v1
       with:
@@ -69,7 +69,8 @@ jobs:
         kubectl taint nodes --all node-role.kubernetes.io/master-
         kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
         kubectl wait --for=condition=ready --timeout=30s node --all
-        kubectl get nodes
+        kubectl get nodes -o wide
+        docker version
 
     - name: Run integration tests
       run: make integration EXTRA_GO_TEST_FLAGS=-v
diff --git a/Makefile b/Makefile
index 62feb578..4e4a1c48 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,8 @@
 
 VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always)
 TEST_KUBECONFIG?=$(HOME)/.kube/config
-TEST_TIMEOUT=300s
+TEST_TIMEOUT=600s
+TEST_PARALLELISM=3
 
 # Verify Go in PATH
 ifeq (, $(shell which go))
@@ -71,7 +72,9 @@ test:
 integration:
 	@echo "Running integration tests with $(TEST_KUBECONFIG)"
 	@kubectl config get-contexts
-	TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) $(EXTRA_GO_TEST_FLAGS)  \
+	TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) \
+		-parallel $(TEST_PARALLELISM) \
+		$(EXTRA_GO_TEST_FLAGS)  \
 		$(GO_COVER_FLAGS) -coverprofile=./cover-int.out \
 		./integration/...
 
diff --git a/integration/common/basesuites.go b/integration/common/basesuites.go
index 717c3049..dbbffbba 100644
--- a/integration/common/basesuites.go
+++ b/integration/common/basesuites.go
@@ -26,7 +26,7 @@ type BaseSuite struct {
 	Namespace string
 }
 
-func (s *BaseSuite) SetupTest() {
+func (s *BaseSuite) SetupSuite() {
 	var err error
 	if !s.SkipSetupCreate {
 		logrus.Infof("%s: Setting up builder", s.Name)
@@ -44,7 +44,7 @@ func (s *BaseSuite) SetupTest() {
 	require.NoError(s.T(), err, "%s: kube client failed", s.Name)
 }
 
-func (s *BaseSuite) TearDownTest() {
+func (s *BaseSuite) TearDownSuite() {
 	logrus.Infof("%s: Removing builder", s.Name)
 	err := RunBuildkit("rm", []string{
 		s.Name,
@@ -62,7 +62,7 @@ func (s *BaseSuite) TestSimpleBuild() {
 	dir, cleanup, err := NewSimpleBuildContext()
 	defer cleanup()
 	require.NoError(s.T(), err, "Failed to set up temporary build context")
-	args := []string{}
+	args := []string{"--progress=plain"}
 	if s.Name != "buildkit" { // TODO wire up the default name variable
 		args = append(
 			args,
@@ -102,7 +102,7 @@ func (s *BaseSuite) TestLocalOutputTarBuild() {
 	dir, cleanup, err := NewSimpleBuildContext()
 	defer cleanup()
 	require.NoError(s.T(), err, "Failed to set up temporary build context")
-	args := []string{}
+	args := []string{"--progress=plain"}
 	if s.Name != "buildkit" { // TODO wire up the default name variable
 		args = append(
 			args,
diff --git a/integration/suites/configmap_test.go b/integration/suites/configmap_test.go
index 4359b3d0..0292d4ed 100644
--- a/integration/suites/configmap_test.go
+++ b/integration/suites/configmap_test.go
@@ -30,7 +30,7 @@ type configMapSuite struct {
 	configMapClient v1.ConfigMapInterface
 }
 
-func (s *configMapSuite) SetupTest() {
+func (s *configMapSuite) SetupSuite() {
 	var err error
 	s.ClientSet, s.Namespace, err = common.GetKubeClientset()
 	require.NoError(s.T(), err, "%s: kube client failed", s.Name)
@@ -200,7 +200,7 @@ func (s *configMapSuite) TestPreExistingWithCustomCreate() {
 
 func TestConfigMapSuite(t *testing.T) {
 	common.Skipper(t)
-	//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
+	t.Parallel()
 	suite.Run(t, &configMapSuite{
 		Name: "configmaptest",
 	})
diff --git a/integration/suites/default_test.go b/integration/suites/default_test.go
index d6dc0b65..e9c905b0 100644
--- a/integration/suites/default_test.go
+++ b/integration/suites/default_test.go
@@ -14,7 +14,7 @@ type DefaultSuite struct{ common.BaseSuite }
 
 func TestDefaultSuite(t *testing.T) {
 	common.Skipper(t)
-	//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
+	t.Parallel()
 	suite.Run(t, &DefaultSuite{
 		BaseSuite: common.BaseSuite{
 			Name: "buildkit", // TODO pull this from the actual default name
diff --git a/integration/suites/randomlb_test.go b/integration/suites/randomlb_test.go
index 9421bb81..9c65d095 100644
--- a/integration/suites/randomlb_test.go
+++ b/integration/suites/randomlb_test.go
@@ -16,7 +16,7 @@ type RandomLBSuite struct{ common.BaseSuite }
 
 func TestRandomLBSuite(t *testing.T) {
 	common.Skipper(t)
-	//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
+	t.Parallel()
 	suite.Run(t, &RandomLBSuite{
 		BaseSuite: common.BaseSuite{
 			Name:        "randomlb",
diff --git a/pkg/build/build.go b/pkg/build/build.go
index a322e386..610f2829 100644
--- a/pkg/build/build.go
+++ b/pkg/build/build.go
@@ -581,14 +581,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku
 		multiDriver := len(m[k]) > 1
 		for i, dp := range m[k] {
 			d := drivers[dp.driverIndex].Driver
+			driverName := drivers[dp.driverIndex].Name
 			opt.Platforms = dp.platforms
 
-			// TODO - this is a little messy - some recactoring of this routine would be worthwhile
-			var chosenNodeName string
-			for name := range clients[d.Factory().Name()] {
-				chosenNodeName = name // TODO - this doesn't actually seem to be working!!!
-				break
-			}
 			// TODO - this is also messy and wont work for multi-driver scenarios (no that it's possible yet...)
 			if auth == nil {
 				auth = d.GetAuthWrapper(registrySecretName)
@@ -598,9 +593,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku
 				// Set up loader based on first found type (only 1 supported)
 				for _, entry := range opt.Exports {
 					if entry.Type == "docker" {
-						return newDockerLoader(ctx, d, kubeClientConfig, chosenNodeName, mw)
+						return newDockerLoader(ctx, d, kubeClientConfig, driverName, mw)
 					} else if entry.Type == "oci" {
-						return newContainerdLoader(ctx, d, kubeClientConfig, chosenNodeName, mw)
+						return newContainerdLoader(ctx, d, kubeClientConfig, driverName, mw)
 					}
 				}
 				// TODO - Push scenario?  (or is this a "not reached" scenario now?)
@@ -899,7 +894,7 @@ func notSupported(d driver.Driver, f driver.Feature) error {
 
 type dockerLoadCallback func(name string) (io.WriteCloser, func(), error)
 
-func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
+func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
 	nodeNames := []string{}
 	// TODO this isn't quite right - we need a better "list only pods from one instance" func
 	builders, err := d.List(ctx)
@@ -907,10 +902,16 @@ func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clie
 		return nil, nil, err
 	}
 	for _, builder := range builders {
+		if builder.Name != builderName {
+			continue
+		}
 		for _, node := range builder.Nodes {
 			nodeNames = append(nodeNames, node.Name)
 		}
 	}
+	if len(nodeNames) == 0 {
+		return nil, nil, fmt.Errorf("no builders found for %s", builderName)
+	}
 	// TODO revamp this flow to return a list of pods
 
 	readers := make([]*io.PipeReader, len(nodeNames))
@@ -1019,7 +1020,7 @@ func (w *waitingWriter) Close() error {
 	return err
 }
 
-func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
+func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
 	nodeNames := []string{}
 	// TODO this isn't quite right - we need a better "list only pods from one instance" func
 	// TODO revamp this flow to return a list of pods
@@ -1029,12 +1030,15 @@ func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig
 		return nil, nil, err
 	}
 	for _, builder := range builders {
+		if builder.Name != builderName {
+			continue
+		}
 		for _, node := range builder.Nodes {
 			// For containerd, we never have to load on the node where it was built
 			// TODO - this doesn't actually work yet, but when we switch the output.Type to "oci" we need to load it everywhere anyway
-			if node.Name == chosenNodeName {
-				continue
-			}
+			// if node.Name == chosenNodeName {
+			// 	continue
+			// }
 			nodeNames = append(nodeNames, node.Name)
 		}
 	}
diff --git a/pkg/cmd/build.go b/pkg/cmd/build.go
index bade0ddd..b11d6700 100644
--- a/pkg/cmd/build.go
+++ b/pkg/cmd/build.go
@@ -232,7 +232,7 @@ func runBuild(streams genericclioptions.IOStreams, in buildOptions) error {
 func buildTargets(ctx context.Context, kubeClientConfig clientcmd.ClientConfig, streams genericclioptions.IOStreams, opts map[string]build.Options, progressMode, contextPathHash, registrySecretName, instance string) error {
 	driverName := instance
 	if driverName == "" {
-		driverName = "buildx_buildkit_kubernetes"
+		driverName = "buildkit"
 	}
 	d, err := driver.GetDriver(ctx, driverName, nil, kubeClientConfig, []string{} /* TODO what BuildkitFlags are these? */, "" /* unused config file */, map[string]string{} /* DriverOpts unused */, contextPathHash)
 	if err != nil {
diff --git a/pkg/driver/kubernetes/driver.go b/pkg/driver/kubernetes/driver.go
index e76f8d58..cd5f7831 100644
--- a/pkg/driver/kubernetes/driver.go
+++ b/pkg/driver/kubernetes/driver.go
@@ -51,6 +51,7 @@ const (
 	// TODO - consider adding other default values here to aid users in fine-tuning by editing the configmap post deployment
 	DefaultConfigFileTemplate = `# Default buildkitd configuration.  Use --config <path/to/file> to override during create
 debug = false
+root = "/var/lib/buildkit/{{ .Name }}"
 [worker.containerd]
   namespace = "{{ .ContainerdNamespace }}"
 `
@@ -103,7 +104,7 @@ func (d *Driver) Bootstrap(ctx context.Context, l progress.Logger) error {
 			}
 		}
 		return sub.Wrap(
-			fmt.Sprintf("waiting for %d pods to be ready", d.minReplicas),
+			fmt.Sprintf("waiting for %d pods to be ready for %s", d.minReplicas, d.deployment.Name),
 			func() error {
 				if err := d.wait(ctx, sub); err != nil {
 					return err
@@ -137,7 +138,7 @@ func (d *Driver) wait(ctx context.Context, sub progress.SubLogger) error {
 	for try := 0; try < 100; try++ {
 		if err == nil {
 			if depl.Status.ReadyReplicas >= int32(d.minReplicas) {
-				sub.Log(1, []byte(fmt.Sprintf("All %d replicas online\n", d.minReplicas)))
+				sub.Log(1, []byte(fmt.Sprintf("All %d replicas for %s online\n", d.minReplicas, d.deployment.Name)))
 				return nil
 			}
 			deploymentUID = string(depl.ObjectMeta.GetUID())
diff --git a/pkg/driver/kubernetes/factory.go b/pkg/driver/kubernetes/factory.go
index 949fed9e..71251989 100644
--- a/pkg/driver/kubernetes/factory.go
+++ b/pkg/driver/kubernetes/factory.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"io/ioutil"
 	"strconv"
-	"strings"
 	"text/template"
 
 	"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/driver"
@@ -243,21 +242,9 @@ func (f *factory) AllowsInstances() bool {
 }
 
 // buildxNameToDeploymentName converts buildx name to Kubernetes Deployment name.
-//
-// eg. "buildx_buildkit_loving_mendeleev0" -> "loving-mendeleev0"
 func buildxNameToDeploymentName(bx string) string {
-	// TODO: commands.util.go should not pass "buildx_buildkit_" prefix to drivers
-	s := bx
-	if strings.HasPrefix(s, "buildx_buildkit_") {
-		s = strings.TrimPrefix(s, "buildx_buildkit_")
-	}
-	s = strings.ReplaceAll(s, "_", "-")
-	if s == "kubernetes" {
-		// Having the default name of the deployment for buildkit called "kubernetes" is confusing, use something better
-		return "buildkit"
-	}
-	if s == "" {
+	if bx == "" {
 		return "buildkit"
 	}
-	return s
+	return bx
 }
diff --git a/pkg/driver/kubernetes/manifest/manifest.go b/pkg/driver/kubernetes/manifest/manifest.go
index 56743da4..a160f732 100644
--- a/pkg/driver/kubernetes/manifest/manifest.go
+++ b/pkg/driver/kubernetes/manifest/manifest.go
@@ -160,7 +160,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
 		},
 		corev1.VolumeMount{
 			Name:             "var-lib-buildkit",
-			MountPath:        "/var/lib/buildkit",
+			MountPath:        "/var/lib/buildkit/" + opt.Name,
 			MountPropagation: &mountPropagationBidirectional,
 		},
 		corev1.VolumeMount{
@@ -204,7 +204,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
 			Name: "var-lib-buildkit",
 			VolumeSource: corev1.VolumeSource{
 				HostPath: &corev1.HostPathVolumeSource{
-					Path: "/var/lib/buildkit",
+					Path: "/var/lib/buildkit/" + opt.Name,
 					Type: &hostPathDirectoryOrCreate,
 				}},
 		},
@@ -246,10 +246,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
 		},
 	)
 
-	// If this is a multi-user cluster and other users are running
-	// containerd based builders in other namespaces, we'll fail to start due to
-	// shared mounts for the buildkit cache
-	// buildkitd: could not lock /var/lib/buildkit/buildkitd.lock, another instance running?
+	// Spread our builders out on a multi-node cluster
 	d.Spec.Template.Spec.Affinity = &corev1.Affinity{
 		PodAntiAffinity: &corev1.PodAntiAffinity{
 			RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{

From edb8d429062b2fadc0c7eae99dbd558ebe931b30 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <hiltgend@vmware.com>
Date: Mon, 7 Dec 2020 15:55:04 -0800
Subject: [PATCH 2/2] Switch CI to use native containerd instead of kind

The github actions ubuntu image has a recent enough containerd version
installed that we can run kubernetes directly on containerd without having to
use kind nesting.
---
 .github/workflows/pull_request.yaml | 25 ++++++++++++++++++++-----
 pkg/build/build.go                  |  5 +----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml
index a5f378c3..8305c01f 100644
--- a/.github/workflows/pull_request.yaml
+++ b/.github/workflows/pull_request.yaml
@@ -28,7 +28,7 @@ jobs:
         flags: unit-tests
         name: codecov-unit-test
 
-  test-integration-kind-containerd:
+  test-integration-containerd:
     runs-on: ubuntu-latest
     steps:
     - name: Set up Go ${{ env.GO_VERSION }}
@@ -37,16 +37,31 @@ jobs:
         go-version: ^${{ env.GO_VERSION }}
     - uses: actions/checkout@v2
       name: Check out code into the Go module directory
-    - name: Setup kind cluster
-      run: kind create cluster --config ./integration/kind-ci.yaml
+    - name: Setup containerd cluster
+      run: |
+        set -x
+        sudo apt-get install -y kubeadm kubelet
+        sudo swapoff -a
+        # Ensure dockerd isn't running
+        sudo systemctl stop docker.socket
+        sudo systemctl stop docker
+        sudo rm -f /var/run/docker.sock
+        sudo kubeadm init --cri-socket /run/containerd/containerd.sock
+        mkdir -p $HOME/.kube/
+        sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+        sudo chown $USER $HOME/.kube/config
+        kubectl taint nodes --all node-role.kubernetes.io/master-
+        kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
+        kubectl wait --for=condition=ready --timeout=30s node --all
+        kubectl get nodes -o wide
     - name: Run integration tests
-      run: make integration EXTRA_GO_TEST_FLAGS=-v TEST_PARALLELISM=1
+      run: make integration EXTRA_GO_TEST_FLAGS=-v
     - name: Gather integration coverage results
       uses: codecov/codecov-action@v1
       with:
         file: cover-int.out
         flags: integration-tests
-        name: codecov-integration-test-kind
+        name: codecov-integration-test-containerd
 
   test-integration-dockerd:
     runs-on: ubuntu-latest
diff --git a/pkg/build/build.go b/pkg/build/build.go
index 610f2829..92141178 100644
--- a/pkg/build/build.go
+++ b/pkg/build/build.go
@@ -1035,10 +1035,7 @@ func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig
 		}
 		for _, node := range builder.Nodes {
 			// For containerd, we never have to load on the node where it was built
-			// TODO - this doesn't actually work yet, but when we switch the output.Type to "oci" we need to load it everywhere anyway
-			// if node.Name == chosenNodeName {
-			// 	continue
-			// }
+			// TODO - we may want to filter the source node, but when we switch the output.Type to "oci" we need to load it everywhere anyway
 			nodeNames = append(nodeNames, node.Name)
 		}
 	}