Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Commit

Permalink
Support multiple concurrent builder deployments
Browse files Browse the repository at this point in the history
This fixes a glitch in the way builders were looked up where multiple builders
running on the same cluster wound up incorrectly identified for transfer during
builds.  It also refines the way we define the buildkit state directory for
containerd based builds.  This state directory must be on the host so
containerd can mount it into child containers, but multiple instances of
buildkitd can not share.  Most of the tests have been switched to support
parallel runs, however while testing, I noticed `kind` does not work in
parallel (seemingly random low-level containerd failures to start containers)
so CI forces a serial testing for kind.  Native dockerd and containerd clusters
work correctly.
  • Loading branch information
Daniel Hiltgen committed Dec 9, 2020
1 parent 2848b46 commit 364ef5a
Show file tree
Hide file tree
Showing 11 changed files with 42 additions and 49 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/pull_request.yaml
Expand Up @@ -40,7 +40,7 @@ jobs:
- name: Setup kind cluster
run: kind create cluster --config ./integration/kind-ci.yaml
- name: Run integration tests
run: make integration EXTRA_GO_TEST_FLAGS=-v
run: make integration EXTRA_GO_TEST_FLAGS=-v TEST_PARALLELISM=1
- name: Gather integration coverage results
uses: codecov/codecov-action@v1
with:
Expand Down Expand Up @@ -69,7 +69,8 @@ jobs:
kubectl taint nodes --all node-role.kubernetes.io/master-
kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
kubectl wait --for=condition=ready --timeout=30s node --all
kubectl get nodes
kubectl get nodes -o wide
docker version
- name: Run integration tests
run: make integration EXTRA_GO_TEST_FLAGS=-v
Expand Down
7 changes: 5 additions & 2 deletions Makefile
Expand Up @@ -3,7 +3,8 @@

VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always)
TEST_KUBECONFIG?=$(HOME)/.kube/config
TEST_TIMEOUT=300s
TEST_TIMEOUT=600s
TEST_PARALLELISM=3

# Verify Go in PATH
ifeq (, $(shell which go))
Expand Down Expand Up @@ -71,7 +72,9 @@ test:
integration:
@echo "Running integration tests with $(TEST_KUBECONFIG)"
@kubectl config get-contexts
TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) $(EXTRA_GO_TEST_FLAGS) \
TEST_KUBECONFIG=$(TEST_KUBECONFIG) go test -timeout $(TEST_TIMEOUT) $(GO_FLAGS) \
-parallel $(TEST_PARALLELISM) \
$(EXTRA_GO_TEST_FLAGS) \
$(GO_COVER_FLAGS) -coverprofile=./cover-int.out \
./integration/...

Expand Down
8 changes: 4 additions & 4 deletions integration/common/basesuites.go
Expand Up @@ -26,7 +26,7 @@ type BaseSuite struct {
Namespace string
}

func (s *BaseSuite) SetupTest() {
func (s *BaseSuite) SetupSuite() {
var err error
if !s.SkipSetupCreate {
logrus.Infof("%s: Setting up builder", s.Name)
Expand All @@ -44,7 +44,7 @@ func (s *BaseSuite) SetupTest() {
require.NoError(s.T(), err, "%s: kube client failed", s.Name)
}

func (s *BaseSuite) TearDownTest() {
func (s *BaseSuite) TearDownSuite() {
logrus.Infof("%s: Removing builder", s.Name)
err := RunBuildkit("rm", []string{
s.Name,
Expand All @@ -62,7 +62,7 @@ func (s *BaseSuite) TestSimpleBuild() {
dir, cleanup, err := NewSimpleBuildContext()
defer cleanup()
require.NoError(s.T(), err, "Failed to set up temporary build context")
args := []string{}
args := []string{"--progress=plain"}
if s.Name != "buildkit" { // TODO wire up the default name variable
args = append(
args,
Expand Down Expand Up @@ -102,7 +102,7 @@ func (s *BaseSuite) TestLocalOutputTarBuild() {
dir, cleanup, err := NewSimpleBuildContext()
defer cleanup()
require.NoError(s.T(), err, "Failed to set up temporary build context")
args := []string{}
args := []string{"--progress=plain"}
if s.Name != "buildkit" { // TODO wire up the default name variable
args = append(
args,
Expand Down
4 changes: 2 additions & 2 deletions integration/suites/configmap_test.go
Expand Up @@ -30,7 +30,7 @@ type configMapSuite struct {
configMapClient v1.ConfigMapInterface
}

func (s *configMapSuite) SetupTest() {
func (s *configMapSuite) SetupSuite() {
var err error
s.ClientSet, s.Namespace, err = common.GetKubeClientset()
require.NoError(s.T(), err, "%s: kube client failed", s.Name)
Expand Down Expand Up @@ -200,7 +200,7 @@ func (s *configMapSuite) TestPreExistingWithCustomCreate() {

func TestConfigMapSuite(t *testing.T) {
common.Skipper(t)
//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
t.Parallel()
suite.Run(t, &configMapSuite{
Name: "configmaptest",
})
Expand Down
2 changes: 1 addition & 1 deletion integration/suites/default_test.go
Expand Up @@ -14,7 +14,7 @@ type DefaultSuite struct{ common.BaseSuite }

func TestDefaultSuite(t *testing.T) {
common.Skipper(t)
//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
t.Parallel()
suite.Run(t, &DefaultSuite{
BaseSuite: common.BaseSuite{
Name: "buildkit", // TODO pull this from the actual default name
Expand Down
2 changes: 1 addition & 1 deletion integration/suites/randomlb_test.go
Expand Up @@ -16,7 +16,7 @@ type RandomLBSuite struct{ common.BaseSuite }

func TestRandomLBSuite(t *testing.T) {
common.Skipper(t)
//t.Parallel() // TODO - tests fail if run in parallel, may be actual race bug
t.Parallel()
suite.Run(t, &RandomLBSuite{
BaseSuite: common.BaseSuite{
Name: "randomlb",
Expand Down
30 changes: 17 additions & 13 deletions pkg/build/build.go
Expand Up @@ -581,14 +581,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku
multiDriver := len(m[k]) > 1
for i, dp := range m[k] {
d := drivers[dp.driverIndex].Driver
driverName := drivers[dp.driverIndex].Name
opt.Platforms = dp.platforms

// TODO - this is a little messy - some recactoring of this routine would be worthwhile
var chosenNodeName string
for name := range clients[d.Factory().Name()] {
chosenNodeName = name // TODO - this doesn't actually seem to be working!!!
break
}
// TODO - this is also messy and wont work for multi-driver scenarios (no that it's possible yet...)
if auth == nil {
auth = d.GetAuthWrapper(registrySecretName)
Expand All @@ -598,9 +593,9 @@ func Build(ctx context.Context, drivers []DriverInfo, opt map[string]Options, ku
// Set up loader based on first found type (only 1 supported)
for _, entry := range opt.Exports {
if entry.Type == "docker" {
return newDockerLoader(ctx, d, kubeClientConfig, chosenNodeName, mw)
return newDockerLoader(ctx, d, kubeClientConfig, driverName, mw)
} else if entry.Type == "oci" {
return newContainerdLoader(ctx, d, kubeClientConfig, chosenNodeName, mw)
return newContainerdLoader(ctx, d, kubeClientConfig, driverName, mw)
}
}
// TODO - Push scenario? (or is this a "not reached" scenario now?)
Expand Down Expand Up @@ -899,18 +894,24 @@ func notSupported(d driver.Driver, f driver.Feature) error {

type dockerLoadCallback func(name string) (io.WriteCloser, func(), error)

func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
func newDockerLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
nodeNames := []string{}
// TODO this isn't quite right - we need a better "list only pods from one instance" func
builders, err := d.List(ctx)
if err != nil {
return nil, nil, err
}
for _, builder := range builders {
if builder.Name != builderName {
continue
}
for _, node := range builder.Nodes {
nodeNames = append(nodeNames, node.Name)
}
}
if len(nodeNames) == 0 {
return nil, nil, fmt.Errorf("no builders found for %s", builderName)
}
// TODO revamp this flow to return a list of pods

readers := make([]*io.PipeReader, len(nodeNames))
Expand Down Expand Up @@ -1019,7 +1020,7 @@ func (w *waitingWriter) Close() error {
return err
}

func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, chosenNodeName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig clientcmd.ClientConfig, builderName string, mw *progress.MultiWriter) (io.WriteCloser, func(), error) {
nodeNames := []string{}
// TODO this isn't quite right - we need a better "list only pods from one instance" func
// TODO revamp this flow to return a list of pods
Expand All @@ -1029,12 +1030,15 @@ func newContainerdLoader(ctx context.Context, d driver.Driver, kubeClientConfig
return nil, nil, err
}
for _, builder := range builders {
if builder.Name != builderName {
continue
}
for _, node := range builder.Nodes {
// For containerd, we never have to load on the node where it was built
// TODO - this doesn't actually work yet, but when we switch the output.Type to "oci" we need to load it everywhere anyway
if node.Name == chosenNodeName {
continue
}
// if node.Name == chosenNodeName {
// continue
// }
nodeNames = append(nodeNames, node.Name)
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/build.go
Expand Up @@ -232,7 +232,7 @@ func runBuild(streams genericclioptions.IOStreams, in buildOptions) error {
func buildTargets(ctx context.Context, kubeClientConfig clientcmd.ClientConfig, streams genericclioptions.IOStreams, opts map[string]build.Options, progressMode, contextPathHash, registrySecretName, instance string) error {
driverName := instance
if driverName == "" {
driverName = "buildx_buildkit_kubernetes"
driverName = "buildkit"
}
d, err := driver.GetDriver(ctx, driverName, nil, kubeClientConfig, []string{} /* TODO what BuildkitFlags are these? */, "" /* unused config file */, map[string]string{} /* DriverOpts unused */, contextPathHash)
if err != nil {
Expand Down
5 changes: 3 additions & 2 deletions pkg/driver/kubernetes/driver.go
Expand Up @@ -51,6 +51,7 @@ const (
// TODO - consider adding other default values here to aid users in fine-tuning by editing the configmap post deployment
DefaultConfigFileTemplate = `# Default buildkitd configuration. Use --config <path/to/file> to override during create
debug = false
root = "/var/lib/buildkit/{{ .Name }}"
[worker.containerd]
namespace = "{{ .ContainerdNamespace }}"
`
Expand Down Expand Up @@ -103,7 +104,7 @@ func (d *Driver) Bootstrap(ctx context.Context, l progress.Logger) error {
}
}
return sub.Wrap(
fmt.Sprintf("waiting for %d pods to be ready", d.minReplicas),
fmt.Sprintf("waiting for %d pods to be ready for %s", d.minReplicas, d.deployment.Name),
func() error {
if err := d.wait(ctx, sub); err != nil {
return err
Expand Down Expand Up @@ -137,7 +138,7 @@ func (d *Driver) wait(ctx context.Context, sub progress.SubLogger) error {
for try := 0; try < 100; try++ {
if err == nil {
if depl.Status.ReadyReplicas >= int32(d.minReplicas) {
sub.Log(1, []byte(fmt.Sprintf("All %d replicas online\n", d.minReplicas)))
sub.Log(1, []byte(fmt.Sprintf("All %d replicas for %s online\n", d.minReplicas, d.deployment.Name)))
return nil
}
deploymentUID = string(depl.ObjectMeta.GetUID())
Expand Down
17 changes: 2 additions & 15 deletions pkg/driver/kubernetes/factory.go
Expand Up @@ -8,7 +8,6 @@ import (
"fmt"
"io/ioutil"
"strconv"
"strings"
"text/template"

"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/driver"
Expand Down Expand Up @@ -243,21 +242,9 @@ func (f *factory) AllowsInstances() bool {
}

// buildxNameToDeploymentName converts buildx name to Kubernetes Deployment name.
//
// eg. "buildx_buildkit_loving_mendeleev0" -> "loving-mendeleev0"
func buildxNameToDeploymentName(bx string) string {
// TODO: commands.util.go should not pass "buildx_buildkit_" prefix to drivers
s := bx
if strings.HasPrefix(s, "buildx_buildkit_") {
s = strings.TrimPrefix(s, "buildx_buildkit_")
}
s = strings.ReplaceAll(s, "_", "-")
if s == "kubernetes" {
// Having the default name of the deployment for buildkit called "kubernetes" is confusing, use something better
return "buildkit"
}
if s == "" {
if bx == "" {
return "buildkit"
}
return s
return bx
}
9 changes: 3 additions & 6 deletions pkg/driver/kubernetes/manifest/manifest.go
Expand Up @@ -160,7 +160,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
},
corev1.VolumeMount{
Name: "var-lib-buildkit",
MountPath: "/var/lib/buildkit",
MountPath: "/var/lib/buildkit/" + opt.Name,
MountPropagation: &mountPropagationBidirectional,
},
corev1.VolumeMount{
Expand Down Expand Up @@ -204,7 +204,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
Name: "var-lib-buildkit",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/lib/buildkit",
Path: "/var/lib/buildkit/" + opt.Name,
Type: &hostPathDirectoryOrCreate,
}},
},
Expand Down Expand Up @@ -246,10 +246,7 @@ func toContainerdWorker(d *appsv1.Deployment, opt *DeploymentOpt) error {
},
)

// If this is a multi-user cluster and other users are running
// containerd based builders in other namespaces, we'll fail to start due to
// shared mounts for the buildkit cache
// buildkitd: could not lock /var/lib/buildkit/buildkitd.lock, another instance running?
// Spread our builders out on a multi-node cluster
d.Spec.Template.Spec.Affinity = &corev1.Affinity{
PodAntiAffinity: &corev1.PodAntiAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
Expand Down

0 comments on commit 364ef5a

Please sign in to comment.