From 2f598d2f554d01a87d7b24742055b170b0a062cd Mon Sep 17 00:00:00 2001 From: Antoni Ivanov Date: Tue, 8 Mar 2022 15:04:55 +0200 Subject: [PATCH] vdk-server: status returns if server is not running (#754) When running `vdk server --status` command it was returning that the control service was installed but it was not running (the process was down). So added a check to explicitly verify that. Added a few more debug messages. Testing Done: vdk server -s locally and verified message. Signed-off-by: Antoni Ivanov --- .../src/vdk/plugin/server/installer.py | 165 ++++++++++-------- .../vdk/plugin/server/server_plugin_utils.py | 25 +++ 2 files changed, 121 insertions(+), 69 deletions(-) create mode 100644 projects/vdk-plugins/vdk-server/src/vdk/plugin/server/server_plugin_utils.py diff --git a/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/installer.py b/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/installer.py index 1c1f8b7647..30f18be57b 100644 --- a/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/installer.py +++ b/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/installer.py @@ -23,6 +23,7 @@ ) from vdk.internal.core.errors import BaseVdkError from vdk.internal.core.errors import ErrorMessage +from vdk.plugin.server import server_plugin_utils log = logging.getLogger(__name__) @@ -102,13 +103,15 @@ def check_status(self): self.__get_kind_cluster() and self.__docker_container_exists(self.git_server_container_name) and self.__docker_container_exists(self.docker_registry_container_name) + and self.__control_service_is_up() ): - log.info("The Versatile Data Kit Control Service is installed") + log.info("The Versatile Data Kit Control Service is installed and running.") log.info( "Access the REST API at http://localhost:8092/data-jobs/swagger-ui.html\n" ) else: - log.info("No installation found") + log.info("No running installation found.") + sys.exit(1) @staticmethod def __get_current_directory() -> pathlib.Path: @@ -121,7 +124,7 @@ def __docker_container_exists(container_name) -> bool: """ docker_client = docker.from_env() try: - return next( + container_exists = next( ( c for c in docker_client.api.containers(all=True) @@ -129,6 +132,10 @@ def __docker_container_exists(container_name) -> bool: ), None, ) + log.debug( + f"Container {container_name} is {'' if container_exists else 'not'} present" + ) + return container_exists except Exception as ex: log.error(f"Failed to search for a Docker container. {str(ex)}") sys.exit(1) @@ -531,7 +538,12 @@ def __get_kind_cluster(self) -> bool: log.error(f"Stderr output: {stderr_as_str}") sys.exit(result.returncode) stdout_as_str = result.stdout.decode("utf-8") - return self.kind_cluster_name in stdout_as_str.splitlines() + kind_cluster_exists = self.kind_cluster_name in stdout_as_str.splitlines() + if kind_cluster_exists: + log.debug(f"Kind cluster {self.kind_cluster_name} is present.") + else: + log.debug(f"Kind cluster {self.kind_cluster_name} is not present.") + return kind_cluster_exists except Exception as ex: log.error( f'Failed to obtain information about the Kind cluster "{self.kind_cluster_name}". ' @@ -604,71 +616,7 @@ def __install_helm_chart(self): log.error(f"Stderr output: {stderr_as_str}") exit(result.returncode) result = subprocess.run( - [ - "helm", - "install", - self.helm_installation_name, - self.helm_chart_name, - "--atomic", - "--set", - "service.type=ClusterIP", - "--set", - "deploymentBuilderResourcesDefault.limits.cpu=0", - "--set", - "deploymentBuilderResourcesDefault.requests.cpu=0", - "--set", - "deploymentBuilderResourcesDefault.limits.memory=0", - "--set", - "deploymentBuilderResourcesDefault.requests.memory=0", - "--set", - "deploymentDefaultDataJobsResources.limits.cpu=0", - "--set", - "deploymentDefaultDataJobsResources.requests.cpu=0", - "--set", - "deploymentDefaultDataJobsResources.limits.memory=0", - "--set", - "deploymentDefaultDataJobsResources.requests.memory=0", - "--set", - "resources.limits.cpu=0", - "--set", - "resources.requests.cpu=0", - "--set", - "resources.limits.memory=0", - "--set", - "resources.requests.memory=0", - "--set", - "cockroachdb.statefulset.replicas=1", - "--set", - "replicas=1", - "--set", - "ingress.enabled=true", - "--set", - "deploymentGitBranch=master", - "--set", - "deploymentDockerRegistryType=generic", - "--set", - f"deploymentDockerRepository={self.docker_registry_container_name}:5000", - "--set", - "proxyRepositoryURL=localhost:5000", - "--set", - f"deploymentGitUrl={git_server_ip}/{self.git_server_admin_user}/{self.git_server_repository_name}.git", - "--set", - f"deploymentGitUsername={self.git_server_admin_user}", - "--set", - f"deploymentGitPassword={self.git_server_admin_password}", - "--set", - f"uploadGitReadWriteUsername={self.git_server_admin_user}", - "--set", - f"uploadGitReadWritePassword={self.git_server_admin_password}", - "--set", - "extraEnvVars.GIT_SSL_ENABLED=false", - "--set", - "extraEnvVars.DATAJOBS_DEPLOYMENT_BUILDER_EXTRAARGS=--insecure", - "--set", - "datajobTemplate.template.spec.successfulJobsHistoryLimit=5", - "--set", - "datajobTemplate.template.spec.failedJobsHistoryLimit=5", - ], + self.__helm_install_command(git_server_ip), capture_output=True, ) if result.returncode != 0: @@ -683,6 +631,73 @@ def __install_helm_chart(self): ) sys.exit(1) + def __helm_install_command(self, git_server_ip): + return [ + "helm", + "install", + self.helm_installation_name, + self.helm_chart_name, + "--atomic", + "--set", + "service.type=ClusterIP", + "--set", + "deploymentBuilderResourcesDefault.limits.cpu=0", + "--set", + "deploymentBuilderResourcesDefault.requests.cpu=0", + "--set", + "deploymentBuilderResourcesDefault.limits.memory=0", + "--set", + "deploymentBuilderResourcesDefault.requests.memory=0", + "--set", + "deploymentDefaultDataJobsResources.limits.cpu=0", + "--set", + "deploymentDefaultDataJobsResources.requests.cpu=0", + "--set", + "deploymentDefaultDataJobsResources.limits.memory=0", + "--set", + "deploymentDefaultDataJobsResources.requests.memory=0", + "--set", + "resources.limits.cpu=0", + "--set", + "resources.requests.cpu=0", + "--set", + "resources.limits.memory=0", + "--set", + "resources.requests.memory=0", + "--set", + "cockroachdb.statefulset.replicas=1", + "--set", + "replicas=1", + "--set", + "ingress.enabled=true", + "--set", + "deploymentGitBranch=master", + "--set", + "deploymentDockerRegistryType=generic", + "--set", + f"deploymentDockerRepository={self.docker_registry_container_name}:5000", + "--set", + "proxyRepositoryURL=localhost:5000", + "--set", + f"deploymentGitUrl={git_server_ip}/{self.git_server_admin_user}/{self.git_server_repository_name}.git", + "--set", + f"deploymentGitUsername={self.git_server_admin_user}", + "--set", + f"deploymentGitPassword={self.git_server_admin_password}", + "--set", + f"uploadGitReadWriteUsername={self.git_server_admin_user}", + "--set", + f"uploadGitReadWritePassword={self.git_server_admin_password}", + "--set", + "extraEnvVars.GIT_SSL_ENABLED=false", + "--set", + "extraEnvVars.DATAJOBS_DEPLOYMENT_BUILDER_EXTRAARGS=--insecure", + "--set", + "datajobTemplate.template.spec.successfulJobsHistoryLimit=5", + "--set", + "datajobTemplate.template.spec.failedJobsHistoryLimit=5", + ] + def __uninstall_helm_chart(self): log.info("Uninstalling Control Service...") try: @@ -711,6 +726,7 @@ def __install_ingress_prerequisites(self): config.load_kube_config() with client.ApiClient() as k8s_client: try: + log.debug("Deploy ingress controller...") utils.create_from_yaml( k8s_client, self.__current_directory.joinpath("ingress-nginx-deploy.yaml"), @@ -727,6 +743,7 @@ def __install_ingress_prerequisites(self): w = watch.Watch() k8s_client = client.CoreV1Api() try: + log.debug("Wait for ingress controller to be ready ...") for event in w.stream( func=k8s_client.list_namespaced_pod, namespace="ingress-nginx", @@ -772,3 +789,13 @@ def __cleanup_configuration(): log.error(f"Failed to clean up. {str(ex)}") exit(1) log.info("Done") + + @staticmethod + def __control_service_is_up(): + with server_plugin_utils.requests_retry_session() as s: + response: requests.Response = s.get("http://localhost:8092") + if response.status_code < 300: + log.debug("Control Service at http://localhost:8092 is UP.") + else: + log.debug("Control Service at http://localhost:8092 is DOWN.") + return response.status_code < 300 diff --git a/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/server_plugin_utils.py b/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/server_plugin_utils.py new file mode 100644 index 0000000000..31058dd106 --- /dev/null +++ b/projects/vdk-plugins/vdk-server/src/vdk/plugin/server/server_plugin_utils.py @@ -0,0 +1,25 @@ +# Copyright 2021 VMware, Inc. +# SPDX-License-Identifier: Apache-2.0 +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + + +def requests_retry_session( + retries=3, + backoff_factor=0.3, + status_forcelist=(500, 502, 503, 504), + session=None, +) -> requests.Session: + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session