From 5c294f475e85081e85ecc82462f20129c3a55ee0 Mon Sep 17 00:00:00 2001 From: Nick Vrvilo Date: Thu, 10 Jan 2019 17:43:35 -0600 Subject: [PATCH] S3 backup for Waiter Service logs on K8s --- kitchen/.gitignore | 5 + kitchen/bin/test-waiter-init | 31 +++++++ kitchen/bin/waiter-init | 91 +++++++++++++++--- .../fileserver-container/fileserver-start | 4 +- .../ci/run-integration-tests-k8s-scheduler.sh | 11 ++- waiter/bin/ci/s3-server-setup.sh | 26 ++++++ waiter/config-full.edn | 8 ++ waiter/config-k8s.edn | 2 + .../kubernetes_scheduler_integration_test.clj | 93 ++++++++++++++++++- waiter/src/waiter/scheduler/kubernetes.clj | 78 ++++++++++++---- waiter/src/waiter/settings.clj | 5 +- waiter/src/waiter/util/client_tools.clj | 5 + .../test/waiter/scheduler/kubernetes_test.clj | 10 +- 13 files changed, 327 insertions(+), 42 deletions(-) create mode 100755 waiter/bin/ci/s3-server-setup.sh diff --git a/kitchen/.gitignore b/kitchen/.gitignore index 3b9992af2..3709e3533 100644 --- a/kitchen/.gitignore +++ b/kitchen/.gitignore @@ -1,3 +1,8 @@ .cache __pycache__ kitchen.log + +/latest +/r[0-9] +/.r[0-9].index.json +/.waiter-container-runs diff --git a/kitchen/bin/test-waiter-init b/kitchen/bin/test-waiter-init index 0573170c3..bfd653253 100755 --- a/kitchen/bin/test-waiter-init +++ b/kitchen/bin/test-waiter-init @@ -13,6 +13,18 @@ start_test() { working_dir=r$(( $(cat .waiter-container-runs) - 1 )) } +start_guarded_test() { + test_desc="$1" + shift + if test "$@"; then + start_test "$test_desc" + test_started=true + else + printf '\nSkipping Test %s' "$test_desc" + test_started=false + fi +} + send_sigterm() { sleep 0.1 kill $test_pid @@ -136,4 +148,23 @@ assert $working_dir != $old_working_dir assert -f $working_dir/$fresh_file_name assert $(cat $working_dir/$fresh_file_name) == hello +# +# Ensure that the stdout file captured by the wrapper script +# is uploaded to S3 when the script is terminated. +# This test runs iff WAITER_LOG_BUCKET_URL is non-null. +# +start_guarded_test 'stdout uploaded to s3 bucket' "$WAITER_LOG_BUCKET_URL" <&2 fi - # Wait for a long time (15 minutes), awaiting another signal from Kubernetes. - # This delay gives Waiter time to safely update the desired replica count - # before the pod actually terminates, avoiding a race to replace this pod. - # If we receive a second SIGTERM from Kubernetes, then the sleep period is canceled, - # and we simply wait for the user's process to complete (or get SIGKILLed). - # The main point here is to NOT exit before the second SIGTERM is received. - # If for some reason the second SIGTERM never arrives, the sleep will eventually expire, - # or the pod's grace period will expire (resulting in a SIGKILL from Kubernetes). - # Likewise, if the user's process takes too long to terminate gracefully, - # the pod's grace period will expire (resulting in a SIGKILL from Kubernetes). - sleep 900 & - wait # wait for sleep to complete, or a second SIGTERM - kill %2 # cancel sleep - wait # wait for graceful termination of user's process + waiter_init_pid=$$ + + { + # Give the user's application a few seconds to gracefully exit, + # then forcefully terminate with a SIGKILL if it's still running. + sleep ${WAITER_GRACE_SECS:=3} + kill -9 -- -$waiter_child_pid + + # Wait for another signal from Kubernetes. + # This delay gives Waiter time to safely update the desired replica count + # before the pod actually terminates, avoiding a race to replace this pod. + # If we receive a second SIGTERM from Kubernetes, then the sleep period is canceled, + # and we simply wait for the user's process to complete (or get SIGKILLed). + # The main point here is to NOT exit before the second SIGTERM is received. + # If for some reason the second SIGTERM never arrives, the sleep will eventually expire, + # or the pod's grace period will expire (resulting in a SIGKILL from Kubernetes). + # Likewise, if the user's process takes too long to terminate gracefully, + # the pod's grace period will expire (resulting in a SIGKILL from Kubernetes). + # However, if we don't see the second SIGTERM after a reasonable delay, + # we assume we missed it (due to the asyncronous nature of the system), + # and that it is now safe to terminate this pod. + sleep ${WAITER_SYNC_MAX_SECS:=10} + kill -15 $waiter_init_pid + } & + + # wait for graceful termination of user's process + while kill -0 $waiter_child_pid; do + wait %1 + done &>/dev/null + + # send logs to S3 (if enabled) + if [ "$WAITER_LOG_BUCKET_URL" ]; then + # Extract this pod's name from the hostname + # (this is used to create a unique path in S3) + pod_name=$(hostname --short) + base_url="$WAITER_LOG_BUCKET_URL/$pod_name" + # For each ./r* directory created by a container restart, + # we upload the stdout and stderr, and build an index.json file + # that is also uploaded to the target directory in the S3 bucket. + # We work backwards from the most recent run down to 0 to increase the odds + # that our most recent runs' logs are successfully persisted before a SIGKILL. + cd "$waiter_sandbox_base_dir" + for i in $(seq $waiter_restart_count -1 0); do + waiter_log_files='stdout stderr' + indextmp=".r$i.index.json" + rm -f $indextmp + separator='[' + for f in $waiter_log_files; do + logfile="r$i/$f" + # Using the -T option with curl PUTs the target file to the given URL, + # and avoids loading the full file into memory when sending the payload. + curl -s -T "$logfile" "$base_url/$logfile" + printf '%s\n{"name":"%s","type":"file","size":%d}' "$separator" "$f" "$(stat -c%s $logfile)" >> $indextmp + separator=',' + done + printf '\n]\n' >> $indextmp + curl -s -T "$indextmp" "$base_url/r$i/index.json" + done + fi + + # wait for second sigterm to arrive + while [ $waiter_2nd_sigterm != true ]; do + sleep 0.1 + done # Exit container with code 128+15=143, indicating termination via SIGTERM. exit 143 } + +# Catch the second SIGTERM sent by Kubernetes on pod deletion. +# This double-termination is an important part of our Waiter scale-down logic, +# and the mechanics are described in more detail above (in handle_k8s_terminate). +handle_2nd_k8s_terminate() { + trap : SIGTERM # reset SIGTERM handler to no-op + waiter_2nd_sigterm=true +} + trap handle_k8s_terminate SIGTERM # Track container restart count @@ -53,6 +113,7 @@ echo $(( $waiter_restart_count + 1 )) > .waiter-container-runs # Ensure that HOME is set to the fresh working directory for this container instance. # HOME should be a symlink ./latest, which points to the new working directory. +waiter_sandbox_base_dir="$(pwd -P)" waiter_sandbox_dir="./r${waiter_restart_count}" mkdir -p "$waiter_sandbox_dir" ln -Tsf $waiter_sandbox_dir latest diff --git a/kubernetes/fileserver-container/fileserver-start b/kubernetes/fileserver-container/fileserver-start index 6c8be8eef..09e0b237c 100755 --- a/kubernetes/fileserver-container/fileserver-start +++ b/kubernetes/fileserver-container/fileserver-start @@ -7,5 +7,5 @@ export WAITER_FILESERVER_PORT # Generate server config from template envsubst /root/nginx.conf -# Start server in non-daemon mode -nginx -c /root/nginx.conf +# Start server in non-daemon mode, replacing current process +exec nginx -c /root/nginx.conf diff --git a/waiter/bin/ci/run-integration-tests-k8s-scheduler.sh b/waiter/bin/ci/run-integration-tests-k8s-scheduler.sh index 035883a9a..755973cca 100755 --- a/waiter/bin/ci/run-integration-tests-k8s-scheduler.sh +++ b/waiter/bin/ci/run-integration-tests-k8s-scheduler.sh @@ -2,8 +2,8 @@ # Usage: run-integration-tests-k8s-scheduler.sh [TEST_COMMAND] [TEST_SELECTOR] # # Examples: -# run-integration-tests-k8s-scheduler.sh parallel-test integration-fast -# run-integration-tests-k8s-scheduler.sh parallel-test integration-slow +# run-integration-tests-k8s-scheduler.sh parallel-test integration-heavy +# run-integration-tests-k8s-scheduler.sh parallel-test integration-lite # run-integration-tests-k8s-scheduler.sh parallel-test # run-integration-tests-k8s-scheduler.sh # @@ -21,6 +21,13 @@ KITCHEN_DIR=${WAITER_DIR}/../kitchen # Start minikube ${DIR}/minikube-setup.sh +# Start S3 test server +if [[ $TEST_SELECTOR =~ heavy$ ]]; then + ${DIR}/s3-server-setup.sh + S3SERVER_IP=$(docker inspect s3server | jq -r '.[0].NetworkSettings.Networks.bridge.IPAddress') + export WAITER_S3_BUCKET=http://$S3SERVER_IP:8000/waiter-service-logs +fi + # Ensure we have the docker image for the pods ${KITCHEN_DIR}/bin/build-docker-image.sh diff --git a/waiter/bin/ci/s3-server-setup.sh b/waiter/bin/ci/s3-server-setup.sh new file mode 100755 index 000000000..f68dbe550 --- /dev/null +++ b/waiter/bin/ci/s3-server-setup.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +# Install AWS CLI (for s3api commands) via pip +# We use this to handle S3 authentication for bucket creation +# https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html +pip install awscli --upgrade --user + +# Run cloudserver (S3 compatible test server) via docker +# The API server endpoint is accessible via localhost:8888 +# https://hub.docker.com/r/scality/s3server +echo Starting S3 server docker container +docker run --name s3server --detach --rm --publish=8888:8000 scality/s3server:6018536a +echo -n Waiting for S3 server +while ! curl localhost:8888 &>/dev/null; do + echo -n . + sleep 1 +done +echo + +# Create a public RW bucket for waiter app logs, using the default cloudserver credentials +# https://github.com/scality/cloudserver#run-it-with-a-file-backend +AWS_ACCESS_KEY_ID=accessKey1 AWS_SECRET_ACCESS_KEY=verySecretKey1 \ + aws s3api create-bucket --endpoint-url=http://localhost:8888 \ + --acl=public-read-write --bucket=waiter-service-logs --output=table diff --git a/waiter/config-full.edn b/waiter/config-full.edn index ba363f793..340a3521d 100644 --- a/waiter/config-full.edn +++ b/waiter/config-full.edn @@ -349,6 +349,14 @@ :authorizer {:kind :default :default {:factory-fn waiter.authorization/noop-authorizer}} + ;; The :log-bucket-url setting is optional, but if it's non-nil, it should be the URL (string) of an S3 bucket + ;; where log files should be copied when a waiter service pod is terminated. + ;; When a :log-bucket-url is given, the pod waits up to an additional :log-bucket-sync-secs before terminating + ;; (starting after :pod-sigkill-delay-secs or the user process's graceful exit). + ;; The log-bucket-sync-secs option should be set between 0 and 300 seconds, inclusive (default is 180). + :log-bucket-sync-secs 180 + :log-bucket-url "http://s3.example.com/waiter-service-logs" + ;; String value used to annotate Kubernetes objects that are orchestrated by this Waiter instantiation: :cluster-name "waiter" diff --git a/waiter/config-k8s.edn b/waiter/config-k8s.edn index 569ec8a09..68fb7cb84 100644 --- a/waiter/config-k8s.edn +++ b/waiter/config-k8s.edn @@ -24,6 +24,8 @@ :kubernetes {:authorizer {:kind :sanity-check :sanity-check {:factory-fn waiter.authorization/sanity-check-authorizer}} :fileserver {:port 591} + :log-bucket-sync-secs 10 + :log-bucket-url #config/env-default ["WAITER_S3_BUCKET" nil] :url "http://localhost:8001"}} ; ---------- Error Handling ---------- diff --git a/waiter/integration/waiter/kubernetes_scheduler_integration_test.clj b/waiter/integration/waiter/kubernetes_scheduler_integration_test.clj index 24dde128d..953ebf9c2 100644 --- a/waiter/integration/waiter/kubernetes_scheduler_integration_test.clj +++ b/waiter/integration/waiter/kubernetes_scheduler_integration_test.clj @@ -1,5 +1,10 @@ (ns waiter.kubernetes-scheduler-integration-test - (:require [clojure.test :refer :all] + (:require [clojure.data.json :as json] + [clojure.set :as set] + [clojure.string :as string] + [clojure.walk :as walk] + [clojure.test :refer :all] + [clojure.tools.logging :as log] [waiter.util.client-tools :refer :all])) (deftest ^:parallel ^:integration-fast test-kubernetes-watch-state-update @@ -34,3 +39,89 @@ (< initial-rs-snapshot-version initial-rs-watch-version))) (is (<= initial-rs-snapshot-version rs-snapshot-version')) (is (< rs-snapshot-version' rs-watch-version')))))))) + +(deftest ^:parallel ^:integration-slow ^:resource-heavy test-s3-logs + (testing-using-waiter-url + (when (using-k8s? waiter-url) + (let [headers {:x-waiter-name (rand-name) + :x-waiter-max-instances 2 + :x-waiter-scale-up-factor 0.99 + :x-waiter-scale-down-factor 0.99 + :x-kitchen-delay-ms 5000} + _ (log/info "making canary request...") + {:keys [cookies instance-id service-id]} (make-request-with-debug-info headers #(make-kitchen-request waiter-url %)) + request-fn (fn [] (->> #(make-kitchen-request waiter-url %) + (make-request-with-debug-info headers) + :instance-id))] + (with-service-cleanup + service-id + (assert-service-on-all-routers waiter-url service-id cookies) + ;; Get a service with at least one active and one killed instance. + ;; This portion of the test logic was copied from basic-test/test-killed-instances + (log/info "starting parallel requests") + (let [instance-ids-atom (atom #{}) + instance-request-fn (fn [] + (let [instance-id (request-fn)] + (swap! instance-ids-atom conj instance-id))) + instance-ids (->> (parallelize-requests 4 10 instance-request-fn + :canceled? (fn [] (> (count @instance-ids-atom) 2)) + :verbose true + :service-id service-id) + (reduce set/union))] + (is (> (count instance-ids) 1) (str instance-ids))) + + (log/info "waiting for at least one instance to get killed") + (is (wait-for #(->> (get-in (service-settings waiter-url service-id) [:instances :killed-instances]) + (map :id) + set + seq) + :interval 2 :timeout 45) + (str "No killed instances found for " service-id)) + ;; Test that the active instances' logs are available. + ;; This portion of the test logic was copied from basic-test/test-basic-logs + (let [active-instances (get-in (service-settings waiter-url service-id :cookies cookies) + [:instances :active-instances]) + log-url (:log-url (first active-instances)) + _ (log/debug "Log Url Active:" log-url) + make-request-fn (fn [url] (make-request url "" :verbose true)) + {:keys [body] :as logs-response} (make-request-fn log-url) + _ (assert-response-status logs-response 200) + _ (log/debug "Response body:" body) + log-files-list (walk/keywordize-keys (json/read-str body)) + stdout-file-link (:url (first (filter #(= (:name %) "stdout") log-files-list))) + stderr-file-link (:url (first (filter #(= (:name %) "stderr") log-files-list)))] + (is (every? #(string/includes? body %) ["stderr" "stdout"]) + (str "Directory listing is missing entries: stderr and stdout, got response: " logs-response)) + (doseq [file-link [stderr-file-link stdout-file-link]] + (if (string/starts-with? (str file-link) "http") + (assert-response-status (make-request-fn file-link) 200) + (log/warn "test-basic-logs did not verify file link:" stdout-file-link)))) + (delete-service waiter-url service-id) + ;; Test that the killed instances' logs were persisted to S3. + ;; This portion of the test logic was modified from the active-instances tests above. + (log/info "waiting s3 logs to appear") + (let [log-bucket-url (k8s-log-bucket-url waiter-url)] + (is (wait-for + #(let [killed-instances (get-in (service-settings waiter-url service-id :cookies cookies) + [:instances :killed-instances]) + log-url (:log-url (first killed-instances))] + (string/starts-with? log-url log-bucket-url)) + :interval 1 :timeout 15) + (str "Log URL never pointed to S3 bucket " log-bucket-url))) + (let [killed-instances (get-in (service-settings waiter-url service-id :cookies cookies) + [:instances :killed-instances]) + log-url (:log-url (first killed-instances)) + _ (log/debug "Log Url Killed:" log-url) + make-request-fn (fn [url] (make-request url "" :verbose true)) + {:keys [body] :as logs-response} (make-request-fn log-url) + _ (assert-response-status logs-response 200) + _ (log/debug "Response body:" body) + log-files-list (walk/keywordize-keys (json/read-str body)) + stdout-file-link (:url (first (filter #(= (:name %) "stdout") log-files-list))) + stderr-file-link (:url (first (filter #(= (:name %) "stderr") log-files-list)))] + (is (every? #(string/includes? body %) ["stderr" "stdout"]) + (str "Directory listing is missing entries: stderr and stdout, got response: " logs-response)) + (doseq [file-link [stderr-file-link stdout-file-link]] + (if (string/starts-with? (str file-link) "http") + (assert-response-status (make-request-fn file-link) 200) + (log/warn "test-basic-logs did not verify file link:" stdout-file-link))))))))) diff --git a/waiter/src/waiter/scheduler/kubernetes.clj b/waiter/src/waiter/scheduler/kubernetes.clj index 5b287dffb..106434fa4 100644 --- a/waiter/src/waiter/scheduler/kubernetes.clj +++ b/waiter/src/waiter/scheduler/kubernetes.clj @@ -135,6 +135,14 @@ service-id (k8s-object->service-id pod)] (str service-id \. pod-name \- restart-count)))) +(defn- unpack-instance-id + "Extract the service-id, pod name, and pod restart-number from an instance-id." + [instance-id] + (let [[_ service-id pod-name restart-number] (re-find #"^([-a-z0-9]+)\.([-a-z0-9]+)-(\d)+$" instance-id)] + {:service-id service-id + :pod-name pod-name + :restart-number restart-number})) + (defn- log-dir-path [namespace restart-count] "Build log directory path string for a containter run." (str "/home/" namespace "/r" restart-count)) @@ -444,6 +452,7 @@ daemon-state fileserver http-client + log-bucket-url max-patch-retries max-name-length pod-base-port @@ -543,14 +552,13 @@ :message "Error while scaling waiter service"}))) (retrieve-directory-content - [{:keys [http-client] {:keys [port scheme]} :fileserver} - _ instance-id host browse-path] + [{:keys [http-client log-bucket-url service-id->service-description-fn watch-state] + {:keys [port scheme]} :fileserver} + service-id instance-id host browse-path] (let [auth-str @k8s-api-auth-str headers (when auth-str {"Authorization" auth-str}) - instance-restart-number (->> (string/last-index-of instance-id \-) - inc - (subs instance-id)) - instance-base-dir (str "/r" instance-restart-number) + {:keys [_ pod-name restart-number]} (unpack-instance-id instance-id) + instance-base-dir (str "/r" restart-number) browse-path (if (string/blank? browse-path) "/" browse-path) browse-path (cond-> browse-path @@ -558,8 +566,19 @@ (str "/") (not (string/starts-with? browse-path "/")) (->> (str "/"))) - target-url (str scheme "://" host ":" port instance-base-dir browse-path)] - (when port + run-as-user (-> service-id + service-id->service-description-fn + service-description->namespace) + pod (get-in @watch-state [:service-id->pod-id->pod service-id pod-name]) + target-url (if pod + ;; the pod is live: try accessing logs through sidecar + (when port + (str scheme "://" host ":" port instance-base-dir browse-path)) + ;; the pod is not live: try accessing logs through S3 + (when log-bucket-url + (str log-bucket-url "/" run-as-user "/" service-id "/" + pod-name instance-base-dir browse-path)))] + (when target-url (ss/try+ (let [result (http-utils/http-request http-client @@ -586,7 +605,9 @@ :syncer (retrieve-syncer-state-fn)}) (validate-service [_ service-id] - (let [{:strs [run-as-user]} (service-id->service-description-fn service-id)] + (let [run-as-user (-> service-id + service-id->service-description-fn + service-description->namespace)] (authz/check-user authorizer run-as-user service-id)))) (defn default-replicaset-builder @@ -597,19 +618,32 @@ {:strs [backend-proto cmd cpus grace-period-secs health-check-interval-secs health-check-max-consecutive-failures mem min-instances ports run-as-user] :as service-description} - {:keys [default-container-image] :as context}] + {:keys [default-container-image log-bucket-url] :as context}] (let [work-path (str "/home/" run-as-user) home-path (str work-path "/latest") base-env (scheduler/environment service-id service-description service-id->password-fn home-path) + ;; We include the default log-bucket-sync-secs value in the total-sigkill-delay-secs + ;; delay iff the log-bucket-url setting was given the scheduler config. + log-bucket-sync-secs (if log-bucket-url (:log-bucket-sync-secs context) 0) + total-sigkill-delay-secs (+ pod-sigkill-delay-secs log-bucket-sync-secs) ;; Make $PORT0 value pseudo-random to ensure clients can't hardcode it. ;; Helps maintain compatibility with Marathon, where port assignment is dynamic. port0 (-> service-id hash (mod 100) (* 10) (+ pod-base-port)) env (into [;; We set these two "MESOS_*" variables to improve interoperability. ;; New clients should prefer using WAITER_SANDBOX. {:name "MESOS_DIRECTORY" :value home-path} - {:name "MESOS_SANDBOX" :value home-path}] + {:name "MESOS_SANDBOX" :value home-path} + ;; Number of seconds to wait after receiving a sigterm + ;; before sending a sigkill to the user's process. + ;; This is handled by the waiter-init script, + ;; separately from the pod's grace period, + ;; in order to provide extra time for logs to sync to an s3 bucket. + {:name "WAITER_GRACE_SECS" :value (str pod-sigkill-delay-secs)}] (concat + (when log-bucket-url + [{:name "WAITER_LOG_BUCKET_URL" + :value (str log-bucket-url "/" run-as-user "/" service-id)}]) (for [[k v] base-env] {:name k :value v}) (for [i (range ports)] @@ -663,7 +697,7 @@ :workingDir work-path}] :volumes [{:name "user-home" :emptyDir {}}] - :terminationGracePeriodSeconds pod-sigkill-delay-secs}}}} + :terminationGracePeriodSeconds total-sigkill-delay-secs}}}} ;; Optional fileserver sidecar container (integer? (:port fileserver)) (update-in @@ -672,8 +706,7 @@ (let [{:keys [cmd image port] {:keys [cpu mem]} :resources} fileserver memory (str mem "Mi")] {:command cmd - :env [{:name "WAITER_FILESERVER_PORT" - :value (str port)}] + :env [{:name "WAITER_FILESERVER_PORT" :value (str port)}] :image image :imagePullPolicy "IfNotPresent" :name "waiter-fileserver" @@ -861,10 +894,11 @@ (defn kubernetes-scheduler "Returns a new KubernetesScheduler with the provided configuration. Validates the configuration against kubernetes-scheduler-schema and throws if it's not valid." - [{:keys [authentication authorizer cluster-name http-options max-patch-retries max-name-length - pod-base-port pod-sigkill-delay-secs pod-suffix-length replicaset-api-version replicaset-spec-builder - scheduler-name scheduler-state-chan scheduler-syncer-interval-secs service-id->service-description-fn - service-id->password-fn url start-scheduler-syncer-fn watch-retries] + [{:keys [authentication authorizer cluster-name http-options log-bucket-sync-secs log-bucket-url + max-patch-retries max-name-length pod-base-port pod-sigkill-delay-secs pod-suffix-length + replicaset-api-version replicaset-spec-builder scheduler-name scheduler-state-chan + scheduler-syncer-interval-secs service-id->service-description-fn service-id->password-fn + url start-scheduler-syncer-fn watch-retries] {fileserver-port :port fileserver-scheme :scheme :as fileserver} :fileserver}] {:pre [(schema/contains-kind-sub-map? authorizer) (or (nil? fileserver-port) @@ -873,6 +907,8 @@ (re-matches #"https?" fileserver-scheme) (utils/pos-int? (:socket-timeout http-options)) (utils/pos-int? (:conn-timeout http-options)) + (and (number? log-bucket-sync-secs) (<= 0 log-bucket-sync-secs 300)) + (or (nil? log-bucket-url) (some? (io/as-url log-bucket-url))) (utils/non-neg-int? max-patch-retries) (utils/pos-int? max-name-length) (not (string/blank? cluster-name)) @@ -896,13 +932,16 @@ (utils/assoc-if-absent :user-agent "waiter-k8s") http-utils/http-client-factory) service-id->failed-instances-transient-store (atom {}) + replicaset-spec-builder-ctx (assoc replicaset-spec-builder + :log-bucket-sync-secs log-bucket-sync-secs + :log-bucket-url log-bucket-url) replicaset-spec-builder-fn (let [f (-> replicaset-spec-builder :factory-fn utils/resolve-symbol deref)] (assert (fn? f) "ReplicaSet spec function must be a Clojure fn") (fn [scheduler service-id service-description] - (f scheduler service-id service-description replicaset-spec-builder))) + (f scheduler service-id service-description replicaset-spec-builder-ctx))) watch-options (cond-> default-watch-options (some? watch-retries) (assoc :watch-retries watch-retries)) @@ -928,6 +967,7 @@ daemon-state fileserver http-client + log-bucket-url max-patch-retries max-name-length pod-base-port diff --git a/waiter/src/waiter/settings.clj b/waiter/src/waiter/settings.clj index 812383269..e50207f2c 100644 --- a/waiter/src/waiter/settings.clj +++ b/waiter/src/waiter/settings.clj @@ -176,6 +176,8 @@ (log/info "reading settings from file:" config-file-path) (let [edn-readers {:readers {'config/regex (fn [expr] (re-pattern expr)) 'config/env #(env % config-file-path) + 'config/env-default (fn [[var-name default]] + (or (System/getenv var-name) default)) 'config/env-int #(Integer/parseInt (env % config-file-path))}} settings (edn/read-string edn-readers (slurp config-file-path))] (log/info "configured settings:\n" (with-out-str (clojure.pprint/pprint settings))) @@ -299,7 +301,7 @@ :mesos-slave-port 5051 :search-interval-days 10} :kubernetes {; Default values are not provided below for the following keys: - ; :authentication [:fileserver :port] :url + ; :authentication [:fileserver :port] :log-bucket-url :url :factory-fn 'waiter.scheduler.kubernetes/kubernetes-scheduler :authorizer {:kind :default :default {:factory-fn 'waiter.authorization/noop-authorizer}} @@ -310,6 +312,7 @@ :scheme "http"} :http-options {:conn-timeout 10000 :socket-timeout 10000} + :log-bucket-sync-secs 180 :max-patch-retries 5 :max-name-length 63 :pod-base-port 31000 diff --git a/waiter/src/waiter/util/client_tools.clj b/waiter/src/waiter/util/client_tools.clj index 442464ff2..a1490087b 100644 --- a/waiter/src/waiter/util/client_tools.clj +++ b/waiter/src/waiter/util/client_tools.clj @@ -484,6 +484,11 @@ [waiter-url & {:keys [verbose] :or {verbose false}}] (setting waiter-url [:scheduler-config :marathon :url] :verbose verbose)) +(defn k8s-log-bucket-url + "Returns the S3 bucket url for K8s service log backups" + [waiter-url & {:keys [verbose] :or {verbose false}}] + (setting waiter-url [:scheduler-config :kubernetes :log-bucket-url] :verbose verbose)) + (defn num-tasks-running [waiter-url service-id & {:keys [verbose prev-tasks-running] :or {verbose false prev-tasks-running -1}}] (let [http-options {:conn-timeout 10000, :socket-timeout 10000, :spnego-auth use-spnego} marathon-url (marathon-url waiter-url :verbose verbose) diff --git a/waiter/test/waiter/scheduler/kubernetes_test.clj b/waiter/test/waiter/scheduler/kubernetes_test.clj index d433583c8..de3c4bd8d 100644 --- a/waiter/test/waiter/scheduler/kubernetes_test.clj +++ b/waiter/test/waiter/scheduler/kubernetes_test.clj @@ -51,6 +51,8 @@ :cluster-name "waiter" :fileserver {:port 9090 :scheme "http"} + :log-bucket-sync-secs 60 + :log-bucket-url nil :max-patch-retries 5 :max-name-length 63 :pod-base-port 8080 @@ -664,11 +666,12 @@ (deftest test-retrieve-directory-content (let [service-id "test-service-id" - instance-id "test-service-instance-id-0" + instance-id "test-service-id.instance-id-0" instance-base-dir "/r0" + pod-name "instance-id" host "host.local" path "/some/path/" - dummy-scheduler (make-dummy-scheduler [service-id]) + {:keys [watch-state] :as dummy-scheduler} (make-dummy-scheduler [service-id]) port (get-in dummy-scheduler [:fileserver :port]) make-file (fn [file-name size] {:url (str "http://" host ":" port instance-base-dir path file-name) @@ -695,6 +698,7 @@ (make-dir "x") (make-dir "y") (make-dir "z")]}]] + (swap! watch-state assoc-in [:service-id->pod-id->pod service-id pod-name] ::pod) (doseq [{:keys [description expected]} inputs] (testing description (let [actual (with-redefs [hu/http-request (constantly (strip-links expected))] @@ -725,6 +729,8 @@ :watch-state (atom nil) :http-options {:conn-timeout 10000 :socket-timeout 10000} + :log-bucket-sync-secs 60 + :log-bucket-url nil :max-patch-retries 5 :max-name-length 63 :pod-base-port 8080