Skip to content
Permalink
Browse files

Use compute-clusters in config (#1160)

  • Loading branch information...
pschorf authored and dposada committed Jun 25, 2019
1 parent f55b899 commit bb5e44268ce589f1191b4f06aff8db546dd23518
@@ -214,8 +214,8 @@ def test_compute_cluster(self):
# For now, we only have one compute cluster. This could be wrong with future
# refactors, but for now, this is the only compute cluster we have, so we should
# expect it.
expected_compute_cluster = settings_dict['mesos-compute-cluster-name']
expected_mesos_framework = settings_dict['mesos-framework-id']
expected_compute_cluster = settings_dict['compute-clusters'][0]['config']['compute-cluster-name']
expected_mesos_framework = settings_dict['compute-clusters'][0]['config']['framework-id']
job_uuid, resp = util.submit_job(self.cook_url)

try:
@@ -325,7 +325,7 @@ def retrieve_mesos_url(varname='MESOS_PORT', value='5050'):
if os.getenv('COOK_TEST_DERIVE_MESOS_HOST') is not None:
cook_url = retrieve_cook_url()
_wait_for_cook(cook_url)
mesos_master = settings(cook_url).get('mesos-master')
mesos_master = settings(cook_url)['compute-clusters'][0]['config'].get('master')
if not mesos_master:
raise RuntimeError('Unable to derive Mesos host, mesos-master is not present in settings')

@@ -16,6 +16,12 @@
:admins #{"root" "travis"}
;; users that are allowed to do things on behalf of others
:impersonators #{"poser" "travis"}}
:compute-clusters [{:factory-fn cook.mesos.mesos-compute-cluster/factory-fn
:config {:framework-id #config/env "COOK_FRAMEWORK_ID"
:compute-cluster-name "local-mesos"
:master #config/env "MINIMESOS_ZOOKEEPER"
:failover-timeout-ms nil
:role "cook"}}]
:container-defaults {:volumes [{:host-path "/tmp/cook-integration-mount"
:container-path "/mnt/cook-integration"
:mode "RW"}]}
@@ -58,11 +64,7 @@
:max-preemption 500.0
:min-dru-diff 1.0
:safe-dru-threshold 1.0}
:mesos {:master #config/env "MINIMESOS_ZOOKEEPER"
:failover-timeout-ms nil
:leader-path "/cook-scheduler"
:role "cook"
:framework-id #config/env "COOK_FRAMEWORK_ID"}
:mesos {:leader-path "/cook-scheduler"}
:executor {:command #config/env "COOK_EXECUTOR_COMMAND"
:environment {"EXECUTOR_DEFAULT_PROGRESS_OUTPUT_NAME" "stdout"}
:portion #config/env-int-default ["COOK_EXECUTOR_PORTION" 0]}
@@ -15,6 +15,12 @@
:container-defaults {:volumes [{:host-path "/tmp/cook-integration-mount"
:container-path "/mnt/cook-integration"
:mode "RW"}]}
:mesos {:leader-path "/cook-scheduler"}
:compute-clusters [{:factory-fn cook.mesos.mesos-compute-cluster/factory-fn
:config {:failover-timeout-ms nil ; When we close the instance of Cook, all its tasks are killed by Mesos
:master #config/env "MESOS_MASTER"
:framework-id #config/env "COOK_FRAMEWORK_ID"
:compute-cluster-name "local-mesos"}}]
:cors-origins ["https?://cors.example.com"]
:data-local {:fitness-calculator {:cache-ttl-ms 60000
:cost-endpoint #config/env "DATA_LOCAL_ENDPOINT"
@@ -45,10 +51,6 @@
:default :info}}
:metrics {:jmx true
:user-metrics-interval-seconds 60}
:mesos {:failover-timeout-ms nil ; When we close the instance of Cook, all its tasks are killed by Mesos
:framework-id #config/env "COOK_FRAMEWORK_ID"
:leader-path "/cook-scheduler"
:master #config/env "MESOS_MASTER"}
:nrepl {:enabled? true
:port #config/env-int "COOK_NREPL_PORT"}
:pools {:default "gamma"}
@@ -261,11 +261,11 @@
:configurator configure-jet-logging
:max-threads 200
:request-header-size 32768}
server-port (assoc :port server-port)
server-https-port (assoc :ssl-port server-https-port)
server-keystore-pass (assoc :key-password server-keystore-pass)
server-keystore-path (assoc :keystore server-keystore-path)
server-keystore-type (assoc :keystore-type server-keystore-type)))]
server-port (assoc :port server-port)
server-https-port (assoc :ssl-port server-https-port)
server-keystore-pass (assoc :key-password server-keystore-pass)
server-keystore-path (assoc :keystore server-keystore-path)
server-keystore-type (assoc :keystore-type server-keystore-type)))]
(fn [] (.stop jetty))))
; If the framework id was not found in the configuration settings, we attempt reading it from
; ZooKeeper. The read from ZK is present for backwards compatibility (the framework id used to
@@ -285,18 +285,14 @@
sandbox-syncer-state
settings
trigger-chans]
(let [constructor (util/lazy-load-var 'cook.mesos.mesos-compute-cluster/->MesosComputeCluster)
create-mesos-compute-cluster (fn [compute-cluster-name framework-id db-id driver-atom]
(constructor compute-cluster-name
framework-id
db-id
driver-atom
sandbox-syncer-state
exit-code-syncer-state
mesos-heartbeat-chan
trigger-chans))]
((util/lazy-load-var 'cook.mesos.mesos-compute-cluster/setup-compute-cluster-map-from-config) datomic/conn settings
create-mesos-compute-cluster)))
(doall (map (fn [{:keys [factory-fn config]}]
(let [resolved (util/lazy-load-var factory-fn)]
(log/info "Calling compute cluster factory fn" factory-fn "with config" config)
(resolved config {:exit-code-syncer-state exit-code-syncer-state
:mesos-heartbeat-chan mesos-heartbeat-chan
:sandbox-syncer-state sandbox-syncer-state
:trigger-chans trigger-chans})))
(:compute-clusters settings))))
:mesos-datomic-mult (fnk []
(first ((util/lazy-load-var 'cook.datomic/create-tx-report-mult) datomic/conn)))
; TODO(pschorf): Remove hearbeat support
@@ -44,9 +44,11 @@
"Create a missing compute-cluster for one that's not yet in the database."
[conn compute-cluster]
(log/info "Installing a new compute cluster in datomic for " compute-cluster)
@(d/transact
conn
[(assoc compute-cluster :db/id (d/tempid :db.part/user))]))
(let [tempid (d/tempid :db.part/user)
result @(d/transact
conn
[(assoc compute-cluster :db/id tempid)])]
(d/resolve-tempid (d/db conn) (:tempids result) tempid)))

; Internal variable
(def cluster-name->compute-cluster-atom (atom {}))
@@ -77,8 +79,10 @@
TODO: Will want this to be configurable when we support multiple mesos clusters."
[]
{:post [%]} ; Never returns nil.
(-> config/config
:settings
:mesos-compute-cluster-name
compute-cluster-name->ComputeCluster))
(let [first-cluster-name (->> config/config
:settings
:compute-clusters
(map (fn [{:keys [config]}] (:compute-cluster-name config)))
first)]
(compute-cluster-name->ComputeCluster first-cluster-name)))

@@ -139,6 +139,19 @@
{:max-size 5000
:ttl-ms (* 60 1000)}
agent-query-cache))
:compute-clusters (fnk [[:config {compute-clusters []}
{mesos nil}]]
(if (seq compute-clusters)
compute-clusters
[{:factory-fn 'cook.mesos.mesos-compute-cluster/factory-fn
:config {:compute-cluster-name (or (:compute-cluster-name mesos)
"default-compute-cluster-from-config-defaulting")
:framework-id (:framework-id mesos)
:master (:master mesos)
:failover-timeout (:failover-timeout-ms mesos)
:principal (:principal mesos)
:role (:role mesos)
:framework-name (:framework-name mesos)}}]))
:container-defaults (fnk [[:config {container-defaults {}}]]
container-defaults)
:cors-origins (fnk [[:config {cors-origins nil}]]
@@ -314,32 +327,21 @@
:good-enough-fitness (fnk [[:config {scheduler nil}]]
(when scheduler
(or (:good-enough-fitness scheduler) 0.8)))
:mesos-master (fnk [[:config {mesos nil}]]
(when (:master-hosts mesos)
(log/warn "The :master-hosts configuration field is no longer used"))
(when mesos
(:master mesos)))
:mesos-failover-timeout (fnk [[:config {mesos nil}]]
(:failover-timeout-ms mesos))
; TODO(pschorf): Rename
:mesos-leader-path (fnk [[:config {mesos nil}]]
(:leader-path mesos))
:mesos-principal (fnk [[:config {mesos nil}]]
(:principal mesos))
:mesos-role (fnk [[:config {mesos nil}]]
(when mesos
(or (:role mesos) "*")))
; TODO(pschorf): Rename
:mesos-run-as-user (fnk [[:config {mesos nil}]]
(when (:run-as-user mesos)
(log/warn "Tasks launched in Mesos will ignore user specified in the job and run as" (:run-as-user mesos)))
(:run-as-user mesos))
:mesos-framework-name (fnk [[:config {mesos nil}]]
(when mesos
(or (:framework-name mesos) "Cook")))
:mesos-framework-id (fnk [[:config {mesos nil}]]
(:framework-id mesos))
:mesos-compute-cluster-name (fnk [[:config {mesos nil}]]
(or (:compute-cluster-name mesos) "default-compute-cluster-from-config-defaulting"))

; TODO(pschorf): Remove
:mesos-framework-id (fnk [[:config {mesos nil} {compute-clusters []}]]
(or (:framework-id mesos)
(->> compute-clusters
(filter (fn [{:keys [config]}] (contains? config :framework-id)))
(map (fn [{:keys [config]}] (:framework-id config)))
first)))
:jmx-metrics (fnk [[:config [:metrics {jmx false}]]]
(when jmx
((util/lazy-load-var 'cook.reporter/jmx-reporter))))
@@ -143,7 +143,7 @@
{:keys [hostname server-port server-https-port]} server-config
datomic-report-chan (async/chan (async/sliding-buffer 4096))

compute-cluster (mcc/mesos-cluster-hack)
compute-cluster (cc/get-default-cluster-for-legacy)
rebalancer-reservation-atom (atom {})
leader-selector (LeaderSelector.
curator-framework
@@ -174,7 +174,7 @@

(defrecord MesosComputeCluster [compute-cluster-name framework-id db-id driver-atom
sandbox-syncer-state exit-code-syncer-state mesos-heartbeat-chan
trigger-chans]
trigger-chans mesos-config]
cc/ComputeCluster
(compute-cluster-name [this]
compute-cluster-name)
@@ -192,12 +192,7 @@

(initialize-cluster [this pool->fenzo pool->offers-chan]
(let [settings (:settings config/config)
mesos-config (select-keys settings [:mesos-master
:mesos-failover-timeout
:mesos-principal
:mesos-role
:mesos-framework-name
:gpu-enabled?])

progress-config (:progress settings)
conn cook.datomic/conn
{:keys [match-trigger-chan progress-updater-trigger-chan]} trigger-chans
@@ -250,77 +245,63 @@
framework-id]}
(let [query-result
(d/q '[:find [?c]
:in $ ?cluster-name? ?mesos-id?
:in $ ?cluster-name? ?framework-id?
:where
[?c :compute-cluster/type :compute-cluster.type/mesos]
[?c :compute-cluster/cluster-name ?cluster-name?]
[?c :compute-cluster/mesos-framework-id ?framework-id?]]
unfiltered-db compute-cluster-name framework-id)]
(first query-result)))

; Internal method.
(defn get-mesos-compute-cluster
"Process one mesos cluster specification, returning the entity id of the corresponding compute-cluster,
creating the cluster if it does not exist. Warning: Not idempotent. Only call once "
([conn mesos-compute-cluster-factory mesos-cluster]
(get-mesos-compute-cluster conn mesos-compute-cluster-factory mesos-cluster nil))
([conn mesos-compute-cluster-factory {:keys [compute-cluster-name framework-id] :as mesos-cluster} driver] ; driver argument for unit tests
{:pre [compute-cluster-name
framework-id]}
(let [cluster-entity-id (get-mesos-cluster-entity-id (d/db conn) mesos-cluster)]
(when-not cluster-entity-id
(cc/write-compute-cluster conn (mesos-cluster->compute-cluster-map-for-datomic mesos-cluster)))
(mesos-compute-cluster-factory compute-cluster-name
framework-id
(or cluster-entity-id (get-mesos-cluster-entity-id (d/db conn) mesos-cluster))
(atom driver)))))

(defn- get-mesos-clusters-from-config
"Get all of the mesos clusters defined in the configuration.
In config.edn, we put all of the mesos keys under one toplevel dictionary.
E.g.:
{:failover-timeout-ms nil
:framework-id #config/env \"COOK_FRAMEWORK_ID\"
:master #config/env \"MESOS_MASTER\"
...
}
However, in config.clj, we split this up into lots of different keys at the toplevel:
:mesos-master (fnk [[:config {mesos nil}]]
...)
:mesos-framework-id (fnk [[:config {mesos ....
This function undoes this shattering of the :mesos {...} into separate keys that
occurs in config.clj. Long term, we need to fix config.clj to not to that, probably
as part of global cook, at which time, this probably won't need to exist. Until then however....."
[{:keys [mesos-compute-cluster-name mesos-framework-id]}]
[{:compute-cluster-name mesos-compute-cluster-name :framework-id mesos-framework-id}])


(defn setup-compute-cluster-map-from-config
"Setup the cluster-map configs, linking a cluster name to the associated metadata needed
to represent/process it."
[conn settings create-mesos-compute-cluster]
(let [compute-clusters (->> (get-mesos-clusters-from-config settings)
(map (partial get-mesos-compute-cluster conn create-mesos-compute-cluster))
(map cc/register-compute-cluster!))]
(doall compute-clusters)))


; A hack to store the mesos cluster, until we refactor the code so that we support multiple clusters. In the long term future
; this is probably replaced with a function from driver->cluster-id, or the cluster name is propagated by function arguments and
; closed over.
(defn mesos-cluster-hack
"A hack to store the mesos cluster, until we refactor the code so that we support multiple clusters. In the
long term future the cluster is propagated by function arguments and closed over."
[]
{:post [%]} ; Never returns nil.
(-> config/config
:settings
:mesos-compute-cluster-name
cc/compute-cluster-name->ComputeCluster))
(defn get-or-create-cluster-entity-id
"Checks datomic for a compute cluster with the given name and framework-id. If present, returns the entity id.
If missing, installs in datomic and returns the entity id."
[conn compute-cluster-name framework-id]
(let [compute-cluster-entity-id (get-mesos-cluster-entity-id (d/db conn)
{:compute-cluster-name compute-cluster-name
:framework-id framework-id})]
(if compute-cluster-entity-id
compute-cluster-entity-id
(cc/write-compute-cluster conn (mesos-cluster->compute-cluster-map-for-datomic
{:compute-cluster-name compute-cluster-name
:framework-id framework-id})))))

(defn factory-fn
"Constructs a new MesosComputeCluster and registers it."
[{:keys [compute-cluster-name
framework-id
master
failover-timeout
principal
role
framework-name
gpu-enabled?]}
{:keys [exit-code-syncer-state
mesos-heartbeat-chan
sandbox-syncer-state
trigger-chans]}]
(try
(let [conn cook.datomic/conn
cluster-entity-id (get-or-create-cluster-entity-id conn compute-cluster-name framework-id)
mesos-config {:mesos-master master
:mesos-failover-timeout failover-timeout
:mesos-principal principal
:mesos-role (or role "*")
:mesos-framework-name (or framework-name "Cook")
:gpu-enabled? gpu-enabled?}
mesos-compute-cluster (->MesosComputeCluster compute-cluster-name
framework-id
cluster-entity-id
(atom nil)
sandbox-syncer-state
exit-code-syncer-state
mesos-heartbeat-chan
trigger-chans
mesos-config)]
(log/info "Registering compute cluster" mesos-compute-cluster)
(cc/register-compute-cluster! mesos-compute-cluster)
mesos-compute-cluster)
(catch Throwable t
(log/error t "Failed to construct mesos compute cluster")
(throw t))))

0 comments on commit bb5e442

Please sign in to comment.
You can’t perform that action at this time.