This repository has been archived by the owner on Mar 22, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
config-full.edn
1235 lines (1025 loc) · 80.4 KB
/
config-full.edn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
; ========== Waiter Configuration ==========
;
; NOTE: Waiter comes with reasonable defaults for most settings. We have added some hints and
; reminders for the various settings to try and clarify the effects of making changes.
; Nevertheless, tread lightly if you are tweaking and tuning the Waiter configuration.
;
; ---------- Custom Extensions ----------
;
; Several of the configuration settings below allow for extending Waiter by providing custom
; implementations of a protocol. For example, if you want to have a custom way of providing
; entitlements in Waiter, you might specify:
;
; {
; ; ...
; :entitlement-config {:kind :custom
; :custom {:factory-fn waiter-extensions.authz/my-entitlement-manager
; :foo "bar"
; :baz "qux"}}
; ; ...
; }
;
; Which would invoke your custom implementation of the EntitlementManager protocol:
;
; (ns waiter-extensions.security
; (:require [waiter.authorization :as authz]))
;
; (defrecord MyEntitlementManager [foo baz]
; authz/EntitlementManager
;
; (authorized? [this subject action resource]
; ; your custom entitlements code goes here
; ))
;
; (defn my-entitlement-manager [{:keys [foo baz]}]
; (MyEntitlementManager. foo baz))
;
{
; ---------- Cluster ----------
;; To be considered part of the same cluster, Waiter routers need to:
;; 1. Have the same :cluster-config :name
;; 2. Have the same :zookeeper :base-path and :discovery-relative-path to allow computing router endpoints
;; 3. Have the same :zookeeper :base-path and :leader-latch-relative-path to participate in leadership election
:cluster-config {;; Determines if Waiter supports requests bypassing the waiter routers.
:bypass-supported? false
;; Number of routers required in order for leader election to take place;
;; it's a good idea to set this to an odd number:
:min-routers 3
;; Use a descriptive name for your cluster;
;; this name is also used in the token metadata as the root to identify the
;; cluster in which a token was first created.
;; It's best to keep this unique across clusters:
:name "my-waiter-cluster"
;; Use a descriptive prefix for services running in your cluster;
;; Waiter uses this prefix to identify services to manage in a given cluster.
;; It's best to keep this unique across clusters:
:service-prefix "waiter-c1-"}
;; The principal to use for requests, e.g. async request status checks, originating in Waiter.
;; It is expected to be of the format name@domain.
:waiter-principal "waiter@example.com"
:zookeeper {
;; The root path to which Waiter will write data:
:base-path "/waiter"
;; Valid values are:
;; - a valid ZooKeeper connect string (example below)
;; - :in-process, which simply uses an in-process ZK (not for production use)
:connect-string "zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181"
;; Retry policy that retries a set number of times with an increasing
;; (up to a maximum bound) sleep time between retries
:curator-retry-policy {
;; Initial amount of time (milliseconds) to wait between retries
:base-sleep-time-ms 100
;; Maximum number of times to retry
:max-retries 10
;; Maximum amount of time (milliseconds) to wait between retries
:max-sleep-time-ms 120000}
;; The path used for service discovery:
:discovery-relative-path "discovery"
;; The path used for service garbage collection state:
:gc-relative-path "gc-state"
;; The path used for leader election:
:leader-latch-relative-path "leader-latch"
;; The timeout for acquiring a mutex
:mutex-timeout-ms 1000}
;; Waiter starts a loop to query for the list of currently running routers:
:router-syncer {
;; The amount of time (milliseconds) to wait before starting the loop:
:delay-ms 750
;; The interval (milliseconds) on which to query:
:interval-ms 1500}
; ---------- Node ----------
; Use a descriptive name for this router; this is useful in multi-router
; scenarios for identifying which router handled a request:
:router-id-prefix "router-1"
; ---------- Network ----------
;; Set the bind address to a specific IP:
:host "192.168.0.1"
;; Set the port:
:port 9091
;; The DNS name for the Waiter cluster, which can be:
;; - a non-empty string
;; - a non-empty vector of strings, in which case the first element is the primary hostname
:hostname ["waiter.example.com"
"waiter-alternate.example.com"]
;; The configuration for the Waiter server that will be launched.
;; Unless specified otherwise, all the entries below are optional:
:server-options {;; The timeout in ms for blocking I/O operations, default 15 minutes (in millis)
:blocking-timeout 900000
;; The buffer size used while draining request bytes before emitting a response, the default is 16384.
;; Draining request bytes is necessary for clients that are sensitive to request being streamed completely before receiving a response.
;; Waiter, conservatively, drains all request bytes before emitting the response line.
:drain-request-buffer-size 16384
;; Whether or not to enable secure HTTP2 transport, default false
:http2? false
;; Whether or not to enable HTTP2 cleartext transport, default true
:http2c? true
;; When present, provides the base url for images used in error responses
:images-url "http://cdn.waiter.com/images"
;; When SSL is enabled, the keystore to use for SSL connections
:keystore "/path/to/keystore.p12"
;; When SSL is enabled, the scanning interval to detect changes to and reload the keystore
:keystore-scan-interval-secs 43200
;; When SSL is enabled, the format of keystore
:keystore-type "pkcs12"
;; When SSL is enabled, the password to the keystore
:key-password "keystore-password"
;; The maximum number of threads to use (default 200)
:max-threads 200
;; The maximum size in bytes of the request header.
;; Larger headers will allow for more and/or larger cookies plus larger form content encoded in a URL.
;; However, larger headers consume more memory and can make a server more vulnerable to denial of service attacks.
:request-header-size 32768
;; The maximum size in bytes of the response header.
;; Larger headers will allow for more and/or larger cookies and longer HTTP headers (eg for redirection).
;; However, larger headers will also consume more memory.
:response-header-size 8192
;; Whether to send the Date header in responses
:send-date-header? false
;; The SSL port to listen on.
;; To disable SSL support, do not include this entry in the config.
;; Specifying a value for this port (which must be different from port above) enables SSL support.
:ssl-port 9443
;; When SSL is enabled, the truststore to use for SSL connections
:truststore "/path/to/truststore.p12"
;; When SSL is enabled, the format of trust store
:truststore-type "pkcs12"
;; When SSL is enabled, the password to the truststore
:trust-password "truststore-password"}
; ---------- Token Storage ----------
:kv-config {
;; :kind :zk uses Apache ZooKeeper (https://zookeeper.apache.org/):
:kind :zk
:zk {
;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.kv.KeyValueStore:
:factory-fn waiter.kv/new-zk-kv-store
;; How long (milliseconds) to await a response from ZK synchronize call:
:sync-timeout-ms 2000}
;; Alternatively, :kind :local simply uses an in-memory store
;; (not for production use, and obviously does not work with multiple Waiter routers):
;:kind :local
:local {:factory-fn waiter.kv/new-local-kv-store}
;; The path used for token storage:
:relative-path "tokens"
;; Whether or not to store the data encrypted:
:encrypt true
:cache {
;; The maximum number of elements in the cache before FIFO semantics apply:
:threshold 1000
;; The time (seconds) that entries are allowed to reside in the cache:
:ttl 60}}
; ---------- Security ----------
;; Waiter supports authenticating requests to identify the principal of the requesting user.
;; The principal is then used to validate access to services and also to launch services.
:authenticator-config {
;; set :kind to one of the configured authenticators
:kind :one-user
;; :kind :composite enables using the "authentication" service parameter to
;; specify an authentication scheme. If the authentication scheme is not known or not specified
;; then the :default-scheme is used
;:kind :composite
:composite {:factory-fn waiter.auth.composite/composite-authenticator
:authentication-providers {"kerberos" {;; see the :kerberos entry for configuration options
:todo :config}}
;; The authentication scheme to use if one is not specified
:default-authentication-provider "kerberos"}
;; Waiter supports JWT access token-based authentication before trying the configured authenticator.
;; JWT authentication can be disabled by specifying a value of :disabled instead of configuring the map below:
;; :jwt :disabled
:jwt {;; Whether JWT authentication is allowed for Waiter API requests.
;; When not configured, the value is assumed to be true.
:allow-bearer-auth-api? true
;; Whether JWT authentication is the allowed for service (proxy) requests.
;; When not configured, the value is assumed to be false.
:allow-bearer-auth-services? false
;; Whether OIDC+PKCE authentication is allowed for Waiter API requests.
;; When not configured, the value is assumed to be false.
:allow-oidc-auth-api? false
;; Whether OIDC+PKCE authentication is the allowed for service (proxy) requests.
;; The value of USE_OIDC_AUTH environment variable of the service overrides this value.
;; When not configured, the value is assumed to be false.
:allow-oidc-auth-services? false
;; Whether or not to include the www-authenticate header on 401 responses
;; even if a Bearer token is not provided in the request authenticate header.
;; When not configured, the value is assumed to be true.
:attach-www-authenticate-on-missing-bearer-token? true
;; JWT relies on periodically refreshing the list of available keys from a JWKS
:http-options {;; The HTTP options that will be used when accessing the authorization server:
:conn-timeout 10000
:socket-timeout 10000
:spnego-auth false}
;; The issuer constraint expected on the iss field of access token claims.
;; The field can be a string, a regex pattern or a vector consisting of any mix of string or patterns.
;; The constraints in a vector are disjunctive, any matching constraint will validate a JWT iss claim.
:issuer ["test.com" #config/regex "https://test.com/.*"]
;; url of the authorization server where the public keys (JWKS) are available:
:jwks-url "http://127.0.0.1:8040/jwks"
;; The maximum duration from current time allowed for expiry time on the JWT claim.
;; The value defaults to 24 hours, i.e. 3600000
:max-expiry-duration-ms 86400000
;; (optional) url of the authorization server authorize endpoint:
:oidc-authorize-uri "http://127.0.0.1:8040/authorize"
;; (optional) one of :relaxed or :strict
;; in relaxed mode, the Waiter auth cookie after OIDC auth has an expiry time of 1 day
;; in strict mode, the Waiter auth cookie after OIDC auth has an expiry time provided in the JWT access token
:oidc-default-mode :relaxed
;; (optional) maximum number of OIDC challenge cookies in the request beyond which
;; the OIDC auth flow will not be triggered on 401 responses.
;; The default value is 20
:oidc-num-challenge-cookies-allowed-in-request 20
;; (optional) the user-agent header substrings to determine if the request came from a browser.
;; Relies on the heuristic of inspecting common user-agents to determine if it is a browser.
;; The default value is #{"chrome" "mozilla"}
:oidc-redirect-user-agent-products #{"chrome" "mozilla"}
;; (optional) the same-site attribute configured on cookies emitted as result of OIDC auth
;; Valid values are "Lax", "None", "Strict" or nil. The default value is "None"
:oidc-same-site-attribute "None"
;; (optional) url of the authorization server get token endpoint:
:oidc-token-uri "http://127.0.0.1:8040/id-token"
;; The keyword used to retrieve the subject from the access token claims
:subject-key :sub
;; The regex used to validate the subject from the claims
:subject-regex #config/regex "([a-zA-Z0-9]+)@([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*\\.)+([a-zA-Z]{2,})$"
;; The supported algorithms while validating access tokens, must be subset of #{:eddsa :rs256}
:supported-algorithms #{:eddsa}
;; The token type expected in the access token header
:token-type "JWT"
;; The interval at which to refresh the keys from the authorization server:
:update-interval-ms 60000}
;; :kind :one-user allows anonymous access to services (for testing purposes only):
:one-user {;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.auth.authentication.Authenticator:
:factory-fn waiter.auth.authentication/one-user-authenticator
;; the user account used to launch services
:run-as-user "launch-username"}
;; :kind :kerberos enables authentication using the Kerberos protocol
;:kind :kerberos
:kerberos {:factory-fn waiter.auth.kerberos/kerberos-authenticator
;; the maximum number of concurrent Kerberos library (GSSLibStub) invocations allowed
:concurrency-level 20
;; the idle time before cached threads from the invocation throttler are terminated
:keep-alive-mins 5
;; the maximum number of request waiting for Kerberos auth before server responds
;; with a 503 temporarily unavailable
:max-queue-length 1000}}
;; Waiter supports the run-as-requester feature to launch a service as the requesting user.
;; Triggering this feature without passing explicit headers requires providing an explicit consent and storing this in a cookie.
;; The explicit consent is required to defend against CSRF attacks.
;; consent-expiry-days is used to configure how long to remember this consent and honor the presence of the cookie.
:consent-expiry-days 90
;; CORS (https://developer.mozilla.org/en-US/docs/Web/HTTP/Access_control_CORS) governs whether a web application
;; running in the browser may make AJAX requests to a different web application. Configuring CORS will be required for
;; clients who wish to interact with Waiter via Javascript in a web browser. CORS is not required to make requests to
;; your own application running on Waiter (because they will have the same origin) or to communicate with Waiter
;; outside the browser.
:cors-config {
;; :kind :patterns takes a simple list of regular expressions (see below):
:kind :patterns
:patterns {
;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.cors.CorsValidator:
:factory-fn waiter.cors/pattern-based-validator
;; List of regular expressions representing origins to allow:
:allowed-origins [#config/regex "^http://[^\\.]+\\.example\\.org(:80)?$"
#config/regex "^https://anotherapp.example.org:12345$"]}
;; :kind :allow-all allows all cross-origin requests
:allow-all {:factory-fn waiter.cors/allow-all-validator}
;; Headers exposed to CORS clients for Waiter API requests
;; read more at https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Expose-Headers
:exposed-headers ["etag"]
;; The value to use for the access-control-max-age header:
:max-age 3600
;; :kind :token-parameter uses allowed list that's set on the token
:token-parameter {:factory-fn waiter.cors/token-parameter-based-validator
:supports-token-parameter? true}}
:entitlement-config {
;; :kind :simple requires that the requesting user be the same as the run-as-user:
:kind :simple
:simple {
;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.authorization.EntitlementManager:
:factory-fn waiter.authorization/->SimpleEntitlementManager}}
:password-store-config {
;; :kind :configured takes a simple list of passwords:
:kind :configured
:configured {
;; Custom implementations should specify a :factory-fn that
;; returns an instance of waiter.password-store.PasswordProvider:
:factory-fn waiter.password-store/configured-provider
;; List of passwords:
:passwords ["open-sesame"]}}
; ---------- Scheduling ----------
:scheduler-config {
;; :kind :marathon uses Marathon (https://mesosphere.github.io/marathon/) for scheduling instances:
:kind :marathon
;; :kind :composite uses a composite scheduler that in this example has two component
;; schedulers: kubernetes and marathon. The composite scheduler delegates to one of these
;; schedulers based on the "scheduler" service parameters.
;:kind :composite
:composite {
;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.scheduler.ServiceScheduler:
:factory-fn waiter.scheduler.composite/create-composite-scheduler
;; The components schedulers to which the scheduler operations will be delegated.
;; The scheduler parameter in the service description must match the scheduler keys
;; for one of the component schedulers, e.g. "marathon" or "kubernetes".
:components {:marathon {;; a factory function must be provided
;; see the :marathon entry for other configuration options
:factory-fn waiter.scheduler.marathon/marathon-scheduler
:todo :additional-config}
:kubernetes {;; a factory function must be provided
;; see the :kubernetes entry for other configuration options
:factory-fn waiter.scheduler.kubernetes/kubernetes-scheduler
:todo :additional-config}}
;; The scheduler to use by default if the service does not specify one explicitly.
;; This is an optional config and can be nil. When provided it must match one of
;; the component schedulers that have been configured.
:default-scheduler :marathon
;; The selection function used to determine a scheduler for a given service id can be configured.
;; This is an optional config and can be nil.
:selector-context {;; a factory function may optionally be provided which accepts a context map
;; and returns a function that takes a service-id as input and returns a scheduler
:factory-fn 'waiter.scheduler.composite/create-scheduler-parameter-based-selector
;; additional name-value pairs may be provided as context to the factory function
:todo :additional-config}}
;; :kind :kubernetes uses Kubernetes (https://kubernetes.io/) for scheduling Waiter services and instances:
;:kind :kubernetes
:kubernetes {;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.scheduler.ServiceScheduler:
:factory-fn waiter.scheduler.kubernetes/kubernetes-scheduler
;; Whether to always use authenticated health checks on pods readiness and liveness checks
;; regardless of the health-check-authentication parameter in the service description.
:authenticate-health-checks? false
;; Configuration for obtaining the credentials to authenticate with the Kubernetes API server.
;; If authentication isn't required (e.g., using kubectl proxy), then this can be omitted.
:authentication {;; Unary function which can be called periodically (or just once if :refresh-delay-mins is omitted)
;; to obtain an Authorization string for use when authenticating with the Kubernetes API server.
;; This map is passed to the function as its single `context` argument.
;; The function must return a map with an entry for :auth-token which maps
;; to a string that can be used as the value for the Authorization HTTP header.
;; The sample function given below simply returns the string from the
;; WAITER_K8S_AUTH_STRING environment variable.
:action-fn waiter.scheduler.kubernetes/authorization-from-environment
;; Number of minutes to wait between attempts to refresh the credentials.
;; If omitted, then the :action-fn value is only called once at Waiter startup.
:refresh-delay-mins 15}
:replicaset-spec-builder {;; Factory function which creates a Kubernetes ReplicaSet spec
;; (with an embedded Pod spec) for the given Waiter Service.
;; The referenced factory function takes the current Kubernetes scheduler,
;; the Waiter service-id, the Waiter service description, and this context map as arguments.
;; Since this map is passed as the context argument to the factory function,
;; any additional configuration data needed for the implementation should be included here.
:factory-fn waiter.scheduler.kubernetes/default-replicaset-builder
;; Vector of commands with which to prefix the user's cmd string when launching the container.
;; E.g., it may be necessary to delegate launching the user's command to a waiter-init script,
;; which is in turn launched by dumb-init for zombie process handling.
:container-init-commands ["waiter-k8s-init"]
;; The default factory function accepts an option for the docker container to use in the pod.
:default-container-image "twosigma/waiter-test-apps:latest"
;; The default namespace to use for Kubernetes objects created by Waiter
:default-namespace "waiter"
;; Map of image aliases. One of these aliases can be specified in the `image` field and the
;; resolved image will be used.
:image-aliases {"alias/sample-alias" "real/image"}
;; Optionally apply a pod anti-affinity strategy to prevent replicas of the same Waiter service
;; from being scheduled on the same Kubernetes node. This is helpful to avoid losing multiple
;; replicas when a node crashes or goes down for maintenance.
;; Valid values are :preferred, :required, or nil (default).
;; https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#inter-pod-affinity-and-anti-affinity
:pod-anti-affinity :preferred}
;; The authorizer is used by the scheduler to verify that a user has the proper permissions
;; to create services on the underlying scheduler platform.
;; We specify the default no-op authorizer here.
;; See the :marathon :authorizer settings for a full example.
:authorizer {:kind :default
:default {:factory-fn waiter.authorization/noop-authorizer}}
;; The :log-bucket-url setting is optional, but if it's non-nil, it should be the URL (string) of an S3 bucket
;; where log files should be copied when a waiter service pod is terminated.
;; When a :log-bucket-url is given, the pod waits up to an additional :log-bucket-sync-secs before terminating
;; (starting after :pod-sigkill-delay-secs or the user process's graceful exit).
;; The log-bucket-sync-secs option should be set between 0 and 300 seconds, inclusive (default is 180).
:log-bucket-sync-secs 180
:log-bucket-url "http://s3.example.com/waiter-service-logs"
;; String value used to annotate Kubernetes objects that are orchestrated by this Waiter instantiation:
:cluster-name "waiter"
;; Duration to wait before marking a pod as expired when it hasn't transitioned to running state (default is 90).
;; Configure the value to 0 to disable expiring instances and removing the duration constraint on pods
;; before they transition to the running state.
:container-running-grace-secs 90
;; The minimum age in seconds that a k8s object must be before events may be fetched
:fetch-events-k8s-object-minimum-age-secs 30
;; Configuration for creating and querying a sidecar container in each Waiter Service Instance's Kuberentes pod
;; running a server for directory listings (in JSON) and serving file contents.
;; Waiter expects the directory listings in the format returned by nginx's autoindex module when configured for JSON.
;; See the behavior of the default container image below for reference.
;; The sidecar container is only added to Waiter-managed pods if the :port number is set in this map,
;; otherwise, the file browsing API is disabled (always returning an empty array).
;; You must ensure that the fileserver port number does not intersect with the pod-base-port range.
:fileserver {:cmd ["/bin/fileserver-start"]
:image "twosigma/waiter-fileserver:latest"
:port 9090
:predicate-fn waiter.scheduler.kubernetes/fileserver-container-enabled?
:resources {:cpu 0.1 :mem 128}
:scheme "http"}
;; HTTP options that will be used when accessing the Kubernetes API:
:http-options {:conn-timeout 10000
:socket-timeout 10000}
;; Cache configuration for k8s-object-key->event-cache
;; This cache represents Kubernetes Events for ReplicaSets and Pods
:k8s-object-key->event-cache {;; max number of cached objects with events
:threshold 5000
;; time (seconds) to live for an object with events
:ttl 120}
;; Number of retry attempts to make when a PATCH request fails with a 409 Conflict:
:max-patch-retries 5
;; Maximum number of characters allowed for a Kubernetes object name:
:max-name-length 63
;; Target Kubernetes namespace for all Waiter-managed objects.
;; When nil, Waiter operates at the global scope with access to all namespaces.
:namespace nil
;; Port number used to set the $PORT0 (service port) for Waiter service instances.
;; Waiter generates a random offset in the range [0, 990] each time it creates a service,
;; adding that offset to the pod-base-port value to get that Waiter service's $PORT0 value.
;; If more than one port is requested, $PORT1 thru $PORT9 will be defined sequentially after $PORT0;
;; e.g., if pod-base-port is 31000, and the random offset was 390, then $PORT0==31390 and $PORT9==31399:
:pod-base-port 31000
:pod-bypass {;; The maximum amount of time before the SIGTERM signal will be sent in each container of the pod.
;; This will allow requests to drain during this period without killing any containers.
:force-sigterm-secs 120
;; The preStop command put on all containers that gets configured for pods of bypass services
;; This command prevents the SIGTERM signal from reaching or doing various cleanup work
;; preStop commands do not log anywhere by default in k8s, so append logs to the containers main process stdout
;; which is symlinked with "/proc/1/fd/1" https://github.com/kubernetes/kubernetes/issues/54247#issuecomment-827694001
:pre-stop-cmd ["/bin/sh"
"-c"
"echo $(date +%Y-%m-%dT%H:%M:%S.%3NZ) INFO preStop: sleeping for ${WAITER_CONFIG_BYPASS_FORCE_SIGTERM_SECS} secs >> /proc/1/fd/1 ; sleep ${WAITER_CONFIG_BYPASS_FORCE_SIGTERM_SECS} ; echo $(date +%Y-%m-%dT%H:%M:%S.%3NZ) INFO preStop: ending sleep, and container will receive SIGTERM shortly >> /proc/1/fd/1"]
;; The grace period that allows containers to handle SIGTERM before getting fully deleted.
:sigterm-grace-period-secs 30}
;; The number of seconds between the SIGTERM and SIGKILL signals sent to a pod on shutdown.
;; This value should be placed in the terminationGracePeriodSeconds field in the pod template.
;; Should be set between 0 and 300 seconds, inclusive.
:pod-sigkill-delay-secs 3
;; The number of characters (not including the preceding dash) in the unique suffix appended to each generated Pod name in a ReplicaSet.
;; Unless you've custom-patched your Kubernetes code, then this is probably hard-coded to 5 in the controller code.
;; https://github.com/kubernetes/kubernetes/blob/207e9d1/staging/src/k8s.io/apiserver/pkg/storage/names/generate.go#L45
:pod-suffix-length 5
:raven-sidecar {:cmd ["/opt/waiter/raven/bin/raven-start"]
;; environment variables use by provided predicate functions
:env-vars {;; The default env vars configured in the Raven sidecar.
;; These may be overridden by user-provided env vars in the service description.
:defaults {}
;; flag env vars set to a "true"-like value enable raven,
;; "false"-like values disable raven
:flags ["RAVEN_ENABLED"]
;; configuring a raven feature implicitly enables raven
:features ["RAVEN_OVERRIDE_IMAGE"]
;; flag env vars for specifically toggling tls downstream on/off
;; (these do not enable the raven sidecar on their own)
:tls-flags ["RAVEN_FORCE_INGRESS_TLS"]}
:image "twosigma/waiter-raven"
:predicate-fn waiter.scheduler.kubernetes/raven-sidecar-opt-in?
:resources {:cpu 0.1 :mem 256}}
;; Version string used for creating and operating on ReplicaSet objects:
:replicaset-api-version "apps/v1"
;; The number of pod restart at which the instance will start being treated as expired:
:restart-expiry-threshold 2
;; The number of pod restart at which the pod will be killed:
:restart-kill-threshold 8
;; The deployment error message transform function for generating a user friendly message from a failing
;; kubernetes api request
:response->deployment-error-msg-fn waiter.scheduler.kubernetes/default-k8s-message-transform-fn
;; Cache configuration for service-id->deployment-error-cache
;; This cache represents deployment errors at the service level (e.g. k8s api errors)
:service-id->deployment-error-cache {;; max number of services with a deployment-error
:threshold 5000
;; time (seconds) to live for a service with deployment-error
:ttl 120}
;; Base URL for accessing the Kubernetes API server.
;; The URL used below is the default proxy URL bound to by the `kubectl proxy` command:
:url "http://localhost:8001"
;; Determines, in milliseconds, the timeout until a connection is established.
;; Defaults to 10 seconds.
:watch-connect-timeout-ms 10000
;; Determines, in milliseconds, how long to block scheduler initializaion waiting for initial watch state.
;; When 0, the scheduler factory returns immediately (maybe without initial Kubernetes scheduler state knowledge).
;; Defaults to 2 minutes.
:watch-init-timeout-ms 120000
;; Number of times to retry a watch before falling back to a global state query.
;; These retries only apply to a closed watch connection, not to connections returning an HTTP error code.
;; Defaults to 0 (one attempt, no retries) if not provided.
:watch-retries 10
;; Defines, in milliseconds, the maximum period of inactivity between two consecutive data packets on watch connections.
;; Defaults to 15 minutes.
:watch-socket-timeout-ms 900000
;; Whether to validate the SSL certificate of the API server on the k8s watch connections.
;; Defaults to false (same as our the http client used for all other k8s api calls).
:watch-validate-ssl false}
;; :kind :marathon uses Marathon (https://mesosphere.github.io/marathon/) for scheduling instances:
;:kind :marathon
:marathon {
;; Custom implementations should specify a :factory-fn
;; that returns an instance of waiter.scheduler.ServiceScheduler:
:factory-fn waiter.scheduler.marathon/marathon-scheduler
;; The authorizer is used by the scheduler to verify that a user has the proper permissions
;; to create services on the underlying scheduler platform.
:authorizer {:kind :kerberos
:kerberos {;; kerberos-prestash determines user authorization via the presence of prestashed tickets
:factory-fn waiter.auth.kerberos/kerberos-authorizer
;; pluggable function used to authenticate requests
:authenticate-request-fn waiter.auth.spnego/authenticate-request
;; pluggable parameters passed to the function used to authenticate requests
:authenticate-request-fn-context {}
;; the minimum interval after which prestash kerberos tickets needs to refreshed
:prestash-cache-min-refresh-ms 100
;; the maximum interval after which prestash kerberos tickets are forcibly refreshed
:prestash-cache-refresh-ms 1000
;; the prestash query host (where to look for the tickets)
:prestash-query-host "www.example.com"}}
;; The location of home directories on your Mesos agents;
;; the HOME environment variable is set by joining this and the run-as-user:
:home-path-prefix "/home/"
;; The HTTP options that will be used when accessing Marathon:
:http-options {:conn-timeout 10000
:socket-timeout 10000}
;; Marathon will fail deployments (e.g. scale-down) if there is an existing deployment in
;; flight; this represents the amount of time (milliseconds) Waiter will tolerate failed
;; scale-down deployments before using the force flag when scaling down:
:force-kill-after-ms 60000
;; How long (milliseconds) to cache the framework id:
:framework-id-ttl 900000
:marathon-descriptor-builder {;; Factory function which creates a descriptor used by Marathon to create new apps.
;; Any additional configuration data needed for the implementation should be included here.
:factory-fn waiter.scheduler.marathon/default-marathon-descriptor-builder
;; Vector of commands with which to prefix the user's cmd string when launching the container.
;; E.g., it may be necessary to delegate launching the user's command to a waiter-init script.
:container-init-commands ["waiter-mesos-init"]}
;; The base directory where the agent directories are being written.
;; This is an optional parameter.
;; When absent log directory and log url support will be disabled.
:mesos-agent-directory "/home/mesos/agent/workspace/"
;; The port on which the mesos agents are listening.
;; This is an optional parameter.
;; When absent log directory and log url support will be disabled.
:mesos-agent-port 5051
;; Controls the rate at which any mismatch in task count and instances is
;; corrected by triggering new deployments
:sync-deployment {;; Specifies the intervals at which the syncer runs to
;; check for mismatches in task and instance counts.
:interval-ms 15000
;; Specifies the number of cycles for which a service is
;; determined to be in need of a sync deployment before
;; such a deployment is actually triggered.
:timeout-cycles 4}
;; The URL for your Marathon HTTP API:
:url "http://marathon.example.com:8080"}
;; :kind :shell simply schedules instances on your local machine (for testing purposes only):
;:kind :shell
:shell {
:factory-fn waiter.scheduler.shell/shell-scheduler
;; The shell scheduler can persist its state and use this state when it restarts.
;; This is an optional config, when not provided no backup state will be persisted.
;; The file in the work directory which contains the scheduler state:
:backup-file-name "backup.json"
;; The interval (milliseconds) at which instance health will be checked:
:health-check-interval-ms 10000
;; The HTTP connect timeout and idle timeout (milliseconds) for instance health checks:
:health-check-timeout-ms 200
;; The amount of time (milliseconds) after an instance is
;; killed before its port will become available for use:
:port-grace-period-ms 120000
;; Defines the port range from which we allocate ports:
:port-range [10000 10999]
;; The base work directory to use; both absolute and relative paths are supported:
:work-directory "scheduler"}}
;; Waiter performs garbage collection of services by tracking two things:
;;
;; 1. Services that are "broken" (i.e. have no healthy instance, but at
;; least one failed instance, possibly due to a broken command).
;;
;; 2. Services that are "idle" (i.e. have no outstanding requests and have
;; a positive value for idle-timeout-mins parameter value).
;; Idle services are detected based on no changes to the metrics state
;; past the idle-timeout-mins period set on the service description.
;;
;; If an error occurs while deleting a service, there will be repeated
;; attempts to delete it later.
:scheduler-gc-config {
;; The number of hosts on which health checks
;; must fail in order to consider a service broken:
:broken-service-min-hosts 2
;; Faulty services are detected based on no changes to healthy/failed
;; instance state past this amount of time (minutes):
:broken-service-timeout-mins 30
;; Timeout intervals (milliseconds) used as a refractory period to
;; allow effects of any GC run to propagate through the system,
;; for broken and idle services, respectively:
:scheduler-gc-broken-service-interval-ms 60000
:scheduler-gc-interval-ms 60000}
;; How often (milliseconds) to check and start new services for tokens with implicit changes that are receiving requests
:scheduler-start-new-services-interval-ms 5000
;; How often (seconds) to query the scheduler for the service and instance statuses:
:scheduler-syncer-interval-secs 5
:scaling {
;; How often (milliseconds) to run the autoscaling algorithm:
:autoscaler-interval-ms 1000
;; Throttles the rate at which kill requests are sent to the scheduler:
:inter-kill-request-wait-time-ms 1000
;; The autoscaler accounts for expired (both healthy and unhealthy) instances while scaling.
;; Limits the number of unhealthy expired instances taken into account by the scaling algorithm:
:max-expired-unhealthy-instances-to-consider 2
;; Individual scale-up operations are throttled so that we can minimize overshoot of instances.
;; The number of new instances started per scale-up is the maximum of 1 and the resource consumed
;; limit calculated based on the cpus and mem specified in the quanta-constraints.
:quanta-constraints {:cpus 64
:mem 524288}}
; ---------- Service Descriptions ----------
;; Profiles allow creating different sets of defaults for service parameters.
;; The profile config maps a profile name to overridden default parameters.
;; In the example below:
;; - the 'webapp' profile overrides the default configuration defined in :service-description-defaults
;; by increasing the concurrency-level and changing the load balancing scheme to random
:profile-config {"webapp" {:defaults {"concurrency-level" 120
"load-balancing" "random"}}}
:service-description-builder-config {
;; :kind :default invokes the DefaultServiceDescriptionBuilder
:kind :default
:default {
;; Custom implementations should specify a
;; :factory-fn that returns an instance of
;; waiter.service-description.ServiceDescriptionBuilder:
:factory-fn waiter.service-description/->DefaultServiceDescriptionBuilder}}
;; Additional configurable constraints on description parameters.
;; Only max is supported currently.
:service-description-constraints {"cpus" {:max 40}
"mem" {:max 32768}} ;; mem is in megabytes
;; The following service description parameters are required and
;; therefore don't have default values:
;;
;; - cpus
;; - mem
;; - cmd
;; - version
;;
;; For the other parameters, if the user does not provide a
;; value for the parameter when constructing her service
;; description, these defaults will be used:
:service-description-defaults {"allowed-params" #{}
"authentication" "standard"
"backend-proto" "http"
"concurrency-level" 1
"distribution-scheme" "balanced"
"env" {"FOO" "bar"
"BAZ" "qux"}
"expired-instance-restart-rate" 0.1
"grace-period-secs" 30
"health-check-authentication" "disabled"
"health-check-interval-secs" 10
"health-check-max-consecutive-failures" 5
"health-check-port-index" 0
"health-check-proto" nil
"health-check-url" "/status"
"idle-timeout-mins" 30
"instance-expiry-mins" 7200
"interstitial-secs" 0
"jitter-threshold" 0.5
"load-balancing" "oldest"
"max-instances" 500
"max-queue-length" 1000000
"metadata" {}
"min-instances" 1
"permitted-user" "*"
"ports" 1
"restart-backoff-factor" 2
"routing-mode" "default"
"scale-down-factor" 0.001
"scale-factor" 1
"scale-up-factor" 0.1
"termination-grace-period-secs" 0}
; ---------- Timeouts ----------
; Waiter maintains a list of recently "killed" and "erroneous" (e.g. an error occurred while streaming the response)
; instances. Such instances are guaranteed not to be served up as an available instance until sufficient time has
; elapsed since their last use. This is referred to as ejecting an instance:
:ejection-config {
;; Erroneous instances are ejected using an exponential delay based on the number of successive
;; failures and eject-backoff-base-time-ms (milliseconds):
:eject-backoff-base-time-ms 10000
;; Ejected instances that are consistently failing requests despite being deemed as healthy
;; by health checks are marked as expired when the number of consecutive failed requests
;; reaches the expiry-threshold.
:expiry-threshold 5
;; Killed instances are ejected for max-eject-time-ms (milliseconds):
:max-eject-time-ms 300000}
:instance-request-properties {
;; Waiter monitors the state of an async request at specified intervals.
;; It makes calls to the backend instance and inspects the responses to
;; decide when to treat the request as complete. A request is not
;; considered complete as long as the backend keeps returning a 200
;; response. This dictates the default value of the interval (milliseconds)
;; at which Waiter will poll the backend instance:
:async-check-interval-ms 3000
;; Waiter monitors the state of an async request at specified intervals.
;; However, using too low an async-check-interval-ms can lead to many requests
;; in short durations. We can restrict the maximum number of such status check
;; requests per async request using the following value:
:async-request-max-status-checks 50
;; The maximum amount of time (milliseconds) before the async request will be
;; considered timed out, and Waiter will release the allocated instance.
:async-request-max-timeout-ms 14400000
;; After this amount of time (milliseconds) the async request will be
;; considered timed out, and Waiter will release the allocated
;; instance. The value is capped by the value of async-request-max-timeout-ms.
;; This is the default value and can be overridden via the
;; x-waiter-async-request-timeout request header:
:async-request-timeout-ms 60000
;; The capacity factor (an int) of the MappedByteBufferPool.
:byte-buffer-capacity-factor 2048
;; The percentage of the JVM heap that can be used in the MappedByteBufferPool max heap memory in individual clients.
:byte-buffer-heap-percent 0.25
;; The maximum MappedByteBufferPool queue length.
:byte-buffer-max-queue-length 32
;; Size in the size of the buffer used to write/read individual request/response fragments to/from the backend.
;; The value must be a positive integer, defaults to 32 KiB.
:client-buffer-size 32768
;; The max time (milliseconds) a connection can be idle in the client connection pool:
:client-connection-idle-timeout-ms 10000
;; The HTTP connect timeout (milliseconds) for instance requests:
:connection-timeout-ms 5000
;; Configures the delay introduced in generating some error responses to clients
;; e.g. using the below settings, an error rate of 500 would introduce a delay of 250 ms
;; while an error rate of 3000 would introduce a delay of 2000 ms.
:error-response-throttle { ;; the maximum delay in generating the error response
:max-delay-ms 2000
;; the increment in delay per step
:step-delay-ms 50
;; step size used for per minute error rate
:step-size-per-min 100}
;; The HTTP idle timeout (milliseconds) for instance requests:
:initial-socket-timeout-ms 900000
;; The timeout for in-flight requests to an expired instance after which
;; expired instances will be eligible for killing:
:lingering-request-threshold-ms 60000
;; Size in bytes of the output buffer used to aggregate fragments of HTTP responses from a backend.
;; The value must be a positive integer, defaults to 4 KiB.
:output-buffer-size 4096
;; The default amount of time (milliseconds) each request will wait in
;; the Waiter queue before an instance is available to process it.
:queue-timeout-ms 300000
;; Configures the idle timeout (milliseconds) in the response output stream:
:streaming-timeout-ms 20000
;; unsupported waiter headers which will cause requests to fail if these headers
;; are provided
:unsupported-headers #{"x-waiter-maintenance"}}
; ---------- Load Balancing ----------
:work-stealing {
;; The maximum number of outstanding work-stealing offers per router:
:max-in-flight-offers 4000
;; The interval (milliseconds) on which Waiter makes work-stealing offers:
:offer-help-interval-ms 100
;; The duration (milliseconds) after which Waiter times out work-stealing offers:
:offer-idle-timeout-ms 900000
;; The timeout (milliseconds) used internally by the
;; Waiter router to reserve an instance for offering:
:reserve-timeout-ms 1000
;; The distribution schemes of services for which to enable work-stealing support:
:supported-distribution-schemes #{"balanced" "simple"}}
; ---------- Metrics - StatsD ----------
;; Waiter has support for publishing certain metrics via the StatsD (https://github.com/etsy/statsd) protocol.
;; You can either disable this feature, which will result in no StatsD metrics being published:
;:statsd :disabled
;; or you can configure it as such:
:statsd {;; must provide one or both of (1) a statsd server via :host + :port
;; or (2) :dd-agent pointing at a datadog agent with dogstatsd enabled
;; The location of your StatsD server daemon:
:host "statsd.example.com"
;; The port to publish metrics on:
:port 8125
;; datadog agent config for dogstatsd client
:dd-agent {;; host (usually local)
:host "127.0.0.1"
;; port number for agent's dogstatsd udp listener
:port 8126
;; optional predicate function for filtering which metrics
;; get sent to the local datadog agent
;; predicate takes two arguments: metric-group and metric-name
:predicate-fn waiter.statsd/keep-all-metrics}
;; The environment, cluster, and server will all
;; be included in the full name of the statsd metric,
;; or as tags on datadog metrics sent via dogstatsd:
:environment "prod"
:cluster "my_waiter_cluster"
:server "router_1"
;; (option ignored for dogstatsd)
;; Waiter aggregates metrics locally and publishes
;; to StatsD on the following interval (milliseconds):
:publish-interval-ms 10000
;; (option ignored for dogstatsd)
;; Waiter polls the router state for instances and resource
;; usage on the following interval (milliseconds):
:sync-instances-interval-ms 5000}
;; Waiter allows a "metric group" string to be associated with a service so that related
;; services can be grouped together in the Waiter Stats environment. One or multiple
;; services can use the same metric group string depending on how you want to group metrics together.
;; Waiter can be configured to automatically map certain well-known services to metric groups,
;; using a regular expression on the service name. These mappings only take effect if no
;; metric group has been explicitly provided in the service description. If a service
;; description does not explicitly provide a metric group, and Waiter is not automatically
;; mapping it to one, then the service's metrics will show up as part of the "other" metric