forked from schrej/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.go
924 lines (816 loc) · 33.7 KB
/
util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"fmt"
"hash/fnv"
"io/ioutil"
"math/rand"
"os"
"path"
"path/filepath"
"reflect"
"strconv"
"strings"
v1 "k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
utypes "k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/api/legacyscheme"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/pkg/volume"
"k8s.io/kubernetes/pkg/volume/util/types"
"k8s.io/kubernetes/pkg/volume/util/volumepathhandler"
utilstrings "k8s.io/utils/strings"
)
const (
// GB - GigaByte size
GB = 1000 * 1000 * 1000
// GIB - GibiByte size
GIB = 1024 * 1024 * 1024
readyFileName = "ready"
// ControllerManagedAttachAnnotation is the key of the annotation on Node
// objects that indicates attach/detach operations for the node should be
// managed by the attach/detach controller
ControllerManagedAttachAnnotation string = "volumes.kubernetes.io/controller-managed-attach-detach"
// KeepTerminatedPodVolumesAnnotation is the key of the annotation on Node
// that decides if pod volumes are unmounted when pod is terminated
KeepTerminatedPodVolumesAnnotation string = "volumes.kubernetes.io/keep-terminated-pod-volumes"
// VolumeGidAnnotationKey is the of the annotation on the PersistentVolume
// object that specifies a supplemental GID.
VolumeGidAnnotationKey = "pv.beta.kubernetes.io/gid"
// VolumeDynamicallyCreatedByKey is the key of the annotation on PersistentVolume
// object created dynamically
VolumeDynamicallyCreatedByKey = "kubernetes.io/createdby"
// LabelMultiZoneDelimiter separates zones for volumes
LabelMultiZoneDelimiter = "__"
)
// VolumeZoneConfig contains config information about zonal volume.
type VolumeZoneConfig struct {
ZonePresent bool
ZonesPresent bool
ReplicaZoneFromNodePresent bool
Zone string
Zones string
ReplicaZoneFromNode string
}
// IsReady checks for the existence of a regular file
// called 'ready' in the given directory and returns
// true if that file exists.
func IsReady(dir string) bool {
readyFile := path.Join(dir, readyFileName)
s, err := os.Stat(readyFile)
if err != nil {
return false
}
if !s.Mode().IsRegular() {
klog.Errorf("ready-file is not a file: %s", readyFile)
return false
}
return true
}
// SetReady creates a file called 'ready' in the given
// directory. It logs an error if the file cannot be
// created.
func SetReady(dir string) {
if err := os.MkdirAll(dir, 0750); err != nil && !os.IsExist(err) {
klog.Errorf("Can't mkdir %s: %v", dir, err)
return
}
readyFile := path.Join(dir, readyFileName)
file, err := os.Create(readyFile)
if err != nil {
klog.Errorf("Can't touch %s: %v", readyFile, err)
return
}
file.Close()
}
// GetSecretForPod locates secret by name in the pod's namespace and returns secret map
func GetSecretForPod(pod *v1.Pod, secretName string, kubeClient clientset.Interface) (map[string]string, error) {
secret := make(map[string]string)
if kubeClient == nil {
return secret, fmt.Errorf("Cannot get kube client")
}
secrets, err := kubeClient.CoreV1().Secrets(pod.Namespace).Get(secretName, metav1.GetOptions{})
if err != nil {
return secret, err
}
for name, data := range secrets.Data {
secret[name] = string(data)
}
return secret, nil
}
// GetSecretForPV locates secret by name and namespace, verifies the secret type, and returns secret map
func GetSecretForPV(secretNamespace, secretName, volumePluginName string, kubeClient clientset.Interface) (map[string]string, error) {
secret := make(map[string]string)
if kubeClient == nil {
return secret, fmt.Errorf("Cannot get kube client")
}
secrets, err := kubeClient.CoreV1().Secrets(secretNamespace).Get(secretName, metav1.GetOptions{})
if err != nil {
return secret, err
}
if secrets.Type != v1.SecretType(volumePluginName) {
return secret, fmt.Errorf("Cannot get secret of type %s", volumePluginName)
}
for name, data := range secrets.Data {
secret[name] = string(data)
}
return secret, nil
}
// GetClassForVolume locates storage class by persistent volume
func GetClassForVolume(kubeClient clientset.Interface, pv *v1.PersistentVolume) (*storage.StorageClass, error) {
if kubeClient == nil {
return nil, fmt.Errorf("Cannot get kube client")
}
className := v1helper.GetPersistentVolumeClass(pv)
if className == "" {
return nil, fmt.Errorf("Volume has no storage class")
}
class, err := kubeClient.StorageV1().StorageClasses().Get(className, metav1.GetOptions{})
if err != nil {
return nil, err
}
return class, nil
}
// CheckNodeAffinity looks at the PV node affinity, and checks if the node has the same corresponding labels
// This ensures that we don't mount a volume that doesn't belong to this node
func CheckNodeAffinity(pv *v1.PersistentVolume, nodeLabels map[string]string) error {
return checkVolumeNodeAffinity(pv, nodeLabels)
}
func checkVolumeNodeAffinity(pv *v1.PersistentVolume, nodeLabels map[string]string) error {
if pv.Spec.NodeAffinity == nil {
return nil
}
if pv.Spec.NodeAffinity.Required != nil {
terms := pv.Spec.NodeAffinity.Required.NodeSelectorTerms
klog.V(10).Infof("Match for Required node selector terms %+v", terms)
if !v1helper.MatchNodeSelectorTerms(terms, labels.Set(nodeLabels), nil) {
return fmt.Errorf("No matching NodeSelectorTerms")
}
}
return nil
}
// LoadPodFromFile will read, decode, and return a Pod from a file.
func LoadPodFromFile(filePath string) (*v1.Pod, error) {
if filePath == "" {
return nil, fmt.Errorf("file path not specified")
}
podDef, err := ioutil.ReadFile(filePath)
if err != nil {
return nil, fmt.Errorf("failed to read file path %s: %+v", filePath, err)
}
if len(podDef) == 0 {
return nil, fmt.Errorf("file was empty: %s", filePath)
}
pod := &v1.Pod{}
codec := legacyscheme.Codecs.UniversalDecoder()
if err := runtime.DecodeInto(codec, podDef, pod); err != nil {
return nil, fmt.Errorf("failed decoding file: %v", err)
}
return pod, nil
}
// SelectZoneForVolume is a wrapper around SelectZonesForVolume
// to select a single zone for a volume based on parameters
func SelectZoneForVolume(zoneParameterPresent, zonesParameterPresent bool, zoneParameter string, zonesParameter, zonesWithNodes sets.String, node *v1.Node, allowedTopologies []v1.TopologySelectorTerm, pvcName string) (string, error) {
zones, err := SelectZonesForVolume(zoneParameterPresent, zonesParameterPresent, zoneParameter, zonesParameter, zonesWithNodes, node, allowedTopologies, pvcName, 1)
if err != nil {
return "", err
}
zone, ok := zones.PopAny()
if !ok {
return "", fmt.Errorf("could not determine a zone to provision volume in")
}
return zone, nil
}
// SelectZonesForVolume selects zones for a volume based on several factors:
// node.zone, allowedTopologies, zone/zones parameters from storageclass,
// zones with active nodes from the cluster. The number of zones = replicas.
func SelectZonesForVolume(zoneParameterPresent, zonesParameterPresent bool, zoneParameter string, zonesParameter, zonesWithNodes sets.String, node *v1.Node, allowedTopologies []v1.TopologySelectorTerm, pvcName string, numReplicas uint32) (sets.String, error) {
if zoneParameterPresent && zonesParameterPresent {
return nil, fmt.Errorf("both zone and zones StorageClass parameters must not be used at the same time")
}
var zoneFromNode string
// pick one zone from node if present
if node != nil {
// VolumeScheduling implicit since node is not nil
if zoneParameterPresent || zonesParameterPresent {
return nil, fmt.Errorf("zone[s] cannot be specified in StorageClass if VolumeBindingMode is set to WaitForFirstConsumer. Please specify allowedTopologies in StorageClass for constraining zones")
}
// pick node's zone for one of the replicas
var ok bool
zoneFromNode, ok = node.ObjectMeta.Labels[v1.LabelZoneFailureDomain]
if !ok {
return nil, fmt.Errorf("%s Label for node missing", v1.LabelZoneFailureDomain)
}
// if single replica volume and node with zone found, return immediately
if numReplicas == 1 {
return sets.NewString(zoneFromNode), nil
}
}
// pick zone from allowedZones if specified
allowedZones, err := ZonesFromAllowedTopologies(allowedTopologies)
if err != nil {
return nil, err
}
if (len(allowedTopologies) > 0) && (allowedZones.Len() == 0) {
return nil, fmt.Errorf("no matchLabelExpressions with %s key found in allowedTopologies. Please specify matchLabelExpressions with %s key", v1.LabelZoneFailureDomain, v1.LabelZoneFailureDomain)
}
if allowedZones.Len() > 0 {
// VolumeScheduling implicit since allowedZones present
if zoneParameterPresent || zonesParameterPresent {
return nil, fmt.Errorf("zone[s] cannot be specified in StorageClass if allowedTopologies specified")
}
// scheduler will guarantee if node != null above, zoneFromNode is member of allowedZones.
// so if zoneFromNode != "", we can safely assume it is part of allowedZones.
zones, err := chooseZonesForVolumeIncludingZone(allowedZones, pvcName, zoneFromNode, numReplicas)
if err != nil {
return nil, fmt.Errorf("cannot process zones in allowedTopologies: %v", err)
}
return zones, nil
}
// pick zone from parameters if present
if zoneParameterPresent {
if numReplicas > 1 {
return nil, fmt.Errorf("zone cannot be specified if desired number of replicas for pv is greather than 1. Please specify zones or allowedTopologies to specify desired zones")
}
return sets.NewString(zoneParameter), nil
}
if zonesParameterPresent {
if uint32(zonesParameter.Len()) < numReplicas {
return nil, fmt.Errorf("not enough zones found in zones parameter to provision a volume with %d replicas. Found %d zones, need %d zones", numReplicas, zonesParameter.Len(), numReplicas)
}
// directly choose from zones parameter; no zone from node need to be considered
return ChooseZonesForVolume(zonesParameter, pvcName, numReplicas), nil
}
// pick zone from zones with nodes
if zonesWithNodes.Len() > 0 {
// If node != null (and thus zoneFromNode != ""), zoneFromNode will be member of zonesWithNodes
zones, err := chooseZonesForVolumeIncludingZone(zonesWithNodes, pvcName, zoneFromNode, numReplicas)
if err != nil {
return nil, fmt.Errorf("cannot process zones where nodes exist in the cluster: %v", err)
}
return zones, nil
}
return nil, fmt.Errorf("cannot determine zones to provision volume in")
}
// ZonesFromAllowedTopologies returns a list of zones specified in allowedTopologies
func ZonesFromAllowedTopologies(allowedTopologies []v1.TopologySelectorTerm) (sets.String, error) {
zones := make(sets.String)
for _, term := range allowedTopologies {
for _, exp := range term.MatchLabelExpressions {
if exp.Key == v1.LabelZoneFailureDomain {
for _, value := range exp.Values {
zones.Insert(value)
}
} else {
return nil, fmt.Errorf("unsupported key found in matchLabelExpressions: %s", exp.Key)
}
}
}
return zones, nil
}
// ZonesSetToLabelValue converts zones set to label value
func ZonesSetToLabelValue(strSet sets.String) string {
return strings.Join(strSet.UnsortedList(), LabelMultiZoneDelimiter)
}
// ZonesToSet converts a string containing a comma separated list of zones to set
func ZonesToSet(zonesString string) (sets.String, error) {
zones, err := stringToSet(zonesString, ",")
if err != nil {
return nil, fmt.Errorf("error parsing zones %s, must be strings separated by commas: %v", zonesString, err)
}
return zones, nil
}
// LabelZonesToSet converts a PV label value from string containing a delimited list of zones to set
func LabelZonesToSet(labelZonesValue string) (sets.String, error) {
return stringToSet(labelZonesValue, LabelMultiZoneDelimiter)
}
// StringToSet converts a string containing list separated by specified delimiter to a set
func stringToSet(str, delimiter string) (sets.String, error) {
zonesSlice := strings.Split(str, delimiter)
zonesSet := make(sets.String)
for _, zone := range zonesSlice {
trimmedZone := strings.TrimSpace(zone)
if trimmedZone == "" {
return make(sets.String), fmt.Errorf(
"%q separated list (%q) must not contain an empty string",
delimiter,
str)
}
zonesSet.Insert(trimmedZone)
}
return zonesSet, nil
}
// LabelZonesToList converts a PV label value from string containing a delimited list of zones to list
func LabelZonesToList(labelZonesValue string) ([]string, error) {
return stringToList(labelZonesValue, LabelMultiZoneDelimiter)
}
// StringToList converts a string containing list separated by specified delimiter to a list
func stringToList(str, delimiter string) ([]string, error) {
zonesSlice := make([]string, 0)
for _, zone := range strings.Split(str, delimiter) {
trimmedZone := strings.TrimSpace(zone)
if trimmedZone == "" {
return nil, fmt.Errorf(
"%q separated list (%q) must not contain an empty string",
delimiter,
str)
}
zonesSlice = append(zonesSlice, trimmedZone)
}
return zonesSlice, nil
}
// CalculateTimeoutForVolume calculates time for a Recycler pod to complete a
// recycle operation. The calculation and return value is either the
// minimumTimeout or the timeoutIncrement per Gi of storage size, whichever is
// greater.
func CalculateTimeoutForVolume(minimumTimeout, timeoutIncrement int, pv *v1.PersistentVolume) int64 {
giQty := resource.MustParse("1Gi")
pvQty := pv.Spec.Capacity[v1.ResourceStorage]
giSize := giQty.Value()
pvSize := pvQty.Value()
timeout := (pvSize / giSize) * int64(timeoutIncrement)
if timeout < int64(minimumTimeout) {
return int64(minimumTimeout)
}
return timeout
}
// RoundUpSize calculates how many allocation units are needed to accommodate
// a volume of given size. E.g. when user wants 1500MiB volume, while AWS EBS
// allocates volumes in gibibyte-sized chunks,
// RoundUpSize(1500 * 1024*1024, 1024*1024*1024) returns '2'
// (2 GiB is the smallest allocatable volume that can hold 1500MiB)
func RoundUpSize(volumeSizeBytes int64, allocationUnitBytes int64) int64 {
roundedUp := volumeSizeBytes / allocationUnitBytes
if volumeSizeBytes%allocationUnitBytes > 0 {
roundedUp++
}
return roundedUp
}
// RoundUpToGB rounds up given quantity to chunks of GB
func RoundUpToGB(size resource.Quantity) int64 {
requestBytes := size.Value()
return RoundUpSize(requestBytes, GB)
}
// RoundUpToGiB rounds up given quantity upto chunks of GiB
func RoundUpToGiB(size resource.Quantity) int64 {
requestBytes := size.Value()
return RoundUpSize(requestBytes, GIB)
}
// RoundUpSizeInt calculates how many allocation units are needed to accommodate
// a volume of given size. It returns an int instead of an int64 and an error if
// there's overflow
func RoundUpSizeInt(volumeSizeBytes int64, allocationUnitBytes int64) (int, error) {
roundedUp := RoundUpSize(volumeSizeBytes, allocationUnitBytes)
roundedUpInt := int(roundedUp)
if int64(roundedUpInt) != roundedUp {
return 0, fmt.Errorf("capacity %v is too great, casting results in integer overflow", roundedUp)
}
return roundedUpInt, nil
}
// RoundUpToGBInt rounds up given quantity to chunks of GB. It returns an
// int instead of an int64 and an error if there's overflow
func RoundUpToGBInt(size resource.Quantity) (int, error) {
requestBytes := size.Value()
return RoundUpSizeInt(requestBytes, GB)
}
// RoundUpToGiBInt rounds up given quantity upto chunks of GiB. It returns an
// int instead of an int64 and an error if there's overflow
func RoundUpToGiBInt(size resource.Quantity) (int, error) {
requestBytes := size.Value()
return RoundUpSizeInt(requestBytes, GIB)
}
// GenerateVolumeName returns a PV name with clusterName prefix. The function
// should be used to generate a name of GCE PD or Cinder volume. It basically
// adds "<clusterName>-dynamic-" before the PV name, making sure the resulting
// string fits given length and cuts "dynamic" if not.
func GenerateVolumeName(clusterName, pvName string, maxLength int) string {
prefix := clusterName + "-dynamic"
pvLen := len(pvName)
// cut the "<clusterName>-dynamic" to fit full pvName into maxLength
// +1 for the '-' dash
if pvLen+1+len(prefix) > maxLength {
prefix = prefix[:maxLength-pvLen-1]
}
return prefix + "-" + pvName
}
// GetPath checks if the path from the mounter is empty.
func GetPath(mounter volume.Mounter) (string, error) {
path := mounter.GetPath()
if path == "" {
return "", fmt.Errorf("Path is empty %s", reflect.TypeOf(mounter).String())
}
return path, nil
}
// ChooseZoneForVolume implements our heuristics for choosing a zone for volume creation based on the volume name
// Volumes are generally round-robin-ed across all active zones, using the hash of the PVC Name.
// However, if the PVCName ends with `-<integer>`, we will hash the prefix, and then add the integer to the hash.
// This means that a StatefulSet's volumes (`claimname-statefulsetname-id`) will spread across available zones,
// assuming the id values are consecutive.
func ChooseZoneForVolume(zones sets.String, pvcName string) string {
// No zones available, return empty string.
if zones.Len() == 0 {
return ""
}
// We create the volume in a zone determined by the name
// Eventually the scheduler will coordinate placement into an available zone
hash, index := getPVCNameHashAndIndexOffset(pvcName)
// Zones.List returns zones in a consistent order (sorted)
// We do have a potential failure case where volumes will not be properly spread,
// if the set of zones changes during StatefulSet volume creation. However, this is
// probably relatively unlikely because we expect the set of zones to be essentially
// static for clusters.
// Hopefully we can address this problem if/when we do full scheduler integration of
// PVC placement (which could also e.g. avoid putting volumes in overloaded or
// unhealthy zones)
zoneSlice := zones.List()
zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]
klog.V(2).Infof("Creating volume for PVC %q; chose zone=%q from zones=%q", pvcName, zone, zoneSlice)
return zone
}
// chooseZonesForVolumeIncludingZone is a wrapper around ChooseZonesForVolume that ensures zoneToInclude is chosen
// zoneToInclude can either be empty in which case it is ignored. If non-empty, zoneToInclude is expected to be member of zones.
// numReplicas is expected to be > 0 and <= zones.Len()
func chooseZonesForVolumeIncludingZone(zones sets.String, pvcName, zoneToInclude string, numReplicas uint32) (sets.String, error) {
if numReplicas == 0 {
return nil, fmt.Errorf("invalid number of replicas passed")
}
if uint32(zones.Len()) < numReplicas {
return nil, fmt.Errorf("not enough zones found to provision a volume with %d replicas. Need at least %d distinct zones for a volume with %d replicas", numReplicas, numReplicas, numReplicas)
}
if zoneToInclude != "" && !zones.Has(zoneToInclude) {
return nil, fmt.Errorf("zone to be included: %s needs to be member of set: %v", zoneToInclude, zones)
}
if uint32(zones.Len()) == numReplicas {
return zones, nil
}
if zoneToInclude != "" {
zones.Delete(zoneToInclude)
numReplicas = numReplicas - 1
}
zonesChosen := ChooseZonesForVolume(zones, pvcName, numReplicas)
if zoneToInclude != "" {
zonesChosen.Insert(zoneToInclude)
}
return zonesChosen, nil
}
// ChooseZonesForVolume is identical to ChooseZoneForVolume, but selects a multiple zones, for multi-zone disks.
func ChooseZonesForVolume(zones sets.String, pvcName string, numZones uint32) sets.String {
// No zones available, return empty set.
replicaZones := sets.NewString()
if zones.Len() == 0 {
return replicaZones
}
// We create the volume in a zone determined by the name
// Eventually the scheduler will coordinate placement into an available zone
hash, index := getPVCNameHashAndIndexOffset(pvcName)
// Zones.List returns zones in a consistent order (sorted)
// We do have a potential failure case where volumes will not be properly spread,
// if the set of zones changes during StatefulSet volume creation. However, this is
// probably relatively unlikely because we expect the set of zones to be essentially
// static for clusters.
// Hopefully we can address this problem if/when we do full scheduler integration of
// PVC placement (which could also e.g. avoid putting volumes in overloaded or
// unhealthy zones)
zoneSlice := zones.List()
startingIndex := index * numZones
for index = startingIndex; index < startingIndex+numZones; index++ {
zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]
replicaZones.Insert(zone)
}
klog.V(2).Infof("Creating volume for replicated PVC %q; chosen zones=%q from zones=%q",
pvcName, replicaZones.UnsortedList(), zoneSlice)
return replicaZones
}
func getPVCNameHashAndIndexOffset(pvcName string) (hash uint32, index uint32) {
if pvcName == "" {
// We should always be called with a name; this shouldn't happen
klog.Warningf("No name defined during volume create; choosing random zone")
hash = rand.Uint32()
} else {
hashString := pvcName
// Heuristic to make sure that volumes in a StatefulSet are spread across zones
// StatefulSet PVCs are (currently) named ClaimName-StatefulSetName-Id,
// where Id is an integer index.
// Note though that if a StatefulSet pod has multiple claims, we need them to be
// in the same zone, because otherwise the pod will be unable to mount both volumes,
// and will be unschedulable. So we hash _only_ the "StatefulSetName" portion when
// it looks like `ClaimName-StatefulSetName-Id`.
// We continue to round-robin volume names that look like `Name-Id` also; this is a useful
// feature for users that are creating statefulset-like functionality without using statefulsets.
lastDash := strings.LastIndexByte(pvcName, '-')
if lastDash != -1 {
statefulsetIDString := pvcName[lastDash+1:]
statefulsetID, err := strconv.ParseUint(statefulsetIDString, 10, 32)
if err == nil {
// Offset by the statefulsetID, so we round-robin across zones
index = uint32(statefulsetID)
// We still hash the volume name, but only the prefix
hashString = pvcName[:lastDash]
// In the special case where it looks like `ClaimName-StatefulSetName-Id`,
// hash only the StatefulSetName, so that different claims on the same StatefulSet
// member end up in the same zone.
// Note that StatefulSetName (and ClaimName) might themselves both have dashes.
// We actually just take the portion after the final - of ClaimName-StatefulSetName.
// For our purposes it doesn't much matter (just suboptimal spreading).
lastDash := strings.LastIndexByte(hashString, '-')
if lastDash != -1 {
hashString = hashString[lastDash+1:]
}
klog.V(2).Infof("Detected StatefulSet-style volume name %q; index=%d", pvcName, index)
}
}
// We hash the (base) volume name, so we don't bias towards the first N zones
h := fnv.New32()
h.Write([]byte(hashString))
hash = h.Sum32()
}
return hash, index
}
// UnmountViaEmptyDir delegates the tear down operation for secret, configmap, git_repo and downwardapi
// to empty_dir
func UnmountViaEmptyDir(dir string, host volume.VolumeHost, volName string, volSpec volume.Spec, podUID utypes.UID) error {
klog.V(3).Infof("Tearing down volume %v for pod %v at %v", volName, podUID, dir)
// Wrap EmptyDir, let it do the teardown.
wrapped, err := host.NewWrapperUnmounter(volName, volSpec, podUID)
if err != nil {
return err
}
return wrapped.TearDownAt(dir)
}
// MountOptionFromSpec extracts and joins mount options from volume spec with supplied options
func MountOptionFromSpec(spec *volume.Spec, options ...string) []string {
pv := spec.PersistentVolume
if pv != nil {
// Use beta annotation first
if mo, ok := pv.Annotations[v1.MountOptionAnnotation]; ok {
moList := strings.Split(mo, ",")
return JoinMountOptions(moList, options)
}
if len(pv.Spec.MountOptions) > 0 {
return JoinMountOptions(pv.Spec.MountOptions, options)
}
}
return options
}
// JoinMountOptions joins mount options eliminating duplicates
func JoinMountOptions(userOptions []string, systemOptions []string) []string {
allMountOptions := sets.NewString()
for _, mountOption := range userOptions {
if len(mountOption) > 0 {
allMountOptions.Insert(mountOption)
}
}
for _, mountOption := range systemOptions {
allMountOptions.Insert(mountOption)
}
return allMountOptions.List()
}
// ValidateZone returns:
// - an error in case zone is an empty string or contains only any combination of spaces and tab characters
// - nil otherwise
func ValidateZone(zone string) error {
if strings.TrimSpace(zone) == "" {
return fmt.Errorf("the provided %q zone is not valid, it's an empty string or contains only spaces and tab characters", zone)
}
return nil
}
// AccessModesContains returns whether the requested mode is contained by modes
func AccessModesContains(modes []v1.PersistentVolumeAccessMode, mode v1.PersistentVolumeAccessMode) bool {
for _, m := range modes {
if m == mode {
return true
}
}
return false
}
// AccessModesContainedInAll returns whether all of the requested modes are contained by modes
func AccessModesContainedInAll(indexedModes []v1.PersistentVolumeAccessMode, requestedModes []v1.PersistentVolumeAccessMode) bool {
for _, mode := range requestedModes {
if !AccessModesContains(indexedModes, mode) {
return false
}
}
return true
}
// GetWindowsPath get a windows path
func GetWindowsPath(path string) string {
windowsPath := strings.Replace(path, "/", "\\", -1)
if strings.HasPrefix(windowsPath, "\\") {
windowsPath = "c:" + windowsPath
}
return windowsPath
}
// GetUniquePodName returns a unique identifier to reference a pod by
func GetUniquePodName(pod *v1.Pod) types.UniquePodName {
return types.UniquePodName(pod.UID)
}
// GetUniqueVolumeName returns a unique name representing the volume/plugin.
// Caller should ensure that volumeName is a name/ID uniquely identifying the
// actual backing device, directory, path, etc. for a particular volume.
// The returned name can be used to uniquely reference the volume, for example,
// to prevent operations (attach/detach or mount/unmount) from being triggered
// on the same volume.
func GetUniqueVolumeName(pluginName, volumeName string) v1.UniqueVolumeName {
return v1.UniqueVolumeName(fmt.Sprintf("%s/%s", pluginName, volumeName))
}
// GetUniqueVolumeNameFromSpecWithPod returns a unique volume name with pod
// name included. This is useful to generate different names for different pods
// on same volume.
func GetUniqueVolumeNameFromSpecWithPod(
podName types.UniquePodName, volumePlugin volume.VolumePlugin, volumeSpec *volume.Spec) v1.UniqueVolumeName {
return v1.UniqueVolumeName(
fmt.Sprintf("%s/%v-%s", volumePlugin.GetPluginName(), podName, volumeSpec.Name()))
}
// GetUniqueVolumeNameFromSpec uses the given VolumePlugin to generate a unique
// name representing the volume defined in the specified volume spec.
// This returned name can be used to uniquely reference the actual backing
// device, directory, path, etc. referenced by the given volumeSpec.
// If the given plugin does not support the volume spec, this returns an error.
func GetUniqueVolumeNameFromSpec(
volumePlugin volume.VolumePlugin,
volumeSpec *volume.Spec) (v1.UniqueVolumeName, error) {
if volumePlugin == nil {
return "", fmt.Errorf(
"volumePlugin should not be nil. volumeSpec.Name=%q",
volumeSpec.Name())
}
volumeName, err := volumePlugin.GetVolumeName(volumeSpec)
if err != nil || volumeName == "" {
return "", fmt.Errorf(
"failed to GetVolumeName from volumePlugin for volumeSpec %q err=%v",
volumeSpec.Name(),
err)
}
return GetUniqueVolumeName(
volumePlugin.GetPluginName(),
volumeName),
nil
}
// IsPodTerminated checks if pod is terminated
func IsPodTerminated(pod *v1.Pod, podStatus v1.PodStatus) bool {
return podStatus.Phase == v1.PodFailed || podStatus.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(podStatus.ContainerStatuses))
}
// notRunning returns true if every status is terminated or waiting, or the status list
// is empty.
func notRunning(statuses []v1.ContainerStatus) bool {
for _, status := range statuses {
if status.State.Terminated == nil && status.State.Waiting == nil {
return false
}
}
return true
}
// SplitUniqueName splits the unique name to plugin name and volume name strings. It expects the uniqueName to follow
// the format plugin_name/volume_name and the plugin name must be namespaced as described by the plugin interface,
// i.e. namespace/plugin containing exactly one '/'. This means the unique name will always be in the form of
// plugin_namespace/plugin/volume_name, see k8s.io/kubernetes/pkg/volume/plugins.go VolumePlugin interface
// description and pkg/volume/util/volumehelper/volumehelper.go GetUniqueVolumeNameFromSpec that constructs
// the unique volume names.
func SplitUniqueName(uniqueName v1.UniqueVolumeName) (string, string, error) {
components := strings.SplitN(string(uniqueName), "/", 3)
if len(components) != 3 {
return "", "", fmt.Errorf("cannot split volume unique name %s to plugin/volume components", uniqueName)
}
pluginName := fmt.Sprintf("%s/%s", components[0], components[1])
return pluginName, components[2], nil
}
// NewSafeFormatAndMountFromHost creates a new SafeFormatAndMount with Mounter
// and Exec taken from given VolumeHost.
func NewSafeFormatAndMountFromHost(pluginName string, host volume.VolumeHost) *mount.SafeFormatAndMount {
mounter := host.GetMounter(pluginName)
exec := host.GetExec(pluginName)
return &mount.SafeFormatAndMount{Interface: mounter, Exec: exec}
}
// GetVolumeMode retrieves VolumeMode from pv.
// If the volume doesn't have PersistentVolume, it's an inline volume,
// should return volumeMode as filesystem to keep existing behavior.
func GetVolumeMode(volumeSpec *volume.Spec) (v1.PersistentVolumeMode, error) {
if volumeSpec == nil || volumeSpec.PersistentVolume == nil {
return v1.PersistentVolumeFilesystem, nil
}
if volumeSpec.PersistentVolume.Spec.VolumeMode != nil {
return *volumeSpec.PersistentVolume.Spec.VolumeMode, nil
}
return "", fmt.Errorf("cannot get volumeMode for volume: %v", volumeSpec.Name())
}
// GetPersistentVolumeClaimVolumeMode retrieves VolumeMode from pvc.
func GetPersistentVolumeClaimVolumeMode(claim *v1.PersistentVolumeClaim) (v1.PersistentVolumeMode, error) {
if claim.Spec.VolumeMode != nil {
return *claim.Spec.VolumeMode, nil
}
return "", fmt.Errorf("cannot get volumeMode from pvc: %v", claim.Name)
}
// GetPersistentVolumeClaimQualifiedName returns a qualified name for pvc.
func GetPersistentVolumeClaimQualifiedName(claim *v1.PersistentVolumeClaim) string {
return utilstrings.JoinQualifiedName(claim.GetNamespace(), claim.GetName())
}
// CheckVolumeModeFilesystem checks VolumeMode.
// If the mode is Filesystem, return true otherwise return false.
func CheckVolumeModeFilesystem(volumeSpec *volume.Spec) (bool, error) {
if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
volumeMode, err := GetVolumeMode(volumeSpec)
if err != nil {
return true, err
}
if volumeMode == v1.PersistentVolumeBlock {
return false, nil
}
}
return true, nil
}
// CheckPersistentVolumeClaimModeBlock checks VolumeMode.
// If the mode is Block, return true otherwise return false.
func CheckPersistentVolumeClaimModeBlock(pvc *v1.PersistentVolumeClaim) bool {
return utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) && pvc.Spec.VolumeMode != nil && *pvc.Spec.VolumeMode == v1.PersistentVolumeBlock
}
// IsWindowsUNCPath checks if path is prefixed with \\
// This can be used to skip any processing of paths
// that point to SMB shares, local named pipes and local UNC path
func IsWindowsUNCPath(goos, path string) bool {
if goos != "windows" {
return false
}
// Check for UNC prefix \\
if strings.HasPrefix(path, `\\`) {
return true
}
return false
}
// IsWindowsLocalPath checks if path is a local path
// prefixed with "/" or "\" like "/foo/bar" or "\foo\bar"
func IsWindowsLocalPath(goos, path string) bool {
if goos != "windows" {
return false
}
if IsWindowsUNCPath(goos, path) {
return false
}
if strings.Contains(path, ":") {
return false
}
if !(strings.HasPrefix(path, `/`) || strings.HasPrefix(path, `\`)) {
return false
}
return true
}
// MakeAbsolutePath convert path to absolute path according to GOOS
func MakeAbsolutePath(goos, path string) string {
if goos != "windows" {
return filepath.Clean("/" + path)
}
// These are all for windows
// If there is a colon, give up.
if strings.Contains(path, ":") {
return path
}
// If there is a slash, but no drive, add 'c:'
if strings.HasPrefix(path, "/") || strings.HasPrefix(path, "\\") {
return "c:" + path
}
// Otherwise, add 'c:\'
return "c:\\" + path
}
// MapBlockVolume is a utility function to provide a common way of mounting
// block device path for a specified volume and pod. This function should be
// called by volume plugins that implements volume.BlockVolumeMapper.Map() method.
func MapBlockVolume(
devicePath,
globalMapPath,
podVolumeMapPath,
volumeMapName string,
podUID utypes.UID,
) error {
blkUtil := volumepathhandler.NewBlockVolumePathHandler()
// map devicePath to global node path
mapErr := blkUtil.MapDevice(devicePath, globalMapPath, string(podUID))
if mapErr != nil {
return mapErr
}
// map devicePath to pod volume path
mapErr = blkUtil.MapDevice(devicePath, podVolumeMapPath, volumeMapName)
if mapErr != nil {
return mapErr
}
return nil
}