forked from schrej/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pet.go
326 lines (292 loc) · 10.6 KB
/
pet.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package petset
import (
"fmt"
"strconv"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/errors"
"k8s.io/kubernetes/pkg/apis/apps"
"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
"k8s.io/kubernetes/pkg/client/record"
"k8s.io/kubernetes/pkg/runtime"
"github.com/golang/glog"
)
// petLifeCycleEvent is used to communicate high level actions the controller
// needs to take on a given pet. It's recorded in the pcb. The recognized values
// are listed below.
type petLifeCycleEvent string
const (
syncPet petLifeCycleEvent = "sync"
deletePet petLifeCycleEvent = "delete"
// updateRetries is the number of Get/Update cycles we perform when an
// update fails.
updateRetries = 3
// StatefulSetInitAnnotation is an annotation which when set, indicates that the
// pet has finished initializing itself.
// TODO: Replace this with init container status.
StatefulSetInitAnnotation = "pod.alpha.kubernetes.io/initialized"
)
// pcb is the control block used to transmit all updates about a single pet.
// It serves as the manifest for a single pet. Users must populate the pod
// and parent fields to pass it around safely.
type pcb struct {
// pod is the desired pet pod.
pod *api.Pod
// pvcs is a list of desired persistent volume claims for the pet pod.
pvcs []api.PersistentVolumeClaim
// event is the lifecycle event associated with this update.
event petLifeCycleEvent
// id is the identity index of this pet.
id string
// parent is a pointer to the parent statefulset.
parent *apps.StatefulSet
}
// pvcClient is a client for managing persistent volume claims.
type pvcClient interface {
// DeletePVCs deletes the pvcs in the given pcb.
DeletePVCs(*pcb) error
// SyncPVCs creates/updates pvcs in the given pcb.
SyncPVCs(*pcb) error
}
// petSyncer syncs a single pet.
type petSyncer struct {
petClient
// blockingPet is an unhealthy pet either from this iteration or a previous
// iteration, either because it is not yet Running, or being Deleted, that
// prevents other creates/deletions.
blockingPet *pcb
}
// errUnhealthyPet is returned when a we either know for sure a pet is unhealthy,
// or don't know its state but assume it is unhealthy. It's used as a signal to the caller for further operations like updating status.replicas.
// This is not a fatal error.
type errUnhealthyPet string
func (e errUnhealthyPet) Error() string {
return string(e)
}
// Sync syncs the given pet.
func (p *petSyncer) Sync(pet *pcb) error {
if pet == nil {
return nil
}
realPet, exists, err := p.Get(pet)
if err != nil {
return err
}
// There is not constraint except quota on the number of pvcs created.
// This is done per pet so we get a working cluster ASAP, even if user
// runs out of quota.
if err := p.SyncPVCs(pet); err != nil {
return err
}
// if pet failed - we need to remove old one because of consistent naming
if exists && realPet.pod.Status.Phase == api.PodFailed {
glog.V(2).Infof("Deleting evicted pod %v/%v", realPet.pod.Namespace, realPet.pod.Name)
if err := p.petClient.Delete(realPet); err != nil {
return err
}
} else if exists {
if !p.isHealthy(realPet.pod) {
glog.V(4).Infof("StatefulSet %v waiting on unhealthy pet %v", pet.parent.Name, realPet.pod.Name)
}
return p.Update(realPet, pet)
}
if p.blockingPet != nil {
message := errUnhealthyPet(fmt.Sprintf("Create of %v in StatefulSet %v blocked by unhealthy pet %v", pet.pod.Name, pet.parent.Name, p.blockingPet.pod.Name))
glog.V(4).Infof(message.Error())
return message
}
// This is counted as a create, even if it fails. We can't skip indices
// because some pets might allocate a special role to earlier indices.
// The returned error will force a requeue.
// TODO: What's the desired behavior if pet-0 is deleted while pet-1 is
// not yet healthy? currently pet-0 will wait till pet-1 is healthy,
// this feels safer, but might lead to deadlock.
p.blockingPet = pet
if err := p.Create(pet); err != nil {
return err
}
return nil
}
// Delete deletes the given pet, if no other pet in the statefulset is blocking a
// scale event.
func (p *petSyncer) Delete(pet *pcb) error {
if pet == nil {
return nil
}
realPet, exists, err := p.Get(pet)
if err != nil {
return err
}
if !exists {
return nil
}
if p.blockingPet != nil {
glog.V(4).Infof("Delete of %v in StatefulSet %v blocked by unhealthy pet %v", realPet.pod.Name, pet.parent.Name, p.blockingPet.pod.Name)
return nil
}
// This is counted as a delete, even if it fails.
// The returned error will force a requeue.
p.blockingPet = realPet
if !p.isDying(realPet.pod) {
glog.V(2).Infof("StatefulSet %v deleting pet %v/%v", pet.parent.Name, pet.pod.Namespace, pet.pod.Name)
return p.petClient.Delete(pet)
}
glog.V(4).Infof("StatefulSet %v waiting on pet %v to die in %v", pet.parent.Name, realPet.pod.Name, realPet.pod.DeletionTimestamp)
return nil
}
// petClient is a client for managing pets.
type petClient interface {
pvcClient
petHealthChecker
Delete(*pcb) error
Get(*pcb) (*pcb, bool, error)
Create(*pcb) error
Update(*pcb, *pcb) error
}
// apiServerPetClient is a statefulset aware Kubernetes client.
type apiServerPetClient struct {
c internalclientset.Interface
recorder record.EventRecorder
petHealthChecker
}
// Get gets the pet in the pcb from the apiserver.
func (p *apiServerPetClient) Get(pet *pcb) (*pcb, bool, error) {
ns := pet.parent.Namespace
pod, err := p.c.Core().Pods(ns).Get(pet.pod.Name)
if errors.IsNotFound(err) {
return nil, false, nil
}
if err != nil {
return nil, false, err
}
realPet := *pet
realPet.pod = pod
return &realPet, true, nil
}
// Delete deletes the pet in the pcb from the apiserver.
func (p *apiServerPetClient) Delete(pet *pcb) error {
err := p.c.Core().Pods(pet.parent.Namespace).Delete(pet.pod.Name, nil)
if errors.IsNotFound(err) {
err = nil
}
p.event(pet.parent, "Delete", fmt.Sprintf("pet: %v", pet.pod.Name), err)
return err
}
// Create creates the pet in the pcb.
func (p *apiServerPetClient) Create(pet *pcb) error {
_, err := p.c.Core().Pods(pet.parent.Namespace).Create(pet.pod)
p.event(pet.parent, "Create", fmt.Sprintf("pet: %v", pet.pod.Name), err)
return err
}
// Update updates the pet in the 'pet' pcb to match the pet in the 'expectedPet' pcb.
// If the pod object of a pet which to be updated has been changed in server side, we
// will get the actual value and set pet identity before retries.
func (p *apiServerPetClient) Update(pet *pcb, expectedPet *pcb) (updateErr error) {
pc := p.c.Core().Pods(pet.parent.Namespace)
for i := 0; ; i++ {
updatePod, needsUpdate, err := copyPetID(pet, expectedPet)
if err != nil || !needsUpdate {
return err
}
glog.V(4).Infof("Resetting pet %v/%v to match StatefulSet %v spec", pet.pod.Namespace, pet.pod.Name, pet.parent.Name)
_, updateErr = pc.Update(&updatePod)
if updateErr == nil || i >= updateRetries {
return updateErr
}
getPod, getErr := pc.Get(updatePod.Name)
if getErr != nil {
return getErr
}
pet.pod = getPod
}
}
// DeletePVCs should delete PVCs, when implemented.
func (p *apiServerPetClient) DeletePVCs(pet *pcb) error {
// TODO: Implement this when we delete pvcs.
return nil
}
func (p *apiServerPetClient) getPVC(pvcName, pvcNamespace string) (*api.PersistentVolumeClaim, error) {
pvc, err := p.c.Core().PersistentVolumeClaims(pvcNamespace).Get(pvcName)
return pvc, err
}
func (p *apiServerPetClient) createPVC(pvc *api.PersistentVolumeClaim) error {
_, err := p.c.Core().PersistentVolumeClaims(pvc.Namespace).Create(pvc)
return err
}
// SyncPVCs syncs pvcs in the given pcb.
func (p *apiServerPetClient) SyncPVCs(pet *pcb) error {
errmsg := ""
// Create new claims.
for i, pvc := range pet.pvcs {
_, err := p.getPVC(pvc.Name, pet.parent.Namespace)
if err != nil {
if errors.IsNotFound(err) {
var err error
if err = p.createPVC(&pet.pvcs[i]); err != nil {
errmsg += fmt.Sprintf("Failed to create %v: %v", pvc.Name, err)
}
p.event(pet.parent, "Create", fmt.Sprintf("pvc: %v", pvc.Name), err)
} else {
errmsg += fmt.Sprintf("Error trying to get pvc %v, %v.", pvc.Name, err)
}
}
// TODO: Check resource requirements and accessmodes, update if necessary
}
if len(errmsg) != 0 {
return fmt.Errorf("%v", errmsg)
}
return nil
}
// event formats an event for the given runtime object.
func (p *apiServerPetClient) event(obj runtime.Object, reason, msg string, err error) {
if err != nil {
p.recorder.Eventf(obj, api.EventTypeWarning, fmt.Sprintf("Failed%v", reason), fmt.Sprintf("%v, error: %v", msg, err))
} else {
p.recorder.Eventf(obj, api.EventTypeNormal, fmt.Sprintf("Successful%v", reason), msg)
}
}
// petHealthChecker is an interface to check pet health. It makes a boolean
// decision based on the given pod.
type petHealthChecker interface {
isHealthy(*api.Pod) bool
isDying(*api.Pod) bool
}
// defaultPetHealthChecks does basic health checking.
// It doesn't update, probe or get the pod.
type defaultPetHealthChecker struct{}
// isHealthy returns true if the pod is ready & running. If the pod has the
// "pod.alpha.kubernetes.io/initialized" annotation set to "false", pod state is ignored.
func (d *defaultPetHealthChecker) isHealthy(pod *api.Pod) bool {
if pod == nil || pod.Status.Phase != api.PodRunning {
return false
}
podReady := api.IsPodReady(pod)
// User may have specified a pod readiness override through a debug annotation.
initialized, ok := pod.Annotations[StatefulSetInitAnnotation]
if ok {
if initAnnotation, err := strconv.ParseBool(initialized); err != nil {
glog.V(4).Infof("Failed to parse %v annotation on pod %v: %v", StatefulSetInitAnnotation, pod.Name, err)
} else if !initAnnotation {
glog.V(4).Infof("StatefulSet pod %v waiting on annotation %v", pod.Name, StatefulSetInitAnnotation)
podReady = initAnnotation
}
}
return podReady
}
// isDying returns true if the pod has a non-nil deletion timestamp. Since the
// timestamp can only decrease, once this method returns true for a given pet, it
// will never return false.
func (d *defaultPetHealthChecker) isDying(pod *api.Pod) bool {
return pod != nil && pod.DeletionTimestamp != nil
}