forked from pachyderm/pachyderm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
etcd_client.go
284 lines (263 loc) · 6.78 KB
/
etcd_client.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
package discovery
import (
"fmt"
"io/ioutil"
"net/http"
"strings"
"time"
"github.com/coreos/go-etcd/etcd"
)
type etcdClient struct {
client *etcd.Client
}
// customCheckRetry is a fork of etcd's DefaultCheckRetry, except that it issues
// more retries before giving up. Because Pachyderm often starts before etcd is
// ready, retrying Pachd's connection to etcd in a tight loop (<1s) is often
// much faster than waiting for kubernetes to restart the pachd pod.
func customCheckRetry(cluster *etcd.Cluster, numReqs int, lastResp http.Response,
err error) error {
// Retry for 5 minutes, unless the cluster is super huge
maxRetries := 2 * len(cluster.Machines)
if 600 > maxRetries {
maxRetries = 600
}
if numReqs > maxRetries {
errStr := fmt.Sprintf("failed to propose on members %v [last error: %v]", cluster.Machines, err)
return &etcd.EtcdError{
ErrorCode: etcd.ErrCodeEtcdNotReachable,
Message: "All the given peers are not reachable",
Cause: errStr,
Index: 0,
}
}
if lastResp.StatusCode == 0 {
// always retry if it failed to get a response
return nil
}
if lastResp.StatusCode != http.StatusInternalServerError {
// The status code indicates that etcd is no longer in leader election.
// Something is wrong
body := []byte("nil")
if lastResp.Body != nil {
if b, err := ioutil.ReadAll(lastResp.Body); err == nil {
body = b
}
}
errStr := fmt.Sprintf("unhandled http status [%s] with body [%s]", http.StatusText(lastResp.StatusCode), body)
return &etcd.EtcdError{
ErrorCode: etcd.ErrCodeUnhandledHTTPStatus,
Message: "Unhandled HTTP Status",
Cause: errStr,
Index: 0,
}
}
// sleep some time and expect leader election finish
time.Sleep(time.Millisecond * 500)
fmt.Println("Warning: bad response status code ", lastResp.StatusCode)
return nil
}
func newEtcdClient(addresses ...string) *etcdClient {
client := etcd.NewClient(addresses)
client.CheckRetry = customCheckRetry
return &etcdClient{client}
}
func (c *etcdClient) Close() error {
c.client.Close()
return nil
}
func (c *etcdClient) Get(key string) (string, error) {
response, err := c.client.Get(key, false, false)
if err != nil {
return "", err
}
return response.Node.Value, nil
}
func (c *etcdClient) GetAll(key string) (map[string]string, error) {
response, err := c.client.Get(key, false, true)
result := make(map[string]string, 0)
if err != nil {
if strings.HasPrefix(err.Error(), "100: Key not found") {
return result, nil
}
return nil, err
}
nodeToMap(response.Node, result)
return result, nil
}
func (c *etcdClient) Watch(key string, cancel chan bool, callBack func(string) error) error {
// This retry is needed for when the etcd cluster gets overloaded.
for {
if err := c.watchWithoutRetry(key, cancel, callBack); err != nil {
etcdErr, ok := err.(*etcd.EtcdError)
if ok && etcdErr.ErrorCode == 401 {
continue
}
if ok && etcdErr.ErrorCode == 501 {
continue
}
return err
}
}
}
func (c *etcdClient) WatchAll(key string, cancel chan bool, callBack func(map[string]string) error) error {
for {
if err := c.watchAllWithoutRetry(key, cancel, callBack); err != nil {
etcdErr, ok := err.(*etcd.EtcdError)
if ok && etcdErr.ErrorCode == 401 {
continue
}
if ok && etcdErr.ErrorCode == 501 {
continue
}
return err
}
}
}
func (c *etcdClient) Set(key string, value string, ttl uint64) error {
_, err := c.client.Set(key, value, ttl)
if err != nil {
return err
}
return nil
}
func (c *etcdClient) Create(key string, value string, ttl uint64) error {
_, err := c.client.Create(key, value, ttl)
if err != nil {
return err
}
return nil
}
func (c *etcdClient) CreateInDir(dir string, value string, ttl uint64) error {
_, err := c.client.CreateInOrder(dir, value, ttl)
if err != nil {
return err
}
return nil
}
func (c *etcdClient) Delete(key string) error {
_, err := c.client.Delete(key, false)
if err != nil {
return err
}
return nil
}
func (c *etcdClient) CheckAndDelete(key string, oldValue string) error {
_, err := c.client.CompareAndDelete(key, oldValue, 0)
if err != nil {
return err
}
return nil
}
func (c *etcdClient) CheckAndSet(key string, value string, ttl uint64, oldValue string) error {
var err error
if oldValue == "" {
_, err = c.client.Create(key, value, ttl)
} else {
_, err = c.client.CompareAndSwap(key, value, ttl, oldValue, 0)
}
if err != nil {
return err
}
return nil
}
// nodeToMap translates the contents of a node into a map
// nodeToMap can be called on the same map with successive results from watch
// to accumulate a value
// nodeToMap returns true if out was modified
func nodeToMap(node *etcd.Node, out map[string]string) bool {
key := strings.TrimPrefix(node.Key, "/")
if !node.Dir {
if node.Value == "" {
if _, ok := out[key]; ok {
delete(out, key)
return true
}
return false
}
if value, ok := out[key]; !ok || value != node.Value {
out[key] = node.Value
return true
}
return false
}
changed := false
for _, node := range node.Nodes {
changed = nodeToMap(node, out) || changed
}
return changed
}
func (c *etcdClient) watchWithoutRetry(key string, cancel chan bool, callBack func(string) error) error {
var waitIndex uint64 = 1
// First get the starting value of the key
response, err := c.client.Get(key, false, false)
if err != nil {
if strings.HasPrefix(err.Error(), "100: Key not found") {
err = callBack("")
if err != nil {
return err
}
} else {
return err
}
} else {
err = callBack(response.Node.Value)
if err != nil {
return err
}
waitIndex = response.Node.ModifiedIndex + 1
}
for {
response, err := c.client.Watch(key, waitIndex, false, nil, cancel)
if err != nil {
if err == etcd.ErrWatchStoppedByUser {
return ErrCancelled
}
return err
}
err = callBack(response.Node.Value)
if err != nil {
return err
}
waitIndex = response.Node.ModifiedIndex + 1
}
}
func (c *etcdClient) watchAllWithoutRetry(key string, cancel chan bool, callBack func(map[string]string) error) error {
var waitIndex uint64 = 1
value := make(map[string]string)
// First get the starting value of the key
response, err := c.client.Get(key, false, false)
if err != nil {
if strings.HasPrefix(err.Error(), "100: Key not found") {
err = callBack(nil)
if err != nil {
return err
}
} else {
return err
}
} else {
waitIndex = response.EtcdIndex + 1
if nodeToMap(response.Node, value) {
err = callBack(value)
if err != nil {
return err
}
}
}
for {
response, err := c.client.Watch(key, waitIndex, true, nil, cancel)
if err != nil {
if err == etcd.ErrWatchStoppedByUser {
return ErrCancelled
}
return err
}
waitIndex = response.EtcdIndex + 1
if nodeToMap(response.Node, value) {
err = callBack(value)
if err != nil {
return err
}
}
}
}