forked from go-distributed/meritop
/
healthy.go
106 lines (98 loc) · 2.67 KB
/
healthy.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package etcdutil
import (
"fmt"
"log"
"math/rand"
"path"
"strconv"
"time"
"github.com/coreos/go-etcd/etcd"
)
// heartbeat to etcd cluster until stop
func Heartbeat(client *etcd.Client, name string, taskID uint64, interval time.Duration, stop chan struct{}) error {
for {
_, err := client.Set(TaskHealthyPath(name, taskID), "health", computeTTL(interval))
if err != nil {
return err
}
select {
case <-time.After(interval):
case <-stop:
return nil
}
}
}
// detect failure of the given taskID
func DetectFailure(client *etcd.Client, name string, stop chan bool, logger *log.Logger) error {
receiver := make(chan *etcd.Response, 1)
go client.Watch(HealthyPath(name), 0, true, receiver, stop)
for resp := range receiver {
if resp.Action != "expire" && resp.Action != "delete" {
continue
}
err := ReportFailure(client, name, path.Base(resp.Node.Key))
if err != nil {
logger.Printf("ReportFailure returns error: %v", err)
}
}
return nil
}
// report failure to etcd cluster
// If a framework detects a failure, it tries to report failure to /FreeTasks/{taskID}
func ReportFailure(client *etcd.Client, name, failedTask string) error {
_, err := client.Set(FreeTaskPath(name, failedTask), "failed", 0)
return err
}
// WaitFreeTask blocks until it gets a hint of free task
func WaitFreeTask(client *etcd.Client, name string, logger *log.Logger) (uint64, error) {
slots, err := client.Get(FreeTaskDir(name), false, true)
if err != nil {
return 0, err
}
if total := len(slots.Node.Nodes); total > 0 {
ri := rand.Intn(total)
s := slots.Node.Nodes[ri]
idStr := path.Base(s.Key)
id, err := strconv.ParseUint(idStr, 0, 64)
if err != nil {
return 0, err
}
logger.Printf("got failures %v at index %d, randomly choose %d to try...", ListKeys(slots.Node.Nodes), slots.EtcdIndex, ri)
return id, nil
}
watchIndex := slots.EtcdIndex + 1
respChan := make(chan *etcd.Response, 1)
go func() {
for {
logger.Printf("start to wait failure at index %d", watchIndex)
resp, err := client.Watch(FreeTaskDir(name), watchIndex, true, nil, nil)
if err != nil {
logger.Printf("WARN: WaitFailure watch failed: %v", err)
return
}
if resp.Action == "set" {
respChan <- resp
return
}
watchIndex = resp.EtcdIndex + 1
}
}()
var resp *etcd.Response
select {
case resp = <-respChan:
case <-time.After(10 * time.Second):
return 0, fmt.Errorf("WaitFailure timeout!")
}
idStr := path.Base(resp.Node.Key)
id, err := strconv.ParseUint(idStr, 10, 64)
if err != nil {
return 0, err
}
return id, nil
}
func computeTTL(interval time.Duration) uint64 {
if interval/time.Second < 1 {
return 3
}
return 3 * uint64(interval/time.Second)
}