-
Notifications
You must be signed in to change notification settings - Fork 315
/
schedule_job.go
160 lines (143 loc) · 5.58 KB
/
schedule_job.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Copyright 2019 The Vearch Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"context"
"encoding/binary"
"errors"
"time"
"github.com/vearch/vearch/internal/client"
"github.com/vearch/vearch/internal/entity"
"github.com/vearch/vearch/internal/pkg/log"
"github.com/vearch/vearch/internal/proto/vearchpb"
"go.etcd.io/etcd/client/v3/concurrency"
)
const CronInterval = 60
func walkPartitions(masterServer *Server, partitions []*entity.Partition) {
ctx := masterServer.ctx
log.Debug("Start Walking Partitions!")
for _, partition := range partitions {
if space, err := masterServer.client.Master().QuerySpaceByID(ctx, partition.DBId, partition.SpaceId); err != nil {
if vearchpb.NewError(vearchpb.ErrorEnum_INTERNAL_ERROR, err).GetError().Code == vearchpb.ErrorEnum_SPACE_NOT_EXIST {
log.Warnf("Could not find Space contains partition,PartitionID:[%d] so remove it from etcd!", partition.Id)
partitionKey := entity.PartitionKey(partition.Id)
if err := masterServer.client.Master().Delete(ctx, partitionKey); err != nil {
log.Warnf("error:%s", err.Error())
}
} else {
log.Warnf("Failed to find space according dbid:[%d] spaceid:[%d] partitionID:[%d] err:[%s]", partition.DBId, partition.SpaceId, partition.Id, err.Error())
}
} else {
if space == nil {
log.Warnf("Could not find Space contains partition,PartitionID:[%d] so remove it from etcd!", partition.Id)
partitionKey := entity.PartitionKey(partition.Id)
if err := masterServer.client.Master().Delete(ctx, partitionKey); err != nil {
log.Warnf(err.Error())
}
}
}
}
log.Debug("Complete Walking Partitions!")
}
func walkSpaces(masterServer *Server, spaces []*entity.Space) {
ctx := masterServer.ctx
log.Debug("Start Walking Spaces!")
for _, space := range spaces {
if db, err := masterServer.client.Master().Get(ctx, entity.DBKeyBody(space.DBId)); err != nil {
log.Warnf("Failed to get key[%s] from etcd, err: [%s]", entity.DBKeyBody(space.DBId), err.Error())
} else if db == nil {
log.Warnf("Could not find database contains space, SpaceName: %s, SpaceID: %s, so remove it!", space.Name, space.Id)
spaceKey := entity.SpaceKey(space.DBId, space.Id)
if err := masterServer.client.Master().Delete(ctx, spaceKey); err != nil {
log.Warnf("error: %s", err.Error())
}
}
}
log.Debug("Complete Walking Spaces!")
}
func removePartition(partitionServerRpcAddr string, pid entity.PartitionID) error {
log.Debugf("Removing partition:[%s] from ps:[%s]", pid, partitionServerRpcAddr)
return client.DeletePartition(partitionServerRpcAddr, pid)
}
func walkServers(masterServer *Server, servers []*entity.Server) {
ctx := masterServer.ctx
log.Debug("Start Walking Servers!")
for _, server := range servers {
for _, pid := range server.PartitionIds {
if _, err := masterServer.client.Master().QueryPartition(ctx, pid); err != nil {
if vearchpb.NewError(vearchpb.ErrorEnum_INTERNAL_ERROR, err).GetError().Code == vearchpb.ErrorEnum_PARTITION_NOT_EXIST {
log.Warnf("to remove partition:%d", pid)
if err := removePartition(server.RpcAddr(), pid); err != nil {
log.Warnf("Failed to remove partition: %v allocated on server: %v, and err is:%v", pid, server.ID, err)
}
} else {
log.Warnf("Failed to find partition: %v, allocated on server: %v, err: %v", pid, server.ID, err)
}
}
}
}
log.Debug("Complete Walking Servers!")
}
var errSkipJob = errors.New("skip job")
func CleanTask(masterServer *Server) {
var err = masterServer.client.Master().STM(masterServer.ctx, func(stm concurrency.STM) error {
timeBytes := stm.Get(entity.ClusterCleanJobKey)
if len(timeBytes) == 0 {
return nil
}
value := binary.LittleEndian.Uint16([]byte(timeBytes))
if time.Now().UnixNano() > int64(value) {
bytes := make([]byte, 8)
binary.LittleEndian.PutUint64(bytes, uint64(time.Now().UnixNano()+int64(CronInterval)))
stm.Put(entity.ClusterCleanJobKey, string(bytes))
return nil
}
return errSkipJob
})
if err == errSkipJob {
log.Debug("skip clean task .....")
return
}
if err != nil {
log.Errorf("clean task has err for get ClusterCleanJobKey err: %s", err.Error())
return
}
log.Debug("Start clean task")
//process partitions
if partitions, err := masterServer.client.Master().QueryPartitions(masterServer.ctx); err != nil {
log.Errorf("Failed to get all partitions,err: %s", err.Error())
} else {
walkPartitions(masterServer, partitions)
}
//process spaces
if spaces, err := masterServer.client.Master().QuerySpacesByKey(masterServer.ctx, entity.PrefixSpace); err != nil {
log.Errorf("Failed to get all spaces,err: %s", err.Error())
} else {
walkSpaces(masterServer, spaces)
}
//process servers
if servers, err := masterServer.client.Master().QueryServers(masterServer.ctx); err != nil {
log.Errorf("Failed to get all servers,err: %s", err.Error())
} else {
walkServers(masterServer, servers)
}
}
// WatchServerJob watch ps server put and delete
func (s *Server) WatchServerJob(ctx context.Context, cli *client.Client) error {
err := client.NewWatchServerCache(ctx, cli)
if err != nil {
return err
}
return nil
}