-
Notifications
You must be signed in to change notification settings - Fork 714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
mcs: fix watch primary address revision and update cache when meets not leader #6279
Changes from 5 commits
2c43e7a
e34a074
1b09ef4
642872e
719b6f9
4a6cb7f
c5c6b5d
596d6e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -104,6 +104,8 @@ const ( | |
maxRetryTimesGetServicePrimary = 25 | ||
// retryIntervalGetServicePrimary is the retry interval for getting primary addr. | ||
retryIntervalGetServicePrimary = 100 * time.Millisecond | ||
// TODO: move it to etcdutil | ||
watchEtcdChangeRetryInterval = 1 * time.Second | ||
) | ||
|
||
// EtcdStartTimeout the timeout of the startup etcd. | ||
|
@@ -215,6 +217,9 @@ type Server struct { | |
registry *registry.ServiceRegistry | ||
mode string | ||
servicePrimaryMap sync.Map /* Store as map[string]string */ | ||
// updateServicePrimaryAddrCh is used to notify the server to update the service primary address. | ||
// Note: it is only used in API service mode. | ||
updateServicePrimaryAddrCh chan struct{} | ||
} | ||
|
||
// HandlerBuilder builds a server HTTP handler. | ||
|
@@ -558,7 +563,7 @@ func (s *Server) startServerLoop(ctx context.Context) { | |
go s.encryptionKeyManagerLoop() | ||
if s.IsAPIServiceMode() { // disable tso service in api server | ||
s.serverLoopWg.Add(1) | ||
go s.watchServicePrimaryAddrLoop(mcs.TSOServiceName) | ||
go s.startWatchServicePrimaryAddrLoop(mcs.TSOServiceName) | ||
} | ||
} | ||
|
||
|
@@ -1716,43 +1721,98 @@ func (s *Server) GetServicePrimaryAddr(ctx context.Context, serviceName string) | |
return "", false | ||
} | ||
|
||
func (s *Server) watchServicePrimaryAddrLoop(serviceName string) { | ||
// startWatchServicePrimaryAddrLoop starts a loop to watch the primary address of a given service. | ||
func (s *Server) startWatchServicePrimaryAddrLoop(serviceName string) { | ||
defer logutil.LogPanic() | ||
defer s.serverLoopWg.Done() | ||
ctx, cancel := context.WithCancel(s.serverLoopCtx) | ||
defer cancel() | ||
|
||
serviceKey := fmt.Sprintf("/ms/%d/%s/%s/%s", s.clusterID, serviceName, fmt.Sprintf("%05d", 0), "primary") | ||
log.Info("start to watch", zap.String("service-key", serviceKey)) | ||
|
||
primary := &tsopb.Participant{} | ||
ok, rev, err := etcdutil.GetProtoMsgWithModRev(s.client, serviceKey, primary) | ||
if err != nil { | ||
log.Error("get service primary addr failed", zap.String("service-key", serviceKey), zap.Error(err)) | ||
s.updateServicePrimaryAddrCh = make(chan struct{}, 1) | ||
serviceKey := s.servicePrimaryKey(serviceName) | ||
var ( | ||
revision int64 | ||
err error | ||
) | ||
for i := 0; i < maxRetryTimesGetServicePrimary; i++ { | ||
select { | ||
case <-ctx.Done(): | ||
return | ||
case <-time.After(retryIntervalGetServicePrimary): | ||
} | ||
Comment on lines
+1739
to
+1743
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we call |
||
revision, err = s.updateServicePrimaryAddr(serviceName) | ||
if revision != 0 && err == nil { // update success | ||
break | ||
} | ||
} | ||
listenUrls := primary.GetListenUrls() | ||
if ok && len(listenUrls) > 0 { | ||
// listenUrls[0] is the primary service endpoint of the keyspace group | ||
s.servicePrimaryMap.Store(serviceName, listenUrls[0]) | ||
} else { | ||
log.Warn("service primary addr doesn't exist", zap.String("service-key", serviceKey)) | ||
if err != nil { | ||
log.Warn("service primary addr doesn't exist", zap.String("service-key", serviceKey), zap.Error(err)) | ||
} | ||
|
||
watchChan := s.client.Watch(ctx, serviceKey, clientv3.WithPrefix(), clientv3.WithRev(rev)) | ||
log.Info("start to watch service primary addr", zap.String("service-key", serviceKey)) | ||
for { | ||
select { | ||
case <-ctx.Done(): | ||
log.Info("server is closed, exist watch service primary addr loop", zap.String("service", serviceName)) | ||
return | ||
case res := <-watchChan: | ||
for _, event := range res.Events { | ||
default: | ||
} | ||
nextRevision, err := s.watchServicePrimaryAddr(ctx, serviceName, revision) | ||
if err != nil { | ||
log.Error("watcher canceled unexpectedly and a new watcher will start after a while", | ||
zap.Int64("next-revision", nextRevision), | ||
zap.Time("retry-at", time.Now().Add(watchEtcdChangeRetryInterval)), | ||
zap.Error(err)) | ||
revision = nextRevision | ||
time.Sleep(watchEtcdChangeRetryInterval) | ||
} | ||
} | ||
} | ||
|
||
// SetServicePrimaryAddr sets the primary address directly. | ||
// Note: This function is only used for test. | ||
func (s *Server) SetServicePrimaryAddr(serviceName, addr string) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where do we use it? |
||
s.servicePrimaryMap.Store(serviceName, addr) | ||
} | ||
|
||
// watchServicePrimaryAddr watches the primary address on etcd. | ||
func (s *Server) watchServicePrimaryAddr(ctx context.Context, serviceName string, revision int64) (nextRevision int64, err error) { | ||
serviceKey := s.servicePrimaryKey(serviceName) | ||
watcher := clientv3.NewWatcher(s.client) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are many codes with the same logic, the only difference is the key. How about abstracting a function for them? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, I will try it later. |
||
defer watcher.Close() | ||
|
||
for { | ||
WatchChan: | ||
watchChan := watcher.Watch(s.serverLoopCtx, serviceKey, clientv3.WithPrefix(), clientv3.WithRev(revision)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need the prefix? |
||
select { | ||
case <-ctx.Done(): | ||
return revision, nil | ||
case <-s.updateServicePrimaryAddrCh: | ||
revision, err = s.updateServicePrimaryAddr(serviceName) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most likely even after we update the service primary address, we still have one more problem -- s.clientConns stores the forwarded hosts' grpc.ClientConn. We never update the broken connections. if a forwarded host's connection broke,e.g., the forwarded host restarted and broken the existing connection, we'll retrieve the broken connection for this forwarded host continuously. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it will try to create a new connection automatically when the existed connection is closed. |
||
if err != nil { | ||
log.Warn("update service primary addr failed", zap.String("service-key", serviceKey), zap.Error(err)) | ||
} | ||
goto WatchChan | ||
rleungx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
case wresp := <-watchChan: | ||
if wresp.CompactRevision != 0 { | ||
log.Warn("required revision has been compacted, use the compact revision", | ||
zap.Int64("required-revision", revision), | ||
zap.Int64("compact-revision", wresp.CompactRevision)) | ||
revision = wresp.CompactRevision | ||
goto WatchChan | ||
} | ||
if wresp.Err() != nil { | ||
log.Error("watcher is canceled with", | ||
zap.Int64("revision", revision), | ||
errs.ZapError(errs.ErrEtcdWatcherCancel, wresp.Err())) | ||
return revision, wresp.Err() | ||
} | ||
for _, event := range wresp.Events { | ||
lhy1024 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
switch event.Type { | ||
case clientv3.EventTypePut: | ||
primary.ListenUrls = nil // reset the field | ||
primary := &tsopb.Participant{} | ||
if err := proto.Unmarshal(event.Kv.Value, primary); err != nil { | ||
log.Error("watch service primary addr failed", zap.String("service-key", serviceKey), zap.Error(err)) | ||
} else { | ||
listenUrls = primary.GetListenUrls() | ||
listenUrls := primary.GetListenUrls() | ||
if len(listenUrls) > 0 { | ||
// listenUrls[0] is the primary service endpoint of the keyspace group | ||
s.servicePrimaryMap.Store(serviceName, listenUrls[0]) | ||
|
@@ -1761,13 +1821,34 @@ func (s *Server) watchServicePrimaryAddrLoop(serviceName string) { | |
} | ||
} | ||
case clientv3.EventTypeDelete: | ||
log.Warn("service primary is deleted", zap.String("service-key", serviceKey)) | ||
s.servicePrimaryMap.Delete(serviceName) | ||
} | ||
} | ||
revision = wresp.Header.Revision | ||
} | ||
} | ||
} | ||
|
||
// updateServicePrimaryAddr updates the primary address from etcd with get operation. | ||
func (s *Server) updateServicePrimaryAddr(serviceName string) (nextRevision int64, err error) { | ||
serviceKey := s.servicePrimaryKey(serviceName) | ||
primary := &tsopb.Participant{} | ||
ok, revision, err := etcdutil.GetProtoMsgWithModRev(s.client, serviceKey, primary) | ||
listenUrls := primary.GetListenUrls() | ||
if !ok || err != nil || len(listenUrls) == 0 { | ||
return 0, err | ||
} | ||
// listenUrls[0] is the primary service endpoint of the keyspace group | ||
s.servicePrimaryMap.Store(serviceName, listenUrls[0]) | ||
log.Info("update service primary addr", zap.String("service-key", serviceKey), zap.String("primary-addr", listenUrls[0])) | ||
return revision, nil | ||
} | ||
|
||
func (s *Server) servicePrimaryKey(serviceName string) string { | ||
return fmt.Sprintf("/ms/%d/%s/%s/%s", s.clusterID, serviceName, fmt.Sprintf("%05d", 0), "primary") | ||
} | ||
|
||
// RecoverAllocID recover alloc id. set current base id to input id | ||
func (s *Server) RecoverAllocID(ctx context.Context, id uint64) error { | ||
return s.idAllocator.SetBase(id) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that the channel is always passed to the function, then why use an optional parameter?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DispatchRequest
is also used by tso server, tso server is no needed to watch api key