Skip to content

Commit

Permalink
scheduler: bugfix for slow-trend-scheduler when timestamp of heartbea…
Browse files Browse the repository at this point in the history
…ts are equal (#6273)

close #6307

Update the strategy on the judgement whether all timestamps have been updated or not when detecting slow stores.

Co-authored-by: Ti Chi Robot <ti-community-prow-bot@tidb.io>
  • Loading branch information
LykxSassinator and ti-chi-bot committed Apr 13, 2023
1 parent cb3bd5f commit 4201238
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions pkg/schedule/schedulers/evict_slow_trend.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ func (conf *evictSlowTrendSchedulerConfig) candidate() uint64 {
return conf.evictCandidate
}

func (conf *evictSlowTrendSchedulerConfig) captureTS() time.Time {
return conf.candidateCaptureTime
}

func (conf *evictSlowTrendSchedulerConfig) candidateCapturedSecs() uint64 {
return uint64(time.Since(conf.candidateCaptureTime).Seconds())
}
Expand Down Expand Up @@ -237,7 +241,8 @@ func (s *evictSlowTrendScheduler) Schedule(cluster schedule.Cluster, dryRun bool
storeSlowTrendActionStatusGauge.WithLabelValues("cand.cancel:too-faster").Inc()
return ops, nil
}
if !checkStoresAreUpdated(cluster, slowStore) {
slowStoreRecordTS := s.conf.captureTS()
if !checkStoresAreUpdated(cluster, slowStoreID, slowStoreRecordTS) {
log.Info("slow store candidate waiting for other stores to update heartbeats",
zap.Uint64("store-id", slowStoreID))
storeSlowTrendActionStatusGauge.WithLabelValues("cand.wait").Inc()
Expand Down Expand Up @@ -322,18 +327,17 @@ func chooseEvictCandidate(cluster schedule.Cluster) (slowStore *core.StoreInfo)
}

storeSlowTrendActionStatusGauge.WithLabelValues("cand.add").Inc()
log.Info("evict-slow-trend-scheduler canptured candidate", zap.Uint64("store-id", store.GetID()))
log.Info("evict-slow-trend-scheduler captured candidate", zap.Uint64("store-id", store.GetID()))
return store
}

func checkStoresAreUpdated(cluster schedule.Cluster, baseline *core.StoreInfo) bool {
func checkStoresAreUpdated(cluster schedule.Cluster, slowStoreID uint64, slowStoreRecordTS time.Time) bool {
stores := cluster.GetStores()
if len(stores) <= 1 {
return false
}
expected := (len(stores) + 1) / 2
updatedStores := 0
baselineTS := baseline.GetLastHeartbeatTS()
for _, store := range stores {
if store.IsRemoved() {
updatedStores += 1
Expand All @@ -343,11 +347,11 @@ func checkStoresAreUpdated(cluster schedule.Cluster, baseline *core.StoreInfo) b
updatedStores += 1
continue
}
if store.GetID() == baseline.GetID() {
if store.GetID() == slowStoreID {
updatedStores += 1
continue
}
if baselineTS.Before(store.GetLastHeartbeatTS()) {
if slowStoreRecordTS.Before(store.GetLastHeartbeatTS()) {
updatedStores += 1
}
}
Expand Down

0 comments on commit 4201238

Please sign in to comment.