Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tools/heartbeat: support to collect metrics #8235

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tools/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ require (
github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.19.0
github.com/prometheus/common v0.51.1
github.com/spf13/cobra v1.8.0
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.8.4
Expand Down Expand Up @@ -132,7 +133,6 @@ require (
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b // indirect
github.com/prometheus/client_model v0.6.0 // indirect
github.com/prometheus/common v0.51.1 // indirect
github.com/prometheus/procfs v0.13.0 // indirect
github.com/rs/cors v1.7.0 // indirect
github.com/samber/lo v1.37.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions tools/pd-heartbeat-bench/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ type Config struct {
ReportRatio float64 `toml:"report-ratio" json:"report-ratio"`
Sample bool `toml:"sample" json:"sample"`
Round int `toml:"round" json:"round"`
MetricsAddr string `toml:"metrics-addr" json:"metrics-addr"`
}

// NewConfig return a set of settings.
Expand All @@ -69,6 +70,7 @@ func NewConfig() *Config {
fs.StringVar(&cfg.Security.CertPath, "cert", "", "path of file that contains X509 certificate in PEM format")
fs.StringVar(&cfg.Security.KeyPath, "key", "", "path of file that contains X509 key in PEM format")
fs.Uint64Var(&cfg.InitEpochVer, "epoch-ver", 1, "the initial epoch version value")
fs.StringVar(&cfg.MetricsAddr, "metrics-addr", "127.0.0.1:9090", "the address to pull metrics")

return cfg
}
Expand Down
22 changes: 22 additions & 0 deletions tools/pd-heartbeat-bench/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net/http"
"os"
"os/signal"
"strconv"
"sync"
"sync/atomic"
"syscall"
Expand All @@ -46,6 +47,7 @@ import (
"github.com/tikv/pd/pkg/statistics"
"github.com/tikv/pd/pkg/utils/logutil"
"github.com/tikv/pd/tools/pd-heartbeat-bench/config"
"github.com/tikv/pd/tools/pd-heartbeat-bench/metrics"
"go.etcd.io/etcd/pkg/report"
"go.uber.org/zap"
)
Expand Down Expand Up @@ -482,6 +484,7 @@ func main() {
log.Fatal("initialize logger error", zap.Error(err))
}

metrics.InitMetric2Collect(cfg.MetricsAddr)
maxVersion = cfg.InitEpochVer
options := config.NewOptions(cfg)
// let PD have enough time to start
Expand Down Expand Up @@ -532,6 +535,7 @@ func main() {
select {
case <-heartbeatTicker.C:
if cfg.Round != 0 && regions.updateRound > cfg.Round {
metrics.OutputConclusion()
exit(0)
}
rep := newReport(cfg)
Expand All @@ -544,6 +548,7 @@ func main() {
wg.Add(1)
go regions.handleRegionHeartbeat(wg, streams[id], id, rep)
}
go metrics.CollectMetrics(regions.updateRound, 1*time.Second)
wg.Wait()

since := time.Since(startTime).Seconds()
Expand All @@ -559,6 +564,7 @@ func main() {
zap.Uint64("max-epoch-version", maxVersion),
)
log.Info("store heartbeat stats", zap.String("max", fmt.Sprintf("%.4fs", since)))
metrics.CollectRegionAndStoreStats(&stats, &since)
regions.update(cfg, options)
go stores.update(regions) // update stores in background, unusually region heartbeat is slower than store update.
case <-resolvedTSTicker.C:
Expand Down Expand Up @@ -689,6 +695,22 @@ func runHTTPServer(cfg *config.Config, options *config.Options) {

c.IndentedJSON(http.StatusOK, output)
})
engine.GET("metrics_collect", func(c *gin.Context) {
HuSharp marked this conversation as resolved.
Show resolved Hide resolved
second := c.Query("second")
if second == "" {
c.String(http.StatusBadRequest, "missing second")
return
}
secondInt, err := strconv.Atoi(second)
if err != nil {
c.String(http.StatusBadRequest, "invalid second")
return
}
metrics.InitMetric2Collect(cfg.MetricsAddr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe there is no monitor if it only runs binaries.

metrics.CollectMetrics(metrics.WarmUpRound, time.Duration(secondInt)*time.Second)
c.IndentedJSON(http.StatusOK, "Successfully collect metrics")
})

engine.Run(cfg.StatusAddr)
}

Expand Down
206 changes: 206 additions & 0 deletions tools/pd-heartbeat-bench/metrics/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
// Copyright 2024 TiKV Project Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
"context"
"fmt"
"math"
"net/url"
"strings"
"time"

"github.com/pingcap/log"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"go.etcd.io/etcd/pkg/report"
"go.uber.org/zap"
)

var (
prometheusCli api.Client
avgMetrics2Collect []Metric
avgRegionStats report.Stats
avgStoreTime float64
collectRound = 1.0

metrics2Collect = []Metric{
{promSQL: cpuMetric, name: "max cpu usage(%)"},
{promSQL: memoryMetric, name: "max memory usage(G)"},
{promSQL: goRoutineMetric, name: "max go routines"},
{promSQL: hbLatency99Metric, name: "99% Heartbeat Latency(ms)"},
{promSQL: hbLatencyAvgMetric, name: "Avg Heartbeat Latency(ms)"},
}

// Prometheus SQL
cpuMetric = `max_over_time(irate(process_cpu_seconds_total{job=~".*pd.*"}[30s])[1h:30s]) * 100`
memoryMetric = `max_over_time(go_memstats_heap_inuse_bytes{job=~".*pd.*"}[1h])/1024/1024/1024`
goRoutineMetric = `max_over_time(go_goroutines{job=~".*pd.*"}[1h])`
hbLatency99Metric = `histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{}[1m])) by (le))`
hbLatencyAvgMetric = `sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_sum{}[1m])) / sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_count{}[1m])) * 1000`

// Heartbeat Performance Duration BreakDown
hbBreakdownName = "Heartbeat Performance Duration BreakDown (Accumulation)(ms)"
breakdownNames = []string{
"AsyncHotStatsDuration",
"CollectRegionStats",
"Other",
"PreCheck",
"RegionGuide",
"SaveCache_CheckOverlaps",
"SaveCache_InvalidRegion",
"SaveCache_SetRegion",
"SaveCache_UpdateSubTree",
}
hbBreakdownMetricByName = func(name string) string {
return fmt.Sprintf(`sum(rate(pd_core_region_heartbeat_breakdown_handle_duration_seconds_sum{name="%s"}[1m]))`, name)
}
)

type Metric struct {
promSQL string
name string
value float64
}

func InitMetric2Collect(endpoint string) {
for _, name := range breakdownNames {
metrics2Collect = append(metrics2Collect, Metric{
promSQL: hbBreakdownMetricByName(name),
name: fmt.Sprintf("%s with %s", hbBreakdownName, name),
})
}
avgMetrics2Collect = metrics2Collect

if j := strings.Index(endpoint, "//"); j == -1 {
endpoint = "http://" + endpoint
}
cu, err := url.Parse(endpoint)
if err != nil {
log.Error("parse prometheus url error", zap.Error(err))
return
}
prometheusCli, err = NewPrometheusClient(*cu)
if err != nil {
log.Error("create prometheus client error", zap.Error(err))
return
}
}

func NewPrometheusClient(prometheusURL url.URL) (api.Client, error) {
client, err := api.NewClient(api.Config{
Address: prometheusURL.String(),
})
if err != nil {
return nil, err
}

return client, nil
}

// wait for the first round to warm up
const WarmUpRound = 1

func CollectMetrics(curRound int, wait time.Duration) {
if curRound < WarmUpRound {
return
}
// retry 5 times to get average value
res := make([]struct {
sum float64
count int
}, len(metrics2Collect))
for i := 0; i < 5; i++ {
for j, m := range metrics2Collect {
r, err := getMetric(prometheusCli, m.promSQL, time.Now())
if err != nil {
log.Error("get metric error", zap.String("name", m.name), zap.String("prom sql", m.promSQL), zap.Error(err))
} else if len(r) > 0 {
res[j].sum += r[0]
res[j].count += 1
}
}
time.Sleep(wait)
}
getRes := func(index int) float64 {
if res[index].count == 0 {
return 0
}
return res[index].sum / float64(res[index].count)
}
for i := 0; i < len(metrics2Collect); i++ {
metrics2Collect[i].value = getRes(i)
avgMetrics2Collect[i].value = (avgMetrics2Collect[i].value*collectRound + metrics2Collect[i].value) / (collectRound + 1)
}

collectRound += 1
log.Info("metrics collected", zap.String("metrics", formatMetrics(metrics2Collect)))
}

func getMetric(cli api.Client, query string, ts time.Time) ([]float64, error) {
httpAPI := v1.NewAPI(cli)
val, _, err := httpAPI.Query(context.Background(), query, ts)
if err != nil {
return nil, err
}
valMatrix := val.(model.Vector)
if len(valMatrix) == 0 {
return nil, nil
}
var value []float64
for i := range valMatrix {
value = append(value, float64(valMatrix[i].Value))
// judge whether exceeded float maximum value
if math.IsNaN(value[i]) {
return nil, fmt.Errorf("prometheus query result exceeded float maximum value, result=%s", valMatrix[i].String())
}
}
return value, nil
}

func CollectRegionAndStoreStats(regionStats *report.Stats, storeTime *float64) {
if regionStats != nil && storeTime != nil {
collect(*regionStats, *storeTime)
}
}

func collect(regionStats report.Stats, storeTime float64) {
average := func(avg, new float64) float64 {
return (avg*collectRound + new) / (collectRound + 1)
}

avgRegionStats.Total = time.Duration(average(float64(avgRegionStats.Total), float64(regionStats.Total)))
avgRegionStats.Average = average(avgRegionStats.Average, regionStats.Average)
avgRegionStats.Stddev = average(avgRegionStats.Stddev, regionStats.Stddev)
avgRegionStats.Fastest = average(avgRegionStats.Fastest, regionStats.Fastest)
avgRegionStats.Slowest = average(avgRegionStats.Slowest, regionStats.Slowest)
avgRegionStats.RPS = average(avgRegionStats.RPS, regionStats.RPS)
avgStoreTime = average(avgStoreTime, storeTime)
}

func formatMetrics(ms []Metric) string {
res := ""
for _, m := range ms {
res += "[" + m.name + "]" + " " + fmt.Sprintf("%.10f", m.value) + " "
}
return res
}

func OutputConclusion() {
log.Info("average metrics", zap.Float64("avg store time", avgStoreTime),
zap.Any("avg region stats", avgRegionStats),
zap.String("metrics", formatMetrics(avgMetrics2Collect)))
}