Skip to content

Commit

Permalink
Fixed Consistant Log Levelling
Browse files Browse the repository at this point in the history
Made changes to exisisting codebase with structured logs from format logs. [Github issue ref](#1469).

Added logging kepler with klog section on dev/README.md, [see
here](https://github.com/sustainable-computing-io/kepler/blob/main/doc/dev/README.md).

Signed-off-by: Andy Xie <anxie@redhat.com>
  • Loading branch information
PalmPalm7 committed Jun 20, 2024
1 parent 344f369 commit 77d4005
Show file tree
Hide file tree
Showing 34 changed files with 372 additions and 150 deletions.
2 changes: 2 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ linters-settings:
printf:
funcs:
- (github.com/golangci/golangci-lint/pkg/logutils.Log).Infof
- (github.com/golangci/golangci-lint/pkg/logutils.Log).InfoS
- (github.com/golangci/golangci-lint/pkg/logutils.Log).Warnf
- (github.com/golangci/golangci-lint/pkg/logutils.Log).Errorf
- (github.com/golangci/golangci-lint/pkg/logutils.Log).ErrorS
- (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf
lll:
line-length: 140
Expand Down
31 changes: 23 additions & 8 deletions cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,27 @@ func healthProbe(w http.ResponseWriter, req *http.Request) {
}
}

func finalizing() {
stack := "exit stack: \n" + string(debug.Stack())
klog.InfoS(stack)
exitCode := 10
klog.Infoln(finishingMsg)
klog.FlushAndExit(klog.ExitFlushTimeout, exitCode)
}

func main() {
start := time.Now()
klog.InitFlags(nil)

// for log debugging issues
flag.Set("logtostderr", "false")
flag.Set("alsologtostderr", "true")
flag.Set("log_dir", "/root/kepler/kepler_prime/kepler_debug/logs1.txt")
flag.Set("v", "2")

flag.Parse()

klog.Infof("Kepler running on version: %s", Version)
klog.InfoS("Kepler running on version", "version", kversion.Version)

registry := metrics.GetRegistry()
registry.MustRegister(prometheus.NewGaugeFunc(
Expand Down Expand Up @@ -144,7 +159,7 @@ func main() {
stats.InitAvailableParamAndMetrics()

if config.EnabledGPU {
klog.Infof("Initializing the GPU collector")
klog.InfoS("Initializing the GPU collector")
// the GPU operators typically takes longer time to initialize than kepler resulting in error to start the gpu driver
// therefore, we wait up to 1 min to allow the gpu operator initialize
for i := 0; i <= maxGPUInitRetry; i++ {
Expand All @@ -158,16 +173,16 @@ func main() {
if err == nil {
defer gpu.Shutdown()
} else {
klog.Infof("Failed to initialize the GPU collector: %v. Have the GPU operator initialize?", err)
klog.InfoS("Failed to initialize the GPU collector. Have the GPU operator initialize?", "err", err)
}
}

if config.IsExposeQATMetricsEnabled() {
klog.Infof("Initializing the QAT collector")
klog.InfoS("Initializing the QAT collector")
if qatErr := qat.Init(); qatErr == nil {
defer qat.Shutdown()
} else {
klog.Infof("Failed to initialize the QAT collector: %v", qatErr)
klog.InfoS("Failed to initialize the QAT collector", "err", qatErr)
}
}

Expand All @@ -178,7 +193,7 @@ func main() {
// starting a new gorotine to collect data and report metrics
// BPF is attached here
if startErr := m.Start(); startErr != nil {
klog.Infof("%s", fmt.Sprintf("failed to start : %v", startErr))
klog.InfoS("failed to start", "err", startErr)
}
metricPathConfig := config.GetMetricPath(*metricsPath)
bindAddressConfig := config.GetBindAddress(*address)
Expand All @@ -198,7 +213,7 @@ func main() {
Handler: &handler,
}

klog.Infof("starting to listen on %s", bindAddressConfig)
klog.InfoS("starting to listen on", "address" bindAddressConfig)
errChan := make(chan error)

signalChan := make(chan os.Signal, 1)
Expand All @@ -212,7 +227,7 @@ func main() {
errChan <- err
}
}()
klog.Infof(startedMsg, time.Since(start))
klog.InfoS(startedMsg, "duration", time.Since(start))
klog.Flush() // force flush to parse the start msg in the e2e test

// Wait for an exit signal
Expand Down
36 changes: 36 additions & 0 deletions doc/dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,39 @@ or
go tool pprof -http 0.0.0.0:8000 <path/to/pprof-capture>.pb.gz

```

## Logging Kepler with Klog
While logging Kepler with Klog, please follow the guidelines from this [best practice article](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md).

Highlights of the guidelines are in the following:

### What method to use?

* `klog.ErrorS` - Errors should be used to indicate unexpected behaviours in code, like unexpected errors returned by subroutine function calls.
Logs generated by `ErrorS` command may be enhanced with additional debug information (by logging library). Calling `ErrorS` with `nil` as error may be acceptable if there is error condition that deserves a stack trace at this origin point.

* `klog.InfoS` - Structured logs to the INFO log. `InfoS` should be used for routine logging. It can also be used to log warnings for expected errors (errors that can happen during routine operations).
Depending on log severity it's important to pick a proper verbosity level to ensure that consumer is neither under nor overwhelmed by log volume:
* `klog.V(0).InfoS` = `klog.InfoS` - Generally useful for this to **always** be visible to a cluster operator
* Programmer errors
* Logging extra info about a panic
* CLI argument handling
* `klog.V(1).InfoS` - A reasonable default log level if you don't want verbosity.
* Information about config (listening on X, watching Y)
* Errors that repeat frequently that relate to conditions that can be corrected (pod detected as unhealthy)
* `klog.V(2).InfoS` - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems.
* Logging HTTP requests and their exit code
* System state changing (killing pod)
* Controller state change events (starting pods)
* Scheduler log messages
* `klog.V(3).InfoS` - Extended information about changes
* More info about system state changes
* `klog.V(4).InfoS` - Debug level verbosity
* Logging in particularly thorny parts of code where you may want to come back later and check it
* `klog.V(5).InfoS` - Trace level verbosity
* Context to understand the steps leading up to errors and warnings
* More information for troubleshooting reported issues

As per the comments, the practical default level is V(2). Developers and QE
environments may wish to run at V(3) or V(4). If you wish to change the log
level, you can pass in `-v=X` where X is the desired maximum level to log.
2 changes: 1 addition & 1 deletion e2e/integration-test/e2e_metric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ func checkPodMetricValues(keplerMetric *TestKeplerMetric, metricName string, zer
break
} else {
retry += 1
log.Infof("Retrying to get metrics for %s, retry count: %d", metricName, retry)
log.InfoS("Retrying to get metrics", "metricsName", metricName, "retryCount", retry)
// exponential backoff
time.Sleep(time.Duration(math.Pow(2, float64(retry))) * time.Second)
}
Expand Down
88 changes: 79 additions & 9 deletions pkg/bpf/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ func getLibbpfObjectFilePath(byteOrder binary.ByteOrder) (string, error) {
if len(matches) < 1 {
return "", fmt.Errorf("failed to find bpf object file: no matches found")
}
klog.Infof("found bpf object file: %s", matches[0])
klog.InfoS("found bpf object file", "objectFile", matches[0])
return matches[0], nil
}
return bpfassetsPath, nil
Expand All @@ -137,7 +137,7 @@ func (e *exporter) attach() error {
}

// resize array entries
klog.Infof("%d CPU cores detected. Resizing eBPF Perf Event Arrays", e.cpuCores)
klog.InfoS("CPU cores detected. Resizing eBPF Perf Event Arrays", "cpuCores", e.cpuCores)
toResize := []string{
"cpu_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader",
"cpu_cycles", "cpu_instructions", "cache_miss",
Expand Down Expand Up @@ -165,7 +165,7 @@ func (e *exporter) attach() error {
}

if _, err = prog.AttachGeneric(); err != nil {
klog.Infof("failed to attach tracepoint/sched/sched_switch: %v", err)
klog.InfoS("failed to attach tracepoint/sched/sched_switch", "err", err)
} else {
e.enabledSoftwareCounters[config.CPUTime] = struct{}{}
}
Expand Down Expand Up @@ -220,7 +220,7 @@ func (e *exporter) attach() error {
}

if !config.ExposeHardwareCounterMetrics {
klog.Infof("Hardware counter metrics are disabled")
klog.InfoS("Hardware counter metrics are disabled")
return nil
}

Expand Down Expand Up @@ -261,7 +261,31 @@ func (e *exporter) attach() error {
e.enabledHardwareCounters[arrayName] = struct{}{}
}

<<<<<<< HEAD

Check failure on line 264 in pkg/bpf/exporter.go

View workflow job for this annotation

GitHub Actions / golang / gofmt_test

expected statement, found '<<'
klog.Infof("Successfully load eBPF module from libbpf object")
=======
// attach task clock perf event. this is a software counter, not a hardware counter
bpfPerfArrayName := config.TaskClock + bpfPerfArraySuffix
bpfMap, err := e.module.GetMap(bpfPerfArrayName)
if err != nil {
return fmt.Errorf("could not get ebpf map for perf event %s: %w", bpfPerfArrayName, err)
}
fds, perfErr := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_TASK_CLOCK, e.cpuCores)
if perfErr != nil {
return fmt.Errorf("could not attach perf event %s: %w", bpfPerfArrayName, perfErr)
}
for i, fd := range fds {
err = bpfMap.Update(unsafe.Pointer(&i), unsafe.Pointer(&fd))
if err != nil {
klog.Warningf("failed to update bpf map: %v", err)
return cleanup()
}
}
e.perfEventFds = append(e.perfEventFds, fds...)
e.enabledSoftwareCounters[config.TaskClock] = struct{}{}

klog.InfoS("Successfully load eBPF module from libbpf object")
>>>>>>> 14c8f707 (Fixed Consistant Log Levelling)

Check failure on line 288 in pkg/bpf/exporter.go

View workflow job for this annotation

GitHub Actions / golang / gofmt_test

expected statement, found '>>'
return nil
}

Expand Down Expand Up @@ -299,6 +323,52 @@ func (e *exporter) CollectProcesses() (processesData []ProcessBPFMetrics, err er
return
}

<<<<<<< HEAD

Check failure on line 326 in pkg/bpf/exporter.go

View workflow job for this annotation

GitHub Actions / golang / gofmt_test

expected declaration, found '<<'
=======
func (e *exporter) CollectCPUFreq() (cpuFreqData map[int32]uint64, err error) {
cpuFreqData = make(map[int32]uint64)
var cpuFreq *bpf.BPFMap
cpuFreq, err = e.module.GetMap(TableCPUFreqName)

Check failure on line 331 in pkg/bpf/exporter.go

View workflow job for this annotation

GitHub Actions / golang / gofmt_test

expected declaration, found cpuFreq
if err != nil {
return
}
// cpuFreqkeySize := int(unsafe.Sizeof(uint32Key))
iterator := cpuFreq.Iterator()
var freq uint32
// keySize := int(unsafe.Sizeof(freq))
retry := 0

Check failure on line 339 in pkg/bpf/exporter.go

View workflow job for this annotation

GitHub Actions / golang / gofmt_test

expected declaration, found retry
next := iterator.Next()
for next {
keyBytes := iterator.Key()
cpu := int32(e.byteOrder.Uint32(keyBytes))
data, getErr := cpuFreq.GetValue(unsafe.Pointer(&cpu))
if getErr != nil {
retry += 1
if retry > config.MaxLookupRetry {
klog.V(5).InfoS("failed to get data with max retry", "err", getErr, "maxRetry", config.MaxLookupRetry)
next = iterator.Next()
retry = 0
}
continue
}
getErr = binary.Read(bytes.NewReader(data), e.byteOrder, &freq)
if getErr != nil {
klog.V(5).InfoS("failed to decode received data", "err" getErr)
next = iterator.Next()
retry = 0
continue
}
if retry > 0 {
klog.V(5).InfoS("successfully get data with retry", "retry", retry)
}
cpuFreqData[cpu] = uint64(freq)
next = iterator.Next()
retry = 0
}
return
}

>>>>>>> 14c8f707 (Fixed Consistant Log Levelling)
///////////////////////////////////////////////////////////////////////////
// utility functions

Expand Down Expand Up @@ -367,7 +437,7 @@ func (e *exporter) libbpfCollectProcessBatchSingleHash(processes *bpf.BPFMap) ([
if err != nil {
// os.IsNotExist means we reached the end of the table
if !os.IsNotExist(err) {
klog.V(5).Infof("GetValueAndDeleteBatch failed: %v. A partial value might have been collected.", err)
klog.V(5).InfoS("GetValueAndDeleteBatch failed. A partial value might have been collected.", "err", err)
}
}

Expand All @@ -383,7 +453,7 @@ func (e *exporter) libbpfCollectProcessBatchSingleHash(processes *bpf.BPFMap) ([
processesData = append(processesData, ct)
}
}
klog.V(5).Infof("successfully get data with batch get and delete with %d pids in %v", len(processesData), time.Since(start))
klog.V(5).InfoS("successfully get data with batch get and delete", "pidCount", len(processesData), "duration", time.Since(start))
return processesData, err
}

Expand All @@ -400,21 +470,21 @@ func (e *exporter) libbpfCollectProcessSingleHash(processes *bpf.BPFMap) (proces
if getErr != nil {
retry += 1
if retry > config.MaxLookupRetry {
klog.V(5).Infof("failed to get data: %v with max retry: %d \n", getErr, config.MaxLookupRetry)
klog.V(5).InfoS("failed to get data with max retry", "err", getErr, "maxRetry", config.MaxLookupRetry)
next = iterator.Next()
retry = 0
}
continue
}
getErr = binary.Read(bytes.NewReader(data), e.byteOrder, &ct)
if getErr != nil {
klog.V(5).Infof("failed to decode received data: %v\n", getErr)
klog.V(5).InfoS("failed to decode received data", "err" getErr)
next = iterator.Next()
retry = 0
continue
}
if retry > 0 {
klog.V(5).Infof("successfully get data with retry=%d \n", retry)
klog.V(5).InfoS("successfully get data with retry", "retry", retry)
}
processesData = append(processesData, ct)
keys = append(keys, key)
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/metric_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (c *Collector) Update() {
c.UpdateEnergyUtilizationMetrics()

c.printDebugMetrics()
klog.V(5).Infof("Collector Update elapsed time: %s", time.Since(start))
klog.V(5).InfoS("Collector Update elapsed time", "duration", time.Since(start))
}

// resetDeltaValue resets existing podEnergy previous curr value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func addGPUUtilizationToProcessStats(processStats map[uint64]*stats.ProcessStats
// if the pid is within a container, it will have an container ID
if config.IsExposeContainerStatsEnabled() {
if containerID, err = cgroup.GetContainerIDFromPID(uintPid); err != nil {
klog.V(6).Infof("failed to resolve container for Pid %v (command=%s): %v, set containerID=%s", pid, command, err, containerID)
klog.V(5).InfoS("failed to resolve container for PID", "pid", pid, "command", command, "err", err, "containerID", containerID)
}
}

Expand All @@ -90,7 +90,7 @@ func addGPUUtilizationToProcessStats(processStats map[uint64]*stats.ProcessStats
if config.IsExposeVMStatsEnabled() {
vmID, err = libvirt.GetVMID(uintPid)
if err != nil {
klog.V(6).Infof("failed to resolve VM ID for PID %v (command=%s): %v", pid, command, err)
klog.V(5).InfoS("failed to resolve VM ID for PID", "pid", pid, "command", command, "err", err)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]*
comm := C.GoString((*C.char)(unsafe.Pointer(&ct.Command)))

if ct.PID != 0 {
klog.V(6).Infof("process %s (pid=%d, cgroup=%d) has %d cpu time %d CPU cycles, %d instructions, %d cache misses, %d page cache hits",
comm, ct.PID, ct.CGroupID, ct.ProcessRunTime, ct.CPUCycles, ct.CPUInstr, ct.CacheMisses, ct.PageCacheHit)
klog.V(5).InfoS("Process metrics collected", "process", comm, "pid", ct.PID, "cgroupID", ct.CGroupID, "taskClockTime", ct.TaskClockTime, "cpuCycles", ct.CPUCycles, "cpuInstructions", ct.CPUInstr, "cacheMisses", ct.CacheMisses, "pageCacheHits", ct.PageCacheHit)
}
// skip process without resource utilization
if ct.ProcessRunTime == 0 && ct.CacheMisses == 0 && ct.PageCacheHit == 0 {
Expand All @@ -101,15 +100,15 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]*
// if the pid is within a container, it will have a container ID
containerID, err := cgroup.GetContainerID(ct.CGroupID, ct.PID, config.EnabledEBPFCgroupID)
if err != nil {
klog.V(6).Infof("failed to resolve container for PID %v (command=%s): %v, set containerID=%s", ct.PID, comm, err, utils.SystemProcessName)
klog.V(5).InfoS("Failed to resolve container for PID", "pid", ct.PID, "command", comm, "err", err, "defaultContainerID", utils.SystemProcessName)
}

// if the pid is within a VM, it will have an VM ID
vmID := utils.EmptyString
if config.IsExposeVMStatsEnabled() {
vmID, err = libvirt.GetVMID(ct.PID)
if err != nil {
klog.V(6).Infof("failed to resolve VM ID for PID %v (command=%s): %v", ct.PID, comm, err)
klog.V(5).InfoS("Failed to resolve VM ID for PID", "pid", ct.PID, "command", comm, "err", err)
}
}

Expand Down
Loading

0 comments on commit 77d4005

Please sign in to comment.