Skip to content

Commit

Permalink
redfish: access model
Browse files Browse the repository at this point in the history
Signed-off-by: Huamin Chen <hchen@redhat.com>
  • Loading branch information
rootfs committed Jun 20, 2023
1 parent f0e6459 commit 559b0d7
Show file tree
Hide file tree
Showing 14 changed files with 863 additions and 4 deletions.
13 changes: 11 additions & 2 deletions cmd/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ var (
kubeconfig = flag.String("kubeconfig", "", "absolute path to the kubeconfig file, if empty we use the in-cluster configuration")
apiserverEnabled = flag.Bool("apiserver", true, "if apiserver is disabled, we collect pod information from kubelet")
kernelSourceDirPath = flag.String("kernel-source-dir", "", "path to the kernel source directory")
redfishCredFilePath = flag.String("redfish-cred-file-path", "", "path to the redfish credential file")
)

func healthProbe(w http.ResponseWriter, req *http.Request) {
Expand Down Expand Up @@ -154,10 +155,13 @@ func main() {
config.SetEnabledHardwareCounterMetrics(*exposeHardwareCounterMetrics)
config.SetEnabledGPU(*enableGPU)
config.EnabledMSR = *enabledMSR

config.SetKubeConfig(*kubeconfig)
config.SetEnableAPIServer(*apiserverEnabled)
if err := config.SetKernelSourceDir(*kernelSourceDirPath); err != nil {
klog.Warningf("failed to set kernel source dir to %q: %v", *kernelSourceDirPath, err)
if kernelSourceDirPath != nil && len(*kernelSourceDirPath) > 0 {
if err := config.SetKernelSourceDir(*kernelSourceDirPath); err != nil {
klog.Warningf("failed to set kernel source dir to %q: %v", *kernelSourceDirPath, err)
}
}

// the ebpf batch deletion operation was introduced in linux kernel 5.6, which provides better performance to delete keys.
Expand All @@ -174,6 +178,11 @@ func main() {

collector_metric.InitAvailableParamAndMetrics()

// set redfish credential file path
if *redfishCredFilePath != "" {
config.SetRedfishCredFilePath(*redfishCredFilePath)
}

// For local estimator, there is endpoint provided, thus we should let
// model component decide whether/how to init
model.InitEstimateFunctions(collector_metric.ContainerMetricNames, collector_metric.NodeMetadataNames, collector_metric.NodeMetadataValues)
Expand Down
47 changes: 46 additions & 1 deletion pkg/collector/metric/node_metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ const (
GPU = "gpu"
OTHER = "other"
PLATFORM = "platform"
REDFISH = "redfish"
FREQUENCY = "frequency"
)

Expand All @@ -61,6 +62,7 @@ type NodeMetrics struct {
TotalEnergyInGPU *types.UInt64StatCollection
TotalEnergyInOther *types.UInt64StatCollection
TotalEnergyInPlatform *types.UInt64StatCollection
TotalEnergyInRedfish *types.UInt64StatCollection

DynEnergyInCore *types.UInt64StatCollection
DynEnergyInDRAM *types.UInt64StatCollection
Expand All @@ -69,6 +71,7 @@ type NodeMetrics struct {
DynEnergyInGPU *types.UInt64StatCollection
DynEnergyInOther *types.UInt64StatCollection
DynEnergyInPlatform *types.UInt64StatCollection
DynEnergyInRedfish *types.UInt64StatCollection

IdleEnergyInCore *types.UInt64StatCollection
IdleEnergyInDRAM *types.UInt64StatCollection
Expand All @@ -77,6 +80,7 @@ type NodeMetrics struct {
IdleEnergyInGPU *types.UInt64StatCollection
IdleEnergyInOther *types.UInt64StatCollection
IdleEnergyInPlatform *types.UInt64StatCollection
IdleEnergyInRedfish *types.UInt64StatCollection

CPUFrequency map[int32]uint64

Expand Down Expand Up @@ -109,6 +113,9 @@ func NewNodeMetrics() *NodeMetrics {
TotalEnergyInPlatform: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
},
TotalEnergyInRedfish: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
},

DynEnergyInCore: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
Expand All @@ -131,6 +138,9 @@ func NewNodeMetrics() *NodeMetrics {
DynEnergyInPlatform: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
},
DynEnergyInRedfish: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
},

IdleEnergyInCore: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
Expand All @@ -153,6 +163,9 @@ func NewNodeMetrics() *NodeMetrics {
IdleEnergyInPlatform: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
},
IdleEnergyInRedfish: &types.UInt64StatCollection{
Stat: make(map[string]*types.UInt64Stat),
},
}
}

Expand All @@ -163,6 +176,7 @@ func (ne *NodeMetrics) ResetDeltaValues() {
ne.TotalEnergyInPkg.ResetDeltaValues()
ne.TotalEnergyInGPU.ResetDeltaValues()
ne.TotalEnergyInPlatform.ResetDeltaValues()
ne.TotalEnergyInRedfish.ResetDeltaValues()
ne.DynEnergyInCore.ResetDeltaValues()
ne.DynEnergyInDRAM.ResetDeltaValues()
ne.DynEnergyInUncore.ResetDeltaValues()
Expand Down Expand Up @@ -207,6 +221,17 @@ func (ne *NodeMetrics) SetLastestPlatformEnergy(platformEnergy map[string]float6
}
}

// SetLastestRedfishEnergy adds the lastest energy consumption from the node redfish BMC
func (ne *NodeMetrics) SetLastestRedfishEnergy(redfishEnergy map[string]float64, gauge bool) {
for system, energy := range redfishEnergy {
if gauge {
ne.TotalEnergyInRedfish.SetDeltaStat(system, uint64(math.Ceil(energy)))
} else {
ne.TotalEnergyInRedfish.SetAggrStat(system, uint64(math.Ceil(energy)))
}
}
}

// SetNodeComponentsEnergy adds the lastest energy consumption collected from the node's components (e.g., using RAPL)
func (ne *NodeMetrics) SetNodeComponentsEnergy(componentsEnergy map[int]source.NodeComponentsEnergy, gauge bool) {
for pkgID, energy := range componentsEnergy {
Expand Down Expand Up @@ -244,6 +269,7 @@ func (ne *NodeMetrics) UpdateIdleEnergy() {
ne.CalcIdleEnergy(GPU)
}
ne.CalcIdleEnergy(PLATFORM)
ne.CalcIdleEnergy(REDFISH)
// reset
ne.FoundNewIdleState = false
}
Expand Down Expand Up @@ -278,6 +304,9 @@ func (ne *NodeMetrics) UpdateDynEnergy() {
for sensorID := range ne.TotalEnergyInPlatform.Stat {
ne.CalcDynEnergy(PLATFORM, sensorID)
}
for system := range ne.TotalEnergyInRedfish.Stat {
ne.CalcDynEnergy(REDFISH, system)
}
// gpu metric
if config.EnabledGPU && accelerator.IsGPUCollectionSupported() {
for gpuID := range ne.TotalEnergyInGPU.Stat {
Expand Down Expand Up @@ -306,12 +335,17 @@ func (ne *NodeMetrics) SetNodeOtherComponentsEnergy() {
dynCPUComponentsEnergy := ne.DynEnergyInPkg.SumAllDeltaValues() +
ne.DynEnergyInDRAM.SumAllDeltaValues() +
ne.DynEnergyInGPU.SumAllDeltaValues()
// other component can be either platform or redfish
dynPlatformEnergy := ne.DynEnergyInPlatform.SumAllDeltaValues()
if dynPlatformEnergy > dynCPUComponentsEnergy {
otherComponentEnergy := dynPlatformEnergy - dynCPUComponentsEnergy
ne.DynEnergyInOther.SetDeltaStat(OTHER, otherComponentEnergy)
}

dynRedfishEnergy := ne.DynEnergyInRedfish.SumAllDeltaValues()
if dynRedfishEnergy > dynCPUComponentsEnergy {
otherComponentEnergy := dynRedfishEnergy - dynCPUComponentsEnergy
ne.DynEnergyInOther.SetDeltaStat(OTHER, otherComponentEnergy)
}
idleCPUComponentsEnergy := ne.IdleEnergyInPkg.SumAllDeltaValues() +
ne.IdleEnergyInDRAM.SumAllDeltaValues() +
ne.IdleEnergyInGPU.SumAllDeltaValues()
Expand All @@ -320,6 +354,11 @@ func (ne *NodeMetrics) SetNodeOtherComponentsEnergy() {
otherComponentEnergy := idlePlatformEnergy - idleCPUComponentsEnergy
ne.IdleEnergyInOther.SetDeltaStat(OTHER, otherComponentEnergy)
}
idleRedfishEnergy := ne.IdleEnergyInRedfish.SumAllDeltaValues()
if idleRedfishEnergy > idleCPUComponentsEnergy {
otherComponentEnergy := idleRedfishEnergy - idleCPUComponentsEnergy
ne.IdleEnergyInOther.SetDeltaStat(OTHER, otherComponentEnergy)
}
}

func (ne *NodeMetrics) GetNodeResUsagePerResType(resource string) (float64, error) {
Expand Down Expand Up @@ -424,6 +463,8 @@ func (ne *NodeMetrics) getTotalEnergyStatCollection(component string) (energySta
return ne.TotalEnergyInOther
case PLATFORM:
return ne.TotalEnergyInPlatform
case REDFISH:
return ne.TotalEnergyInRedfish
default:
klog.Fatalf("TotalEnergy component type %s is unknown\n", component)
}
Expand All @@ -446,6 +487,8 @@ func (ne *NodeMetrics) getDynEnergyStatCollection(component string) (energyStat
return ne.DynEnergyInOther
case PLATFORM:
return ne.DynEnergyInPlatform
case REDFISH:
return ne.DynEnergyInRedfish
default:
klog.Fatalf("DynEnergy component type %s is unknown\n", component)
}
Expand All @@ -468,6 +511,8 @@ func (ne *NodeMetrics) getIdleEnergyStatCollection(component string) (energyStat
return ne.IdleEnergyInOther
case PLATFORM:
return ne.IdleEnergyInPlatform
case REDFISH:
return ne.IdleEnergyInRedfish
default:
klog.Fatalf("IdleEnergy component type %s is unknown\n", component)
}
Expand Down
10 changes: 10 additions & 0 deletions pkg/collector/metric_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/sustainable-computing-io/kepler/pkg/config"
"github.com/sustainable-computing-io/kepler/pkg/power/accelerator"
"github.com/sustainable-computing-io/kepler/pkg/power/acpi"
"github.com/sustainable-computing-io/kepler/pkg/power/redfish"
"github.com/sustainable-computing-io/kepler/pkg/utils"

collector_metric "github.com/sustainable-computing-io/kepler/pkg/collector/metric"
Expand All @@ -43,6 +44,8 @@ type Collector struct {
bpfHCMeter *attacher.BpfModuleTables
// instance that collects the node energy consumption
acpiPowerMeter *acpi.ACPI
// instance that collects the node redfish power consumption
redfishClient *redfish.RedFishClient

// NodeMetrics holds all node energy and resource usage metrics
NodeMetrics collector_metric.NodeMetrics
Expand Down Expand Up @@ -87,13 +90,20 @@ func (c *Collector) Initialize() error {
c.updateNodeEnergyMetrics()
c.acpiPowerMeter.Run(attacher.HardwareCountersEnabled)

if str := config.GetRedfishCredFilePath(); str != "" {
if err := c.initRedFishCollector(str); err != nil {
return fmt.Errorf("%s", fmt.Sprintf("failed to init redfish collector from %s: %v", str, err))
}
}

return nil
}

func (c *Collector) Destroy() {
if c.bpfHCMeter != nil {
attacher.DetachBPFModules(c.bpfHCMeter)
}
c.stopRedfishCollector()
}

// Update updates the node and container energy and resource usage metrics
Expand Down
12 changes: 11 additions & 1 deletion pkg/collector/node_energy_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ func (c *Collector) updatePlatformEnergy(wg *sync.WaitGroup) {
}
}

// updateRedfishEnergy updates the node redfish power consumption, i.e, the BMC power consumption
func (c *Collector) updateRedfishEnergy(wg *sync.WaitGroup) {
defer wg.Done()
if c.redfishClient != nil {
redfishEnergy := c.redfishClient.GetPower()
c.NodeMetrics.SetLastestRedfishEnergy(redfishEnergy, true)
}
}

// updateMeasuredNodeEnergy updates each node component power consumption, i.e., the CPU core, uncore, package/socket and DRAM
func (c *Collector) updateNodeComponentsEnergy(wg *sync.WaitGroup) {
defer wg.Done()
Expand Down Expand Up @@ -95,11 +104,12 @@ func (c *Collector) updateNodeAvgCPUFrequency(wg *sync.WaitGroup) {
// updateNodeEnergyMetrics updates the node energy consumption of each component
func (c *Collector) updateNodeEnergyMetrics() {
var wgNode sync.WaitGroup
wgNode.Add(4)
wgNode.Add(5)
go c.updatePlatformEnergy(&wgNode)
go c.updateNodeComponentsEnergy(&wgNode)
go c.updateNodeAvgCPUFrequency(&wgNode)
go c.updateNodeGPUEnergy(&wgNode)
go c.updateRedfishEnergy(&wgNode)
wgNode.Wait()
// after updating the total energy we calculate the dynamic energy
// the idle energy is only updated if we find the node using less resources than previously observed
Expand Down
23 changes: 23 additions & 0 deletions pkg/collector/prometheus_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ type NodeDesc struct {
nodePlatformJoulesTotal *prometheus.Desc
nodeOtherComponentsJoulesTotal *prometheus.Desc
nodeGPUJoulesTotal *prometheus.Desc
nodeRedfishJoulesTotal *prometheus.Desc

// Additional metrics (gauge)
// TODO: review if we really need to expose this metric.
Expand Down Expand Up @@ -210,6 +211,8 @@ func (p *PrometheusCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- p.nodeDesc.nodePackageJoulesTotal
ch <- p.nodeDesc.nodePlatformJoulesTotal
ch <- p.nodeDesc.nodeOtherComponentsJoulesTotal
ch <- p.nodeDesc.nodeRedfishJoulesTotal

if config.EnabledGPU {
ch <- p.nodeDesc.nodeGPUJoulesTotal
}
Expand Down Expand Up @@ -319,6 +322,11 @@ func (p *PrometheusCollector) newNodeMetrics() {
"Current GPU value in joules",
[]string{"index", "instance", "source", "mode"}, nil,
)
nodeRedfishJoulesTotal := prometheus.NewDesc(
prometheus.BuildFQName(namespace, "node", "redfish_joules_total"),
"Current Redfish power value in joules",
[]string{"instance", "source", "mode"}, nil,
)

// Additional metrics (gauge)
NodeCPUFrequency := prometheus.NewDesc(
Expand Down Expand Up @@ -346,6 +354,7 @@ func (p *PrometheusCollector) newNodeMetrics() {
nodeDramJoulesTotal: nodeDramJoulesTotal,
nodePackageJoulesTotal: nodePackageJoulesTotal,
nodePlatformJoulesTotal: nodePlatformJoulesTotal,
nodeRedfishJoulesTotal: nodeRedfishJoulesTotal,
nodeOtherComponentsJoulesTotal: nodeOtherComponentsJoulesTotal,
nodeGPUJoulesTotal: nodeGPUJoulesTotal,
NodeCPUFrequency: NodeCPUFrequency,
Expand Down Expand Up @@ -661,6 +670,20 @@ func (p *PrometheusCollector) updateNodeMetrics(wg *sync.WaitGroup, ch chan<- pr
idlePower,
collector_metric.NodeName, "acpi", "idle",
)
dynPower = (float64(p.NodeMetrics.GetSumAggrDynEnergyFromAllSources(collector_metric.REDFISH)) / miliJouleToJoule)
ch <- prometheus.MustNewConstMetric(
p.nodeDesc.nodeRedfishJoulesTotal,
prometheus.CounterValue,
dynPower,
collector_metric.NodeName, "redfish", "dynamic",
)
idlePower = (float64(p.NodeMetrics.GetSumAggrIdleEnergyromAllSources(collector_metric.REDFISH)) / miliJouleToJoule)
ch <- prometheus.MustNewConstMetric(
p.nodeDesc.nodeRedfishJoulesTotal,
prometheus.CounterValue,
idlePower,
collector_metric.NodeName, "redfish", "idle",
)

if config.EnabledGPU {
for gpuID := range p.NodeMetrics.TotalEnergyInGPU.Stat {
Expand Down
57 changes: 57 additions & 0 deletions pkg/collector/redfish_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
Copyright 2021.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package collector

import (
"fmt"
"os"

"github.com/sustainable-computing-io/kepler/pkg/nodecred"
"github.com/sustainable-computing-io/kepler/pkg/power/redfish"
"k8s.io/klog/v2"
)

func (c *Collector) initRedFishCollector(credPath string) error {
if err := nodecred.InitNodeCredImpl(map[string]string{"redfish_cred_file_path": credPath}); err != nil {
return fmt.Errorf("%s", fmt.Sprintf("failed to init node credential: %v", err))
} else {
klog.V(5).Infof("Initialized node credential")
nodeName := os.Getenv("NODE_NAME")
if nodeName == "" {
nodeName = "localhost"
}
redfishCred, err := nodecred.GetNodeCredByNodeName(nodeName, "redfish")
if err == nil {
userName := redfishCred["redfish_username"]
password := redfishCred["redfish_password"]
host := redfishCred["redfish_host"]
if userName != "" && password != "" && host != "" {
klog.V(5).Infof("Initialized redfish credential")
c.redfishClient = redfish.NewRedfishClient(userName, password, host)
}
} else {
return fmt.Errorf("%s", fmt.Sprintf("failed to get node credential: %v", err))
}
}
return nil
}

func (c *Collector) stopRedfishCollector() {
if c.redfishClient != nil {
c.redfishClient.StopPower()
}
}

0 comments on commit 559b0d7

Please sign in to comment.