Skip to content

Commit

Permalink
add support for operator metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Charlie Doern <cbddoern@gmail.com>
  • Loading branch information
cdoern committed Mar 28, 2023
1 parent bd68b53 commit c02a9d2
Show file tree
Hide file tree
Showing 21 changed files with 511 additions and 17 deletions.
3 changes: 3 additions & 0 deletions cmd/machine-config-operator/bootstrap.go
Expand Up @@ -38,6 +38,7 @@ var (
kubeCAFile string
mcoImage string
oauthProxyImage string
kubeRbacProxyImage string
networkConfigFile string
oscontentImage string
pullSecretFile string
Expand Down Expand Up @@ -81,6 +82,7 @@ func init() {
bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.oauthProxyImage, "oauth-proxy-image", "", "Image for origin oauth proxy.")
bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.baseOSContainerImage, "baseos-image", "", "ostree-bootable container image reference")
bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.baseOSExtensionsContainerImage, "baseos-extensions-image", "", "Image with extensions")
bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.kubeRbacProxyImage, "kube-rbac-proxy-image", "", "Image for origin kube-rbac proxy.")
bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.imageReferences, "image-references", "", "File containing imagestreams (from cluster-version-operator)")
bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.cloudProviderCAFile, "cloud-provider-ca-file", "", "path to cloud provider CA certificate")

Expand Down Expand Up @@ -136,6 +138,7 @@ func runBootstrapCmd(cmd *cobra.Command, args []string) {
bootstrapOpts.baremetalRuntimeCfgImage = findImageOrDie(imgstream, "baremetal-runtimecfg")
// TODO: Hmm, this one doesn't actually seem to be passed right now at bootstrap time by the installer
bootstrapOpts.oauthProxyImage = findImageOrDie(imgstream, "oauth-proxy")
bootstrapOpts.kubeRbacProxyImage = findImageOrDie(imgstream, "kube-rbac-proxy")
bootstrapOpts.infraImage = findImageOrDie(imgstream, "pod")
bootstrapOpts.haproxyImage = findImageOrDie(imgstream, "haproxy-router")
bootstrapOpts.baseOSContainerImage, err = findImage(imgstream, baseOSContainerImageTag)
Expand Down
13 changes: 11 additions & 2 deletions cmd/machine-config-operator/start.go
Expand Up @@ -24,15 +24,17 @@ var (
}

startOpts struct {
kubeconfig string
imagesFile string
kubeconfig string
imagesFile string
promMetricsURL string
}
)

func init() {
rootCmd.AddCommand(startCmd)
startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
startCmd.PersistentFlags().StringVar(&startOpts.imagesFile, "images-json", "", "images.json file for MCO.")
startCmd.PersistentFlags().StringVar(&startOpts.promMetricsURL, "metrics-listen-address", "127.0.0.1:8797", "Listen address for prometheus metrics listener")
}

func runStartCmd(cmd *cobra.Command, args []string) {
Expand All @@ -54,6 +56,13 @@ func runStartCmd(cmd *cobra.Command, args []string) {
if err != nil {
glog.Fatalf("error creating clients: %v", err)
}

stopCh := make(chan struct{})
defer close(stopCh)

// start metrics listener
go ctrlcommon.StartMetricsListener(startOpts.promMetricsURL, stopCh, operator.RegisterMCOMetrics)

run := func(ctx context.Context) {
go common.SignalHandler(runCancel)

Expand Down
@@ -0,0 +1,17 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: kube-rbac-proxy
namespace: openshift-machine-config-operator
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
data:
config-file.yaml: |+
authorization:
resourceAttributes:
apiVersion: v1
resource: namespace
subresource: metrics
namespace: openshift-machine-config-operator
Expand Up @@ -9,6 +9,19 @@ metadata:
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
- subjectaccessreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
- apiGroups:
- machineconfiguration.openshift.io
resources:
Expand Down
21 changes: 21 additions & 0 deletions install/0000_80_machine-config-operator_00_service.yaml
@@ -1,5 +1,26 @@
apiVersion: v1
kind: Service
metadata:
name: machine-config-operator
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-operator
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
service.beta.openshift.io/serving-cert-secret-name: mco-proxy-tls
spec:
type: ClusterIP
selector:
k8s-app: machine-config-operator
ports:
- name: metrics
port: 9001
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
Expand Down
Expand Up @@ -17,5 +17,6 @@ data:
"corednsImage": "placeholder.url.oc.will.replace.this.org/placeholdernamespace:coredns",
"haproxyImage": "placeholder.url.oc.will.replace.this.org/placeholdernamespace:haproxy-router",
"baremetalRuntimeCfgImage": "placeholder.url.oc.will.replace.this.org/placeholdernamespace:baremetal-runtimecfg",
"oauthProxy": "placeholder.url.oc.will.replace.this.org/placeholdernamespace:oauth-proxy"
"oauthProxy": "placeholder.url.oc.will.replace.this.org/placeholdernamespace:oauth-proxy",
"kubeRbacProxy": "placeholder.url.oc.will.replace.this.org/placeholdernamespace:kube-rbac-proxy"
}
6 changes: 6 additions & 0 deletions install/0000_80_machine-config-operator_03_rbac.yaml
Expand Up @@ -27,6 +27,12 @@ metadata:
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
rules:
- apiGroups:
- ""
resources:
- namespace/metrics
verbs:
- get
- apiGroups:
- ""
resources:
Expand Down
30 changes: 30 additions & 0 deletions install/0000_80_machine-config-operator_04_deployment.yaml
Expand Up @@ -40,6 +40,29 @@ spec:
mountPath: /etc/ssl/kubernetes/ca.crt
- name: images
mountPath: /etc/mco/images
- name: kube-rbac-proxy
image: placeholder.url.oc.will.replace.this.org/placeholdernamespace:kube-rbac-proxy
ports:
- containerPort: 9001
name: metrics
protocol: TCP
args:
- --secure-listen-address=0.0.0.0:9001
- --config-file=/etc/kube-rbac-proxy/config-file.yaml
- --upstream=http://127.0.0.1:8797
- --logtostderr=true
- --tls-cert-file=/etc/tls/private/tls.crt
- --tls-private-key-file=/etc/tls/private/tls.key
resources:
requests:
cpu: 20m
memory: 50Mi
volumeMounts:
- mountPath: /etc/tls/private
name: proxy-tls
- mountPath: /etc/kube-rbac-proxy
name: auth-proxy-config
serviceAccountName: default
nodeSelector:
node-role.kubernetes.io/master: ""
priorityClassName: "system-cluster-critical"
Expand All @@ -66,3 +89,10 @@ spec:
- name: root-ca
hostPath:
path: /etc/kubernetes/ca.crt
- name: proxy-tls
secret:
secretName: mco-proxy-tls
- configMap:
name: kube-rbac-proxy
name: auth-proxy-config

36 changes: 36 additions & 0 deletions install/0000_90_machine-config-operator_00_servicemonitor.yaml
@@ -1,5 +1,41 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: machine-config-operator
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-operator
annotations:
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
spec:
endpoints:
- interval: 30s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: metrics
scheme: https
path: /metrics
relabelings:
- action: replace
regex: ;(.*)
replacement: $1
separator: ";"
sourceLabels:
- node
- __meta_kubernetes_pod_node_name
targetLabel: node
tlsConfig:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
serverName: machine-config-operator.openshift-machine-config-operator.svc
namespaceSelector:
matchNames:
- openshift-machine-config-operator
selector:
matchLabels:
k8s-app: machine-config-operator
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
Expand Down
4 changes: 4 additions & 0 deletions install/image-references
Expand Up @@ -16,6 +16,10 @@ spec:
from:
kind: DockerImage
name: placeholder.url.oc.will.replace.this.org/placeholdernamespace:oauth-proxy
- name: kube-rbac-proxy
from:
kind: DockerImage
name: placeholder.url.oc.will.replace.this.org/placeholdernamespace:kube-rbac-proxy
# This one is special, it's the OS payload
# https://github.com/openshift/machine-config-operator/issues/183
# See the machine-config-osimageurl configmap.
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/node/status.go
Expand Up @@ -67,7 +67,6 @@ func calculateStatus(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node) mcfgv
UnavailableMachineCount: unavailableMachineCount,
DegradedMachineCount: degradedMachineCount,
}

status.Configuration = pool.Status.Configuration

conditions := pool.Status.Conditions
Expand Down Expand Up @@ -123,6 +122,7 @@ func calculateStatus(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node) mcfgv
if nodeDegraded || renderDegraded {
sdegraded := mcfgv1.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolDegraded, corev1.ConditionTrue, "", "")
mcfgv1.SetMachineConfigPoolCondition(&status, *sdegraded)

} else {
sdegraded := mcfgv1.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolDegraded, corev1.ConditionFalse, "", "")
mcfgv1.SetMachineConfigPoolCondition(&status, *sdegraded)
Expand Down
50 changes: 50 additions & 0 deletions pkg/operator/metrics.go
@@ -0,0 +1,50 @@
package operator

import (
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"

"github.com/prometheus/client_golang/prometheus"
)

const (
DefaultBindAddress = ":8797"
)

// MCO Metrics
var (
// mcoState is the state of the machine config operator
// pause, updated, updating, degraded
mcoState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mco_state",
Help: "state of a specified pool",
}, []string{"pool", "state", "reason"})
// mcoMachineCount is the total number of nodes in the pool
mcoMachineCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mco_machine_count",
Help: "total number of machines in a specified pool",
}, []string{"pool"})
// mcoUpdatedMachineCount is the updated machines in the pool
mcoUpdatedMachineCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mco_updated_machine_count",
Help: "total number of updated machines in specified pool",
}, []string{"pool"})
// mcoDegradedMachineCount is the degraded machines in the pool
mcoDegradedMachineCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mco_degraded_machine_count",
Help: "total number of degraded machines in specified pool",
}, []string{"pool"})
// mcoUnavailableMachineCount is the degraded machines in the pool
mcoUnavailableMachineCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mco_unavailable_machine_count",
Help: "total number of unavailable machines in specified pool",
}, []string{"pool"})
)

func RegisterMCOMetrics() error {
return ctrlcommon.RegisterMetrics([]prometheus.Collector{mcoState, mcoMachineCount, mcoUpdatedMachineCount, mcoDegradedMachineCount, mcoUnavailableMachineCount})
}
76 changes: 76 additions & 0 deletions pkg/operator/operator_test.go
@@ -0,0 +1,76 @@
package operator

import (
"fmt"
"testing"

configv1 "github.com/openshift/api/config/v1"
fakeconfigclientset "github.com/openshift/client-go/config/clientset/versioned/fake"
mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
"github.com/openshift/machine-config-operator/test/helpers"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
corelisterv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
)

func TestMetrics(t *testing.T) {
optr := &Operator{
eventRecorder: &record.FakeRecorder{},
}
optr.vStore = newVersionStore()

p1, p2 := helpers.NewMachineConfigPool("master", nil, helpers.MasterSelector, "v0"), helpers.NewMachineConfigPool("worker", nil, helpers.WorkerSelector, "v0")
p2.Status.MachineCount = 2
p2.Status.UpdatedMachineCount = 1
p2.Status.DegradedMachineCount = 1
optr.mcpLister = &mockMCPLister{
pools: []*mcfgv1.MachineConfigPool{p1, p2},
}

nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
optr.nodeLister = corelisterv1.NewNodeLister(nodeIndexer)
nodeIndexer.Add(&corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "first-node", Labels: map[string]string{"node-role/worker": ""}},
Status: corev1.NodeStatus{
NodeInfo: corev1.NodeSystemInfo{
KubeletVersion: "v1.21",
},
},
})

coName := fmt.Sprintf("test-%s", uuid.NewUUID())
co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: coName}}
optr.name = coName
kasOperator := &configv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{Name: "kube-apiserver"},
Status: configv1.ClusterOperatorStatus{
Versions: []configv1.OperandVersion{
{Name: "kube-apiserver", Version: "1.21"},
},
},
}

optr.configClient = fakeconfigclientset.NewSimpleClientset(co, kasOperator)
err := optr.syncAll([]syncFunc{
{name: "fn1",
fn: func(config *renderConfig) error { return nil },
},
})
require.Nil(t, err)

metric := testutil.ToFloat64(mcoMachineCount.WithLabelValues("worker"))
assert.Equal(t, metric, float64(2))

metric = testutil.ToFloat64(mcoUpdatedMachineCount.WithLabelValues("worker"))
assert.Equal(t, metric, float64(1))

metric = testutil.ToFloat64(mcoDegradedMachineCount.WithLabelValues("worker"))
assert.Equal(t, metric, float64(1))

}

0 comments on commit c02a9d2

Please sign in to comment.