From a81cc66effc0a5a0131d7bda7f9777c7a732e483 Mon Sep 17 00:00:00 2001 From: Tesifonte Belda Date: Fri, 6 Jan 2023 19:13:06 +0100 Subject: [PATCH] [feature] New optional metrics for host services --- METRICS.md | 12 +++++ README.md | 4 ++ etc/vcstat.conf | 2 + internal/vccollector/host.go | 86 +++++++++++++++++++++++++++++++++ plugins/inputs/vcstat/vcstat.go | 18 +++++-- 5 files changed, 118 insertions(+), 4 deletions(-) diff --git a/METRICS.md b/METRICS.md index f573110..f4a4734 100644 --- a/METRICS.md +++ b/METRICS.md @@ -125,6 +125,18 @@ - duplex (string) - speed (int) - mac (string) +- vcstat_host_service + - tags: + - key + - esxhostname + - vcenter + - dcname + - clustername + - fields: + - label (string) + - policy (string) + - required (boolean) + - running (boolean) - vcstat_net_dvs - tags: - dvs diff --git a/README.md b/README.md index 14dd155..5433939 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,8 @@ Use telegraf v1.14 or above so that execd input is available. # host_hba_instances = false ## collect host network interface measurement (vcstat_host_nic) # host_nic_instances = false + ## collect host services measurement (vcstat_host_service) + # host_service_instances = false ## collect network distributed virtual switch measurement (vcstat_net_dvs) # net_dvs_instances = true ## collect network distributed virtual portgroup measurement (vcstat_net_dvp) @@ -105,6 +107,8 @@ vcstat_host_firewall,dcname=MyDC,clustername=MyCluster-01,esxhostname=myesxi01.l vcstat_host_hba,dcname=MyDC,clustername=MyCluster-01,device=vmhba0,driver=lpfc,esxhostname=myesxi01.local,vcenter=vcenter.local status="link-n/a",status_code=1i 1653060681000000000 vcstat_host_nic,dcname=MyDC,clustername=MyCluster-01,device=vmnic0,driver=ntg3,esxhostname=myesxi01.local,vcenter=vcenter.local link_status="Down",link_status_code=2i 1653060681000000000 vcstat_host_esxcli,dcname=MyDC,clustername=MyCluster-01,esxhostname=myesxi01.local,moid=host-706,vcenter=vcenter.local responding_code=0i,response_time_ns=109185876i 1653060681000000000 +vcstat_host_service,dcname=MyDC,clustername=MyCluster-01,esxhostname=myesxi01.local,key=ntpd,vcenter=vcenter.local label="NTP Daemon",policy="on",required=false,running=true 1653060681000000000 +vcstat_host_service,dcname=MyDC,clustername=MyCluster-01,esxhostname=myesxi01.local,key=vpxa,vcenter=vcenter.local label="VMware vCenter Agent",policy="on",required=false,running=true 1653060681000000000 vcstat_net_dvs,dcname=MyDC,dvs=DSwitch-E1,moid=dvs-e1,vcenter=vcenter.local num_standalone_ports=0i,status="green",status_code=0i,num_ports=421i,max_ports=2147483647i 1653060682000000000 vcstat_net_dvp,dcname=MyDC,dvp=DSwitch-E1-DVUplinks-e1,moid=dvportgroup-e1,uplink=true,vcenter=vcenter.local status="green",status_code=0i,num_ports=16i 1653060682000000000 vcstat_datastore,dcname=MyDC,dsname=DS_Departement1,moid=datastore-725,type=VMFS,vcenter=vcenter.local accessible=true,capacity=2198754820096i,freespace=730054262784i,uncommitted=20511i,maintenance_mode="normal" diff --git a/etc/vcstat.conf b/etc/vcstat.conf index 11fdc77..3016a4f 100644 --- a/etc/vcstat.conf +++ b/etc/vcstat.conf @@ -31,6 +31,8 @@ # host_hba_instances = false ## collect host network interface measurement (vcstat_host_nic) # host_nic_instances = false + ## collect host services measurement (vcstat_host_service) + # host_service_instances = false ## collect network distributed virtual switch measurement (vcstat_net_dvs) # net_dvs_instances = true ## collect network distributed virtual portgroup measurement (vcstat_net_dvp) diff --git a/internal/vccollector/host.go b/internal/vccollector/host.go index 9d053f3..8d07288 100644 --- a/internal/vccollector/host.go +++ b/internal/vccollector/host.go @@ -386,6 +386,92 @@ func (c *VcCollector) CollectHostFw( return nil } +// CollectHostServices gathers host services info (like govc: host.service.ls) +func (c *VcCollector) CollectHostServices( + ctx context.Context, + acc telegraf.Accumulator, +) error { + var ( + hstags = make(map[string]string) + hsfields = make(map[string]interface{}) + hsref, sref types.ManagedObjectReference + hsMos []mo.HostServiceSystem + hrefs, srefs []types.ManagedObjectReference + host *object.HostSystem + s *object.HostServiceSystem + hostSt *hostState + t time.Time + err error + ) + + if c.client == nil || c.coll == nil { + return fmt.Errorf("Could not get host services info: %w", govplus.ErrorNoClient) + } + if err = c.getAllDatacentersClustersAndHosts(ctx); err != nil { + return fmt.Errorf("Could not get cluster and host entity list: %w", err) + } + + for i, dc := range c.dcs { + // get HostServiceSystem references and split the list into chunks + for j, host := range c.hosts[i] { + if hostSt = c.getHostStateIdx(i, j); hostSt == nil { + acc.AddError(fmt.Errorf("Could not find host state for %s", host.Name())) + continue + } + s, err = host.ConfigManager().ServiceSystem(ctx) + if err != nil { + return fmt.Errorf("Could not get host service system: %w", err) + } + hrefs = append(hrefs, host.Reference()) + srefs = append(srefs, s.Reference()) + } + chunks := chunckMoRefSlice(srefs, c.queryBulkSize) + + for _, refs := range chunks { + err = c.coll.Retrieve(ctx, refs, []string{"serviceInfo.service"}, &hsMos) + if err != nil { + if err, exit := govplus.IsHardQueryError(err); exit { + return err + } + acc.AddError( + fmt.Errorf("Could not retrieve info for host service reference list: %w", err), + ) + continue + } + t = time.Now() + + for _, hsMo := range hsMos { + services := hsMo.ServiceInfo.Service + + // find host of this service + sref = hsMo.Self.Reference() + for k, r := range srefs { + if r == sref { + hsref = hrefs[k] + break + } + } + host = c.getHostObjectFromReference(i, &hsref) + hstags["clustername"] = c.getClusternameFromHost(i, host) + hstags["dcname"] = dc.Name() + hstags["esxhostname"] = host.Name() + hstags["vcenter"] = c.client.Client.URL().Host + + for _, service := range services { + hstags["key"] = service.Key + hsfields["label"] = service.Label + hsfields["policy"] = service.Policy + hsfields["required"] = service.Required + hsfields["running"] = service.Running + acc.AddFields("vcstat_host_service", hsfields, hstags, t) + } + } + } + } + + return nil +} + // ReportHostEsxcliResponse reports metrics about host esxcli command responses func (c *VcCollector) ReportHostEsxcliResponse( ctx context.Context, diff --git a/plugins/inputs/vcstat/vcstat.go b/plugins/inputs/vcstat/vcstat.go index 87bb1bd..eaee09f 100644 --- a/plugins/inputs/vcstat/vcstat.go +++ b/plugins/inputs/vcstat/vcstat.go @@ -29,7 +29,7 @@ type VCstatConfig struct { Password string `toml:"password"` Timeout config.Duration IntSkipNotRespondig int16 `toml:"intervals_skip_notresponding_esxcli_hosts"` - QueryBulkSize int `toml:"query_bulk_size"` + QueryBulkSize int `toml:"query_bulk_size"` Log telegraf.Logger `toml:"-"` ClusterInstances bool `toml:"cluster_instances"` @@ -39,6 +39,7 @@ type VCstatConfig struct { HostNICInstances bool `toml:"host_nic_instances"` HostFwInstances bool `toml:"host_firewall_instances"` HostGraphics bool `toml:"host_graphics_instances"` + HostServices bool `toml:"host_service_instances"` NetDVSInstances bool `toml:"net_dvs_instances"` NetDVPInstances bool `toml:"net_dvp_instances"` VMInstances bool `toml:"vm_instances"` @@ -85,6 +86,8 @@ var sampleConfig = ` # host_hba_instances = false ## collect host network interface measurement (vcstat_host_nic) # host_nic_instances = false + ## collect host services measurement (vcstat_host_service) + # host_service_instances = false ## collect network distributed virtual switch measurement (vcstat_net_dvs) # net_dvs_instances = true ## collect network distributed virtual portgroup measurement (vcstat_net_dvp) @@ -101,13 +104,14 @@ func init() { Username: "user@corp.local", Password: "secret", Timeout: config.Duration(time.Second * 0), - QueryBulkSize: 100, + QueryBulkSize: 100, IntSkipNotRespondig: 20, ClusterInstances: true, DatastoreInstances: false, HostInstances: true, HostFwInstances: false, HostGraphics: false, + HostServices: false, HostHBAInstances: false, HostNICInstances: false, NetDVSInstances: true, @@ -231,7 +235,7 @@ func (vcs *VCstatConfig) Gather(acc telegraf.Accumulator) error { // selfmonitoring vcs.GatherTime.Set(int64(time.Since(startTime).Nanoseconds())) - if vcs.HostHBAInstances || vcs.HostNICInstances || vcs.HostFwInstances || vcs.HostGraphics { + if vcs.HostHBAInstances || vcs.HostNICInstances || vcs.HostFwInstances { vcs.NotRespondingHosts.Set(int64(vcs.vcc.GetNumberNotRespondingHosts())) } for _, m := range selfstat.Metrics() { @@ -338,7 +342,13 @@ func (vcs *VCstatConfig) gatherHost(ctx context.Context, acc telegraf.Accumulato } } - if vcs.HostHBAInstances || vcs.HostNICInstances || vcs.HostFwInstances || vcs.HostGraphics { + if vcs.HostServices { + if err = col.CollectHostServices(ctx, acc); err != nil { + return err + } + } + + if vcs.HostHBAInstances || vcs.HostNICInstances || vcs.HostFwInstances { if err = col.ReportHostEsxcliResponse(ctx, acc); err != nil { return err }