Permalink
Cannot retrieve contributors at this time
Fetching contributors…

#!/usr/bin/env python | |
# | |
# perfmon - a daemon for monitoring performance of the host on which it is run | |
# and of all the local VMs, and for generating events based on configurable | |
# triggers | |
# | |
# Notes: | |
# ====== | |
# The XAPI instance running on localhost monitors a number of variables | |
# for each VM running locally (i.e not on other pool members) and | |
# for the host itself. Each variable is stored in 16 RRDs (Round Robin Databases). | |
# | |
# Consolidation Number of samples in RRD | |
# function 5s/sample 1m/sample 1hr/sample 1day/sample | |
# AVERAGE 120 (10m) 120 (2h) ? ? | |
# MIN 120 (10m) 120 (2h) ? ? | |
# MAX 120 (10m) 120 (2h) ? ? | |
# LAST 120 (10m) 120 (2h) ? ? | |
# | |
# The "Consolidation function" tells how that RRD is built up from the | |
# one with the next highest sample rate. E.g. In the 1m/sample "AVERAGE" RRD | |
# each sample is the average of 12 from the 1s/sample "AVERAGE" RRD, whereas | |
# in the 1m/sample "MIN" RRD each sample is the minimum of 12 from the 1s/sample | |
# "AVERAGE" RRD. | |
# | |
# When XAPI is queried over http it selects the column (e.g. "1hr/sample") | |
# based on the "start" CGI param. It will return the highest level of granularity | |
# available for the period requested. | |
# | |
# The "cf" CGI param specfies the row. (All rows are returned if it's missing.) | |
import sys | |
import os | |
import getopt | |
import traceback | |
import XenAPI | |
import urllib | |
from xml import sax # used to parse rrd_updates because this may be large and sax is more efficient | |
from xml.dom import minidom # used to parse other-config:perfmon. Efficiency is less important than reliability here | |
from xml.parsers.expat import ExpatError | |
import time | |
import re | |
import random | |
import syslog | |
import socket | |
import gc | |
import signal | |
import commands | |
def print_debug(string): | |
if debug: | |
print >>sys.stderr, "DEBUG:", string | |
syslog.syslog(syslog.LOG_USER | syslog.LOG_INFO, "PERFMON(DEBUG): %s" % string) | |
def log_err(string): | |
print >>sys.stderr, string | |
syslog.syslog(syslog.LOG_USER | syslog.LOG_ERR, "PERFMON: %s" % string) | |
pass | |
def log_info(string): | |
print >>sys.stderr, string | |
syslog.syslog(syslog.LOG_INFO | syslog.LOG_INFO, "PERFMON: %s" % string) | |
pass | |
def debug_mem(): | |
objCount = {} | |
gc.collect() | |
objList = gc.get_objects() | |
for obj in objList: | |
if getattr(obj, "__class__", None): | |
name = obj.__class__.__name__ | |
else: | |
name = type(obj) | |
if objCount.has_key(name): | |
objCount[name] += 1 | |
else: | |
objCount[name] = 1 | |
output = [] | |
for name in objCount: | |
output.append("%s :%s" % (name, objCount[name])) | |
log_info("\n".join(output)) | |
class PerfMonException(Exception): | |
pass | |
class XmlConfigException(PerfMonException): | |
pass | |
class UsageException(Exception): | |
pass | |
# Start a session with the master of a pool. | |
# Note: when calling http://localhost/rrd_update we must pass the session | |
# ID as a param. The host then uses this to verify our validity with | |
# the master before responding. | |
# If the verification fails we should get a 401 response | |
class XapiSession(XenAPI.Session): | |
""" Object that represents a XenAPI session with the pool master | |
One of these is needed to refresh a VMMonitor or HOSTMonitor config, or | |
to refresh an RRDUpdates object | |
""" | |
def __init__(self): | |
XenAPI.Session.__init__(self, "http://_var_xapi_xapi", transport=XenAPI.UDSTransport()) | |
self.xenapi.login_with_password("", "", "1.0", "xen-api-scripts-perfmon") | |
def __del__ (self): | |
self.xenapi.session.logout() | |
def id(self): | |
return self._session | |
class ObjectReport: | |
def __init__(self, objtype, uuid): | |
self.objtype = objtype # a string like "vm", or "host" taken from an <entry> tag | |
self.uuid = uuid # the object's uuid | |
self.vars = {} # maps rrd variable name to array of floats | |
def get_uuid(self): | |
return self.uuid | |
def get_var_names(self): | |
return self.vars.keys() | |
def get_value(self, var_name, row): | |
try: | |
return (self.vars[var_name])[row] | |
except: | |
return 0.0 | |
def insert_value(self, var_name, index, value): | |
if not self.vars.has_key(var_name): | |
self.vars[var_name] = [] | |
self.vars[var_name].insert(index, value) | |
class RRDReport: | |
"This is just a data structure passed that is completed by RRDContentHandler" | |
def __init__(self): | |
self.reset() | |
def reset(self): | |
self.columns = 0 # num xapi vars in xml | |
self.rows = 0 # num samples in xml | |
self.start_time = 0 # timestamp of 1st sample in xml | |
self.end_time = 0 # timestamp of last sample in xml | |
self.step_time = 0 # seconds between each pair of samples | |
self.obj_reports = {} # maps uuids to ObjectReports, built from xml | |
class RRDColumn: | |
"class used internally by RRDContentHandler" | |
def __init__(self, paramname, obj_report): | |
self.paramname = paramname | |
self.obj_report = obj_report | |
class RRDContentHandler(sax.ContentHandler): | |
""" Handles data in this format: | |
<xport> | |
<meta> | |
<start>INTEGER</start> | |
<step>INTEGER</step> | |
<end>INTEGER</end> | |
<rows>INTEGER</rows> | |
<columns>INTEGER</columns> | |
<legend> | |
<entry>IGNOREME:(host|vm):UUID:PARAMNAME</entry> | |
... another COLUMNS-1 entries ... | |
</legend> | |
</meta> | |
<data> | |
<row> | |
<t>INTEGER(END_TIME)</t> | |
<v>FLOAT</v> | |
... another COLUMNS-1 values ... | |
</row> | |
... another ROWS-2 rows | |
<row> | |
<t>INTEGER(START_TIME)</t> | |
<v>FLOAT</v> | |
... another COLUMNS-1 values ... | |
</row> | |
</data> | |
</xport> | |
""" | |
def __init__(self, report): | |
"report is saved and later updated by this object. report should contain defaults already" | |
self.report = report | |
self.in_start_tag = False | |
self.in_step_tag = False | |
self.in_end_tag = False | |
self.in_rows_tag = False | |
self.in_columns_tag = False | |
self.in_entry_tag = False | |
self.in_row_tag = False | |
self.column_details = [] | |
self.row = 0 | |
def startElement(self, name, attrs): | |
self.raw_text = "" | |
if name == 'start': | |
self.in_start_tag = True | |
elif name == 'step': | |
self.in_step_tag = True | |
elif name == 'end': | |
self.in_end_tag = True | |
elif name == 'rows': | |
self.in_rows_tag = True | |
elif name == 'columns': | |
self.in_columns_tag = True | |
elif name == 'entry': | |
self.in_entry_tag = True | |
elif name == 'row': | |
self.in_row_tag = True | |
self.col = 0 | |
if self.in_row_tag: | |
if name == 't': | |
self.in_t_tag = True | |
elif name == 'v': | |
self.in_v_tag = True | |
def characters(self, chars): | |
if (self.in_start_tag or | |
self.in_step_tag or | |
self.in_end_tag or | |
self.in_rows_tag or | |
self.in_columns_tag or | |
self.in_entry_tag or | |
#self.in_row_tag # ignore text under row tag, <row>s are just for holding <t> and <v> nodes | |
self.in_t_tag or | |
self.in_v_tag): | |
self.raw_text += chars | |
def endElement(self, name): | |
if name == 'start': | |
# This overwritten later if there are any rows | |
self.report.start_time = int(self.raw_text) | |
self.in_start_tag = False | |
elif name == 'step': | |
self.report.step_time = int(self.raw_text) | |
self.in_step_tag = False | |
elif name == 'end': | |
# This overwritten later if there are any rows | |
self.report.end_time = int(self.raw_text) | |
self.in_end_tag = False | |
elif name == 'rows': | |
self.report.rows = int(self.raw_text) | |
self.in_rows_tag = False | |
elif name == 'columns': | |
self.report.columns = int(self.raw_text) | |
self.in_columns_tag = False | |
elif name == 'entry': | |
(_, objtype, uuid, paramname) = self.raw_text.split(':') | |
# lookup the obj_report corresponding to this uuid, or create if it does not exist | |
if not self.report.obj_reports.has_key(uuid): | |
self.report.obj_reports[uuid] = ObjectReport(objtype, uuid) | |
obj_report = self.report.obj_reports[uuid] | |
# save the details of this column | |
self.column_details.append(RRDColumn(paramname, obj_report)) | |
self.in_entry_tag = False | |
elif name == 'row': | |
self.in_row_tag = False | |
self.row += 1 | |
elif name == 't': | |
# Extract start and end time from row data as it's more reliable than the values in the meta data | |
t = int(self.raw_text) | |
# Last row corresponds to start time | |
self.report.start_time = t | |
if self.row == 0: | |
# First row corresponds to end time | |
self.report.end_time = t | |
self.in_t_tag = False | |
elif name == 'v': | |
v = float(self.raw_text) | |
# Find object report and paramname for this col | |
col_details = self.column_details[self.col] | |
obj_report = col_details.obj_report | |
paramname = col_details.paramname | |
# Update object_report | |
obj_report.insert_value(paramname, index=0, value=v) # use index=0 as this is the earliest sample so far | |
# Update position in row | |
self.col += 1 | |
self.in_t_tag = False | |
# An object of this class should persist the lifetime of the program | |
class RRDUpdates: | |
""" Object used to get and parse the output the http://localhost/rrd_udpates?... | |
""" | |
def __init__(self): | |
# params are what get passed to the CGI executable in the URL | |
self.params = dict() | |
self.params['start'] = int(time.time()) - interval # interval seconds ago | |
self.params['host'] = 'true' # include data for host (as well as for VMs) | |
self.params['sr_uuid'] = 'all' # include data for all SRs attached to this host | |
self.params['cf'] = 'AVERAGE' # consolidation function, each sample averages 12 from the 5 second RRD | |
self.params['interval'] = str(rrd_step) # distinct from the perfmon interval | |
self.report = RRDReport() # data structure updated by RRDContentHandler | |
def __repr__(self): | |
return '<RRDUpdates object: params=%s>' % str(self.params) | |
def refresh(self, session, override_params = {}): | |
"reread the rrd_updates over CGI and parse" | |
params = override_params | |
params['session_id'] = session.id() | |
params.update(self.params) | |
paramstr = "&".join(["%s=%s" % (k,params[k]) for k in params]) | |
print_debug("Calling http://localhost/rrd_updates?%s" % paramstr) | |
# this is better than urllib.urlopen() as it raises an Exception on http 401 'Unauthorised' error | |
# rather than drop into interactive mode | |
sock = urllib.URLopener().open("http://localhost/rrd_updates?%s" % paramstr) | |
xmlsource = sock.read() | |
sock.close() | |
# Use sax rather than minidom and save Vvvast amounts of time and memory. | |
self.report.reset() | |
sax.parseString(xmlsource, RRDContentHandler(self.report)) | |
# Update the time used on the next run | |
self.params['start'] = self.report.end_time + 1 # avoid retrieving same data twice | |
print_debug("Refreshed rrd_updates, start = %d, end = %d, rows = %d" % \ | |
(self.report.start_time, self.report.end_time, self.report.rows)) | |
def get_num_rows(self): | |
"Return the number of samples of each parameter" | |
return self.report.rows | |
def get_obj_report_by_uuid(self, uuid): | |
"Return an ObjectReport for the object with this uuid" | |
try: | |
return self.report.obj_reports[uuid] | |
except: | |
return None | |
def get_uuid_list_by_objtype(self, objtype): | |
"Return a list of uuids corresonding to the objects of this type for which we have ObjectReports" | |
return [ objrep.uuid | |
for objrep in self.report.obj_reports.values() | |
if objrep.objtype == objtype ] | |
# Consolidation functions: | |
supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage', 'get_percent_log_fs_usage', 'get_percent_mem_usage', 'get_percent_sr_usage' ] | |
def average(mylist): | |
return sum(mylist)/float(len(mylist)) | |
def get_percent_log_fs_usage(ignored): | |
"Get the percent usage of the host filesystem for logs partition. Input list is ignored and should be empty" | |
fs_output = commands.getoutput('df /etc/passwd') | |
log_fs_output = commands.getoutput('df /var/log') | |
fs_output = ' '.join(fs_output.splitlines()[1:]) | |
log_fs_output = ' '.join(log_fs_output.splitlines()[1:]) | |
# Get the percent usage only when there is a separate logs partition | |
if (fs_output.split()[0] != log_fs_output.split()[0]): | |
percentage = log_fs_output.split()[4] | |
# remove % character and convert to float | |
return float(percentage[0:-1])/100.0 | |
else: | |
return float('NaN') | |
def get_percent_fs_usage(ignored): | |
"Get the percent usage of the host filesystem. Input list is ignored and should be empty" | |
# this file is on the filesystem of interest in both OEM and Retail | |
output = commands.getoutput('df /etc/passwd') | |
output = ' '.join(output.splitlines()[1:]) # remove header line and rewrap on single line | |
percentage = output.split()[4] | |
# remove % character and convert to float | |
return float(percentage[0:-1])/100.0 | |
def get_percent_mem_usage(ignored): | |
"Get the percent usage of Dom0 memory/swap. Input list is ignored and should be empty" | |
try: | |
memfd = open('/proc/meminfo', 'r') | |
memlist = memfd.readlines() | |
memfd.close() | |
memdict = [ m.split(':', 1) for m in memlist ] | |
memdict = dict([(k.strip(), float(re.search('\d+', v.strip()).group(0))) for (k,v) in memdict]) | |
# We consider the sum of res memory and swap in use as the hard demand | |
# of mem usage, it is bad if this number is beyond the physical mem, as | |
# in such case swapping is obligatory rather than voluntary, hence | |
# degrading the performance. We define the percentage metrics as | |
# (res_mem + swap_in_use) / phy_mem, which could potentially go beyond | |
# 100% (but is considered bad when it does) | |
mem_in_use = memdict['MemTotal'] - memdict['MemFree'] - memdict['Buffers'] - memdict['Cached'] | |
swap_in_use = memdict['SwapTotal'] - memdict['SwapFree'] | |
return float(mem_in_use + swap_in_use) / memdict['MemTotal'] | |
except Exception, e: | |
log_err("Error %s in get_percent_mem_usage, return 0.0 instead" % e) | |
return 0.0 | |
def get_percent_sr_usage(mylist): | |
"""Get the percent usage of the SR. Input list should be exactly two items: [physical_utilisation, size]""" | |
try: | |
if len(mylist) != 2: | |
raise Exception("Incorrect number of values to consolidate: %d (exactly 2 values)" % len(mylist)) | |
physical_utilisation, size = mylist[0:2] | |
return float(physical_utilisation) / size | |
except Exception, e: | |
log_err("Error %s in get_percent_sr_usage, return 0.0 instead" % e) | |
return 0.0 | |
class VariableConfig: | |
"""Object storing the configuration of a Variable | |
Initialisation parameters: | |
xmldoc = dom object representing the <variable> nodes in the ObjectMonitor config strings. | |
See VMMonitor.__doc__ and HOSTMonitor.__doc__ | |
alarm_create_callback = | |
callback called by Variable.update() to create and send an alarm | |
get_default_variable_config = | |
a function that VariableConfig.__init__() uses to lookup default tag values | |
by variable name | |
""" | |
def __init__(self, xmldoc, alarm_create_callback, get_default_variable_config): | |
try: name = xmldoc.getElementsByTagName('name')[0].getAttribute('value') | |
except IndexError: raise XmlConfigException, "variable missing 'name' tag" | |
def get_value(tag): | |
try: | |
return xmldoc.getElementsByTagName(tag)[0].getAttribute('value') | |
except: | |
return get_default_variable_config(name, tag) | |
rrd_regex = get_value('rrd_regex') | |
consolidation_fn = get_value('consolidation_fn') | |
alarm_trigger_level = get_value('alarm_trigger_level') | |
alarm_trigger_period = get_value('alarm_trigger_period') | |
alarm_auto_inhibit_period = get_value('alarm_auto_inhibit_period') | |
alarm_trigger_sense = get_value('alarm_trigger_sense') | |
alarm_priority = get_value('alarm_priority') | |
# Save xmldoc: we need this when creating the body of the alarms | |
self.xmldoc = xmldoc | |
self.name = name | |
try: | |
self.rrd_regex = re.compile("^%s$" % rrd_regex) | |
except: | |
raise XmlConfigException, "variable %s: regex %s does not compile" % (name, rrd_regex) | |
if consolidation_fn not in supported_consolidation_functions: | |
raise XmlConfigException, "variable %s: consolidation function %s not supported" \ | |
% (name, consolidation_fn) | |
self.consolidation_fn = eval(consolidation_fn) | |
try: | |
self.alarm_trigger_period = int(alarm_trigger_period) | |
except: | |
raise XmlConfigException, "variable %s: alarm_trigger_period %s not an int" % \ | |
(name, alarm_trigger_period) | |
try: | |
self.alarm_auto_inhibit_period = int(alarm_auto_inhibit_period) | |
except: | |
raise XmlConfigException, "variable %s: alarm_auto_inhibit_period %s not an int" % \ | |
(name, alarm_auto_inhibit_period) | |
try: | |
trigger_level = float(alarm_trigger_level) | |
except: | |
raise XmlConfigException, "variable %s: alarm_trigger_level %s not a float" % \ | |
(name, alarm_trigger_level) | |
self.alarm_priority = alarm_priority | |
if alarm_trigger_sense == "high": | |
self.test_level = lambda : (self.value > trigger_level) | |
else: | |
self.test_level = lambda : (self.value < trigger_level) | |
self.alarm_create_callback = alarm_create_callback | |
def variable_configs_differ(vc1, vc2): | |
"Say whether configuration of one variable differs from that of another" | |
return vc1.xmldoc.toxml() != vc2.xmldoc.toxml() | |
class VariableState: | |
""" Object storing the state of a Variable | |
""" | |
def __init__(self): | |
self.value = None | |
self.timeof_last_alarm = time.time() - self.alarm_auto_inhibit_period | |
self.trigger_down_counter = self.alarm_trigger_period | |
class Variable(VariableConfig, VariableState): | |
""" Variable() is used by ObjectMonitor to create one Variable object for each | |
variable specified in it's config string | |
""" | |
def __init__(self, *args): | |
VariableConfig.__init__(self, *args) | |
VariableState.__init__(self) | |
self.active = True | |
print_debug("Created Variable %s" % self.name) | |
def set_active(self, active): | |
print_debug("set_active on %s. (old, new) = (%s, %s)" % (self.name, self.active, active)) | |
if active == self.active: | |
return # nothing to do | |
self.active = active | |
if active: | |
VariableState.__init__(self) # reset when reactivating | |
def __generate_alarm(self, session): | |
""" Generate an alarm using callback provided by creator | |
... provided that one has not been generated in the last | |
self.alarm_auto_inhibit_period seconds | |
""" | |
t = time.time() | |
delta = t - self.timeof_last_alarm | |
print_debug("Time since last alarm for var %s is %d - %d = %d. Refractory period = %d." % (self.name, t, self.timeof_last_alarm, delta, self.alarm_auto_inhibit_period)) | |
if delta < self.alarm_auto_inhibit_period: | |
return # we are in the auto inhibit period - do nothing | |
self.timeof_last_alarm = t | |
message = "value: %f\nconfig:\n%s" % (self.value, self.xmldoc.toprettyxml()) | |
self.alarm_create_callback(self, session, message) | |
def update(self, value, session): | |
"""Update the value of the variable using an RRDUpdates object | |
Calls self.__generate_alarm() if level has been 'bad' for more than | |
self.alarm_trigger_period seconds | |
""" | |
self.value = value | |
print_debug("Variable %s set to %f" % (self.name, value)) | |
if self.test_level(): | |
# level is bad | |
self.trigger_down_counter -= rrd_step | |
if self.trigger_down_counter <= 0: | |
self.__generate_alarm(session) | |
# reset trigger counter | |
self.trigger_down_counter = self.alarm_trigger_period | |
else: | |
# level good - reset trigger counter | |
self.trigger_down_counter = self.alarm_trigger_period | |
class ObjectMonitor: | |
"""Abstract class, used as base for VMMonitor and HOSTMonitor | |
Public attributes are uuid, refresh_config() | |
Inherited classes must implement a public attribute process_rrd_updates() | |
""" | |
def __init__(self, uuid): | |
self.uuid = uuid | |
self.xmlconfig = None | |
# "variables" is the public attribute of interest | |
self.variables = [] | |
self.refresh_config() | |
def refresh_config(self): | |
if self.__update_xmlconfig(): | |
# config has changed - reparse it | |
try: | |
self.__parse_xmlconfig() | |
except XmlConfigException, e: | |
errmsg = "\n".join([ str(x) for x in e.args ]) | |
log_err("%s %s config error: %s" % (self.monitortype, self.uuid, errmsg)) | |
except ExpatError, e: | |
errmsg = "\n".join([ str(x) for x in e.args ]) | |
log_err("%s %s XML parse error: %s" % (self.monitortype, self.uuid, errmsg)) | |
return True | |
else: | |
return False # config unchanged | |
def __update_xmlconfig(self): | |
if not all_xmlconfigs.has_key(self.uuid): | |
xmlconfig = None | |
else: | |
xmlconfig = all_xmlconfigs[self.uuid] | |
changed = False | |
if xmlconfig != self.xmlconfig: | |
self.xmlconfig = xmlconfig | |
changed = True | |
return changed | |
def __parse_xmlconfig(self): | |
if not self.xmlconfig: | |
# Possible if this VM/host is not configured yet | |
self.variables = [] | |
return | |
xmldoc = minidom.parseString(self.xmlconfig) | |
variable_nodes = xmldoc.getElementsByTagName('variable') | |
variable_names = [] | |
for vn in variable_nodes: | |
# create a variable using the config in vn | |
var = Variable(vn, self.alarm_create, self.get_default_variable_config) | |
# Update list of variable names | |
if var.name not in variable_names: | |
variable_names.append(var.name) | |
# build list of variables already present with same name | |
vars_with_same_name = [ v for v in self.variables if v.name == var.name ] | |
count = 0 | |
append_var = True | |
for v in vars_with_same_name: | |
# this list should be 0 or 1 long! | |
if count > 0: | |
log_err("programmer error: found duplicate variable %s (uuid %s)" % (var.name, self.uuid)) | |
self.variables.remove(v) | |
continue | |
count += 1 | |
# only replace variable in self.variables if its config has changed. | |
# This way we don't reset its state | |
if variable_configs_differ(var, v): | |
self.variables.remove(v) | |
else: | |
append_var = False | |
if append_var: | |
print_debug("Appending %s to list of variables for %s UUID=%s" % (var.name, self.monitortype, self.uuid)) | |
self.variables.append(var) | |
# Now delete any old variables that do not appear in the new variable_nodes | |
variables_to_remove = [ v for v in self.variables if v.name not in variable_names ] | |
for v in variables_to_remove: | |
print_debug("Deleting %s from list of variables for UUID=%s" % (v.name, self.uuid)) | |
self.variables.remove(v) | |
def get_active_variables(self): | |
return self.variables | |
def process_rrd_updates(self, rrd_updates, session): | |
print_debug("%sMonitor processing rrd_updates for %s" % (self.monitortype, self.uuid)) | |
obj_report = rrd_updates.get_obj_report_by_uuid(self.uuid) | |
num_rows = rrd_updates.get_num_rows() | |
if not obj_report: | |
return | |
params_in_obj_report = obj_report.get_var_names() | |
for var in self.get_active_variables(): | |
# find the subset of the params returned for this object that we need to consolidate into var | |
params_to_consolidate = filter(var.rrd_regex.match, params_in_obj_report) | |
for row in range(num_rows): | |
# Get the values to consolidate | |
values_to_consolidate = map(lambda param: obj_report.get_value(param, row), params_to_consolidate) | |
# Consolidate them | |
value = var.consolidation_fn(values_to_consolidate) | |
# Pass result on to the variable object - this may result in an alarm being generated | |
var.update(value, session) | |
def alarm_create(self, var, session, message): | |
"Callback used by Variable var to actually send an alarm" | |
print_debug("Creating an alarm for %s %s, message: %s" % (self.monitortype, self.uuid, message)) | |
session.xenapi.message.create("ALARM", var.alarm_priority, self.monitortype, self.uuid, message) | |
class VMMonitor(ObjectMonitor): | |
"""Object that maintains state of one VM | |
Configured by writing an xml string into an other-config key, e.g. | |
xe vm-param-set uuid=$vmuuid other-config:perfmon=\ | |
'<config><variable><name value="cpu_usage"/><alarm_trigger_level value="0.5"/></variable></config>' | |
Notes: | |
- Multiple <variable> nodes allowed | |
- full list of child nodes is | |
* name: what to call the variable (no default) | |
* alarm_priority: the priority of the messages generated (default '3') | |
* alarm_trigger_level: level of value that triggers an alarm (no default) | |
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') | |
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') | |
* alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600') | |
* consolidation_fn: how to combine variables from rrd_updates into one value | |
(default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', 'get_percent_log_fs_usage' for 'log_fs_usage', 'get_percent_mem_usage' for 'mem_usage', & 'sum' for everything else) | |
* rrd_regex matches the names of variables from (xe vm-data-sources-list uuid=$vmuuid) used to compute value | |
(only has defaults for "cpu_usage", "network_usage", and "disk_usage") | |
""" | |
def __init__(self, *args): | |
self.monitortype = "VM" | |
ObjectMonitor.__init__(self, *args) | |
print_debug("Created VMMonitor with uuid %s" % self.uuid) | |
def get_default_variable_config(self, variable_name, config_tag): | |
"This allows user to not specify full set of tags for each variable in xml config" | |
if config_tag == 'consolidation_fn': | |
if variable_name == "cpu_usage": return 'average' | |
elif variable_name == "fs_usage": return 'get_percent_fs_usage' | |
elif variable_name == "log_fs_usage": return 'get_percent_log_fs_usage' | |
elif variable_name == "mem_usage": return 'get_percent_mem_usage' | |
else: return 'sum' | |
elif config_tag == 'rrd_regex': | |
if variable_name == "cpu_usage": return "cpu[0-9]+" | |
elif variable_name == "network_usage": return "vif_[0-9]+_[rt]x" | |
elif variable_name == "disk_usage": return "vbd_(xvd|hd)[a-z]+_(read|write)" | |
elif variable_name == "fs_usage": return "_$_DUMMY__" # match nothing | |
elif variable_name == "log_fs_usage": return "_$_DUMMY__" # match nothing | |
elif variable_name == "mem_usage": return "_$_DUMMY__" # match nothing | |
else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name | |
elif config_tag == 'alarm_trigger_period': return '60' # 1 minute | |
elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour | |
elif config_tag == 'alarm_trigger_level': | |
if variable_name == "fs_usage": return '0.9' # trigger when 90% full | |
elif variable_name == "log_fs_usage": return '0.9' # trigger when 90% full | |
elif variable_name == "mem_usage": return '0.95' # tigger when mem demanded is close to phy_mem | |
else: raise XmlConfigException, "variable %s: no default alarm_trigger_level - please specify one" % variable_name | |
elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above* | |
elif config_tag == 'alarm_priority': return '3' # Service degradation level defined in PR-1455 | |
else: raise XmlConfigException, "variable %s: no default available for tag %s" % (variable_name, config_tag) | |
class SRMonitor(ObjectMonitor): | |
"""Object that maintains state of one SR | |
Configured by writing an xml string into an other-config key, e.g. | |
xe sr-param-set uuid=$vmuuid other-config:perfmon=\ | |
'<config><variable><name value="physical_utilisation"/><alarm_trigger_level value="0.8"/></variable></config>' | |
Notes: | |
- Multiple <variable> nodes allowed | |
- full list of child nodes is | |
* name: what to call the variable (no default) | |
* alarm_priority: the priority of the messages generated (default '3') | |
* alarm_trigger_level: level of value that triggers an alarm (no default) | |
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') | |
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') | |
* alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600') | |
* consolidation_fn: how to combine variables from rrd_updates into one value | |
(default is 'get_percent_sr_usage' for 'physical_utilistation', & 'sum' for everything else) | |
* rrd_regex matches the names of variables from (xe sr-data-sources-list uuid=$sruuid) used to compute value | |
(has default for "physical_utilistaion") | |
""" | |
def __init__(self, *args): | |
self.monitortype = "SR" | |
ObjectMonitor.__init__(self, *args) | |
print_debug("Created SRMonitor with uuid %s" % self.uuid) | |
def get_default_variable_config(self, variable_name, config_tag): | |
"This allows user to not specify full set of tags for each variable in xml config" | |
if config_tag == 'consolidation_fn': | |
if variable_name == 'physical_utilisation': return 'get_percent_sr_usage' | |
else: return 'sum' | |
elif config_tag == 'rrd_regex': | |
if variable_name == 'physical_utilisation': return 'physical_utilisation|size' | |
elif variable_name == "sr_io_throughput_total_per_host": return '_$_DUMMY__' # (these are to drive Host RRDs and so are handled by the HOSTMonitor) | |
else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name | |
elif config_tag == 'alarm_trigger_period': return '60' # 1 minute | |
elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour | |
elif config_tag == 'alarm_trigger_level': | |
if variable_name == "physical_utilistaion": return '0.8' # trigger when 80% full | |
else: raise XmlConfigException, "variable %s: no default alarm_trigger_level - please specify one" % variable_name | |
elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above* | |
elif config_tag == 'alarm_priority': return '3' # Service degradation level defined in PR-1455 | |
else: raise XmlConfigException, "variable %s: no default available for tag %s" % (variable_name, config_tag) | |
class HOSTMonitor(ObjectMonitor): | |
"""Object that maintains state of one Host | |
Configured by writing an xml string into an other-config key, e.g. | |
xe host-param-set uuid=$hostuuid other-config:perfmon=\ | |
'<config><variable><name value="cpu_usage"/><alarm_trigger_level value="0.5"/></variable></config>' | |
Notes: | |
- Multiple <variable> nodes allowed | |
- full list of child nodes is | |
* name: what to call the variable (no default) | |
* alarm_priority: the priority of the messages generated (default '3') | |
* alarm_trigger_level: level of value that triggers an alarm (no default) | |
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') | |
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') | |
* alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600') | |
* consolidation_fn: how to combine variables from rrd_updates into one value | |
(default is 'average' for 'cpu_usage' & 'sum' for everything else) | |
* rrd_regex matches the names of variables from (xe host-data-source-list uuid=$hostuuid) used to compute value | |
(only has defaults for "cpu_usage", "network_usage", "memory_free_kib" and "sr_io_throughput_total_xxxxxxxx" | |
where that last one ends with the first eight characters of the SR uuid) | |
Also, as a special case for SR throughput, it is possible to configure a Host by | |
writing xml into the other-config key of an SR connected to it, e.g. | |
xe sr-param-set uuid=$sruuid other-config:perfmon=\ | |
'<config><variable><name value="sr_io_throughput_total_per_host"/><alarm_trigger_level value="0.01"/></variable></config> | |
This only works for that one specific variable-name, and rrd_regex must not be specified. | |
Configuration done on the host directly (variable-name sr_io_throughput_total_xxxxxxxx) takes priority. | |
""" | |
def __init__(self, *args): | |
self.monitortype = "Host" | |
self.secondary_variables = set() | |
self.secondary_xmlconfigs = {} # map of sr uuid to xml text | |
ObjectMonitor.__init__(self, *args) | |
print_debug("Created HOSTMonitor with uuid %s" % self.uuid) | |
def get_default_variable_config(self, variable_name, config_tag): | |
"This allows user to not specify full set of tags for each variable in xml config" | |
if config_tag == 'consolidation_fn': | |
if variable_name == "cpu_usage": return 'average' | |
else: return 'sum' | |
elif config_tag == 'rrd_regex': | |
if variable_name == "cpu_usage": return "cpu[0-9]+" | |
elif variable_name == "network_usage": return "pif_eth[0-9]+_[rt]x" | |
elif variable_name == "memory_free_kib": return variable_name | |
elif re.match("sr_io_throughput_total_[0-9a-f]{8}$", variable_name): return variable_name[3:] | |
else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name | |
elif config_tag == 'alarm_trigger_period': return '60' # 1 minute | |
elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour | |
elif config_tag == 'alarm_trigger_sense': | |
if variable_name == "memory_free_kib": return "low" | |
else: return 'high' # trigger if *above* level | |
elif config_tag == 'alarm_priority': return '3' # Service degradation level defined in PR-1455 | |
else: raise XmlConfigException, "variable %s: no default available for tag %s" % (variable_name, config_tag) | |
def get_active_variables(self): | |
r = self.variables + [v for v in self.secondary_variables if v.active] | |
print_debug("Returning active variables: %d main, %d total" % (len(self.variables), len(r))) | |
return r | |
def refresh_config(self): | |
main_changed = ObjectMonitor.refresh_config(self) | |
# Now handle any extra config from SRs. | |
# This functionality makes this file inelegant but means that it is | |
# possible to set up an alarm on each host that uses an SR by setting | |
# appropriate configuration in the SR's other-config. | |
if self.uuid not in sruuids_by_hostuuid: | |
print_debug("%s not in sruuids_by_hostuuid" % self.uuid) | |
self.secondary_variables.clear() | |
self.secondary_xmlconfigs.clear() | |
return | |
secondary_changed = False | |
old_sruuids = set(self.secondary_xmlconfigs) # create set of keys | |
current_sruuids = sruuids_by_hostuuid[self.uuid] # a set already | |
if old_sruuids != current_sruuids: | |
print_debug("Changed set of perfmon sruuids for host %s" % self.uuid) | |
secondary_changed = True | |
else: | |
for sruuid in sruuids_by_hostuuid[self.uuid]: | |
sr_xmlconfig = all_xmlconfigs[sruuid] | |
# As an optimisation, if xml unchanged then do not re-parse. | |
# Otherwise we would create Variables which would turn out to be same as existing ones so we would ignore them. | |
if sruuid in self.secondary_xmlconfigs and self.secondary_xmlconfigs[sruuid] == sr_xmlconfig: | |
print_debug("Unchanged sr_xmlconfig for sruuid %s" % sruuid) | |
else: | |
print_debug("Found new/different sr_xmlconfig for sruuid %s" % sruuid) | |
secondary_changed = True | |
break | |
if secondary_changed: | |
try: | |
self.__parse_secondary_xmlconfigs() | |
except XmlConfigException, e: | |
errmsg = "\n".join([ str(x) for x in e.args ]) | |
log_err("%s %s secondary config error: %s" % (self.monitortype, self.uuid, errmsg)) | |
except ExpatError, e: | |
errmsg = "\n".join([ str(x) for x in e.args ]) | |
log_err("%s %s secondary XML parse error: %s" % (self.monitortype, self.uuid, errmsg)) | |
if main_changed or secondary_changed: | |
# Calculate which secondary variables are active, i.e. not overridden by ones configured on the host rather than the SR. | |
main_names = set([v.name for v in self.variables]) | |
for v in self.secondary_variables: | |
v.set_active(v.name not in main_names) | |
def __parse_secondary_xmlconfigs(self): | |
variable_names = set() # Names of the Variable objects we create based on the xml nodes we find | |
self.secondary_xmlconfigs.clear() | |
for sruuid in sruuids_by_hostuuid[self.uuid]: | |
print_debug("Looking for config on SR uuid %s" % sruuid) | |
sr_xmlconfig = all_xmlconfigs[sruuid] | |
self.secondary_xmlconfigs[sruuid] = sr_xmlconfig | |
xmldoc = minidom.parseString(sr_xmlconfig) | |
variable_nodes = xmldoc.getElementsByTagName('variable') | |
found = False | |
for vn in variable_nodes: | |
try: | |
name_element = vn.getElementsByTagName('name')[0] | |
name = name_element.getAttribute('value') | |
except IndexError: | |
log_err("variable missing 'name' tag in perfmon xml config of SR %s" % sruuid) | |
continue # perhaps other nodes are valid | |
print_debug("Found variable with name %s on SR uuid %s" % (name, sruuid)) | |
if name != 'sr_io_throughput_total_per_host': | |
continue # Do nothing unless the variable is meant for the host | |
if len(vn.getElementsByTagName('rrd_regex')) > 0: | |
log_err("Configuration error: rrd_regex must not be specified in config on SR meant for each host") | |
continue # perhaps another node is valid | |
if found: | |
log_err("Configuration error: duplicate variable %s on SR %s" % (name, sruuid)) | |
# A host can only have one Variable from a given SR since we only accept one kind (one name). | |
break | |
found = True | |
name_override = 'sr_io_throughput_total_%s' % sruuid[0:8] | |
name_element.setAttribute('value', name_override) | |
provenance_element = xmldoc.createElement('configured_on') | |
provenance_element.setAttribute('class', 'SR') | |
provenance_element.setAttribute('uuid', sruuid) | |
vn.appendChild(provenance_element) | |
var = Variable(vn, self.alarm_create, self.get_default_variable_config) | |
variable_names.add(var.name) | |
append_var = True | |
vars_with_same_name = [ v for v in self.secondary_variables if v.name == var.name ] | |
for v in vars_with_same_name: | |
# this list should be 0 or 1 long! | |
# only replace variable in self.secondary_variables if its config has changed. | |
# This way we don't reset its state | |
if variable_configs_differ(var, v): | |
print_debug("Removing existing secondary variable to replace with new: %s" % v.name) | |
self.secondary_variables.remove(v) | |
else: | |
print_debug("Found existing secondary variable with same config: %s" % v.name) | |
append_var = False | |
if append_var: | |
print_debug("Adding %s to set of secondary variables for host UUID=%s" % (var.name, self.uuid)) | |
self.secondary_variables.add(var) | |
# Now that we have read all the xml items, | |
# delete any old variables that do not appear in the new variable_nodes | |
print_debug("Going to delete any secondary_variables not in %s" % variable_names) | |
variables_to_remove = [ v for v in self.secondary_variables if v.name not in variable_names ] | |
for v in variables_to_remove: | |
print_debug("Deleting %s from set of secondary variables for UUID=%s" % (v.name, self.uuid)) | |
self.secondary_variables.remove(v) | |
all_xmlconfigs = {} | |
sruuids_by_hostuuid = {} # Maps host uuid to a set of the uuids of the host's SRs that have other-config:perfmon | |
def update_all_xmlconfigs(session): | |
"""Update all_xmlconfigs, a global dictionary that maps any uuid | |
(SR, host or VM) to the xml config string in other-config:perfmon keys | |
and update sruuids_by_hostuuid which together with all_xmlconfigs allows | |
lookup of the other-config:perfmon xml of the SRs connected to a host""" | |
global all_xmlconfigs | |
global sruuids_by_hostuuid | |
all_host_recs = session.xenapi.host.get_all_records() | |
all_vm_recs = session.xenapi.VM.get_all_records() | |
all_sr_recs = session.xenapi.SR.get_all_records() | |
# build dictionary mapping uuids to other_configs | |
all_otherconfigs = {} | |
for recs in (all_host_recs, all_vm_recs, all_sr_recs): | |
all_otherconfigs.update([ | |
(recs[ref]['uuid'], recs[ref]['other_config']) | |
for ref in recs.keys() | |
]) | |
# rebuild dictionary mapping uuids to xmlconfigs | |
all_xmlconfigs.clear() | |
all_xmlconfigs.update([ | |
(uuid, other_config['perfmon']) | |
for (uuid, other_config) in all_otherconfigs.items() | |
if other_config.has_key('perfmon') | |
]) | |
# Rebuild another map | |
sruuids_by_hostuuid.clear() | |
for (sr, rec) in all_sr_recs.items(): | |
if rec['other_config'].has_key('perfmon'): | |
sruuid = rec['uuid'] | |
# If we hadn't done SR.get_all_records we would now do SR.get_PBDs. | |
host_refs = [session.xenapi.PBD.get_host(pbd) for pbd in rec['PBDs']] | |
host_uuids = [all_host_recs[ref]['uuid'] for ref in host_refs] | |
for hu in host_uuids: | |
if hu in sruuids_by_hostuuid: | |
sruuids_by_hostuuid[hu].add(sruuid) | |
else: | |
sruuids_by_hostuuid[hu] = set([sruuid]) | |
# 5 minute default interval | |
interval = 300 | |
interval_percent_dither = 5 | |
rrd_step = 60 | |
debug = False | |
# rate to call update_all_xmlconfigs() | |
config_update_period = 1800 | |
cmdsockname = "\0perfmon" # an af_unix socket name (the "\0" stops socket.bind() creating a fs node) | |
cmdmaxlen = 256 | |
def main(): | |
global interval | |
global interval_percent_dither | |
global rrd_step | |
global debug | |
global config_update_period | |
maxruns=None | |
try: | |
argv = sys.argv[1:] | |
opts, args = getopt.getopt(argv, "i:n:ds:c:D:", | |
["interval=", "numloops=","debug","rrdstep=","config_update_period=","interval_percent_dither="]) | |
except getopt.GetoptError: | |
raise UsageException | |
configfname = None | |
for opt, arg in opts: | |
if opt == '-i' or opt == '--interval': | |
interval = int(arg) | |
elif opt == '-n' or opt == '--numloops': | |
maxruns = int(arg) | |
elif opt == '-d' or opt == '--debug': | |
debug = True | |
elif opt == '-s' or opt == '--rrdstep': | |
rrd_step = int(arg) | |
if rrd_step != 5 and rrd_step != 60: | |
raise UsageException | |
elif opt == '-c' or opt == '--config_update_period': | |
config_update_period = int(arg) | |
elif opt == '-D' or opt == '--interval_percent_dither': | |
interval_percent_dither = int(arg) | |
else: | |
raise UsageException | |
# open the cmd socket (over which we listen for commands such as "refresh") | |
cmdsock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) | |
cmdsock.bind(cmdsockname) | |
# The dither on each loop (prevents stampede on master) | |
rand = random.Random().uniform | |
dither = (interval * interval_percent_dither)/100.0 | |
# Create a XAPI session on first run | |
restart_session = True | |
# Create a client for getting the rrd_updates over HTTP | |
rrd_updates = RRDUpdates() | |
# Work out when next to update all the xmlconfigs for all the | |
# hosts and all the VMs. This causes a lot of data to be retrieved | |
# from the master, so we only do it once every config_update_period | |
# and we cache the results | |
next_config_update = time.time() | |
# monitors for vms running on this host. | |
# This dictionary uses uuids to lookup each monitor object | |
vm_mon_lookup = {} | |
# monitors for srs plugged on this host | |
# This dictionary uses uuids to lookup each monitor object | |
sr_mon_lookup = {} | |
# The monitor for the host | |
host_mon = None | |
runs = 0 | |
while True: | |
print_debug("Run: %d" % runs) | |
# Get new updates - and catch any http errors | |
try: | |
# if session has failed on last run we need to restart it | |
if restart_session: | |
session = XapiSession() | |
restart_session = False | |
rrd_updates.refresh(session) | |
# Should we update all_xmlconfigs | |
if time.time() >= next_config_update: | |
print_debug("Updating all_xmlconfigs") | |
# yes - update all the xml configs: this generates a few LARGE xapi messages from the master | |
update_all_xmlconfigs(session) | |
# Set time when to do this next | |
next_config_update = time.time() + config_update_period | |
# List of VMs present in rrd_updates | |
vm_uuid_list = rrd_updates.get_uuid_list_by_objtype('vm') | |
# Remove any monitors for VMs no longer listed in rrd_updates page | |
for uuid in vm_mon_lookup.keys(): | |
if uuid not in vm_uuid_list: | |
vm_mon_lookup.pop(uuid) | |
# Create monitors for VMs that have just appeared in rrd_updates page | |
for uuid in vm_uuid_list: | |
if uuid not in vm_mon_lookup.keys(): | |
vm_mon_lookup[uuid] = VMMonitor(uuid) | |
else: | |
# check if the config has changed, e.g. by XenCenter | |
vm_mon_lookup[uuid].refresh_config() | |
# Remove monitor for the host if it's no longer listed in rrd_updates page | |
# Create monitor for the host if it has just appeared in rrd_updates page | |
try: | |
host_uuid = rrd_updates.get_uuid_list_by_objtype('host')[0] # should only ever be one of these | |
except: | |
# list may be empty! | |
host_uuid = None | |
if not host_uuid: | |
host_mon = None | |
elif not host_mon: | |
host_mon = HOSTMonitor(host_uuid) | |
elif host_mon.uuid != host_uuid: | |
raise PerfMonException, "host uuid in rrd_updates changed (old: %s, new %s)" % \ | |
(host_mon.uuid, host_uuid) | |
else: | |
# check if the config has changed, e.g. by XenCenter | |
host_mon.refresh_config() | |
# List of SRs present in rrd_updates | |
sr_uuid_list = rrd_updates.get_uuid_list_by_objtype('sr') | |
print_debug("sr_uuid_list = %s" % sr_uuid_list) | |
# Remove monitors for SRs no longer listed in the rrd_updates page | |
for uuid in sr_mon_lookup.keys(): | |
if uuid not in sr_uuid_list: | |
sr_mon_lookup.pop(uuid) | |
# Create monitors for SRs that have just appeared in rrd_updates page | |
for uuid in sr_uuid_list: | |
if uuid not in sr_mon_lookup.keys(): | |
sr_mon_lookup[uuid] = SRMonitor(uuid) | |
else: | |
sr_mon_lookup[uuid].refresh_config() | |
# Go through each vm_mon and update it using the rrd_udpates - this may generate alarms | |
for vm_mon in vm_mon_lookup.values(): | |
vm_mon.process_rrd_updates(rrd_updates, session) | |
# Ditto for the host_mon | |
if host_mon: | |
host_mon.process_rrd_updates(rrd_updates, session) | |
# And for the sr_mons | |
for sr_mon in sr_mon_lookup.values(): | |
sr_mon.process_rrd_updates(rrd_updates, session) | |
except socket.error, e: | |
if e.args[0] == 111: | |
# "Connection refused" - this happens when we try to restart session and *that* fails | |
time.sleep(2) | |
pass | |
log_err("caught socket.error: (%s) - restarting XAPI session" % " ".join([str(x) for x in e.args])) | |
restart_session = True | |
except IOError, e: | |
if e.args[0] == 'http error' and e.args[1] in (401, 500): | |
# Error getting rrd_updates: 401=Unauthorised, 500=Internal - start new session | |
pass | |
elif e.args[0] == 'socket error': | |
# This happens if we send messages or read other-config:perfmon after xapi is restarted | |
pass | |
else: | |
# Don't know why we got this error - crash, die and look at logs later | |
raise | |
log_err("caught IOError: (%s) - restarting XAPI session" % " ".join([str(x) for x in e.args])) | |
restart_session = True | |
runs += 1 | |
if maxruns is not None and runs >= maxruns: | |
break | |
# Force collection of cyclically referenced objects cos we don't | |
# trust GC to do it on its own | |
gc.collect() | |
# Sleep for interval + dither, exiting early if we recv a cmd | |
timeout = rand(interval, interval + dither) | |
cmdsock.settimeout(timeout) | |
try: | |
cmd = cmdsock.recv(cmdmaxlen) | |
except socket.timeout: | |
pass | |
else: | |
if cmd == "refresh": | |
# This forces a re-read of all the configs on the next loop | |
next_config_update = time.time() | |
elif cmd == "debug_mem": | |
debug_mem() | |
else: | |
log_err("received unhandled command %s" % cmd) | |
# continue to next run | |
return 0 | |
def sigterm_handler(sig, stack_frame): | |
log_err("Caught signal %d - exiting" % sig) | |
sys.exit(1) | |
pidfile = "/var/run/perfmon.pid" | |
if __name__ == "__main__": | |
# setup signal handler to print out notice when killed | |
signal.signal(signal.SIGTERM, sigterm_handler) | |
if '--daemon' in sys.argv[1:]: | |
sys.argv.remove('--daemon') | |
if os.fork() != 0: | |
sys.exit(0) | |
os.setsid() | |
sys.stdout=open("/dev/null", 'w') | |
sys.stdin=open("/dev/null", 'r') | |
sys.stderr=sys.stdout | |
# Exit if perfmon already running | |
if os.path.exists(pidfile): | |
pid = open(pidfile).read() | |
if os.path.exists("/proc/%s" % pid): | |
log_err("perfmon already running - exiting") | |
sys.exit(3) | |
try: | |
# Write out pidfile | |
fd = open(pidfile,"w") | |
fd.write("%d" % os.getpid()) | |
fd.close() | |
# run the main loop | |
rc = main() | |
except UsageException, e: | |
# Print the usage | |
log_err("usage: %s [-i <interval> -n <loops> -d -s <rrd_step> -c <config_update_period> -D <interval_percent_dither>] \\\n" \ | |
"\t[--interval=<interval> --numloops=<loops> --debug \\\n" \ | |
"\t --rrdstep=<rrd_step> --daemon]\n" \ | |
"\t --config_update_period=<config_update_period>\n" \ | |
"\t --interval_percent_dither=<interval_percent_dither>\n" \ | |
" interval:\tseconds between reads of http://localhost/rrd_updates?...\n" \ | |
" loops:\tnumber of times to run before exiting\n" \ | |
" rrd_step:\tseconds between samples provided by rrd_updates. Valid values are 5 or 60\n" \ | |
" config_update_period:\tseconds between getting updates of all VM/host records from master\n" \ | |
" interval_percent_dither:\tmax percent dither in each loop - prevents stampede on master\n" \ | |
% (sys.argv[0])) | |
rc = 1 | |
except SystemExit: | |
# we caught a signal which we have already logged | |
pass | |
except Exception, e: | |
rc = 2 | |
log_err("FATAL ERROR: perfmon will exit") | |
log_err("Exception is of class %s" % e.__class__) | |
ex = sys.exc_info() | |
err = traceback.format_exception(*ex) | |
# Python built-in Exception has args, | |
# but XenAPI.Failure has details instead. Sigh. | |
try: | |
errmsg = "\n".join([ str(x) for x in e.args ]) | |
# print the exception args nicely | |
log_err(errmsg) | |
except Exception, ignored: | |
try: | |
errmsg = "\n".join([ str(x) for x in e.details ]) | |
# print the exception args nicely | |
log_err(errmsg) | |
except Exception, ignored: | |
pass | |
# now log the traceback to syslog | |
for exline in err: | |
log_err(exline) | |
# remove pidfile and exit | |
os.unlink(pidfile) | |
sys.exit(rc) |