Permalink
Browse files

Replace deprecated lastUpdate field in whipser data format with an cu…

…stomizable aggregationMethod
  • Loading branch information...
1 parent 4a5f4ea commit 75340fb56da50b1e6cf061a24ef66387e9611b35 @tmm1 committed Jul 25, 2011
@@ -3,8 +3,10 @@
#
# [name]
# pattern = regex
+# aggregate = (average|sum|last|min|max)
# retentions = timePerPoint:timeToStore, timePerPoint:timeToStore, ...
[default_1min_for_1day]
pattern = .*
+aggregate = average
retentions = 60s:1d
@@ -79,31 +79,34 @@ def matches(self, metric):
class DefaultSchema(Schema):
- def __init__(self, name, archives):
+ def __init__(self, name, archives, aggregate):
self.name = name
self.archives = archives
+ self.aggregate = aggregate
def test(self, metric):
return True
class PatternSchema(Schema):
- def __init__(self, name, pattern, archives):
+ def __init__(self, name, pattern, archives, aggregate):
self.name = name
self.pattern = pattern
self.regex = re.compile(pattern)
self.archives = archives
+ self.aggregate = aggregate
def test(self, metric):
return self.regex.search(metric)
class ListSchema(Schema):
- def __init__(self, name, listName, archives):
+ def __init__(self, name, listName, archives, aggregate):
self.name = name
self.listName = listName
self.archives = archives
self.path = join(settings.WHITELISTS_DIR, listName)
+ self.aggregate = aggregate
if exists(self.path):
self.mtime = os.stat(self.path).st_mtime
@@ -156,18 +159,19 @@ def loadStorageSchemas():
matchAll = options.get('match-all')
pattern = options.get('pattern')
listName = options.get('list')
+ aggregate = options.get('aggregate')
retentions = options['retentions'].split(',')
archives = [ Archive.fromString(s) for s in retentions ]
if matchAll:
- mySchema = DefaultSchema(section, archives)
+ mySchema = DefaultSchema(section, archives, aggregate)
elif pattern:
- mySchema = PatternSchema(section, pattern, archives)
+ mySchema = PatternSchema(section, pattern, archives, aggregate)
elif listName:
- mySchema = ListSchema(section, listName, archives)
+ mySchema = ListSchema(section, listName, archives, aggregate)
else:
raise ValueError('schema "%s" has no pattern or list parameter configured' % section)
@@ -179,4 +183,4 @@ def loadStorageSchemas():
defaultArchive = Archive(60, 60 * 24 * 7) #default retention for unclassified data (7 days of minutely data)
-defaultSchema = DefaultSchema('default', [defaultArchive])
+defaultSchema = DefaultSchema('default', [defaultArchive], 'average')
@@ -94,11 +94,13 @@ def writeCachedDataPoints():
if not dbFileExists:
archiveConfig = None
+ aggregationMethod = None
for schema in schemas:
if schema.matches(metric):
log.creates('new metric %s matched schema %s' % (metric, schema.name))
archiveConfig = [archive.getTuple() for archive in schema.archives]
+ aggregationMethod = schema.aggregate
break
if not archiveConfig:
@@ -108,7 +110,7 @@ def writeCachedDataPoints():
os.system("mkdir -p -m 755 '%s'" % dbDir)
log.creates("creating database file %s" % dbFilePath)
- whisper.create(dbFilePath, archiveConfig)
+ whisper.create(dbFilePath, archiveConfig, aggregationMethod=aggregationMethod)
os.chmod(dbFilePath, 0755)
increment('creates')
@@ -45,6 +45,7 @@ def parseRetentionDef(retentionDef):
option_parser = OptionParser(usage='''%prog path secondsPerPoint:pointsToStore [secondsPerPoint:pointsToStore]* ''')
option_parser.add_option('--xFilesFactor', default=0.5, type='float')
+option_parser.add_option('--aggregationMethod', default='average', type='string', help="Method to use when aggregating values into lower precisions (average, sum, last, min, max)")
option_parser.add_option('--overwrite', default=False, action='store_true')
(options, args) = option_parser.parse_args()
@@ -60,7 +61,7 @@ def parseRetentionDef(retentionDef):
print 'Overwriting existing file: %s' % path
os.unlink(path)
-whisper.create(path, archives, xFilesFactor=options.xFilesFactor)
+whisper.create(path, archives, xFilesFactor=options.xFilesFactor, aggregationMethod=options.aggregationMethod)
size = os.stat(path).st_size
print 'Created: %s (%d bytes)' % (path,size)
@@ -56,6 +56,7 @@ def parseRetentionDef(retentionDef):
12h:2y 12 hours per datapoint, 2 years of retention
''')
option_parser.add_option('--xFilesFactor', default=None, type='float', help="Change the xFilesFactor")
+option_parser.add_option('--aggregationMethod', default=None, type='string', help="Change the aggregation method used (average, sum, last, min, max)")
option_parser.add_option('--force', default=False, action='store_true', help="Perform a destructive change")
option_parser.add_option('--newfile', default=None, action='store', help="Create a new database file without removing the existing one")
option_parser.add_option('--nobackup', action='store_true', help='Delete the .bak file after successful execution')
@@ -78,6 +79,11 @@ def parseRetentionDef(retentionDef):
else:
xff = options.xFilesFactor
+if options.aggregationMethod is None:
+ aggregationMethod = info['aggregationMethod']
+else:
+ aggregationMethod = options.aggregationMethod
+
print 'Retrieving all data from the archives'
for archive in old_archives:
fromTime = now - archive['retention'] + archive['secondsPerPoint']
@@ -95,7 +101,7 @@ def parseRetentionDef(retentionDef):
newfile = options.newfile
print 'Creating new whisper database: %s' % newfile
-whisper.create(newfile, new_archives, xFilesFactor=xff)
+whisper.create(newfile, new_archives, xFilesFactor=xff, aggregationMethod=aggregationMethod)
size = os.stat(newfile).st_size
print 'Created: %s (%d bytes)' % (newfile,size)
View
@@ -19,13 +19,11 @@
#
# File = Header,Data
# Header = Metadata,ArchiveInfo+
-# Metadata = lastUpdate,maxRetention,xFilesFactor,archiveCount
+# Metadata = aggregationType,maxRetention,xFilesFactor,archiveCount
# ArchiveInfo = Offset,SecondsPerPoint,Points
# Data = Archive+
# Archive = Point+
# Point = timestamp,value
-#
-# NOTE: the lastUpdate field is deprecated, do not use it!
import os, struct, time
try:
@@ -43,8 +41,6 @@
longSize = struct.calcsize(longFormat)
floatFormat = "!f"
floatSize = struct.calcsize(floatFormat)
-timestampFormat = "!L"
-timestampSize = struct.calcsize(timestampFormat)
valueFormat = "!d"
valueSize = struct.calcsize(valueFormat)
pointFormat = "!Ld"
@@ -54,6 +50,15 @@
archiveInfoFormat = "!3L"
archiveInfoSize = struct.calcsize(archiveInfoFormat)
+aggregationTypeToMethod = dict({
+ 1: 'average',
+ 2: 'sum',
+ 3: 'last',
+ 4: 'max',
+ 5: 'min'
+})
+aggregationMethodToType = dict([[v,k] for k,v in aggregationTypeToMethod.items()])
+
debug = startBlock = endBlock = lambda *a,**k: None
@@ -65,6 +70,10 @@ class InvalidConfiguration(WhisperException):
"""Invalid configuration."""
+class InvalidAggregationMethod(WhisperException):
+ """Invalid aggregation method."""
+
+
class InvalidTimeInterval(WhisperException):
"""Invalid time interval."""
@@ -122,7 +131,7 @@ def __readHeader(fh):
packedMetadata = fh.read(metadataSize)
try:
- (lastUpdate,maxRetention,xff,archiveCount) = struct.unpack(metadataFormat,packedMetadata)
+ (aggregationType,maxRetention,xff,archiveCount) = struct.unpack(metadataFormat,packedMetadata)
except:
raise CorruptWhisperFile("Unable to read header", fh.name)
@@ -146,7 +155,7 @@ def __readHeader(fh):
fh.seek(originalOffset)
info = {
- #'lastUpdate' : lastUpdate, # Deprecated
+ 'aggregationMethod' : aggregationTypeToMethod.get(aggregationType, 'average'),
'maxRetention' : maxRetention,
'xFilesFactor' : xff,
'archives' : archives,
@@ -159,20 +168,15 @@ def __readHeader(fh):
def __changeLastUpdate(fh):
return #XXX Make this a NOP, use os.stat(filename).st_mtime instead
- originalOffset = fh.tell()
- fh.seek(0) #Based on assumption that first field is lastUpdate
- now = int( time.time() )
- packedTime = struct.pack(timestampFormat,now)
- fh.write(packedTime)
- fh.seek(originalOffset)
-def create(path,archiveList,xFilesFactor=0.5):
- """create(path,archiveList,xFilesFactor=0.5)
+def create(path,archiveList,xFilesFactor=0.5,aggregationMethod='average'):
+ """create(path,archiveList,xFilesFactor=0.5,aggregationMethod='average')
path is a string
archiveList is a list of archives, each of which is of the form (secondsPerPoint,numberOfPoints)
xFilesFactor specifies the fraction of data points in a propagation interval that must have known values for a propagation to occur
+aggregationMethod specifies the method to use when propogating data (average, sum, last, min, max)
"""
#Validate archive configurations...
if not archiveList:
@@ -210,12 +214,12 @@ def create(path,archiveList,xFilesFactor=0.5):
if LOCK:
fcntl.flock( fh.fileno(), fcntl.LOCK_EX )
- lastUpdate = struct.pack( timestampFormat, int(time.time()) )
+ aggregationType = struct.pack( longFormat, aggregationMethodToType[aggregationMethod] )
oldest = sorted([secondsPerPoint * points for secondsPerPoint,points in archiveList])[-1]
maxRetention = struct.pack( longFormat, oldest )
xFilesFactor = struct.pack( floatFormat, float(xFilesFactor) )
archiveCount = struct.pack(longFormat, len(archiveList))
- packedMetadata = lastUpdate + maxRetention + xFilesFactor + archiveCount
+ packedMetadata = aggregationType + maxRetention + xFilesFactor + archiveCount
fh.write(packedMetadata)
headerSize = metadataSize + (archiveInfoSize * len(archiveList))
archiveOffsetPointer = headerSize
@@ -234,8 +238,22 @@ def create(path,archiveList,xFilesFactor=0.5):
fh.close()
+def __aggregate(aggregationMethod, knownValues):
+ if aggregationMethod == 'average':
+ return float(sum(knownValues)) / float(len(knownValues))
+ elif aggregationMethod == 'sum':
+ return float(sum(knownValues))
+ elif aggregationMethod == 'last':
+ return knownValues[len(knownValues)-1]
+ elif aggregationMethod == 'max':
+ return max(knownValues)
+ elif aggregationMethod == 'min':
+ return min(knownValues)
+ else:
+ raise InvalidAggregationMethod("Unrecognized aggregation method")
+
-def __propagate(fh,timestamp,xff,higher,lower):
+def __propagate(fh,timestamp,aggregationMethod,xff,higher,lower):
lowerIntervalStart = timestamp - (timestamp % lower['secondsPerPoint'])
lowerIntervalEnd = lowerIntervalStart + lower['secondsPerPoint']
@@ -290,7 +308,7 @@ def __propagate(fh,timestamp,xff,higher,lower):
knownPercent = float(len(knownValues)) / float(len(neighborValues))
if knownPercent >= xff: #we have enough data to propagate a value!
- aggregateValue = float(sum(knownValues)) / float(len(knownValues)) #TODO another CF besides average?
+ aggregateValue = __aggregate(aggregationMethod, knownValues)
myPackedPoint = struct.pack(pointFormat,lowerIntervalStart,aggregateValue)
fh.seek(lower['offset'])
packedPoint = fh.read(pointSize)
@@ -367,7 +385,7 @@ def file_update(fh, value, timestamp):
#Now we propagate the update to lower-precision archives
higher = archive
for lower in lowerArchives:
- if not __propagate(fh, myInterval, header['xFilesFactor'], higher, lower):
+ if not __propagate(fh, myInterval, header['aggregationMethod'], header['xFilesFactor'], higher, lower):
break
higher = lower
@@ -493,7 +511,7 @@ def __archive_update_many(fh,header,archive,points):
uniqueLowerIntervals = set(lowerIntervals)
propagateFurther = False
for interval in uniqueLowerIntervals:
- if __propagate(fh,interval,header['xFilesFactor'],higher,lower):
+ if __propagate(fh,interval,header['aggregationMethod'],header['xFilesFactor'],higher,lower):
propagateFurther = True
if not propagateFurther:

0 comments on commit 75340fb

Please sign in to comment.