Skip to content

Commit

Permalink
verify and polish a couple of optimizations and reject another.
Browse files Browse the repository at this point in the history
include code used to evaluate optimizations.
  • Loading branch information
garyposter committed Jun 29, 2007
1 parent 1a6af35 commit 2da3f83
Show file tree
Hide file tree
Showing 3 changed files with 281 additions and 44 deletions.
81 changes: 37 additions & 44 deletions src/zc/relationship/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ def __init__(self, attrs, defaultTransitiveQueriesFactory=None,
res['multiple'] = data.get('multiple', False)
if (res['dump'] is None) ^ (res['load'] is None):
raise ValueError(
"either both of 'dump' and 'load' must be None, or neither")
"either both of 'dump' and 'load' must be None, or "
"neither")
# when both load and dump are None, this is a small
# optimization that can be a large optimization if the returned
# value is one of the main four options of the selected btree
Expand All @@ -154,6 +155,7 @@ def __init__(self, attrs, defaultTransitiveQueriesFactory=None,
raise ValueError('no name specified')
if res['name'] in _attrs or val in seen:
raise ValueError('Duplicate in attrs', res['name'], val)
seen.add(val)
if res['TreeSet'].__name__[0] == 'I':
Mapping = BTrees.family32.IO.BTree
elif res['TreeSet'].__name__[0] == 'L':
Expand All @@ -163,8 +165,8 @@ def __init__(self, attrs, defaultTransitiveQueriesFactory=None,
Mapping = family.OO.BTree
self._name_TO_mapping[res['name']] = Mapping()
# these are objtoken to (relcount, relset)
seen.add(val)
_attrs[res['name']] = res


def _getValuesAndTokens(self, rel, data):
values = None
Expand Down Expand Up @@ -255,19 +257,25 @@ def index_doc(self, relToken, rel):
# over essentially generating a new TreeSet and
# updating it with *all* values. On the other
# hand, if there are a lot of removals, it's
# probably quicker just to make a new one." We
# pick >25 removals, mostly arbitrarily, as our
# "cut bait" cue. We don't just do a len of
# removed because lens of btrees are potentially
# expensive.
for ix, t in enumerate(removed):
if ix >= 25: # arbitrary cut-off
newTokens = data['TreeSet'](newTokens)
break
oldTokens.remove(t)
# probably quicker just to make a new one." See
# timeit/set_creation_vs_removal.py for details.
# A len is pretty cheap--all the buckets should
# already be in memory.
len_removed = len(removed)
if len_removed < 5:
recycle = True
else:
len_old = len(oldTokens)
ratio = float(len_old)/len_removed
recycle = (ratio <= 0.1 or len_old > 500
and ratio < 0.2)
if recycle:
for t in removed:
oldTokens.remove(t)
oldTokens.update(added)
newTokens = oldTokens
else:
newTokens = data['TreeSet'](newTokens)
else:
removed = oldTokens
added = newTokens
Expand Down Expand Up @@ -425,7 +433,7 @@ def findValueTokenSet(self, reltoken, name):

def _relData(self, searchTerms):
data = []
if getattr(searchTerms, 'items', None) is not None:
if getattr(searchTerms, 'items', None) is not None: # quack
searchTerms = searchTerms.items()
searchTerms = tuple(searchTerms)
if not searchTerms:
Expand All @@ -443,40 +451,25 @@ def _relData(self, searchTerms):
relData = self._name_TO_mapping[nm].get(token)
if relData is None or relData[0].value == 0:
return None
data.append((relData[0].value, nm, token, relData[1]))
data.append((relData[0].value, relData[1])) # length, set
# we don't want to sort on the set values!! just the lengths.
data.sort(key=lambda i: i[0])
if rel is not None:
for ct, nm, tk, st in data:
if rel not in st:
for l, s in data:
if rel not in s:
return None
return self._relTools['TreeSet']((rel,))
data.sort()
while len(data) > 1:
first_count, _ignore1, _ignore2, first_set = data[0]
second_count, second_name, second_token, second_set = data[1]
if first_count <= second_count/10:
# we'll just test this by hand: intended to be an optimization.
# should be tested and the factor adjusted (or this
# code path removed, relying only on the standard BTree
# intersection code). The theory behind this is that the
# standard BTree intersection code just iterates over the sets
# to find matches. Therefore, if you have one set of
# range(100000) and another of (99999,) then it will be fairly
# inefficient. walking a BTree to find membership is very
# cheap, so if the first_count is significantly smaller than
# the second_count we should just check whether each
# member of the smaller set is in the larger, one at a time.
intersection = self._relTools['TreeSet'](
i for i in first_set if i in second_set)
else:
intersection = self._relTools['intersection'](
first_set, second_set)
if not intersection:
return None
data = data[2:]
# we can't know how many are in a new set without a possibly
# expensive len; however, the len should be <= first_count
data.insert(0, (first_count, None, None, intersection))
return data[0][3]
# we had an untested optimization attempt here before. It has
# been tested now (see timeit/manual_intersection.py), found
# useless, and removed.
#
# we know we have at least one result now. intersect all. Work
# from smallest to largest, until we're done or we don't have any
# more results.
res = data.pop(0)[1]
while res and data:
res = self._relTools['intersection'](res, data.pop(0)[1])
return res

def _parse(self, query, maxDepth, filter, targetQuery, targetFilter,
transitiveQueriesFactory):
Expand Down
96 changes: 96 additions & 0 deletions src/zc/relationship/timeit/manual_intersection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
This one is not very surprising, in retrospect.
Example result:
[('control_result', 0.000244140625),
('manual one ten', 0.028602838516235352),
('intersect one ten', 0.031899929046630859),
('intersect ten one', 0.031992912292480469),
('manual one hundred', 0.028395891189575195),
('intersect one hundred', 0.031839847564697266),
('intersect hundred one', 0.031901836395263672),
('manual ten hundred', 0.061377763748168945),
('intersect ten hundred', 0.038860797882080078),
('intersect hundred ten', 0.038875818252563477),
('manual one thousand', 0.028687715530395508),
('intersect one thousand', 0.03191065788269043),
('intersect thousand one', 0.03180384635925293),
('manual ten thousand', 0.061729907989501953),
('intersect ten thousand', 0.039285898208618164),
('intersect thousand ten', 0.039031744003295898),
('manual hundred thousand', 0.39881396293640137),
('intersect hundred thousand', 0.12407588958740234),
('intersect thousand hundred', 0.12287592887878418),
('manual one tenthousand', 0.028566837310791016),
('intersect one tenthousand', 0.031667947769165039),
('intersect tenthousand one', 0.031734943389892578),
('manual ten tenthousand', 0.062446832656860352),
('intersect ten tenthousand', 0.039116859436035156),
('intersect tenthousand ten', 0.039546728134155273),
('manual hundred tenthousand', 0.40611386299133301),
('intersect hundred tenthousand', 0.1228797435760498),
('intersect tenthousand hundred', 0.12310886383056641),
('manual thousand tenthousand', 3.989987850189209),
('intersect thousand tenthousand', 0.91191983222961426),
('intersect tenthousand thousand', 0.91330981254577637)]
Conclusion: just use intersection. manual intersection with one on left is
always slightly better than C intersection, but not worth it in terms of
complexity.
While order is irrelevant for an individual intersection, it is very relevant
for a manual intersection: unsurprisingly, you want to sort smaller sets first,
because as long as one of your two sets is small, you go pretty fast.
Therefore, given a multi-intersection of [small, medium, large] it makes the
most sense to order them that way:
intersect(intersect(small, medium), large)
"""

import timeit
import pprint

setup = '''
import BTrees
one = BTrees.family32.IO.TreeSet((0,))
ten = BTrees.family32.IO.TreeSet(range(10))
hundred = BTrees.family32.IO.TreeSet(range(100))
thousand = BTrees.family32.IO.TreeSet(range(1000))
tenthousand = BTrees.family32.IO.TreeSet(range(10000))
'''

manual_template = 'BTrees.family32.IO.TreeSet(i for i in %s if i in %s)'

intersect_template = 'BTrees.family32.IO.intersection(%s, %s)'


control = timeit.Timer('pass', setup)

options = ('one', 'ten', 'hundred', 'thousand', 'tenthousand')

runs = 10000

control_result = min(timeit.Timer('pass', setup).repeat(3, runs))

d = [('control_result', control_result)]

for i, big in enumerate(options):
for little in options[:i]:
d.append((
'manual %s %s' % (little, big),
min(timeit.Timer(
manual_template % (little, big),
setup).repeat(3, runs)) - control_result))
d.append((
'intersect %s %s' % (little, big),
min(timeit.Timer(
intersect_template % (little, big),
setup).repeat(3, runs)) - control_result))
d.append((
'intersect %s %s' % (big, little),
min(timeit.Timer(
intersect_template % (big, little),
setup).repeat(3, runs)) - control_result))

pprint.pprint(d)
148 changes: 148 additions & 0 deletions src/zc/relationship/timeit/set_creation_vs_removal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
Example output:
----8<----8<----8<----
control: 0.0847041606903
(the following results have the control subtracted)
create_empty: 0.00116777420044
create_one: 0.00260376930237
create_ten: 0.00407481193542
create_hundred: 0.0167508125305
create_thousand: 0.149729013443
create_tenthousand: 1.55575275421
remove_one: 0.00317978858948
remove_ten: 0.0110769271851
remove_hundred: 0.0708107948303
remove_thousand: 0.52201294899
remove_tenthousand: 5.27875876427
len_one: 0.00183796882629
len_ten: 0.001877784729
len_hundred: 0.00301790237427
len_thousand: 0.00213479995728
len_tenthousand: 0.00438284873962
----8<----8<----8<----
remove one/create one is similar
ten starts giving obvious wins to creation. removing 1 (10%) slightly cheaper
create 100/remove 10 gives slight win to remove (10%-ish)
create 1000/remove 100 gives significant win to removal. Progression for both
patterns begins to be linear with larger numbers, which makes sense. That
suggests 20% might be the approximate break even point.
create 10000/remove 1000 suggests a break even point of about 26%.
The cost of the len is not nothing, but begins to pay for itself with
larger numbers.
This will be our heuristic then, rather than trying to calculate a curve I
couldn't make anyway.
given len_removal, len_set, and ratio (len_set/len_removal)
if len_removal < 5 or ratio <= .1 or len_set > 500 and ration <= .2:
remove rather than create
"""

import timeit

setup = '''
import BTrees
one = BTrees.family32.IO.TreeSet((0,))
ten = BTrees.family32.IO.TreeSet(range(10))
hundred = BTrees.family32.IO.TreeSet(range(100))
thousand = BTrees.family32.IO.TreeSet(range(1000))
tenthousand = BTrees.family32.IO.TreeSet(range(10000))
sets = [BTrees.family32.IO.TreeSet(tenthousand) for i in range(1000)]
'''

base = 's = sets.pop()'

create_template = base + '''
BTrees.family32.IO.TreeSet(%s)'''

remove_template = base + '''
for i in range(0, 10000, %d):
s.remove(i)'''

len_template = base + '''
len(%s)'''

control = timeit.Timer(base, setup)

create_empty = timeit.Timer(create_template % ('',), setup)
create_one = timeit.Timer(create_template % ('one',), setup)
create_ten = timeit.Timer(create_template % ('ten',), setup)
create_hundred = timeit.Timer(create_template % ('hundred',), setup)
create_thousand = timeit.Timer(create_template % ('thousand',), setup)
create_tenthousand = timeit.Timer(create_template % ('tenthousand',), setup)

remove_one = timeit.Timer(remove_template % (10000,), setup)
remove_ten = timeit.Timer(remove_template % (1000,), setup)
remove_hundred = timeit.Timer(remove_template % (100,), setup)
remove_thousand = timeit.Timer(remove_template % (10,), setup)
remove_tenthousand = timeit.Timer(remove_template % (1,), setup)

len_one = timeit.Timer(len_template % ('one',), setup)
len_ten = timeit.Timer(len_template % ('ten',), setup)
len_hundred = timeit.Timer(len_template % ('hundred',), setup)
len_thousand = timeit.Timer(len_template % ('thousand',), setup)
len_tenthousand = timeit.Timer(len_template % ('tenthousand',), setup)

d = {}

control_result = d['control_result'] = min(control.repeat(3, 1000))
d['create_empty_result'] = min(create_empty.repeat(3, 1000))
d['create_one_result'] = min(create_one.repeat(3, 1000))
d['create_ten_result'] = min(create_ten.repeat(3, 1000))
d['create_hundred_result'] = min(create_hundred.repeat(3, 1000))
d['create_thousand_result'] = min(create_thousand.repeat(3, 1000))
d['create_tenthousand_result'] = min(create_tenthousand.repeat(3, 1000))
d['remove_one_result'] = min(remove_one.repeat(3, 1000))
d['remove_ten_result'] = min(remove_ten.repeat(3, 1000))
d['remove_hundred_result'] = min(remove_hundred.repeat(3, 1000))
d['remove_thousand_result'] = min(remove_thousand.repeat(3, 1000))
d['remove_tenthousand_result'] = min(remove_tenthousand.repeat(3, 1000))
d['len_one_result'] = min(len_one.repeat(3, 1000))
d['len_ten_result'] = min(len_ten.repeat(3, 1000))
d['len_hundred_result'] = min(len_hundred.repeat(3, 1000))
d['len_thousand_result'] = min(len_thousand.repeat(3, 1000))
d['len_tenthousand_result'] = min(len_tenthousand.repeat(3, 1000))

for k, v in d.items():
if k == 'control_result':
continue
d[k[:-7]] = v - control_result

print '''
control: %(control_result)s
(the following results have the control subtracted)
create_empty: %(create_empty)s
create_one: %(create_one)s
create_ten: %(create_ten)s
create_hundred: %(create_hundred)s
create_thousand: %(create_thousand)s
create_tenthousand: %(create_tenthousand)s
remove_one: %(remove_one)s
remove_ten: %(remove_ten)s
remove_hundred: %(remove_hundred)s
remove_thousand: %(remove_thousand)s
remove_tenthousand: %(remove_tenthousand)s
len_one: %(len_one)s
len_ten: %(len_ten)s
len_hundred: %(len_hundred)s
len_thousand: %(len_thousand)s
len_tenthousand: %(len_tenthousand)s
''' % d

0 comments on commit 2da3f83

Please sign in to comment.