verify and polish a couple of optimizations and reject another.

include code used to evaluate optimizations.
zopefoundation · Jun 29, 2007 · 2da3f83 · 2da3f83
1 parent 1a6af35
commit 2da3f83
Show file tree

Hide file tree

Showing 3 changed files with 281 additions and 44 deletions.
diff --git a/src/zc/relationship/index.py b/src/zc/relationship/index.py
@@ -129,7 +129,8 @@ def __init__(self, attrs, defaultTransitiveQueriesFactory=None,
             res['multiple'] = data.get('multiple', False)
             if (res['dump'] is None) ^ (res['load'] is None):
                 raise ValueError(
-                    "either both of 'dump' and 'load' must be None, or neither")
+                    "either both of 'dump' and 'load' must be None, or "
+                    "neither")
                 # when both load and dump are None, this is a small
                 # optimization that can be a large optimization if the returned
                 # value is one of the main four options of the selected btree
@@ -154,6 +155,7 @@ def __init__(self, attrs, defaultTransitiveQueriesFactory=None,
                 raise ValueError('no name specified')
             if res['name'] in _attrs or val in seen:
                 raise ValueError('Duplicate in attrs', res['name'], val)
+            seen.add(val)
             if res['TreeSet'].__name__[0] == 'I':
                 Mapping = BTrees.family32.IO.BTree
             elif res['TreeSet'].__name__[0] == 'L':
@@ -163,8 +165,8 @@ def __init__(self, attrs, defaultTransitiveQueriesFactory=None,
                 Mapping = family.OO.BTree
             self._name_TO_mapping[res['name']] = Mapping()
             # these are objtoken to (relcount, relset)
-            seen.add(val)
             _attrs[res['name']] = res
+
 
     def _getValuesAndTokens(self, rel, data):
         values = None
@@ -255,19 +257,25 @@ def index_doc(self, relToken, rel):
                             # over essentially generating a new TreeSet and
                             # updating it with *all* values.  On the other
                             # hand, if there are a lot of removals, it's
-                            # probably quicker just to make a new one."  We
-                            # pick >25 removals, mostly arbitrarily, as our 
-                            # "cut bait" cue.  We don't just do a len of
-                            # removed because lens of btrees are potentially
-                            # expensive.
-                            for ix, t in enumerate(removed):
-                                if ix >= 25: # arbitrary cut-off
-                                    newTokens = data['TreeSet'](newTokens)
-                                    break
-                                oldTokens.remove(t)
+                            # probably quicker just to make a new one."  See
+                            # timeit/set_creation_vs_removal.py for details.
+                            # A len is pretty cheap--all the buckets should
+                            # already be in memory.
+                            len_removed = len(removed)
+                            if len_removed < 5:
+                                recycle = True
                             else:
+                                len_old = len(oldTokens)
+                                ratio = float(len_old)/len_removed
+                                recycle = (ratio <= 0.1 or len_old > 500
+                                           and ratio < 0.2)
+                            if recycle:
+                                for t in removed:
+                                    oldTokens.remove(t)
                                 oldTokens.update(added)
                                 newTokens = oldTokens
+                            else:
+                                newTokens = data['TreeSet'](newTokens)
                     else:
                         removed = oldTokens
                         added = newTokens
@@ -425,7 +433,7 @@ def findValueTokenSet(self, reltoken, name):
 
     def _relData(self, searchTerms):
         data = []
-        if getattr(searchTerms, 'items', None) is not None:
+        if getattr(searchTerms, 'items', None) is not None: # quack
             searchTerms = searchTerms.items()
         searchTerms = tuple(searchTerms)
         if not searchTerms:
@@ -443,40 +451,25 @@ def _relData(self, searchTerms):
                     relData = self._name_TO_mapping[nm].get(token)
                 if relData is None or relData[0].value == 0:
                     return None
-                data.append((relData[0].value, nm, token, relData[1]))
+                data.append((relData[0].value, relData[1])) # length, set
+        # we don't want to sort on the set values!! just the lengths.
+        data.sort(key=lambda i: i[0])
         if rel is not None:
-            for ct, nm, tk, st in data:
-                if rel not in st:
+            for l, s in data:
+                if rel not in s:
                     return None
             return self._relTools['TreeSet']((rel,))
-        data.sort()
-        while len(data) > 1:
-            first_count, _ignore1, _ignore2, first_set = data[0]
-            second_count, second_name, second_token, second_set = data[1]
-            if first_count <= second_count/10:
-                # we'll just test this by hand: intended to be an optimization.
-                # should be tested and the factor adjusted (or this
-                # code path removed, relying only on the standard BTree
-                # intersection code).  The theory behind this is that the
-                # standard BTree intersection code just iterates over the sets
-                # to find matches.  Therefore, if you have one set of 
-                # range(100000) and another of (99999,) then it will be fairly
-                # inefficient.  walking a BTree to find membership is very
-                # cheap, so if the first_count is significantly smaller than
-                # the second_count we should just check whether each
-                # member of the smaller set is in the larger, one at a time.
-                intersection = self._relTools['TreeSet'](
-                    i for i in first_set if i in second_set)
-            else:
-                intersection = self._relTools['intersection'](
-                    first_set, second_set)
-            if not intersection:
-                return None
-            data = data[2:]
-            # we can't know how many are in a new set without a possibly
-            # expensive len; however, the len should be <= first_count
-            data.insert(0, (first_count, None, None, intersection))
-        return data[0][3]
+        # we had an untested optimization attempt here before.  It has
+        # been tested now (see timeit/manual_intersection.py), found
+        # useless, and removed.
+        #
+        # we know we have at least one result now.  intersect all.  Work
+        # from smallest to largest, until we're done or we don't have any
+        # more results.
+        res = data.pop(0)[1]
+        while res and data:
+            res = self._relTools['intersection'](res, data.pop(0)[1])
+        return res
 
     def _parse(self, query, maxDepth, filter, targetQuery, targetFilter,
                transitiveQueriesFactory):

diff --git a/src/zc/relationship/timeit/manual_intersection.py b/src/zc/relationship/timeit/manual_intersection.py
@@ -0,0 +1,96 @@
+"""
+This one is not very surprising, in retrospect.
+
+Example result:
+
+[('control_result', 0.000244140625),
+ ('manual one ten', 0.028602838516235352),
+ ('intersect one ten', 0.031899929046630859),
+ ('intersect ten one', 0.031992912292480469),
+ ('manual one hundred', 0.028395891189575195),
+ ('intersect one hundred', 0.031839847564697266),
+ ('intersect hundred one', 0.031901836395263672),
+ ('manual ten hundred', 0.061377763748168945),
+ ('intersect ten hundred', 0.038860797882080078),
+ ('intersect hundred ten', 0.038875818252563477),
+ ('manual one thousand', 0.028687715530395508),
+ ('intersect one thousand', 0.03191065788269043),
+ ('intersect thousand one', 0.03180384635925293),
+ ('manual ten thousand', 0.061729907989501953),
+ ('intersect ten thousand', 0.039285898208618164),
+ ('intersect thousand ten', 0.039031744003295898),
+ ('manual hundred thousand', 0.39881396293640137),
+ ('intersect hundred thousand', 0.12407588958740234),
+ ('intersect thousand hundred', 0.12287592887878418),
+ ('manual one tenthousand', 0.028566837310791016),
+ ('intersect one tenthousand', 0.031667947769165039),
+ ('intersect tenthousand one', 0.031734943389892578),
+ ('manual ten tenthousand', 0.062446832656860352),
+ ('intersect ten tenthousand', 0.039116859436035156),
+ ('intersect tenthousand ten', 0.039546728134155273),
+ ('manual hundred tenthousand', 0.40611386299133301),
+ ('intersect hundred tenthousand', 0.1228797435760498),
+ ('intersect tenthousand hundred', 0.12310886383056641),
+ ('manual thousand tenthousand', 3.989987850189209),
+ ('intersect thousand tenthousand', 0.91191983222961426),
+ ('intersect tenthousand thousand', 0.91330981254577637)]
+
+Conclusion: just use intersection.  manual intersection with one on left is
+always slightly better than C intersection, but not worth it in terms of
+complexity.
+
+While order is irrelevant for an individual intersection, it is very relevant
+for a manual intersection: unsurprisingly, you want to sort smaller sets first,
+because as long as one of your two sets is small, you go pretty fast.
+Therefore, given a multi-intersection of [small, medium, large] it makes the
+most sense to order them that way:
+
+intersect(intersect(small, medium), large)
+"""
+
+import timeit
+import pprint
+
+setup = '''
+import BTrees
+one = BTrees.family32.IO.TreeSet((0,))
+ten = BTrees.family32.IO.TreeSet(range(10))
+hundred = BTrees.family32.IO.TreeSet(range(100))
+thousand = BTrees.family32.IO.TreeSet(range(1000))
+tenthousand = BTrees.family32.IO.TreeSet(range(10000))
+'''
+
+manual_template = 'BTrees.family32.IO.TreeSet(i for i in %s if i in %s)'
+
+intersect_template = 'BTrees.family32.IO.intersection(%s, %s)'
+
+
+control = timeit.Timer('pass', setup)
+
+options = ('one', 'ten', 'hundred', 'thousand', 'tenthousand')
+
+runs = 10000
+
+control_result = min(timeit.Timer('pass', setup).repeat(3, runs))
+
+d = [('control_result', control_result)]
+
+for i, big in enumerate(options):
+    for little in options[:i]:
+        d.append((
+            'manual %s %s' % (little, big),
+            min(timeit.Timer(
+                manual_template % (little, big),
+                setup).repeat(3, runs)) - control_result))
+        d.append((
+            'intersect %s %s' % (little, big),
+            min(timeit.Timer(
+                intersect_template % (little, big),
+                setup).repeat(3, runs)) - control_result))
+        d.append((
+            'intersect %s %s' % (big, little),
+            min(timeit.Timer(
+                intersect_template % (big, little),
+                setup).repeat(3, runs)) - control_result))
+
+pprint.pprint(d)
diff --git a/src/zc/relationship/timeit/set_creation_vs_removal.py b/src/zc/relationship/timeit/set_creation_vs_removal.py
@@ -0,0 +1,148 @@
+"""
+Example output:
+
+----8<----8<----8<----
+
+control: 0.0847041606903
+
+(the following results have the control subtracted)
+
+create_empty: 0.00116777420044
+create_one: 0.00260376930237
+create_ten: 0.00407481193542
+create_hundred: 0.0167508125305
+create_thousand: 0.149729013443
+create_tenthousand: 1.55575275421
+
+remove_one: 0.00317978858948
+remove_ten: 0.0110769271851
+remove_hundred: 0.0708107948303
+remove_thousand: 0.52201294899
+remove_tenthousand: 5.27875876427
+
+len_one: 0.00183796882629
+len_ten: 0.001877784729
+len_hundred: 0.00301790237427
+len_thousand: 0.00213479995728
+len_tenthousand: 0.00438284873962
+
+----8<----8<----8<----
+
+remove one/create one is similar
+
+ten starts giving obvious wins to creation.  removing 1 (10%) slightly cheaper
+
+create 100/remove 10 gives slight win to remove (10%-ish)
+
+create 1000/remove 100 gives significant win to removal.  Progression for both
+patterns begins to be linear with larger numbers, which makes sense.  That
+suggests 20% might be the approximate break even point.
+
+create 10000/remove 1000 suggests a break even point of about 26%.
+
+The cost of the len is not nothing, but begins to pay for itself with
+larger numbers.
+
+This will be our heuristic then, rather than trying to calculate a curve I
+couldn't make anyway.
+
+given len_removal, len_set, and ratio (len_set/len_removal)
+if len_removal < 5 or ratio <= .1 or len_set > 500 and ration <= .2:
+    remove rather than create
+
+"""
+
+import timeit
+
+setup = '''
+import BTrees
+one = BTrees.family32.IO.TreeSet((0,))
+ten = BTrees.family32.IO.TreeSet(range(10))
+hundred = BTrees.family32.IO.TreeSet(range(100))
+thousand = BTrees.family32.IO.TreeSet(range(1000))
+tenthousand = BTrees.family32.IO.TreeSet(range(10000))
+sets = [BTrees.family32.IO.TreeSet(tenthousand) for i in range(1000)]
+'''
+
+base = 's = sets.pop()'
+
+create_template = base + '''
+BTrees.family32.IO.TreeSet(%s)'''
+
+remove_template = base + '''
+for i in range(0, 10000, %d):
+    s.remove(i)'''
+
+len_template = base + '''
+len(%s)'''
+
+control = timeit.Timer(base, setup)
+
+create_empty = timeit.Timer(create_template % ('',), setup)
+create_one = timeit.Timer(create_template % ('one',), setup)
+create_ten = timeit.Timer(create_template % ('ten',), setup)
+create_hundred = timeit.Timer(create_template % ('hundred',), setup)
+create_thousand = timeit.Timer(create_template % ('thousand',), setup)
+create_tenthousand = timeit.Timer(create_template % ('tenthousand',), setup)
+
+remove_one = timeit.Timer(remove_template % (10000,), setup)
+remove_ten = timeit.Timer(remove_template % (1000,), setup)
+remove_hundred = timeit.Timer(remove_template % (100,), setup)
+remove_thousand = timeit.Timer(remove_template % (10,), setup)
+remove_tenthousand = timeit.Timer(remove_template % (1,), setup)
+
+len_one = timeit.Timer(len_template % ('one',), setup)
+len_ten = timeit.Timer(len_template % ('ten',), setup)
+len_hundred = timeit.Timer(len_template % ('hundred',), setup)
+len_thousand = timeit.Timer(len_template % ('thousand',), setup)
+len_tenthousand = timeit.Timer(len_template % ('tenthousand',), setup)
+
+d = {}
+
+control_result = d['control_result'] = min(control.repeat(3, 1000))
+d['create_empty_result'] = min(create_empty.repeat(3, 1000))
+d['create_one_result'] = min(create_one.repeat(3, 1000))
+d['create_ten_result'] = min(create_ten.repeat(3, 1000))
+d['create_hundred_result'] = min(create_hundred.repeat(3, 1000))
+d['create_thousand_result'] = min(create_thousand.repeat(3, 1000))
+d['create_tenthousand_result'] = min(create_tenthousand.repeat(3, 1000))
+d['remove_one_result'] = min(remove_one.repeat(3, 1000))
+d['remove_ten_result'] = min(remove_ten.repeat(3, 1000))
+d['remove_hundred_result'] = min(remove_hundred.repeat(3, 1000))
+d['remove_thousand_result'] = min(remove_thousand.repeat(3, 1000))
+d['remove_tenthousand_result'] = min(remove_tenthousand.repeat(3, 1000))
+d['len_one_result'] = min(len_one.repeat(3, 1000))
+d['len_ten_result'] = min(len_ten.repeat(3, 1000))
+d['len_hundred_result'] = min(len_hundred.repeat(3, 1000))
+d['len_thousand_result'] = min(len_thousand.repeat(3, 1000))
+d['len_tenthousand_result'] = min(len_tenthousand.repeat(3, 1000))
+
+for k, v in d.items():
+    if k == 'control_result':
+        continue
+    d[k[:-7]] = v - control_result
+
+print '''
+control: %(control_result)s
+
+(the following results have the control subtracted)
+
+create_empty: %(create_empty)s
+create_one: %(create_one)s
+create_ten: %(create_ten)s
+create_hundred: %(create_hundred)s
+create_thousand: %(create_thousand)s
+create_tenthousand: %(create_tenthousand)s
+
+remove_one: %(remove_one)s
+remove_ten: %(remove_ten)s
+remove_hundred: %(remove_hundred)s
+remove_thousand: %(remove_thousand)s
+remove_tenthousand: %(remove_tenthousand)s
+
+len_one: %(len_one)s
+len_ten: %(len_ten)s
+len_hundred: %(len_hundred)s
+len_thousand: %(len_thousand)s
+len_tenthousand: %(len_tenthousand)s
+''' % d