- Feature: Massively improved performance on all levels. This was mainly

accomplished by removing unnecessary database accesses, better caching and more efficient algorithms. This results in speedups between 4-25 times.
zopefoundation · Apr 2, 2012 · 0d7210f · 0d7210f
1 parent acafea3
commit 0d7210f
Show file tree

Hide file tree

Showing 6 changed files with 301 additions and 33 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -50,15 +50,41 @@ CHANGES
 - Feature: Added a little script to test performance. It is not very
   sophisticated, but it is sufficient for a first round of optimizations.
 
-- Performance: Drastically improved performance for collections that store
-  only one type of objects and where the documents do not store the type
-  (i.e. it is stored in the name map collection).
-
-- Performance: The Zope Container fast load via find() did not work correctly,
-  since setstate() did not change the state from ghost to active and thus the
-  state was loaded again from MongoDB and set on the object. Now we use the
-  new ``_latest_states`` cache to lookup a document when ``setstate()`` is
-  called through the proper channels.
+- Feature: Massively improved performance on all levels. This was mainly
+  accomplished by removing unnecessary database accesses, better caching and
+  more efficient algorithms. This results in speedups between 4-25 times.
+
+  - When resolving the path to a klass, the result is now cached. More
+    importantly, lookup failures are also cached mapping path ->
+    ``None``. This is important, since an optimization the ``resolve()``
+    method causes a lot of failing lookups.
+
+  - When resolving the dbref to a type, we try to resolve the dbref early
+    using the document, if we know that the documents within the collection
+    store their type path. This avoids frequent queries of the name map
+    collection when it is not needed.
+
+  - When getting the object document to read the class path, it will now read
+    the entire document and store it in the ``_latest_states`` dictionary, so
+    that other code may pick it up and use it. This should avoid superflous
+    reads from MongoDB.
+
+  - Drastically improved performance for collections that store only one type
+    of object and where the documents do not store the type (i.e. it is
+    stored in the name map collection).
+
+  - The Zope Container fast load via find() did not work correctly, since
+    setstate() did not change the state from ghost to active and thus the
+    state was loaded again from MongoDB and set on the object. Now we use the
+    new ``_latest_states`` cache to lookup a document when ``setstate()`` is
+    called through the proper channels. Now this "fast load" method truly
+    causes O(1) database lookups.
+
+  - Whenever the Mongo Object Id is used as a hash key, use the hash of the id
+    instead. The ``__cmp__()`` method of the ``ObjectId`` class is way too
+    slow.
+
+  - Cache collection name lookup from objects in the ``ObjectWriter`` class.
 
 - Bug: We have seen several occasions in production where we suddenly lost
   some state in some documents, which prohibited the objects from being

diff --git a/src/mongopersist/performance.py b/src/mongopersist/performance.py
@@ -25,14 +25,16 @@
 from mongopersist import conflict, datamanager
 from mongopersist.zope import container
 
+MULTIPLE_CLASSES = True
+
 class People(container.AllItemsMongoContainer):
     _p_mongo_collection = 'people'
     _m_database = 'performance'
     _m_collection = 'person'
 
 class Person(persistent.Persistent, container.MongoContained):
     _p_mongo_collection = 'person'
-    #_p_mongo_store_type = True
+    _p_mongo_store_type = True
 
     def __init__(self, name, age):
         self.name = name
@@ -42,6 +44,10 @@ def __repr__(self):
         return '<%s %s @ %i [%s]>' %(
             self.__class__.__name__, self.name, self.age, self.__name__)
 
+class Person2(Person):
+    pass
+
+
 def run_basic_crud(options):
     conn = pymongo.Connection('localhost', 27017, tz_aware=False)
     dm = datamanager.MongoDataManager(
@@ -57,7 +63,8 @@ def run_basic_crud(options):
         transaction.begin()
         t1 = time.time()
         for idx in xrange(options.size):
-            people[None] = Person('Mr Number %.5i' %idx, random.randint(0, 100))
+            klass = Person if (MULTIPLE_CLASSES and idx % 2) else Person2
+            people[None] = klass('Mr Number %.5i' %idx, random.randint(0, 100))
         transaction.commit()
         t2 = time.time()
         print 'Insert:       %.4f secs' % (t2-t1)
@@ -80,16 +87,21 @@ def run_basic_crud(options):
     transaction.begin()
     t1 = time.time()
     [person.name for person in people.find()]
-    #cProfile.runctx('[person.name for person in people.find()]', globals(), locals())
+    #cProfile.runctx(
+    #    '[person.name for person in people.find()]', globals(), locals())
     t2 = time.time()
     print 'Fast Read:    %.4f secs' % (t2-t1)
 
     # Profile modification
     t1 = time.time()
-    for person in people.find():
-        person.name += 'X'
-        person.age += 1
-    transaction.commit()
+    def modify():
+        for person in list(people.find()):
+            person.name += 'X'
+            person.age += 1
+        transaction.commit()
+    modify()
+    #cProfile.runctx(
+    #    'modify()', globals(), locals())
     t2 = time.time()
     print 'Modification: %.4f secs' % (t2-t1)
 

diff --git a/src/mongopersist/serialize.py b/src/mongopersist/serialize.py
@@ -28,10 +28,14 @@
 
 from mongopersist import interfaces
 
+IGNORE_IDENTICAL_DOCUMENTS = True
+ALWAYS_READ_FULL_DOC = True
+
 SERIALIZERS = []
 OID_CLASS_LRU = lru.LRUCache(20000)
-
-IGNORE_IDENTICAL_DOCUMENTS = True
+COLLECTIONS_WITH_TYPE = set()
+AVAILABLE_NAME_MAPPINGS = set()
+PATH_RESOLVE_CACHE = {}
 
 def get_dotted_name(obj):
     return obj.__module__+'.'+obj.__name__
@@ -73,11 +77,15 @@ def get_collection_name(self, obj):
         except AttributeError:
             return db_name, get_dotted_name(obj.__class__)
         # Make sure that the coll_name to class path mapping is available.
+        # Let's make sure we do the lookup only once, since the info will
+        # never change.
+        path = get_dotted_name(obj.__class__)
+        map = {'collection': coll_name, 'database': db_name, 'path': path}
+        map_hash = (db_name, coll_name, path)
+        if map_hash in AVAILABLE_NAME_MAPPINGS:
+            return db_name, coll_name
         db = self._jar._conn[self._jar.default_database]
         coll = db[self._jar.name_map_collection]
-        map = {'collection': coll_name,
-               'database': db_name,
-               'path': get_dotted_name(obj.__class__)}
         result = coll.find_one(map)
         if result is None:
             # If there is already a map for this collection, the next map must
@@ -88,6 +96,7 @@ def get_collection_name(self, obj):
                 setattr(obj, '_p_mongo_store_type', True)
             map['doc_has_type'] = getattr(obj, '_p_mongo_store_type', False)
             coll.save(map)
+        AVAILABLE_NAME_MAPPINGS.add(map_hash)
         return db_name, coll_name
 
     def get_non_persistent_state(self, obj, seen):
@@ -281,55 +290,113 @@ def __init__(self, jar):
         self._single_map_cache = {}
 
     def simple_resolve(self, path):
-        return resolve(path)
+        # We try to look up the klass from a cache. The important part here is
+        # that we also cache lookup failures as None, since they actually
+        # happen more frequently than a hit due to an optimization in the
+        # resolve() function.
+        try:
+            klass = PATH_RESOLVE_CACHE[path]
+        except KeyError:
+            try:
+                klass = resolve(path)
+            except ImportError:
+                PATH_RESOLVE_CACHE[path] = klass = None
+            else:
+                PATH_RESOLVE_CACHE[path] = klass
+        if klass is None:
+            raise ImportError(path)
+        return klass
 
     def resolve(self, dbref):
         __traceback_info__ = dbref
-        # 1. Check the global oid-based lookup cache.
+        # 1. Check the global oid-based lookup cache. Use the hash of the id,
+        #    since otherwise the comparison is way too expensive.
         try:
-            return OID_CLASS_LRU[dbref.id]
+            return OID_CLASS_LRU[hash(dbref.id)]
         except KeyError:
             pass
         # 2. Check the transient single map entry lookup cache.
         try:
             return self._single_map_cache[(dbref.database, dbref.collection)]
         except KeyError:
             pass
-        # 3. Try to resolve the path directly.
+        # 3. If we have found the type within the document for a collection
+        #    before, let's try again. This will only hit, if we have more than
+        #    one type for the collection, otherwise the single map entry
+        #    lookup failed.
+        coll_key = (dbref.database, dbref.collection)
+        if coll_key in COLLECTIONS_WITH_TYPE:
+            if dbref in self._jar._latest_states:
+                obj_doc = self._jar._latest_states[dbref]
+            elif ALWAYS_READ_FULL_DOC:
+                obj_doc = self._jar.get_collection(
+                    dbref.database, dbref.collection).find_one(dbref.id)
+                self._jar._latest_states[dbref] = obj_doc
+            else:
+                obj_doc = self._jar\
+                    .get_collection(dbref.database, dbref.collection)\
+                    .find_one(dbref.id, fields=('_py_persistent_type',))
+            if '_py_persistent_type' in obj_doc:
+                klass = self.simple_resolve(obj_doc['_py_persistent_type'])
+                OID_CLASS_LRU[hash(dbref.id)] = klass
+                return klass
+        # 4. Try to resolve the path directly. We want to do this optimization
+        #    after all others, because trying it a lot is very expensive.
         try:
             return self.simple_resolve(dbref.collection)
         except ImportError:
             pass
-        # 4. No simple hits, so we have to do some leg work.
+        # 5. No simple hits, so we have to do some leg work.
         # Let's now try to look up the path from the collection to path
         # mapping
         db = self._jar._conn[self._jar.default_database]
         coll = db[self._jar.name_map_collection]
-        result = coll.find(
-            {'collection': dbref.collection, 'database': dbref.database})
-        count = result.count()
+        result = tuple(coll.find(
+            {'collection': dbref.collection, 'database': dbref.database}))
+        # Calling count() on a query result causes another database
+        # access. Since the result sets should be typically very small, let's
+        # load them all.
+        count = len(result)
         if count == 0:
             raise ImportError(dbref)
         elif count == 1:
             # Do not add these results to the LRU cache, since the count might
             # change later. But storing it for the length of the transaction
             # is fine, which is really useful if you load a lot of objects of
             # the same type.
-            klass = self.simple_resolve(result.next()['path'])
+            klass = self.simple_resolve(result[0]['path'])
             self._single_map_cache[(dbref.database, dbref.collection)] = klass
             return klass
         else:
             if dbref.id is None:
                 raise ImportError(dbref)
             # Multiple object types are stored in the collection. We have to
             # look at the object to find out the type.
-            obj_doc = self._jar\
-                .get_collection(dbref.database, dbref.collection).find_one(
-                    dbref.id, fields=('_py_persistent_type',))
+            if dbref in self._jar._latest_states:
+                # Optimization: If we have the latest state, then we just get
+                # this object document. This is used for fast loading or when
+                # resolving the same object path a second time. (The latter
+                # should never happen due to the object cache.)
+                obj_doc = self._jar._latest_states[dbref]
+            elif ALWAYS_READ_FULL_DOC:
+                # Optimization: Read the entire doc and stick it in the right
+                # place so that unghostifying the object later will not cause
+                # another database access.
+                obj_doc = self._jar\
+                    .get_collection(dbref.database, dbref.collection)\
+                    .find_one(dbref.id)
+                self._jar._latest_states[dbref] = obj_doc
+            else:
+                obj_doc = self._jar\
+                    .get_collection(dbref.database, dbref.collection)\
+                    .find_one(dbref.id, fields=('_py_persistent_type',))
             if '_py_persistent_type' in obj_doc:
+                COLLECTIONS_WITH_TYPE.add(coll_key)
                 klass = self.simple_resolve(obj_doc['_py_persistent_type'])
             else:
                 # Find the name-map entry where "doc_has_type" is False.
+                # Note: This case is really inefficient and does not allow any
+                # optimization. It should be avoided as much as possible.
                 for name_map_item in result:
                     if not name_map_item['doc_has_type']:
                         klass = self.simple_resolve(name_map_item['path'])

diff --git a/src/mongopersist/testing.py b/src/mongopersist/testing.py
@@ -53,3 +53,7 @@ def tearDown(test):
     test.globs['conn'].drop_database(test.globs['DBNAME'])
     test.globs['conn'].disconnect()
     serialize.SERIALIZERS.__init__()
+    serialize.OID_CLASS_LRU.__init__(20000)
+    serialize.COLLECTIONS_WITH_TYPE.__init__()
+    serialize.AVAILABLE_NAME_MAPPINGS.__init__()
+    serialize.PATH_RESOLVE_CACHE = {}