Skip to content
This repository has been archived by the owner on May 13, 2020. It is now read-only.

Commit

Permalink
- Feature: Massively improved performance on all levels. This was mainly
Browse files Browse the repository at this point in the history
  accomplished by removing unnecessary database accesses, better caching and
  more efficient algorithms. This results in speedups between 4-25 times.
  • Loading branch information
strichter committed Apr 2, 2012
1 parent acafea3 commit 0d7210f
Show file tree
Hide file tree
Showing 6 changed files with 301 additions and 33 deletions.
44 changes: 35 additions & 9 deletions CHANGES.txt
Expand Up @@ -50,15 +50,41 @@ CHANGES
- Feature: Added a little script to test performance. It is not very
sophisticated, but it is sufficient for a first round of optimizations.

- Performance: Drastically improved performance for collections that store
only one type of objects and where the documents do not store the type
(i.e. it is stored in the name map collection).

- Performance: The Zope Container fast load via find() did not work correctly,
since setstate() did not change the state from ghost to active and thus the
state was loaded again from MongoDB and set on the object. Now we use the
new ``_latest_states`` cache to lookup a document when ``setstate()`` is
called through the proper channels.
- Feature: Massively improved performance on all levels. This was mainly
accomplished by removing unnecessary database accesses, better caching and
more efficient algorithms. This results in speedups between 4-25 times.

- When resolving the path to a klass, the result is now cached. More
importantly, lookup failures are also cached mapping path ->
``None``. This is important, since an optimization the ``resolve()``
method causes a lot of failing lookups.

- When resolving the dbref to a type, we try to resolve the dbref early
using the document, if we know that the documents within the collection
store their type path. This avoids frequent queries of the name map
collection when it is not needed.

- When getting the object document to read the class path, it will now read
the entire document and store it in the ``_latest_states`` dictionary, so
that other code may pick it up and use it. This should avoid superflous
reads from MongoDB.

- Drastically improved performance for collections that store only one type
of object and where the documents do not store the type (i.e. it is
stored in the name map collection).

- The Zope Container fast load via find() did not work correctly, since
setstate() did not change the state from ghost to active and thus the
state was loaded again from MongoDB and set on the object. Now we use the
new ``_latest_states`` cache to lookup a document when ``setstate()`` is
called through the proper channels. Now this "fast load" method truly
causes O(1) database lookups.

- Whenever the Mongo Object Id is used as a hash key, use the hash of the id
instead. The ``__cmp__()`` method of the ``ObjectId`` class is way too
slow.

- Cache collection name lookup from objects in the ``ObjectWriter`` class.

- Bug: We have seen several occasions in production where we suddenly lost
some state in some documents, which prohibited the objects from being
Expand Down
26 changes: 19 additions & 7 deletions src/mongopersist/performance.py
Expand Up @@ -25,14 +25,16 @@
from mongopersist import conflict, datamanager
from mongopersist.zope import container

MULTIPLE_CLASSES = True

class People(container.AllItemsMongoContainer):
_p_mongo_collection = 'people'
_m_database = 'performance'
_m_collection = 'person'

class Person(persistent.Persistent, container.MongoContained):
_p_mongo_collection = 'person'
#_p_mongo_store_type = True
_p_mongo_store_type = True

def __init__(self, name, age):
self.name = name
Expand All @@ -42,6 +44,10 @@ def __repr__(self):
return '<%s %s @ %i [%s]>' %(
self.__class__.__name__, self.name, self.age, self.__name__)

class Person2(Person):
pass


def run_basic_crud(options):
conn = pymongo.Connection('localhost', 27017, tz_aware=False)
dm = datamanager.MongoDataManager(
Expand All @@ -57,7 +63,8 @@ def run_basic_crud(options):
transaction.begin()
t1 = time.time()
for idx in xrange(options.size):
people[None] = Person('Mr Number %.5i' %idx, random.randint(0, 100))
klass = Person if (MULTIPLE_CLASSES and idx % 2) else Person2
people[None] = klass('Mr Number %.5i' %idx, random.randint(0, 100))
transaction.commit()
t2 = time.time()
print 'Insert: %.4f secs' % (t2-t1)
Expand All @@ -80,16 +87,21 @@ def run_basic_crud(options):
transaction.begin()
t1 = time.time()
[person.name for person in people.find()]
#cProfile.runctx('[person.name for person in people.find()]', globals(), locals())
#cProfile.runctx(
# '[person.name for person in people.find()]', globals(), locals())
t2 = time.time()
print 'Fast Read: %.4f secs' % (t2-t1)

# Profile modification
t1 = time.time()
for person in people.find():
person.name += 'X'
person.age += 1
transaction.commit()
def modify():
for person in list(people.find()):
person.name += 'X'
person.age += 1
transaction.commit()
modify()
#cProfile.runctx(
# 'modify()', globals(), locals())
t2 = time.time()
print 'Modification: %.4f secs' % (t2-t1)

Expand Down
101 changes: 84 additions & 17 deletions src/mongopersist/serialize.py
Expand Up @@ -28,10 +28,14 @@

from mongopersist import interfaces

IGNORE_IDENTICAL_DOCUMENTS = True
ALWAYS_READ_FULL_DOC = True

SERIALIZERS = []
OID_CLASS_LRU = lru.LRUCache(20000)

IGNORE_IDENTICAL_DOCUMENTS = True
COLLECTIONS_WITH_TYPE = set()
AVAILABLE_NAME_MAPPINGS = set()
PATH_RESOLVE_CACHE = {}

def get_dotted_name(obj):
return obj.__module__+'.'+obj.__name__
Expand Down Expand Up @@ -73,11 +77,15 @@ def get_collection_name(self, obj):
except AttributeError:
return db_name, get_dotted_name(obj.__class__)
# Make sure that the coll_name to class path mapping is available.
# Let's make sure we do the lookup only once, since the info will
# never change.
path = get_dotted_name(obj.__class__)
map = {'collection': coll_name, 'database': db_name, 'path': path}
map_hash = (db_name, coll_name, path)
if map_hash in AVAILABLE_NAME_MAPPINGS:
return db_name, coll_name
db = self._jar._conn[self._jar.default_database]
coll = db[self._jar.name_map_collection]
map = {'collection': coll_name,
'database': db_name,
'path': get_dotted_name(obj.__class__)}
result = coll.find_one(map)
if result is None:
# If there is already a map for this collection, the next map must
Expand All @@ -88,6 +96,7 @@ def get_collection_name(self, obj):
setattr(obj, '_p_mongo_store_type', True)
map['doc_has_type'] = getattr(obj, '_p_mongo_store_type', False)
coll.save(map)
AVAILABLE_NAME_MAPPINGS.add(map_hash)
return db_name, coll_name

def get_non_persistent_state(self, obj, seen):
Expand Down Expand Up @@ -281,55 +290,113 @@ def __init__(self, jar):
self._single_map_cache = {}

def simple_resolve(self, path):
return resolve(path)
# We try to look up the klass from a cache. The important part here is
# that we also cache lookup failures as None, since they actually
# happen more frequently than a hit due to an optimization in the
# resolve() function.
try:
klass = PATH_RESOLVE_CACHE[path]
except KeyError:
try:
klass = resolve(path)
except ImportError:
PATH_RESOLVE_CACHE[path] = klass = None
else:
PATH_RESOLVE_CACHE[path] = klass
if klass is None:
raise ImportError(path)
return klass

def resolve(self, dbref):
__traceback_info__ = dbref
# 1. Check the global oid-based lookup cache.
# 1. Check the global oid-based lookup cache. Use the hash of the id,
# since otherwise the comparison is way too expensive.
try:
return OID_CLASS_LRU[dbref.id]
return OID_CLASS_LRU[hash(dbref.id)]
except KeyError:
pass
# 2. Check the transient single map entry lookup cache.
try:
return self._single_map_cache[(dbref.database, dbref.collection)]
except KeyError:
pass
# 3. Try to resolve the path directly.
# 3. If we have found the type within the document for a collection
# before, let's try again. This will only hit, if we have more than
# one type for the collection, otherwise the single map entry
# lookup failed.
coll_key = (dbref.database, dbref.collection)
if coll_key in COLLECTIONS_WITH_TYPE:
if dbref in self._jar._latest_states:
obj_doc = self._jar._latest_states[dbref]
elif ALWAYS_READ_FULL_DOC:
obj_doc = self._jar.get_collection(
dbref.database, dbref.collection).find_one(dbref.id)
self._jar._latest_states[dbref] = obj_doc
else:
obj_doc = self._jar\
.get_collection(dbref.database, dbref.collection)\
.find_one(dbref.id, fields=('_py_persistent_type',))
if '_py_persistent_type' in obj_doc:
klass = self.simple_resolve(obj_doc['_py_persistent_type'])
OID_CLASS_LRU[hash(dbref.id)] = klass
return klass
# 4. Try to resolve the path directly. We want to do this optimization
# after all others, because trying it a lot is very expensive.
try:
return self.simple_resolve(dbref.collection)
except ImportError:
pass
# 4. No simple hits, so we have to do some leg work.
# 5. No simple hits, so we have to do some leg work.
# Let's now try to look up the path from the collection to path
# mapping
db = self._jar._conn[self._jar.default_database]
coll = db[self._jar.name_map_collection]
result = coll.find(
{'collection': dbref.collection, 'database': dbref.database})
count = result.count()
result = tuple(coll.find(
{'collection': dbref.collection, 'database': dbref.database}))
# Calling count() on a query result causes another database
# access. Since the result sets should be typically very small, let's
# load them all.
count = len(result)
if count == 0:
raise ImportError(dbref)
elif count == 1:
# Do not add these results to the LRU cache, since the count might
# change later. But storing it for the length of the transaction
# is fine, which is really useful if you load a lot of objects of
# the same type.
klass = self.simple_resolve(result.next()['path'])
klass = self.simple_resolve(result[0]['path'])
self._single_map_cache[(dbref.database, dbref.collection)] = klass
return klass
else:
if dbref.id is None:
raise ImportError(dbref)
# Multiple object types are stored in the collection. We have to
# look at the object to find out the type.
obj_doc = self._jar\
.get_collection(dbref.database, dbref.collection).find_one(
dbref.id, fields=('_py_persistent_type',))
if dbref in self._jar._latest_states:
# Optimization: If we have the latest state, then we just get
# this object document. This is used for fast loading or when
# resolving the same object path a second time. (The latter
# should never happen due to the object cache.)
obj_doc = self._jar._latest_states[dbref]
elif ALWAYS_READ_FULL_DOC:
# Optimization: Read the entire doc and stick it in the right
# place so that unghostifying the object later will not cause
# another database access.
obj_doc = self._jar\
.get_collection(dbref.database, dbref.collection)\
.find_one(dbref.id)
self._jar._latest_states[dbref] = obj_doc
else:
obj_doc = self._jar\
.get_collection(dbref.database, dbref.collection)\
.find_one(dbref.id, fields=('_py_persistent_type',))
if '_py_persistent_type' in obj_doc:
COLLECTIONS_WITH_TYPE.add(coll_key)
klass = self.simple_resolve(obj_doc['_py_persistent_type'])
else:
# Find the name-map entry where "doc_has_type" is False.
# Note: This case is really inefficient and does not allow any
# optimization. It should be avoided as much as possible.
for name_map_item in result:
if not name_map_item['doc_has_type']:
klass = self.simple_resolve(name_map_item['path'])
Expand Down
4 changes: 4 additions & 0 deletions src/mongopersist/testing.py
Expand Up @@ -53,3 +53,7 @@ def tearDown(test):
test.globs['conn'].drop_database(test.globs['DBNAME'])
test.globs['conn'].disconnect()
serialize.SERIALIZERS.__init__()
serialize.OID_CLASS_LRU.__init__(20000)
serialize.COLLECTIONS_WITH_TYPE.__init__()
serialize.AVAILABLE_NAME_MAPPINGS.__init__()
serialize.PATH_RESOLVE_CACHE = {}

0 comments on commit 0d7210f

Please sign in to comment.