-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Generate collection names using a hash function #84
Changes from 4 commits
da9caa8
8455e39
855a0ac
63c914b
0b832b3
7f4824a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,9 +4,15 @@ | |
"use strict"; | ||
|
||
var mongoose = require('mongoose'); | ||
var boom = require('boom'); | ||
var crypto = require('crypto'); | ||
var bytesCounter = require('bytes-counter'); | ||
|
||
var sthConfig, sthLogger, sthHelper, connectionURL, eventSchema, aggregatedSchema; | ||
|
||
var MAX_NAMESPACE_SIZE_IN_BYTES = 120, | ||
MIN_HASH_SIZE_IN_BYTES = 20; | ||
|
||
/** | ||
* Declares the Mongoose schemas. | ||
*/ | ||
|
@@ -169,52 +175,117 @@ | |
* @return {string} The database name | ||
*/ | ||
function getDatabase(service) { | ||
return sthConfig.DB_PREFIX + '_' + service; | ||
return sthConfig.DB_PREFIX + service; | ||
} | ||
|
||
/** | ||
* Return the name of the collection which will store the raw events | ||
* @param {string} databaseName The database name | ||
* @param {string} servicePath The service path of the entity the event is related to | ||
* @param {string} entityId The entity id related to the event | ||
* @param {string} entityType The type of entity related to the event | ||
* @param {string} attrName The attribute id related to the event | ||
* @returns {string} The collection name | ||
*/ | ||
function getCollectionName4Events(servicePath, entityId, entityType, attrName) { | ||
function getCollectionName4Events(databaseName, servicePath, entityId, entityType, attrName) { | ||
var collectionName4Events; | ||
switch(sthConfig.DATA_MODEL) { | ||
case sthConfig.DATA_MODELS.COLLECTIONS_PER_SERVICE_PATH: | ||
return sthConfig.COLLECTION_PREFIX + '_' + servicePath; | ||
collectionName4Events = servicePath; | ||
break; | ||
case sthConfig.DATA_MODELS.COLLECTIONS_PER_ENTITY: | ||
return sthConfig.COLLECTION_PREFIX + '_' + servicePath + '_' + entityId + (entityType ? '_' + entityType : ''); | ||
collectionName4Events = servicePath + '_' + entityId + (entityType ? '_' + entityType : ''); | ||
break; | ||
case sthConfig.DATA_MODELS.COLLECTIONS_PER_ATTRIBUTE: | ||
return sthConfig.COLLECTION_PREFIX + '_' + servicePath + '_' + entityId + (entityType ? '_' + entityType : '') + | ||
collectionName4Events = servicePath + '_' + entityId + (entityType ? '_' + entityType : '') + | ||
'_' + attrName; | ||
break; | ||
} | ||
if (sthConfig.SHOULD_HASH) { | ||
var limit = getHashSizeInBytes(databaseName); | ||
if (limit < MIN_HASH_SIZE_IN_BYTES) { | ||
sthLogger.warn('The available bytes for the hashes to be used as part of the collection names is not big enough (' + | ||
'at least ' + MIN_HASH_SIZE_IN_BYTES + ' bytes are needed), ' + | ||
'please reduce the size of the DB_PREFIX ("' + sthConfig.DB_PREFIX + '" = ' + bytesCounter.count(sthConfig.DB_PREFIX) + ' bytes), ' + | ||
'the service ("' + databaseName.substring(sthConfig.DB_PREFIX.length, databaseName.length) + '" = ' + bytesCounter.count(databaseName) + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At...
You are talking about the size of the fiware-service but calculating the size of the database. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed ;) Good catch! Updating it ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 0b832b3 |
||
' bytes) and/or the COLLECTION_PREFIX ("' + sthConfig.COLLECTION_PREFIX + '" = ' + bytesCounter.count(sthConfig.COLLECTION_PREFIX) + | ||
' bytes)', | ||
{ | ||
operationType: sthConfig.OPERATION_TYPE.DB_LOG | ||
} | ||
); | ||
return null; | ||
} | ||
return sthConfig.COLLECTION_PREFIX + generateHash(collectionName4Events, limit); | ||
} else { | ||
return sthConfig.COLLECTION_PREFIX + collectionName4Events; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems we are not controlling the size of the collection in this case. I think we agreed on truncate the collection name if it exceeded 120 bytes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ups, sorry! I didn't understood it that way :p In fact, in my opinion it would be kind of problematic for us to truncate the collection name (magically from a user point of view) due to possible collisions in the collection names :) In fact, I would be more in favor of logging an informative message in case the namespace takes more than 120 bytes. Currently, we are logging the message MongoDB returns but we could do something similar to what we do with the hashing option. I mean, informing the user that the namespace (DB_PREFIX + service + COLLECTION_PREFIX + collectionName + '.aggr' takes more than 120 bytes and consequently the MongoDB collection won't be created :) What do you think @frbattid and @iariasleon ? Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's OK for me. I mean, truncation was never implemented in Cygnus when the maximum length was reached with any other backend, and a log message was printed. Nevertheless, I though we had decided to start truncating, my fault! Thus, it is only missing checking for the length and warning about that, right? Or is it done outside this function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact, I just did it since it was straight forward until we decide about it :) Currently, for the hashing case, the following message is logged in case the bytes available for the hash are less than 20 bytes:
On the other hand and for the other case, the following message is logged in case the namespace (concatenation of the database name and the collection name for the aggregated data is bigger than 120 bytes):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have not pushed these last changes until we decide about it ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Push them! :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pushed and included in 7f4824a |
||
} | ||
} | ||
|
||
/** | ||
* Returns the available hash size in bytes to be used as part of the collection names | ||
* based on the database name, database name prefix and collection name prefix | ||
* @param databaseName The database name | ||
* @return {number} The size of the hash in bytes | ||
*/ | ||
function getHashSizeInBytes(databaseName) { | ||
return MAX_NAMESPACE_SIZE_IN_BYTES - bytesCounter.count(databaseName) - | ||
bytesCounter.count(sthConfig.COLLECTION_PREFIX) - bytesCounter.count('.aggr') - 1; | ||
} | ||
|
||
/** | ||
* Return the name of the collection which will store the aggregated data | ||
* @param {string} databaseName The database name | ||
* @param {string} servicePath The service path of the entity the event is related to | ||
* @param {string} entityId The entity id related to the event | ||
* @param {string} entityType The type of entity related to the event | ||
* @param {string} attrName The attribute id related to the event | ||
* @returns {string} The collection name | ||
*/ | ||
function getCollectionName4Aggregated(servicePath, entityId, entityType, | ||
function getCollectionName4Aggregated(databaseName, servicePath, entityId, entityType, | ||
attrName) { | ||
return getCollectionName4Events( | ||
servicePath, entityId, entityType, attrName) + '.aggr'; | ||
var collectionName4Events = getCollectionName4Events( | ||
databaseName, servicePath, entityId, entityType, attrName); | ||
if (collectionName4Events) { | ||
return collectionName4Events + '.aggr'; | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
/** | ||
* Returns a reference to a collection of the database asynchronously | ||
* @param {string} databaseName The database's name | ||
* @param {string} collectionName The collection's name | ||
* @param {object} params Params (service, service path, entity, attribute or collection) for which the collection | ||
* should be returned | ||
* @param {boolean} isAggregated Flag indicating if the aggregated collection is desired. If false, the raw data | ||
* collection is the one requiered | ||
* @param {boolean} shouldCreate Flag indicating if the collection should be created | ||
* if it does not exist | ||
* @param {boolean} shouldStoreHash Flag indicating if the collection hash should be stored in case the collection | ||
* is created | ||
* @param {Function} callback Callback function to be called with the results | ||
*/ | ||
function getCollection(databaseName, collectionName, shouldCreate, callback) { | ||
function getCollection(params, isAggregated, shouldCreate, shouldStoreHash, callback) { | ||
var databaseName = getDatabase(params.service); | ||
|
||
shouldStoreHash = sthConfig.SHOULD_HASH && shouldStoreHash; | ||
|
||
var collectionName; | ||
if (params.collection) { | ||
collectionName = params.collection; | ||
} else { | ||
collectionName = isAggregated ? | ||
getCollectionName4Aggregated(databaseName, params.servicePath, params.entityId, params.entityType, | ||
params.attrName) : | ||
getCollectionName4Events(databaseName, params.servicePath, params.entityId, params.entityType, | ||
params.attrName); | ||
} | ||
|
||
if (!collectionName) { | ||
var error = boom.badRequest('The collection name could not be generated'); | ||
return process.nextTick(callback.bind(null, error)); | ||
} | ||
|
||
// Switch to the right database | ||
var connection = mongoose.connection.useDb(databaseName); | ||
|
||
|
@@ -226,6 +297,17 @@ | |
shouldCreate) { | ||
connection.db.createCollection(collectionName, | ||
function (err, collection) { | ||
if (!err && shouldStoreHash) { | ||
storeCollectionHash(params, isAggregated, collectionName, function(err) { | ||
if (err) { | ||
// There was an error when storing the collection hash | ||
// Do nothing | ||
sthLogger.warn('Error when storing the hash generated as part of the collection name into the database', { | ||
operationType: sthConfig.OPERATION_TYPE.DB_LOG | ||
}); | ||
} | ||
}); | ||
} | ||
if (err && err.message === 'collection already exists') { | ||
// We have observed that although leaving the strict option to the default value, sometimes | ||
// we get a 'collection already exists' error when executing connection.db#createCollection() | ||
|
@@ -809,6 +891,84 @@ | |
}); | ||
} | ||
|
||
/** | ||
* Generates a hash based on an input and a maximum number of bytes | ||
* @param input The input to generate the hash from | ||
* @param limit The maximum number of bytes of the hash | ||
*/ | ||
function generateHash(input, limit) { | ||
var shasum = crypto.createHash('sha512'); | ||
shasum.update(input); | ||
var hash = shasum.digest('hex'); | ||
if (limit) { | ||
hash = hash.substr(0, limit); | ||
} | ||
return hash; | ||
} | ||
|
||
/** | ||
* Stores the collection name (hash) in the database | ||
* @param params The params used to generate the collection name (hash) | ||
* @param hash The generated hash used as part of the collection names | ||
* @param callback A callback function | ||
*/ | ||
function storeCollectionHash(params, isAggregated, hash, callback) { | ||
getCollection({ | ||
service: params.service, | ||
collection: sthConfig.COLLECTION_PREFIX + 'collection_names' | ||
}, false, true, false, function(err, collection) { | ||
if (err) { | ||
return callback(err); | ||
} | ||
// 2 updates operations are needed since MongoDB currently does not support the possibility | ||
// to address the same field in a $set operation as a $setOnInsert operation | ||
collection.update( | ||
{ | ||
_id: hash | ||
}, | ||
{ | ||
'$setOnInsert': { | ||
dataModel: sthConfig.DATA_MODEL, | ||
isAggregated: isAggregated, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is :) I only included it to ease and to speed up queries :) Otherwise, users would have to use text searching in the _id property of the collection which maps hashes to service path, entity, etc., vales... This will force us to create a text index on that property (see http://docs.mongodb.org/manual/reference/operator/query/text/) which apart of consuming a namespace (this is not important :D) can end up consuming a lot of disk and memory (this is the counterpart of text indexes, and the reason why they are not recommended if not forced to really use them :)) What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I see... You convinced me :) NTC |
||
service: params.service, | ||
servicePath: params.servicePath, | ||
entityId: params.entityId, | ||
entityType: params.entityType, | ||
attrName: params.attrName | ||
} | ||
}, { | ||
upsert: true | ||
}, | ||
function(err) { | ||
if (err && callback) { | ||
return callback(err); | ||
} | ||
collection.update( | ||
{ | ||
_id: hash | ||
}, | ||
{ | ||
'$set': { | ||
dataModel: sthConfig.DATA_MODEL, | ||
isAggregated: isAggregated, | ||
service: params.service, | ||
servicePath: params.servicePath, | ||
entityId: params.entityId, | ||
entityType: params.entityType, | ||
attrName: params.attrName | ||
} | ||
}, | ||
function(err) { | ||
if (callback) { | ||
return callback(err); | ||
} | ||
} | ||
); | ||
} | ||
); | ||
}); | ||
} | ||
|
||
module.exports = function (theSthConfig, theSthLogger, theSthHelper) { | ||
sthConfig = theSthConfig; | ||
sthLogger = theSthLogger; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We agreed to eliminate all related information with data models
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But not in this PR, right? :) I wanted to include everything needed and in a new PR delete all the information just to have that information together in case someone asks us to get it back... :p
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 not in this PR